From c29d13dec60e336d5f034e07836c237fed70108b Mon Sep 17 00:00:00 2001 From: Niko Papadopoulos <nikolaos.papadopoulos@embl.de> Date: Tue, 24 Jan 2023 16:52:20 +0100 Subject: [PATCH] looked at EC of second-best morphologs and why they sometimes disagree; looked at available choanos for HGT candidates --- analysis/revision-hgt-outgroup.ipynb | 284 ++++++++++ .../revision-second_best_morpholog-run.ipynb | 529 +++++++++++++++++- analysis/suppl-horizontal_gene_transfer.ipynb | 2 +- 3 files changed, 808 insertions(+), 7 deletions(-) create mode 100644 analysis/revision-hgt-outgroup.ipynb diff --git a/analysis/revision-hgt-outgroup.ipynb b/analysis/revision-hgt-outgroup.ipynb new file mode 100644 index 0000000..0856d47 --- /dev/null +++ b/analysis/revision-hgt-outgroup.ipynb @@ -0,0 +1,284 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "19a33b01-d8d4-49de-9e05-302c14fb7c42", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-01-24 16:43\n" + ] + } + ], + "source": [ + "from datetime import datetime, timezone\n", + "import pandas as pd\n", + "import pytz\n", + "\n", + "utc_dt = datetime.now(timezone.utc) # UTC time\n", + "dt = utc_dt.astimezone()\n", + "tz = pytz.timezone('Europe/Berlin')\n", + "berlin_now = datetime.now(tz)\n", + "print(f'{berlin_now:%Y-%m-%d %H:%M}')" + ] + }, + { + "cell_type": "markdown", + "id": "55f808fc-0a34-48d5-96b9-659d27f16f13", + "metadata": {}, + "source": [ + "The reviewers challenged us to look for the HGT candidates in the nearest non-metazoan outgroup, choanoflagellates. We are using _Salpingoeca rosetta_ and _Monosiga brevicollis_, two model choanoflagellates with publically available genomes." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c4ad4dc2-a115-4679-8947-c6ed78582bbd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " % Total % Received % Xferd Average Speed Time Time Time Current\n", + " Dload Upload Total Spent Left Speed\n", + "100 9376k 0 9376k 0 0 1607k 0 --:--:-- 0:00:05 --:--:-- 1472k\n", + " % Total % Received % Xferd Average Speed Time Time Time Current\n", + " Dload Upload Total Spent Left Speed\n", + "100 6562k 0 6562k 0 0 2390k 0 --:--:-- 0:00:02 --:--:-- 2394k\n" + ] + } + ], + "source": [ + "!curl \"https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28proteome%3AUP000007799%29%29\" -o salpingoeca.faa\n", + "!curl \"https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28proteome%3AUP000001357%29%29\" -o monosiga.faa\n", + "!rm -rf tmp/" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "aae09518-9f38-4c3e-90e0-f8595589e748", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%bash --out salpingoeca.out --err salpingoeca.err\n", + "hgt_candidates=\"/Users/npapadop/Documents/data/coffe/hgt.pep\"\n", + "mmseqs easy-search ${hgt_candidates} \"./salpingoeca.faa\" salpingoeca.m8 tmp --search-type 2" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9f1aa456-1f6f-4523-b923-763b603b649c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%bash --out monosiga.out --err monosiga.err\n", + "hgt_candidates=\"/Users/npapadop/Documents/data/coffe/hgt.pep\"\n", + "mmseqs easy-search ${hgt_candidates} \"./monosiga.faa\" monosiga.m8 tmp --search-type 2" + ] + }, + { + "cell_type": "markdown", + "id": "80982650-856c-4b18-be0e-501b240533ab", + "metadata": {}, + "source": [ + "Let's have a look at the results:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "169716b0-3ce2-4eb7-8e53-915c99ec5122", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>0</th>\n", + " <th>1</th>\n", + " <th>2</th>\n", + " <th>3</th>\n", + " <th>4</th>\n", + " <th>5</th>\n", + " <th>6</th>\n", + " <th>7</th>\n", + " <th>8</th>\n", + " <th>9</th>\n", + " <th>10</th>\n", + " <th>11</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>c103983_g1_i1_m.71422,</td>\n", + " <td>A9V527</td>\n", + " <td>0.251</td>\n", + " <td>505</td>\n", + " <td>370</td>\n", + " <td>0</td>\n", + " <td>65</td>\n", + " <td>569</td>\n", + " <td>66</td>\n", + " <td>560</td>\n", + " <td>3.201000e-14</td>\n", + " <td>76</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 \\\n", + "0 c103983_g1_i1_m.71422, A9V527 0.251 505 370 0 65 569 66 560 \n", + "\n", + " 10 11 \n", + "0 3.201000e-14 76 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.read_csv(\"monosiga.m8\", sep=\"\\t\", header=None)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8c324422-faed-4249-89da-cb1dafeac4d1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>0</th>\n", + " <th>1</th>\n", + " <th>2</th>\n", + " <th>3</th>\n", + " <th>4</th>\n", + " <th>5</th>\n", + " <th>6</th>\n", + " <th>7</th>\n", + " <th>8</th>\n", + " <th>9</th>\n", + " <th>10</th>\n", + " <th>11</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>c103983_g1_i1_m.71422,</td>\n", + " <td>F2UGB0</td>\n", + " <td>0.253</td>\n", + " <td>490</td>\n", + " <td>354</td>\n", + " <td>0</td>\n", + " <td>63</td>\n", + " <td>552</td>\n", + " <td>159</td>\n", + " <td>633</td>\n", + " <td>2.262000e-16</td>\n", + " <td>83</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 \\\n", + "0 c103983_g1_i1_m.71422, F2UGB0 0.253 490 354 0 63 552 159 633 \n", + "\n", + " 10 11 \n", + "0 2.262000e-16 83 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.read_csv(\"salpingoeca.m8\", sep=\"\\t\", header=None)" + ] + }, + { + "cell_type": "markdown", + "id": "8184c03c-4741-4499-a346-a31f4e4ed61c", + "metadata": {}, + "source": [ + "In both cases the only relevant hit that is found is c103983_g1, the gene EggNOG v5.0 identifies as \"metal-dependent hydrolase - Proteobacteria\" and MorF putatively identifies as an aminohydrolase." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/analysis/revision-second_best_morpholog-run.ipynb b/analysis/revision-second_best_morpholog-run.ipynb index 114266e..bc753c2 100644 --- a/analysis/revision-second_best_morpholog-run.ipynb +++ b/analysis/revision-second_best_morpholog-run.ipynb @@ -10,7 +10,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "2022-12-21 15:28\n" + "2023-01-24 15:27\n" ] } ], @@ -387,7 +387,7 @@ { "data": { "text/plain": [ - "<matplotlib.collections.PathCollection at 0x2cc05e740>" + "<matplotlib.collections.PathCollection at 0x2ceef8e80>" ] }, "execution_count": 19, @@ -729,7 +729,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 32, "id": "b7f15b68-ef6b-40ad-9368-f1d9a1220b2b", "metadata": {}, "outputs": [ @@ -739,7 +739,7 @@ "0.1088115396676074" ] }, - "execution_count": 33, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -750,7 +750,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 33, "id": "db54d8dc-302f-4dc3-8abf-03c357796d49", "metadata": {}, "outputs": [ @@ -760,7 +760,7 @@ "0.07933521480087802" ] }, - "execution_count": 38, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -769,6 +769,523 @@ "np.sum(top10p_12[~exclude] < 3) / np.sum(top10p_12[~exclude] > 3)" ] }, + { + "cell_type": "code", + "execution_count": 36, + "id": "d1c33949-76e8-4bdd-bbee-99824d43570a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.217579250720461" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.mean(top10p_12[~exclude][top10p_12[~exclude] < 4])" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "f3d9b8ec-e7fe-4708-9e90-5377a59b34e7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "query\n", + "132 0\n", + "142 3\n", + "170 0\n", + "215 0\n", + "252 1\n", + " ..\n", + "41535 1\n", + "41596 0\n", + "41621 1\n", + "41703 3\n", + "41934 1\n", + "Length: 694, dtype: int64" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "top10p_12[~exclude][top10p_12[~exclude] < 4]" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "614bc862-d3ae-45bb-bae4-0dd45e9ae3cf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>query</th>\n", + " <th>bit score</th>\n", + " <th>uniprot</th>\n", + " <th>eggNOG_OGs</th>\n", + " <th>Preferred_name</th>\n", + " <th>EC</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1798577</th>\n", + " <td>41596</td>\n", + " <td>367</td>\n", + " <td>Q22707</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...</td>\n", + " <td>DUSP11</td>\n", + " <td>3.1.3.16,3.1.3.48</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798580</th>\n", + " <td>41596</td>\n", + " <td>356</td>\n", + " <td>C0PFH1</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n", + " <td>-</td>\n", + " <td>2.7.7.50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798581</th>\n", + " <td>41596</td>\n", + " <td>347</td>\n", + " <td>B7EQL6</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n", + " <td>-</td>\n", + " <td>2.7.7.50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798582</th>\n", + " <td>41596</td>\n", + " <td>347</td>\n", + " <td>A0A1D6EGX8</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n", + " <td>-</td>\n", + " <td>2.7.7.50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798583</th>\n", + " <td>41596</td>\n", + " <td>344</td>\n", + " <td>A0A0H5S9H8</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,3A2XY@33...</td>\n", + " <td>-</td>\n", + " <td>3.1.3.16,3.1.3.48</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798584</th>\n", + " <td>41596</td>\n", + " <td>343</td>\n", + " <td>Q6NXK5</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...</td>\n", + " <td>DUSP11</td>\n", + " <td>3.1.3.16,3.1.3.48</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798585</th>\n", + " <td>41596</td>\n", + " <td>337</td>\n", + " <td>Q2QWJ7</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n", + " <td>-</td>\n", + " <td>2.7.7.50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798588</th>\n", + " <td>41596</td>\n", + " <td>330</td>\n", + " <td>O75319</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...</td>\n", + " <td>DUSP11</td>\n", + " <td>3.1.3.16,3.1.3.48</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798589</th>\n", + " <td>41596</td>\n", + " <td>328</td>\n", + " <td>Q4KM79</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...</td>\n", + " <td>DUSP11</td>\n", + " <td>3.1.3.16,3.1.3.48</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798590</th>\n", + " <td>41596</td>\n", + " <td>323</td>\n", + " <td>A0A3P7GI08</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,3A2XY@33...</td>\n", + " <td>-</td>\n", + " <td>3.1.3.16,3.1.3.48</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798592</th>\n", + " <td>41596</td>\n", + " <td>319</td>\n", + " <td>I1N9F9</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n", + " <td>-</td>\n", + " <td>2.7.7.50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798595</th>\n", + " <td>41596</td>\n", + " <td>296</td>\n", + " <td>E9QD92</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...</td>\n", + " <td>DUSP11</td>\n", + " <td>3.1.3.16,3.1.3.48</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798596</th>\n", + " <td>41596</td>\n", + " <td>288</td>\n", + " <td>Q8GSD7</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n", + " <td>-</td>\n", + " <td>2.7.7.50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798597</th>\n", + " <td>41596</td>\n", + " <td>286</td>\n", + " <td>A0A0R0KQX0</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n", + " <td>-</td>\n", + " <td>2.7.7.50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798598</th>\n", + " <td>41596</td>\n", + " <td>276</td>\n", + " <td>F4IYM6</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n", + " <td>-</td>\n", + " <td>2.7.7.50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798599</th>\n", + " <td>41596</td>\n", + " <td>273</td>\n", + " <td>Q8SX38</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...</td>\n", + " <td>DUSP11</td>\n", + " <td>3.1.3.16,3.1.3.48</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798600</th>\n", + " <td>41596</td>\n", + " <td>272</td>\n", + " <td>P34442</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,3A2XY@33...</td>\n", + " <td>-</td>\n", + " <td>3.1.3.16,3.1.3.48</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798601</th>\n", + " <td>41596</td>\n", + " <td>270</td>\n", + " <td>J9BD64</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...</td>\n", + " <td>DUSP11</td>\n", + " <td>3.1.3.16,3.1.3.48</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798602</th>\n", + " <td>41596</td>\n", + " <td>265</td>\n", + " <td>K7MTE7</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n", + " <td>-</td>\n", + " <td>2.7.7.50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798603</th>\n", + " <td>41596</td>\n", + " <td>259</td>\n", + " <td>A0A0N4U7Y1</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,3A2XY@33...</td>\n", + " <td>-</td>\n", + " <td>3.1.3.16,3.1.3.48</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798604</th>\n", + " <td>41596</td>\n", + " <td>259</td>\n", + " <td>K7KDH1</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n", + " <td>-</td>\n", + " <td>2.7.7.50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798606</th>\n", + " <td>41596</td>\n", + " <td>254</td>\n", + " <td>Q6NY98</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...</td>\n", + " <td>RNGTT</td>\n", + " <td>1.6.5.3,1.6.99.3,2.7.7.50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798607</th>\n", + " <td>41596</td>\n", + " <td>254</td>\n", + " <td>I1KKA0</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n", + " <td>-</td>\n", + " <td>2.7.7.50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798608</th>\n", + " <td>41596</td>\n", + " <td>250</td>\n", + " <td>K7K355</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n", + " <td>-</td>\n", + " <td>2.7.7.50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798609</th>\n", + " <td>41596</td>\n", + " <td>246</td>\n", + " <td>O60942</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...</td>\n", + " <td>RNGTT</td>\n", + " <td>1.6.5.3,1.6.99.3,2.7.7.50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798610</th>\n", + " <td>41596</td>\n", + " <td>238</td>\n", + " <td>O55236</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...</td>\n", + " <td>RNGTT</td>\n", + " <td>1.6.5.3,1.6.99.3,2.7.7.50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798611</th>\n", + " <td>41596</td>\n", + " <td>238</td>\n", + " <td>Q9VY44</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...</td>\n", + " <td>RNGTT</td>\n", + " <td>1.6.5.3,1.6.99.3,2.7.7.50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798612</th>\n", + " <td>41596</td>\n", + " <td>233</td>\n", + " <td>A0A5K4FAB6</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...</td>\n", + " <td>RNGTT</td>\n", + " <td>1.6.5.3,1.6.99.3,2.7.7.50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798614</th>\n", + " <td>41596</td>\n", + " <td>226</td>\n", + " <td>Q17607</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...</td>\n", + " <td>RNGTT</td>\n", + " <td>1.6.5.3,1.6.99.3,2.7.7.50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798615</th>\n", + " <td>41596</td>\n", + " <td>219</td>\n", + " <td>D3ZH30</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...</td>\n", + " <td>RNGTT</td>\n", + " <td>1.6.5.3,1.6.99.3,2.7.7.50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798616</th>\n", + " <td>41596</td>\n", + " <td>211</td>\n", + " <td>A0A0N4UCR5</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...</td>\n", + " <td>RNGTT</td>\n", + " <td>1.6.5.3,1.6.99.3,2.7.7.50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1798618</th>\n", + " <td>41596</td>\n", + " <td>198</td>\n", + " <td>A0A183XJR9</td>\n", + " <td>COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...</td>\n", + " <td>RNGTT</td>\n", + " <td>1.6.5.3,1.6.99.3,2.7.7.50</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " query bit score uniprot \\\n", + "1798577 41596 367 Q22707 \n", + "1798580 41596 356 C0PFH1 \n", + "1798581 41596 347 B7EQL6 \n", + "1798582 41596 347 A0A1D6EGX8 \n", + "1798583 41596 344 A0A0H5S9H8 \n", + "1798584 41596 343 Q6NXK5 \n", + "1798585 41596 337 Q2QWJ7 \n", + "1798588 41596 330 O75319 \n", + "1798589 41596 328 Q4KM79 \n", + "1798590 41596 323 A0A3P7GI08 \n", + "1798592 41596 319 I1N9F9 \n", + "1798595 41596 296 E9QD92 \n", + "1798596 41596 288 Q8GSD7 \n", + "1798597 41596 286 A0A0R0KQX0 \n", + "1798598 41596 276 F4IYM6 \n", + "1798599 41596 273 Q8SX38 \n", + "1798600 41596 272 P34442 \n", + "1798601 41596 270 J9BD64 \n", + "1798602 41596 265 K7MTE7 \n", + "1798603 41596 259 A0A0N4U7Y1 \n", + "1798604 41596 259 K7KDH1 \n", + "1798606 41596 254 Q6NY98 \n", + "1798607 41596 254 I1KKA0 \n", + "1798608 41596 250 K7K355 \n", + "1798609 41596 246 O60942 \n", + "1798610 41596 238 O55236 \n", + "1798611 41596 238 Q9VY44 \n", + "1798612 41596 233 A0A5K4FAB6 \n", + "1798614 41596 226 Q17607 \n", + "1798615 41596 219 D3ZH30 \n", + "1798616 41596 211 A0A0N4UCR5 \n", + "1798618 41596 198 A0A183XJR9 \n", + "\n", + " eggNOG_OGs Preferred_name \\\n", + "1798577 COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33... DUSP11 \n", + "1798580 COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33... - \n", + "1798581 COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33... - \n", + "1798582 COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33... - \n", + "1798583 COG5226@1|root,KOG2386@2759|Eukaryota,3A2XY@33... - \n", + "1798584 COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33... DUSP11 \n", + "1798585 COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33... - \n", + "1798588 COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33... DUSP11 \n", + "1798589 COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33... DUSP11 \n", + "1798590 COG5226@1|root,KOG2386@2759|Eukaryota,3A2XY@33... - \n", + "1798592 COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33... - \n", + "1798595 COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33... DUSP11 \n", + "1798596 COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33... - \n", + "1798597 COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33... - \n", + "1798598 COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33... - \n", + "1798599 COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33... DUSP11 \n", + "1798600 COG5226@1|root,KOG2386@2759|Eukaryota,3A2XY@33... - \n", + "1798601 COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33... DUSP11 \n", + "1798602 COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33... - \n", + "1798603 COG5226@1|root,KOG2386@2759|Eukaryota,3A2XY@33... - \n", + "1798604 COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33... - \n", + "1798606 COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33... RNGTT \n", + "1798607 COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33... - \n", + "1798608 COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33... - \n", + "1798609 COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33... RNGTT \n", + "1798610 COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33... RNGTT \n", + "1798611 COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33... RNGTT \n", + "1798612 COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33... RNGTT \n", + "1798614 COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33... RNGTT \n", + "1798615 COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33... RNGTT \n", + "1798616 COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33... RNGTT \n", + "1798618 COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33... RNGTT \n", + "\n", + " EC \n", + "1798577 3.1.3.16,3.1.3.48 \n", + "1798580 2.7.7.50 \n", + "1798581 2.7.7.50 \n", + "1798582 2.7.7.50 \n", + "1798583 3.1.3.16,3.1.3.48 \n", + "1798584 3.1.3.16,3.1.3.48 \n", + "1798585 2.7.7.50 \n", + "1798588 3.1.3.16,3.1.3.48 \n", + "1798589 3.1.3.16,3.1.3.48 \n", + "1798590 3.1.3.16,3.1.3.48 \n", + "1798592 2.7.7.50 \n", + "1798595 3.1.3.16,3.1.3.48 \n", + "1798596 2.7.7.50 \n", + "1798597 2.7.7.50 \n", + "1798598 2.7.7.50 \n", + "1798599 3.1.3.16,3.1.3.48 \n", + "1798600 3.1.3.16,3.1.3.48 \n", + "1798601 3.1.3.16,3.1.3.48 \n", + "1798602 2.7.7.50 \n", + "1798603 3.1.3.16,3.1.3.48 \n", + "1798604 2.7.7.50 \n", + "1798606 1.6.5.3,1.6.99.3,2.7.7.50 \n", + "1798607 2.7.7.50 \n", + "1798608 2.7.7.50 \n", + "1798609 1.6.5.3,1.6.99.3,2.7.7.50 \n", + "1798610 1.6.5.3,1.6.99.3,2.7.7.50 \n", + "1798611 1.6.5.3,1.6.99.3,2.7.7.50 \n", + "1798612 1.6.5.3,1.6.99.3,2.7.7.50 \n", + "1798614 1.6.5.3,1.6.99.3,2.7.7.50 \n", + "1798615 1.6.5.3,1.6.99.3,2.7.7.50 \n", + "1798616 1.6.5.3,1.6.99.3,2.7.7.50 \n", + "1798618 1.6.5.3,1.6.99.3,2.7.7.50 " + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "slim[slim[\"query\"] == 41596]" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "3ee9ee24-9274-403e-a269-d43b13c63573", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 640x480 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots()\n", + "ax.hist(slim[slim[\"query\"] == 142][\"bit score\"], bins=50);" + ] + }, { "cell_type": "code", "execution_count": 37, diff --git a/analysis/suppl-horizontal_gene_transfer.ipynb b/analysis/suppl-horizontal_gene_transfer.ipynb index d6a508d..a074e64 100644 --- a/analysis/suppl-horizontal_gene_transfer.ipynb +++ b/analysis/suppl-horizontal_gene_transfer.ipynb @@ -677,7 +677,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.10.8" } }, "nbformat": 4, -- GitLab