From c29d13dec60e336d5f034e07836c237fed70108b Mon Sep 17 00:00:00 2001
From: Niko Papadopoulos <nikolaos.papadopoulos@embl.de>
Date: Tue, 24 Jan 2023 16:52:20 +0100
Subject: [PATCH] looked at EC of second-best morphologs and why they sometimes
 disagree; looked at available choanos for HGT candidates

---
 analysis/revision-hgt-outgroup.ipynb          | 284 ++++++++++
 .../revision-second_best_morpholog-run.ipynb  | 529 +++++++++++++++++-
 analysis/suppl-horizontal_gene_transfer.ipynb |   2 +-
 3 files changed, 808 insertions(+), 7 deletions(-)
 create mode 100644 analysis/revision-hgt-outgroup.ipynb

diff --git a/analysis/revision-hgt-outgroup.ipynb b/analysis/revision-hgt-outgroup.ipynb
new file mode 100644
index 0000000..0856d47
--- /dev/null
+++ b/analysis/revision-hgt-outgroup.ipynb
@@ -0,0 +1,284 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "19a33b01-d8d4-49de-9e05-302c14fb7c42",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2023-01-24 16:43\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datetime import datetime, timezone\n",
+    "import pandas as pd\n",
+    "import pytz\n",
+    "\n",
+    "utc_dt = datetime.now(timezone.utc) # UTC time\n",
+    "dt = utc_dt.astimezone()\n",
+    "tz = pytz.timezone('Europe/Berlin')\n",
+    "berlin_now = datetime.now(tz)\n",
+    "print(f'{berlin_now:%Y-%m-%d %H:%M}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "55f808fc-0a34-48d5-96b9-659d27f16f13",
+   "metadata": {},
+   "source": [
+    "The reviewers challenged us to look for the HGT candidates in the nearest non-metazoan outgroup, choanoflagellates. We are using _Salpingoeca rosetta_ and _Monosiga brevicollis_, two model choanoflagellates with publically available genomes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "c4ad4dc2-a115-4679-8947-c6ed78582bbd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
+      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
+      "100 9376k    0 9376k    0     0  1607k      0 --:--:--  0:00:05 --:--:-- 1472k\n",
+      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
+      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
+      "100 6562k    0 6562k    0     0  2390k      0 --:--:--  0:00:02 --:--:-- 2394k\n"
+     ]
+    }
+   ],
+   "source": [
+    "!curl \"https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28proteome%3AUP000007799%29%29\" -o salpingoeca.faa\n",
+    "!curl \"https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28proteome%3AUP000001357%29%29\" -o monosiga.faa\n",
+    "!rm -rf tmp/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "aae09518-9f38-4c3e-90e0-f8595589e748",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%%bash --out salpingoeca.out --err salpingoeca.err\n",
+    "hgt_candidates=\"/Users/npapadop/Documents/data/coffe/hgt.pep\"\n",
+    "mmseqs easy-search ${hgt_candidates} \"./salpingoeca.faa\" salpingoeca.m8 tmp --search-type 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "9f1aa456-1f6f-4523-b923-763b603b649c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%%bash --out monosiga.out --err monosiga.err\n",
+    "hgt_candidates=\"/Users/npapadop/Documents/data/coffe/hgt.pep\"\n",
+    "mmseqs easy-search ${hgt_candidates} \"./monosiga.faa\" monosiga.m8 tmp --search-type 2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "80982650-856c-4b18-be0e-501b240533ab",
+   "metadata": {},
+   "source": [
+    "Let's have a look at the results:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "169716b0-3ce2-4eb7-8e53-915c99ec5122",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>3</th>\n",
+       "      <th>4</th>\n",
+       "      <th>5</th>\n",
+       "      <th>6</th>\n",
+       "      <th>7</th>\n",
+       "      <th>8</th>\n",
+       "      <th>9</th>\n",
+       "      <th>10</th>\n",
+       "      <th>11</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>c103983_g1_i1_m.71422,</td>\n",
+       "      <td>A9V527</td>\n",
+       "      <td>0.251</td>\n",
+       "      <td>505</td>\n",
+       "      <td>370</td>\n",
+       "      <td>0</td>\n",
+       "      <td>65</td>\n",
+       "      <td>569</td>\n",
+       "      <td>66</td>\n",
+       "      <td>560</td>\n",
+       "      <td>3.201000e-14</td>\n",
+       "      <td>76</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                       0       1      2    3    4   5   6    7   8    9   \\\n",
+       "0  c103983_g1_i1_m.71422,  A9V527  0.251  505  370   0  65  569  66  560   \n",
+       "\n",
+       "             10  11  \n",
+       "0  3.201000e-14  76  "
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pd.read_csv(\"monosiga.m8\", sep=\"\\t\", header=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "8c324422-faed-4249-89da-cb1dafeac4d1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>3</th>\n",
+       "      <th>4</th>\n",
+       "      <th>5</th>\n",
+       "      <th>6</th>\n",
+       "      <th>7</th>\n",
+       "      <th>8</th>\n",
+       "      <th>9</th>\n",
+       "      <th>10</th>\n",
+       "      <th>11</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>c103983_g1_i1_m.71422,</td>\n",
+       "      <td>F2UGB0</td>\n",
+       "      <td>0.253</td>\n",
+       "      <td>490</td>\n",
+       "      <td>354</td>\n",
+       "      <td>0</td>\n",
+       "      <td>63</td>\n",
+       "      <td>552</td>\n",
+       "      <td>159</td>\n",
+       "      <td>633</td>\n",
+       "      <td>2.262000e-16</td>\n",
+       "      <td>83</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                       0       1      2    3    4   5   6    7    8    9   \\\n",
+       "0  c103983_g1_i1_m.71422,  F2UGB0  0.253  490  354   0  63  552  159  633   \n",
+       "\n",
+       "             10  11  \n",
+       "0  2.262000e-16  83  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pd.read_csv(\"salpingoeca.m8\", sep=\"\\t\", header=None)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8184c03c-4741-4499-a346-a31f4e4ed61c",
+   "metadata": {},
+   "source": [
+    "In both cases the only relevant hit that is found is c103983_g1, the gene EggNOG v5.0 identifies as \"metal-dependent hydrolase - Proteobacteria\" and MorF putatively identifies as an aminohydrolase."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/analysis/revision-second_best_morpholog-run.ipynb b/analysis/revision-second_best_morpholog-run.ipynb
index 114266e..bc753c2 100644
--- a/analysis/revision-second_best_morpholog-run.ipynb
+++ b/analysis/revision-second_best_morpholog-run.ipynb
@@ -10,7 +10,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2022-12-21 15:28\n"
+      "2023-01-24 15:27\n"
      ]
     }
    ],
@@ -387,7 +387,7 @@
     {
      "data": {
       "text/plain": [
-       "<matplotlib.collections.PathCollection at 0x2cc05e740>"
+       "<matplotlib.collections.PathCollection at 0x2ceef8e80>"
       ]
      },
      "execution_count": 19,
@@ -729,7 +729,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 32,
    "id": "b7f15b68-ef6b-40ad-9368-f1d9a1220b2b",
    "metadata": {},
    "outputs": [
@@ -739,7 +739,7 @@
        "0.1088115396676074"
       ]
      },
-     "execution_count": 33,
+     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -750,7 +750,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 33,
    "id": "db54d8dc-302f-4dc3-8abf-03c357796d49",
    "metadata": {},
    "outputs": [
@@ -760,7 +760,7 @@
        "0.07933521480087802"
       ]
      },
-     "execution_count": 38,
+     "execution_count": 33,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -769,6 +769,523 @@
     "np.sum(top10p_12[~exclude] < 3) / np.sum(top10p_12[~exclude] > 3)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "d1c33949-76e8-4bdd-bbee-99824d43570a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1.217579250720461"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.mean(top10p_12[~exclude][top10p_12[~exclude] < 4])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "f3d9b8ec-e7fe-4708-9e90-5377a59b34e7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "query\n",
+       "132      0\n",
+       "142      3\n",
+       "170      0\n",
+       "215      0\n",
+       "252      1\n",
+       "        ..\n",
+       "41535    1\n",
+       "41596    0\n",
+       "41621    1\n",
+       "41703    3\n",
+       "41934    1\n",
+       "Length: 694, dtype: int64"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "top10p_12[~exclude][top10p_12[~exclude] < 4]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "id": "614bc862-d3ae-45bb-bae4-0dd45e9ae3cf",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>query</th>\n",
+       "      <th>bit score</th>\n",
+       "      <th>uniprot</th>\n",
+       "      <th>eggNOG_OGs</th>\n",
+       "      <th>Preferred_name</th>\n",
+       "      <th>EC</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1798577</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>367</td>\n",
+       "      <td>Q22707</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...</td>\n",
+       "      <td>DUSP11</td>\n",
+       "      <td>3.1.3.16,3.1.3.48</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798580</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>356</td>\n",
+       "      <td>C0PFH1</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>2.7.7.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798581</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>347</td>\n",
+       "      <td>B7EQL6</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>2.7.7.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798582</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>347</td>\n",
+       "      <td>A0A1D6EGX8</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>2.7.7.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798583</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>344</td>\n",
+       "      <td>A0A0H5S9H8</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,3A2XY@33...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>3.1.3.16,3.1.3.48</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798584</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>343</td>\n",
+       "      <td>Q6NXK5</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...</td>\n",
+       "      <td>DUSP11</td>\n",
+       "      <td>3.1.3.16,3.1.3.48</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798585</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>337</td>\n",
+       "      <td>Q2QWJ7</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>2.7.7.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798588</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>330</td>\n",
+       "      <td>O75319</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...</td>\n",
+       "      <td>DUSP11</td>\n",
+       "      <td>3.1.3.16,3.1.3.48</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798589</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>328</td>\n",
+       "      <td>Q4KM79</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...</td>\n",
+       "      <td>DUSP11</td>\n",
+       "      <td>3.1.3.16,3.1.3.48</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798590</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>323</td>\n",
+       "      <td>A0A3P7GI08</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,3A2XY@33...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>3.1.3.16,3.1.3.48</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798592</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>319</td>\n",
+       "      <td>I1N9F9</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>2.7.7.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798595</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>296</td>\n",
+       "      <td>E9QD92</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...</td>\n",
+       "      <td>DUSP11</td>\n",
+       "      <td>3.1.3.16,3.1.3.48</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798596</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>288</td>\n",
+       "      <td>Q8GSD7</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>2.7.7.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798597</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>286</td>\n",
+       "      <td>A0A0R0KQX0</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>2.7.7.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798598</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>276</td>\n",
+       "      <td>F4IYM6</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>2.7.7.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798599</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>273</td>\n",
+       "      <td>Q8SX38</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...</td>\n",
+       "      <td>DUSP11</td>\n",
+       "      <td>3.1.3.16,3.1.3.48</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798600</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>272</td>\n",
+       "      <td>P34442</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,3A2XY@33...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>3.1.3.16,3.1.3.48</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798601</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>270</td>\n",
+       "      <td>J9BD64</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...</td>\n",
+       "      <td>DUSP11</td>\n",
+       "      <td>3.1.3.16,3.1.3.48</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798602</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>265</td>\n",
+       "      <td>K7MTE7</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>2.7.7.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798603</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>259</td>\n",
+       "      <td>A0A0N4U7Y1</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,3A2XY@33...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>3.1.3.16,3.1.3.48</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798604</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>259</td>\n",
+       "      <td>K7KDH1</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>2.7.7.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798606</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>254</td>\n",
+       "      <td>Q6NY98</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...</td>\n",
+       "      <td>RNGTT</td>\n",
+       "      <td>1.6.5.3,1.6.99.3,2.7.7.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798607</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>254</td>\n",
+       "      <td>I1KKA0</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>2.7.7.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798608</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>250</td>\n",
+       "      <td>K7K355</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>2.7.7.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798609</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>246</td>\n",
+       "      <td>O60942</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...</td>\n",
+       "      <td>RNGTT</td>\n",
+       "      <td>1.6.5.3,1.6.99.3,2.7.7.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798610</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>238</td>\n",
+       "      <td>O55236</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...</td>\n",
+       "      <td>RNGTT</td>\n",
+       "      <td>1.6.5.3,1.6.99.3,2.7.7.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798611</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>238</td>\n",
+       "      <td>Q9VY44</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...</td>\n",
+       "      <td>RNGTT</td>\n",
+       "      <td>1.6.5.3,1.6.99.3,2.7.7.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798612</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>233</td>\n",
+       "      <td>A0A5K4FAB6</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...</td>\n",
+       "      <td>RNGTT</td>\n",
+       "      <td>1.6.5.3,1.6.99.3,2.7.7.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798614</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>226</td>\n",
+       "      <td>Q17607</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...</td>\n",
+       "      <td>RNGTT</td>\n",
+       "      <td>1.6.5.3,1.6.99.3,2.7.7.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798615</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>219</td>\n",
+       "      <td>D3ZH30</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...</td>\n",
+       "      <td>RNGTT</td>\n",
+       "      <td>1.6.5.3,1.6.99.3,2.7.7.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798616</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>211</td>\n",
+       "      <td>A0A0N4UCR5</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...</td>\n",
+       "      <td>RNGTT</td>\n",
+       "      <td>1.6.5.3,1.6.99.3,2.7.7.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1798618</th>\n",
+       "      <td>41596</td>\n",
+       "      <td>198</td>\n",
+       "      <td>A0A183XJR9</td>\n",
+       "      <td>COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...</td>\n",
+       "      <td>RNGTT</td>\n",
+       "      <td>1.6.5.3,1.6.99.3,2.7.7.50</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         query  bit score     uniprot  \\\n",
+       "1798577  41596        367      Q22707   \n",
+       "1798580  41596        356      C0PFH1   \n",
+       "1798581  41596        347      B7EQL6   \n",
+       "1798582  41596        347  A0A1D6EGX8   \n",
+       "1798583  41596        344  A0A0H5S9H8   \n",
+       "1798584  41596        343      Q6NXK5   \n",
+       "1798585  41596        337      Q2QWJ7   \n",
+       "1798588  41596        330      O75319   \n",
+       "1798589  41596        328      Q4KM79   \n",
+       "1798590  41596        323  A0A3P7GI08   \n",
+       "1798592  41596        319      I1N9F9   \n",
+       "1798595  41596        296      E9QD92   \n",
+       "1798596  41596        288      Q8GSD7   \n",
+       "1798597  41596        286  A0A0R0KQX0   \n",
+       "1798598  41596        276      F4IYM6   \n",
+       "1798599  41596        273      Q8SX38   \n",
+       "1798600  41596        272      P34442   \n",
+       "1798601  41596        270      J9BD64   \n",
+       "1798602  41596        265      K7MTE7   \n",
+       "1798603  41596        259  A0A0N4U7Y1   \n",
+       "1798604  41596        259      K7KDH1   \n",
+       "1798606  41596        254      Q6NY98   \n",
+       "1798607  41596        254      I1KKA0   \n",
+       "1798608  41596        250      K7K355   \n",
+       "1798609  41596        246      O60942   \n",
+       "1798610  41596        238      O55236   \n",
+       "1798611  41596        238      Q9VY44   \n",
+       "1798612  41596        233  A0A5K4FAB6   \n",
+       "1798614  41596        226      Q17607   \n",
+       "1798615  41596        219      D3ZH30   \n",
+       "1798616  41596        211  A0A0N4UCR5   \n",
+       "1798618  41596        198  A0A183XJR9   \n",
+       "\n",
+       "                                                eggNOG_OGs Preferred_name  \\\n",
+       "1798577  COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...         DUSP11   \n",
+       "1798580  COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...              -   \n",
+       "1798581  COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...              -   \n",
+       "1798582  COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...              -   \n",
+       "1798583  COG5226@1|root,KOG2386@2759|Eukaryota,3A2XY@33...              -   \n",
+       "1798584  COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...         DUSP11   \n",
+       "1798585  COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...              -   \n",
+       "1798588  COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...         DUSP11   \n",
+       "1798589  COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...         DUSP11   \n",
+       "1798590  COG5226@1|root,KOG2386@2759|Eukaryota,3A2XY@33...              -   \n",
+       "1798592  COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...              -   \n",
+       "1798595  COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...         DUSP11   \n",
+       "1798596  COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...              -   \n",
+       "1798597  COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...              -   \n",
+       "1798598  COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...              -   \n",
+       "1798599  COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...         DUSP11   \n",
+       "1798600  COG5226@1|root,KOG2386@2759|Eukaryota,3A2XY@33...              -   \n",
+       "1798601  COG5226@1|root,KOG2386@2759|Eukaryota,38WCH@33...         DUSP11   \n",
+       "1798602  COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...              -   \n",
+       "1798603  COG5226@1|root,KOG2386@2759|Eukaryota,3A2XY@33...              -   \n",
+       "1798604  COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...              -   \n",
+       "1798606  COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...          RNGTT   \n",
+       "1798607  COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...              -   \n",
+       "1798608  COG5226@1|root,KOG2386@2759|Eukaryota,37QNS@33...              -   \n",
+       "1798609  COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...          RNGTT   \n",
+       "1798610  COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...          RNGTT   \n",
+       "1798611  COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...          RNGTT   \n",
+       "1798612  COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...          RNGTT   \n",
+       "1798614  COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...          RNGTT   \n",
+       "1798615  COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...          RNGTT   \n",
+       "1798616  COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...          RNGTT   \n",
+       "1798618  COG5226@1|root,KOG2386@2759|Eukaryota,38DZV@33...          RNGTT   \n",
+       "\n",
+       "                                EC  \n",
+       "1798577          3.1.3.16,3.1.3.48  \n",
+       "1798580                   2.7.7.50  \n",
+       "1798581                   2.7.7.50  \n",
+       "1798582                   2.7.7.50  \n",
+       "1798583          3.1.3.16,3.1.3.48  \n",
+       "1798584          3.1.3.16,3.1.3.48  \n",
+       "1798585                   2.7.7.50  \n",
+       "1798588          3.1.3.16,3.1.3.48  \n",
+       "1798589          3.1.3.16,3.1.3.48  \n",
+       "1798590          3.1.3.16,3.1.3.48  \n",
+       "1798592                   2.7.7.50  \n",
+       "1798595          3.1.3.16,3.1.3.48  \n",
+       "1798596                   2.7.7.50  \n",
+       "1798597                   2.7.7.50  \n",
+       "1798598                   2.7.7.50  \n",
+       "1798599          3.1.3.16,3.1.3.48  \n",
+       "1798600          3.1.3.16,3.1.3.48  \n",
+       "1798601          3.1.3.16,3.1.3.48  \n",
+       "1798602                   2.7.7.50  \n",
+       "1798603          3.1.3.16,3.1.3.48  \n",
+       "1798604                   2.7.7.50  \n",
+       "1798606  1.6.5.3,1.6.99.3,2.7.7.50  \n",
+       "1798607                   2.7.7.50  \n",
+       "1798608                   2.7.7.50  \n",
+       "1798609  1.6.5.3,1.6.99.3,2.7.7.50  \n",
+       "1798610  1.6.5.3,1.6.99.3,2.7.7.50  \n",
+       "1798611  1.6.5.3,1.6.99.3,2.7.7.50  \n",
+       "1798612  1.6.5.3,1.6.99.3,2.7.7.50  \n",
+       "1798614  1.6.5.3,1.6.99.3,2.7.7.50  \n",
+       "1798615  1.6.5.3,1.6.99.3,2.7.7.50  \n",
+       "1798616  1.6.5.3,1.6.99.3,2.7.7.50  \n",
+       "1798618  1.6.5.3,1.6.99.3,2.7.7.50  "
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "slim[slim[\"query\"] == 41596]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "3ee9ee24-9274-403e-a269-d43b13c63573",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 640x480 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "fig, ax = plt.subplots()\n",
+    "ax.hist(slim[slim[\"query\"] == 142][\"bit score\"], bins=50);"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 37,
diff --git a/analysis/suppl-horizontal_gene_transfer.ipynb b/analysis/suppl-horizontal_gene_transfer.ipynb
index d6a508d..a074e64 100644
--- a/analysis/suppl-horizontal_gene_transfer.ipynb
+++ b/analysis/suppl-horizontal_gene_transfer.ipynb
@@ -677,7 +677,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.6"
+   "version": "3.10.8"
   }
  },
  "nbformat": 4,
-- 
GitLab