diff --git a/analysis/prepare.ipynb b/analysis/prepare.ipynb
index 20d03d9662297e1c7548c87e030c5f66fa507113..c5407256a30fd97e37257c20a7a95aa1b7107074 100644
--- a/analysis/prepare.ipynb
+++ b/analysis/prepare.ipynb
@@ -107,12 +107,120 @@
     "fs_swp = \"/scratch/npapadop/foldseek_results/swissprot_score.tsv\"\n",
     "# AlphaFold predictions\n",
     "structure_list = \"/g/arendt/npapadop/data/spongilla_af/best_models\"\n",
-    "metadata = \"/g/arendt/npapadop/data/spongilla_af/\""
+    "metadata = \"/g/arendt/npapadop/data/spongilla_af/best_model_metadata/\""
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "ddd1c39e-ade1-432d-b21d-ec12e43cfe8e",
+   "id": "de3a52aa-8ecc-44bd-9002-7f1412200dc9",
+   "metadata": {},
+   "source": [
+    "## 1. Multiple sequence alignments\n",
+    "\n",
+    "First read the MSAs and extract the number of sequences in each as well as the sequence length and the _Spongilla_ transcript name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7f300273-15c7-41f3-acb3-658129284f25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "N = len(glob.glob(metadata+\"*.a3m\"))\n",
+    "seq_id = [\"\"] * N\n",
+    "no_seqs = [0] * N\n",
+    "seq_length = [0] * N\n",
+    "gene_name = [\"\"] * N\n",
+    "\n",
+    "for i, alignment in enumerate(tqdm(glob.glob(metadata+\"*.a3m\"))):\n",
+    "    try:\n",
+    "        with open(alignment, \"r\") as f:\n",
+    "            seq_id[i] = alignment.split(\"/\")[-1].split(\".\")[0]\n",
+    "            lines = f.readlines()\n",
+    "            no_seqs[i] = (len(lines) - 3) / 2\n",
+    "            seq_length[i] = lines[0].rstrip()[1:].split()[0]\n",
+    "            gene_name[i] = lines[1].rstrip()[1:]\n",
+    "    #         print(no_seqs, seq_length, gene_name)\n",
+    "    except FileNotFoundError:\n",
+    "        continue\n",
+    "\n",
+    "sequence_info = pd.DataFrame({\"query\": seq_id, \"MSA size\": no_seqs, \"query length\": seq_length, \"gene name\": gene_name})\n",
+    "sequence_info.set_index(\"query\", inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4e946268-f439-4203-a815-48a5384ccf31",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "sequence_info[\"protein_id\"] = sequence_info[\"gene name\"].str.split(\".\").str[1]\n",
+    "sequence_info[\"gene_id\"] = sequence_info[\"gene name\"].str.split(\".\").str[0].str.split(\"_\").str[:2].str.join(\"_\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f1731c0-af6a-4caa-bc1f-b665b59c05c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sequence_info.to_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/sequence_info.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c06f7868-dbc8-4760-9c3c-c9f8b6be58ea",
+   "metadata": {},
+   "source": [
+    "## 2. AlphaFold predictions\n",
+    "\n",
+    "Next, read the per-residue pLDDT score from AlphaFold and average it; then keep the best-scoring isoform per gene ID."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eed723f4-314d-4b34-89dd-fc339fd89505",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "N = len(os.listdir(structure_list))\n",
+    "proteins = [\"\"] * N\n",
+    "scores = [0.] * N\n",
+    "\n",
+    "for i, protein in enumerate(tqdm(os.listdir(structure_list))):\n",
+    "    full_name = protein.split(\".\")[0]\n",
+    "    metadata_loc = metadata + full_name + \"_scores.json\"\n",
+    "    with open(metadata_loc, \"r\") as f:\n",
+    "        score = json.load(f)\n",
+    "    name = full_name.split(\"_\")[0]\n",
+    "    proteins[i] = name\n",
+    "    scores[i] = np.mean(score[\"plddt\"])\n",
+    "\n",
+    "alphafold = pd.DataFrame({\"query\": proteins, \"plddt\": scores})\n",
+    "alphafold.set_index(\"query\", inplace=True)\n",
+    "alphafold = alphafold.join(sequence_info)\n",
+    "alphafold = alphafold.sort_values('plddt', ascending=False).drop_duplicates(['gene_id'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "78ef2d0f-3a3b-46a3-ae53-b5093a61a5f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "alphafold.to_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/structure_predictions.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f0ed1ef2-591e-4465-abd3-c3de9d7cce3f",
    "metadata": {},
    "source": [
     "Read sequence information for _Spongilla_ and the summary of the structure predictions"
@@ -125,8 +233,223 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sequence_info = pd.read_csv(\"../data/sequence_info.csv\", index_col=\"query\")\n",
-    "alphafold = pd.read_csv(\"../data/structure_predictions.csv\", index_col=\"query\")"
+    "sequence_info = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/sequence_info.csv\", index_col=\"query\")\n",
+    "alphafold = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/structure_predictions.csv\", index_col=\"query\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "0426d007-88f1-4e7e-8177-abc425f8122e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>plddt</th>\n",
+       "      <th>MSA size</th>\n",
+       "      <th>query length</th>\n",
+       "      <th>gene name</th>\n",
+       "      <th>protein_id</th>\n",
+       "      <th>gene_id</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>query</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>20236</th>\n",
+       "      <td>98.147179</td>\n",
+       "      <td>7655.0</td>\n",
+       "      <td>78</td>\n",
+       "      <td>c114736_g1_i1_m.91624</td>\n",
+       "      <td>91624</td>\n",
+       "      <td>c114736_g1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8471</th>\n",
+       "      <td>98.115354</td>\n",
+       "      <td>16675.0</td>\n",
+       "      <td>127</td>\n",
+       "      <td>c103108_g2_i1_m.62395</td>\n",
+       "      <td>62395</td>\n",
+       "      <td>c103108_g2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10371</th>\n",
+       "      <td>98.018750</td>\n",
+       "      <td>2519.0</td>\n",
+       "      <td>352</td>\n",
+       "      <td>c103630_g1_i2_m.67428</td>\n",
+       "      <td>67428</td>\n",
+       "      <td>c103630_g1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29803</th>\n",
+       "      <td>97.815000</td>\n",
+       "      <td>3461.0</td>\n",
+       "      <td>120</td>\n",
+       "      <td>c91796_g1_i1_m.16975</td>\n",
+       "      <td>16975</td>\n",
+       "      <td>c91796_g1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20860</th>\n",
+       "      <td>97.811325</td>\n",
+       "      <td>14752.0</td>\n",
+       "      <td>151</td>\n",
+       "      <td>c2715_g1_i1_m.171</td>\n",
+       "      <td>171</td>\n",
+       "      <td>c2715_g1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26721</th>\n",
+       "      <td>26.866756</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>712</td>\n",
+       "      <td>c87685_g1_i1_m.11391</td>\n",
+       "      <td>11391</td>\n",
+       "      <td>c87685_g1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32956</th>\n",
+       "      <td>26.821802</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>172</td>\n",
+       "      <td>c94712_g2_i1_m.22872</td>\n",
+       "      <td>22872</td>\n",
+       "      <td>c94712_g2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31710</th>\n",
+       "      <td>26.688600</td>\n",
+       "      <td>165.0</td>\n",
+       "      <td>200</td>\n",
+       "      <td>c93707_g1_i1_m.20473</td>\n",
+       "      <td>20473</td>\n",
+       "      <td>c93707_g1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1202</th>\n",
+       "      <td>26.157910</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>201</td>\n",
+       "      <td>c100524_g1_i1_m.44523</td>\n",
+       "      <td>44523</td>\n",
+       "      <td>c100524_g1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1796</th>\n",
+       "      <td>24.582727</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>297</td>\n",
+       "      <td>c100770_g2_i1_m.45897</td>\n",
+       "      <td>45897</td>\n",
+       "      <td>c100770_g2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>29662 rows × 6 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           plddt  MSA size  query length              gene name  protein_id  \\\n",
+       "query                                                                         \n",
+       "20236  98.147179    7655.0            78  c114736_g1_i1_m.91624       91624   \n",
+       "8471   98.115354   16675.0           127  c103108_g2_i1_m.62395       62395   \n",
+       "10371  98.018750    2519.0           352  c103630_g1_i2_m.67428       67428   \n",
+       "29803  97.815000    3461.0           120   c91796_g1_i1_m.16975       16975   \n",
+       "20860  97.811325   14752.0           151      c2715_g1_i1_m.171         171   \n",
+       "...          ...       ...           ...                    ...         ...   \n",
+       "26721  26.866756       3.0           712   c87685_g1_i1_m.11391       11391   \n",
+       "32956  26.821802       1.0           172   c94712_g2_i1_m.22872       22872   \n",
+       "31710  26.688600     165.0           200   c93707_g1_i1_m.20473       20473   \n",
+       "1202   26.157910       2.0           201  c100524_g1_i1_m.44523       44523   \n",
+       "1796   24.582727      10.0           297  c100770_g2_i1_m.45897       45897   \n",
+       "\n",
+       "          gene_id  \n",
+       "query              \n",
+       "20236  c114736_g1  \n",
+       "8471   c103108_g2  \n",
+       "10371  c103630_g1  \n",
+       "29803   c91796_g1  \n",
+       "20860    c2715_g1  \n",
+       "...           ...  \n",
+       "26721   c87685_g1  \n",
+       "32956   c94712_g2  \n",
+       "31710   c93707_g1  \n",
+       "1202   c100524_g1  \n",
+       "1796   c100770_g2  \n",
+       "\n",
+       "[29662 rows x 6 columns]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "alphafold"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "7efb172b-6adb-4f50-84f7-bb63adecd3c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sequence_info[\"protein_id\"] = sequence_info[\"protein_id\"].astype(\"Int64\") # this column will hold NaNs later so convert it to Int64, which can hold nulls."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "2ad95435-9023-4d97-a0ce-cd7612e57a66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pdb = read_foldseek(fs_pdb)\n",
+    "pdb[\"query\"] = pdb[\"query\"].values.astype(int)\n",
+    "afdb = read_foldseek(fs_afdb)\n",
+    "afdb[\"query\"] = afdb[\"query\"].values.astype(int)\n",
+    "swp = read_foldseek(fs_swp)\n",
+    "swp[\"query\"] = swp[\"query\"].values.astype(int)"
    ]
   },
   {
@@ -202,7 +525,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 11,
    "id": "47c101a2-47b5-40db-9693-f6af15031437",
    "metadata": {},
    "outputs": [],
@@ -215,17 +538,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 12,
    "id": "c31956e6-ed67-4be5-8faf-f7c0ff4d7c58",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(999,)"
+       "(998,)"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -236,7 +559,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 13,
    "id": "b3e5038f-c7c9-4cef-961a-4e5682236f1c",
    "metadata": {},
    "outputs": [],
@@ -249,7 +572,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 14,
    "id": "507753ca-7ef1-4e82-be1d-bbc23d7ec553",
    "metadata": {},
    "outputs": [
@@ -257,7 +580,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 3/3 [00:22<00:00,  7.43s/it]\n"
+      "100%|██████████| 3/3 [00:53<00:00, 17.97s/it]\n"
      ]
     }
    ],
@@ -277,16 +600,6 @@
     "        response[i] = f.read()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4f8d91dc-ce25-4acd-ad0e-189076590080",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "response"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "f50296d5-ac0f-48fc-80ef-85b952b843f4",
@@ -297,15 +610,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 15,
    "id": "0acd0a47-df91-4ffd-9e8d-bc8cb8882005",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 142581/142581 [00:00<00:00, 1999873.12it/s]\n"
+      "100%|██████████| 142581/142581 [00:00<00:00, 1929888.50it/s]\n"
      ]
     }
    ],
@@ -313,86 +628,3467 @@
     "pdb = enrich_from_uniprot(pdb, \"target\", \"uniprot\", uniprot_from=\"PDB_ID\", uniprot_to=\"ACC\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "07e0cb24-97e1-49d5-8c2f-1f757d5b2a5f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "418323"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pdb[\"uniprot\"].str.contains(\",\").sum()"
+   ]
+  },
   {
    "cell_type": "markdown",
-   "id": "b8f492f6-450d-484f-ba07-8a396ea6c314",
+   "id": "3f03690e-ac94-4502-b372-f77da44ba6f3",
    "metadata": {},
    "source": [
-    "Translate the UniProt IDs to gene names; whatever obtained a UniProt ID should have a gene name, and this will be our fallback option if emapper annotation is not present:"
+    "418323 rows in the pdb file have multiple entrances in the uniprot column, separated by a comma. I will separate them and duplicate the rest of the rows."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
-   "id": "d37e7a82-8017-4b16-9419-1cb45d41858d",
+   "execution_count": 17,
+   "id": "6233210d-ba4b-4ca3-afb4-1b4203550298",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "pdb = pdb.drop('uniprot', axis=1).join(pdb['uniprot'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('uniprot'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "574b2698-4bcc-4c9d-8db8-61e327362d3e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pdb[\"uniprot\"].str.contains(\",\").sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "8525155a-ffdc-47e4-8bb6-ce720f0dbc49",
    "metadata": {},
    "outputs": [
     {
-     "ename": "HTTPError",
-     "evalue": "HTTP Error 413: ",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mHTTPError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[0;32m/tmp/ipykernel_224/4090906344.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# pdb_seq = enrich_from_uniprot(pdb, \"uniprot\", \"gene name\", uniprot_from=\"ACC\", uniprot_to=\"GENENAME\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;31m# swp_seq = enrich_from_uniprot(swp, \"uniprot\", \"gene name\", uniprot_from=\"ACC\", uniprot_to=\"GENENAME\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mafdb_seq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menrich_from_uniprot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mafdb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"uniprot\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"gene name\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muniprot_from\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"ACC\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muniprot_to\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"GENENAME\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;32m/tmp/ipykernel_224/1426142881.py\u001b[0m in \u001b[0;36menrich_from_uniprot\u001b[0;34m(df, column_from, column_to, uniprot_from, uniprot_to)\u001b[0m\n\u001b[1;32m     31\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     32\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0menrich_from_uniprot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_from\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_to\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muniprot_from\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"PDB_ID\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muniprot_to\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"ACC\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 33\u001b[0;31m     \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_from_uniprot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_from\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muniprot_from\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muniprot_from\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muniprot_to\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muniprot_to\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     34\u001b[0m     \u001b[0mdf_map\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_id_map\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_from\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_to\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     35\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mon\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcolumn_from\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/tmp/ipykernel_224/1426142881.py\u001b[0m in \u001b[0;36mget_from_uniprot\u001b[0;34m(df, column, uniprot_from, uniprot_to)\u001b[0m\n\u001b[1;32m     13\u001b[0m     \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'utf-8'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     14\u001b[0m     \u001b[0mreq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0murllib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mRequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m     \u001b[0;32mwith\u001b[0m \u001b[0murllib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0murlopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     16\u001b[0m         \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     17\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/opt/conda/lib/python3.9/urllib/request.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(url, data, timeout, cafile, capath, cadefault, context)\u001b[0m\n\u001b[1;32m    212\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    213\u001b[0m         \u001b[0mopener\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_opener\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 214\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mopener\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    215\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    216\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0minstall_opener\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopener\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/opt/conda/lib/python3.9/urllib/request.py\u001b[0m in \u001b[0;36mopen\u001b[0;34m(self, fullurl, data, timeout)\u001b[0m\n\u001b[1;32m    521\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mprocessor\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocess_response\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    522\u001b[0m             \u001b[0mmeth\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprocessor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmeth_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 523\u001b[0;31m             \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmeth\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    524\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    525\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/opt/conda/lib/python3.9/urllib/request.py\u001b[0m in \u001b[0;36mhttp_response\u001b[0;34m(self, request, response)\u001b[0m\n\u001b[1;32m    630\u001b[0m         \u001b[0;31m# request was successfully received, understood, and accepted.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    631\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m200\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mcode\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;36m300\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 632\u001b[0;31m             response = self.parent.error(\n\u001b[0m\u001b[1;32m    633\u001b[0m                 'http', request, response, code, msg, hdrs)\n\u001b[1;32m    634\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/opt/conda/lib/python3.9/urllib/request.py\u001b[0m in \u001b[0;36merror\u001b[0;34m(self, proto, *args)\u001b[0m\n\u001b[1;32m    559\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mhttp_err\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    560\u001b[0m             \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'default'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'http_error_default'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0morig_args\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 561\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_chain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    562\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    563\u001b[0m \u001b[0;31m# XXX probably also want an abstract factory that knows when it makes\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/opt/conda/lib/python3.9/urllib/request.py\u001b[0m in \u001b[0;36m_call_chain\u001b[0;34m(self, chain, kind, meth_name, *args)\u001b[0m\n\u001b[1;32m    492\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mhandler\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mhandlers\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    493\u001b[0m             \u001b[0mfunc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhandler\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmeth_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 494\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    495\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    496\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/opt/conda/lib/python3.9/urllib/request.py\u001b[0m in \u001b[0;36mhttp_error_default\u001b[0;34m(self, req, fp, code, msg, hdrs)\u001b[0m\n\u001b[1;32m    639\u001b[0m \u001b[0;32mclass\u001b[0m \u001b[0mHTTPDefaultErrorHandler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mBaseHandler\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    640\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mhttp_error_default\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhdrs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 641\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0mHTTPError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfull_url\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhdrs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    642\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    643\u001b[0m \u001b[0;32mclass\u001b[0m \u001b[0mHTTPRedirectHandler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mBaseHandler\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mHTTPError\u001b[0m: HTTP Error 413: "
-     ]
+     "data": {
+      "text/plain": [
+       "4600922"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "# pdb_seq = enrich_from_uniprot(pdb, \"uniprot\", \"gene name\", uniprot_from=\"ACC\", uniprot_to=\"GENENAME\")\n",
-    "# swp_seq = enrich_from_uniprot(swp, \"uniprot\", \"gene name\", uniprot_from=\"ACC\", uniprot_to=\"GENENAME\")\n",
-    "afdb_seq = enrich_from_uniprot(afdb, \"uniprot\", \"gene name\", uniprot_from=\"ACC\", uniprot_to=\"GENENAME\")"
+    "len(pdb)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "b80e0e49-c947-45a5-b386-ae3a4631780b",
+   "id": "7cc67429-33c7-4596-b269-7d6f71aed6f4",
    "metadata": {},
    "source": [
-    "Translate the UniProt IDs to eggNOG IDs:"
+    "I will try and use the tool UNIMAPI to retrieve information from the UniProtIDs, including the sequences that we will need for a new emapper run. \n",
+    "UNIMAPI takes a csv file as an imput. I will export those from the foldseek result files as csv files.\n",
+    "\n",
+    "I just realized that I need to add the commas and make it a one-liner. Because I don't know how to do this in Python, I will just use regex and safe the files as afdb_uniprotIDs.txt, etc. "
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "de9d5f80-9ac8-4a14-a628-1e99af934403",
+   "id": "f0c4592d-1465-4ff8-bf7a-b20ac08b2586",
    "metadata": {},
    "outputs": [],
    "source": [
-    "pdb_seq = enrich_from_uniprot(pdb_seq, \"uniprot\", \"eggnog\", uniprot_from=\"ACC\", uniprot_to=\"EGGNOG_ID\")\n",
-    "swp_seq = enrich_from_uniprot(swp_seq, \"uniprot\", \"eggnog\", uniprot_from=\"ACC\", uniprot_to=\"EGGNOG_ID\")\n",
-    "afdb_seq = enrich_from_uniprot(afdb_seq, \"uniprot\", \"eggnog\", uniprot_from=\"ACC\", uniprot_to=\"EGGNOG_ID\")"
+    "afdb[\"uniprot\"].to_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/afdb_uniprotIDs.csv\", index=False, index_label=False, header=False, sep=\",\")\n",
+    "pdb[\"uniprot\"].to_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/pdb_uniprotIDs.csv\", index=False, index_label=False, header=False, sep=\",\")\n",
+    "swp[\"uniprot\"].to_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/swp_uniprotIDs.csv\", index=False, index_label=False, header=False, sep=\",\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "c4f36d35-8a75-4577-a948-15f9d6a76945",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Q8WZ42        13900\n",
+       "Q99996         3626\n",
+       "Q8WXI7         3612\n",
+       "Q9Y6V0         3004\n",
+       "Q15149         2975\n",
+       "              ...  \n",
+       "A0A1P6CI10        1\n",
+       "K7LJC4            1\n",
+       "K7LPC1            1\n",
+       "A0A0N7KCM8        1\n",
+       "P71601            1\n",
+       "Name: uniprot, Length: 524585, dtype: int64"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "afdb.uniprot.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "33436cf6-558d-42a1-9e3e-25f2c86d9e0f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "E3WDQ9    2605\n",
+       "Q5RC05    2472\n",
+       "Q5M7C3    2382\n",
+       "P59597    2346\n",
+       "P02467    2107\n",
+       "          ... \n",
+       "Q6LN45       1\n",
+       "Q56647       1\n",
+       "B1JQI1       1\n",
+       "C5FM58       1\n",
+       "G3KIM4       1\n",
+       "Name: uniprot, Length: 405846, dtype: int64"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "swp.uniprot.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "a84a9095-e6ba-4902-b7b0-192005816f5c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "P01116    18010\n",
+       "P0CX51    10846\n",
+       "O13516    10730\n",
+       "P05756    10359\n",
+       "Q3E7X9    10323\n",
+       "          ...  \n",
+       "P80379        1\n",
+       "Q56691        1\n",
+       "Q7SIH1        1\n",
+       "Q63041        1\n",
+       "Q484B6        1\n",
+       "Name: uniprot, Length: 35260, dtype: int64"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pdb.uniprot.value_counts()"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "8ec832ac-8525-438d-819a-cab2232f574f",
+   "id": "49605efa-c6ef-4f62-8c65-9c5fb87e07b8",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "afdb contains 524585 unique uniprot IDs. \n",
+    "swp contains 405846 unique uniprot IDs.\n",
+    "pdb contains 35260 unique uniprot IDs.\n",
+    "\n",
+    "\n",
+    "tr -d '\\n' < afdb_uniprotIDs.txt > afdb_uniprotIDs_oneline.txt\n",
+    "tr -d '\\n' < pdb_uniprotIDs.txt > pdb_uniprotIDs_oneline.txt\n",
+    "tr -d '\\n' < swp_uniprotIDs.txt > swp_uniprotIDs_oneline.txt\n",
+    "\n",
+    "For the pdb oneliner file, I additionally had to remove those thingies -> \" because sometimes a pdb entrance has multiple uniprot ids. So in principle I had to delete the cases which didn't have uniprot ids (\"\",) and afterwards delete the empty lines. After that I had to delete leftover quotation marks. \n",
+    "\n",
+    "tr -d '\"' < pdb_uniprotIDs_oneline.txt > pdb_uniprotIDs_oneline.txt\n",
+    "\n",
+    "After that, I can run the command for each file using terminal, retrieving specificly picked information.\n",
+    "\n",
+    "First, I will make use of the --fasta option, that will just return a fasta file for the sequences. This fasta file can then be used to run EggNOG mapper manually.\n",
+    "\n",
+    "upimapi.py -i afdb_uniprotIDs_oneline.txt -o /Volumes/arendt/Fabian/PhD/Computational/Spongefold/UNIMAPI/afdb/ --fasta\n",
+    "upimapi.py -i pdb_uniprotIDs_oneline.txt -o /Volumes/arendt/Fabian/PhD/Computational/Spongefold/UNIMAPI/pdb/ --fasta\n",
+    "upimapi.py -i swp_uniprotIDs_oneline.txt -o /Volumes/arendt/Fabian/PhD/Computational/Spongefold/UNIMAPI/swp/ --fasta\n",
+    "\n",
+    "Fasta files are saved in the respective folders as: uniprotinfo.fasta\n",
+    "\n",
+    "After UPIMAPI is translating UniprotID to fasta, it tries to check if all uniprotIDs have a corresponding fasta sequence. However, while the translation happens using the unique UniprotIDs, for some reason it checks if all UniprotIDs (in the case of afdb 12.5 million) are represented. I got a timeout after 24 hours for afdb and swp. Only pdb ran though. I will check how many \">\" are in the fasta files:\n",
+    "\n",
+    "afdb: 525613\n",
+    "swp: 405935\n",
+    "pdb: 35280\n",
+    "\n",
+    "How can there be more fasta entrances than unique uniprotIDs...?\n",
+    "\n",
+    "For the pdb fasta I could immediatelly run e-mapper. For afdb and swp I had to divide the fasta files into 100000 entries files (maximum input for emapper). I did this using the script fasta splitter (http://kirill-kryukov.com/study/tools/fasta-splitter/) with the following commands:\n",
+    "\n",
+    "perl fasta-splitter.pl --part-size 100000 ./afdb/uniprotinfo.fasta --nopad --measure count --out-dir ./afdb/fasta_split/\n",
+    "\n",
+    "perl fasta-splitter.pl --part-size 100000 ./swp/uniprotinfo.fasta --nopad --measure count --out-dir ./swp/fasta_split/\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "5e43cbad-cc59-4814-8ab6-bf1bae920a3e",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "Cross-reference the eggNOG IDs with the eggNOG annotation that gives a nice name/description for each (most) orthogroup:"
+    "pdb_emapper = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/UPIMAPI/pdb/emapper/pdb_emapper.tsv\", sep='\\t', skiprows=4, skipfooter=3, engine='python')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
-   "id": "33c69bec-6a43-49c2-8ea8-3f011fcf86c4",
+   "execution_count": 21,
+   "id": "747a5c72-a19b-4afe-ae1c-b896c298d7ee",
    "metadata": {},
    "outputs": [],
    "source": [
-    "eggnog = pd.read_csv(\"../data/e5.og_annotations.tsv\", sep=\"\\t\", header=None)\n",
-    "eggnog.columns = [\"taxon\", \"orthogroup\", \"evidence\", \"name\"]\n",
-    "eggnog.dropna(inplace=True)\n",
-    "\n",
-    "eggnog.set_index(\"orthogroup\", inplace=True)"
+    "pdb_emapper[\"uniprot\"] = pdb_emapper[\"#query\"].str.split(\"|\").str[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "e3a291be-38e8-495e-b455-72b0770631fe",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "P62238        1\n",
+       "Q9Y324        1\n",
+       "D5DKI8        1\n",
+       "Q6TEK8        1\n",
+       "P04157        1\n",
+       "             ..\n",
+       "P04181        1\n",
+       "A0A1H6Q8Z5    1\n",
+       "P16038        1\n",
+       "Q4JB24        1\n",
+       "P38326        1\n",
+       "Name: uniprot, Length: 34551, dtype: int64"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pdb_emapper.uniprot.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "50150ed8-4d19-4cad-9a7c-5528e3f622f2",
+   "metadata": {},
+   "source": [
+    "Although I have 35280 entries in the fasta file, emapper only scans though 34554 (tail of emapper file). Unique uniprot IDs in the pdb emapper files are then 34551. The question is where the other 700 entries went..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "c522cd0d-704a-486f-a528-a31cf09315de",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pdb_merge = pd.merge(pdb, pdb_emapper, on=\"uniprot\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "7387e9ab-8037-4a6e-8a9a-ec18166247d8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "P01116        18010\n",
+       "P0CX51        10846\n",
+       "O13516        10730\n",
+       "P05756        10359\n",
+       "Q3E7X9        10323\n",
+       "              ...  \n",
+       "A0KKT0            1\n",
+       "Q06672            1\n",
+       "P86179            1\n",
+       "A0A090BWT0        1\n",
+       "Q484B6            1\n",
+       "Name: uniprot, Length: 34551, dtype: int64"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pdb_merge.uniprot.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "08775c42-6ab6-4ab0-8104-e9848d8029f4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "4500383"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(pdb_merge)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "0f0bd0c2-79de-44cf-95ec-3eeb7687be46",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['query',\n",
+       " 'target',\n",
+       " 'seq. id.',\n",
+       " 'alignment length',\n",
+       " 'no. mismatches',\n",
+       " 'no. gap open',\n",
+       " 'query start',\n",
+       " 'target start',\n",
+       " 'query end',\n",
+       " 'target end',\n",
+       " 'e value',\n",
+       " 'bit score',\n",
+       " 'uniprot',\n",
+       " '#query',\n",
+       " 'seed_ortholog',\n",
+       " 'evalue',\n",
+       " 'score',\n",
+       " 'eggNOG_OGs',\n",
+       " 'max_annot_lvl',\n",
+       " 'COG_category',\n",
+       " 'Description',\n",
+       " 'Preferred_name',\n",
+       " 'GOs',\n",
+       " 'EC',\n",
+       " 'KEGG_ko',\n",
+       " 'KEGG_Pathway',\n",
+       " 'KEGG_Module',\n",
+       " 'KEGG_Reaction',\n",
+       " 'KEGG_rclass',\n",
+       " 'BRITE',\n",
+       " 'KEGG_TC',\n",
+       " 'CAZy',\n",
+       " 'BiGG_Reaction',\n",
+       " 'PFAMs']"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(pdb_merge.columns)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fe4e7ac9-697d-4a67-a4ec-04ea44f56b48",
+   "metadata": {},
+   "source": [
+    "I additionally will run UPIMAPI with pdb, afdb and swp input and retrieving information that is stored in Uniprot.\n",
+    "\n",
+    "upimapi.py -i afdb_uniprotIDs_oneline.txt -o /Volumes/arendt/Fabian/PhD/Computational/Spongefold/UNIMAPI/afdb/ -dbs \"evolutionary genealogy of genes: Non-supervised Orthologous Groups\"\n",
+    "\n",
+    "upimapi.py -i pdb_uniprotIDs_oneline.txt -o /Volumes/arendt/Fabian/PhD/Computational/Spongefold/UNIMAPI/pdb/ -dbs \"evolutionary genealogy of genes: Non-supervised Orthologous Groups\"\n",
+    "\n",
+    "upimapi.py -i swp_uniprotIDs_oneline.txt -o /Volumes/arendt/Fabian/PhD/Computational/Spongefold/UNIMAPI/swp/ -dbs \"evolutionary genealogy of genes: Non-supervised Orthologous Groups\"\n",
+    "\n",
+    "I will run these on the cluster. The bash scripts can be found in the respective folders."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "7dbb6daf-fbf9-46bf-9f4b-1ef6805ec2e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pdb_uniprot_info = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/UPIMAPI/pdb/pdb_upimapi_mapping_eggnog.tsv\", sep='\\t')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "cc007880-726c-457a-b078-e43d251eebc1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "pdb_uniprot_info.rename(columns={'Entry': 'uniprot'}, inplace=True)\n",
+    "pdb_uniprot_info = pdb_uniprot_info[[\"uniprot\", \"Entry name\", \"Gene names\", \"Function [CC]\", \"Taxonomic lineage (PHYLUM)\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "f17d7bc1-a6c4-4bd6-85f1-a95fb134901a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>uniprot</th>\n",
+       "      <th>Entry name</th>\n",
+       "      <th>Gene names</th>\n",
+       "      <th>Function [CC]</th>\n",
+       "      <th>Taxonomic lineage (PHYLUM)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>P21553</td>\n",
+       "      <td>CISY_THEAC</td>\n",
+       "      <td>gltA Ta0169</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Candidatus Thermoplasmatota</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Q61179</td>\n",
+       "      <td>IRF9_MOUSE</td>\n",
+       "      <td>Irf9 Isgf3g</td>\n",
+       "      <td>FUNCTION: Transcription factor that plays an e...</td>\n",
+       "      <td>Chordata</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>O34873</td>\n",
+       "      <td>HMGCL_BACSU</td>\n",
+       "      <td>yngG BSU18230</td>\n",
+       "      <td>FUNCTION: Involved in the catabolism of branch...</td>\n",
+       "      <td>Firmicutes</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Q9BU02</td>\n",
+       "      <td>THTPA_HUMAN</td>\n",
+       "      <td>THTPA</td>\n",
+       "      <td>FUNCTION: Hydrolase highly specific for thiami...</td>\n",
+       "      <td>Chordata</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Q9LLQ2</td>\n",
+       "      <td>P102B_LUPLU</td>\n",
+       "      <td>PR10.2B</td>\n",
+       "      <td>FUNCTION: Class II ribonuclease (RNase) (By si...</td>\n",
+       "      <td>Streptophyta</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35255</th>\n",
+       "      <td>Q7JZW2</td>\n",
+       "      <td>Q7JZW2_DROME</td>\n",
+       "      <td>RpS15 anon-EST:Posey137 anon-EST:Posey185 Dmel...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Arthropoda</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35256</th>\n",
+       "      <td>Q9HW09</td>\n",
+       "      <td>PANE_PSEAE</td>\n",
+       "      <td>panE PA4397</td>\n",
+       "      <td>FUNCTION: Catalyzes the NADPH-dependent reduct...</td>\n",
+       "      <td>Proteobacteria</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35257</th>\n",
+       "      <td>F2Z508</td>\n",
+       "      <td>F2Z508_PIG</td>\n",
+       "      <td>STMN4</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Chordata</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35258</th>\n",
+       "      <td>Q8ZM82</td>\n",
+       "      <td>IDI_SALTY</td>\n",
+       "      <td>idi STM3039</td>\n",
+       "      <td>FUNCTION: Catalyzes the 1,3-allylic rearrangem...</td>\n",
+       "      <td>Proteobacteria</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35259</th>\n",
+       "      <td>P12873</td>\n",
+       "      <td>RL29_BACSU</td>\n",
+       "      <td>rpmC BSU01240</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Firmicutes</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>35260 rows × 5 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      uniprot    Entry name  \\\n",
+       "0      P21553    CISY_THEAC   \n",
+       "1      Q61179    IRF9_MOUSE   \n",
+       "2      O34873   HMGCL_BACSU   \n",
+       "3      Q9BU02   THTPA_HUMAN   \n",
+       "4      Q9LLQ2   P102B_LUPLU   \n",
+       "...       ...           ...   \n",
+       "35255  Q7JZW2  Q7JZW2_DROME   \n",
+       "35256  Q9HW09    PANE_PSEAE   \n",
+       "35257  F2Z508    F2Z508_PIG   \n",
+       "35258  Q8ZM82     IDI_SALTY   \n",
+       "35259  P12873    RL29_BACSU   \n",
+       "\n",
+       "                                              Gene names  \\\n",
+       "0                                            gltA Ta0169   \n",
+       "1                                            Irf9 Isgf3g   \n",
+       "2                                          yngG BSU18230   \n",
+       "3                                                  THTPA   \n",
+       "4                                                PR10.2B   \n",
+       "...                                                  ...   \n",
+       "35255  RpS15 anon-EST:Posey137 anon-EST:Posey185 Dmel...   \n",
+       "35256                                        panE PA4397   \n",
+       "35257                                              STMN4   \n",
+       "35258                                        idi STM3039   \n",
+       "35259                                      rpmC BSU01240   \n",
+       "\n",
+       "                                           Function [CC]  \\\n",
+       "0                                                    NaN   \n",
+       "1      FUNCTION: Transcription factor that plays an e...   \n",
+       "2      FUNCTION: Involved in the catabolism of branch...   \n",
+       "3      FUNCTION: Hydrolase highly specific for thiami...   \n",
+       "4      FUNCTION: Class II ribonuclease (RNase) (By si...   \n",
+       "...                                                  ...   \n",
+       "35255                                                NaN   \n",
+       "35256  FUNCTION: Catalyzes the NADPH-dependent reduct...   \n",
+       "35257                                                NaN   \n",
+       "35258  FUNCTION: Catalyzes the 1,3-allylic rearrangem...   \n",
+       "35259                                                NaN   \n",
+       "\n",
+       "        Taxonomic lineage (PHYLUM)  \n",
+       "0      Candidatus Thermoplasmatota  \n",
+       "1                         Chordata  \n",
+       "2                       Firmicutes  \n",
+       "3                         Chordata  \n",
+       "4                     Streptophyta  \n",
+       "...                            ...  \n",
+       "35255                   Arthropoda  \n",
+       "35256               Proteobacteria  \n",
+       "35257                     Chordata  \n",
+       "35258               Proteobacteria  \n",
+       "35259                   Firmicutes  \n",
+       "\n",
+       "[35260 rows x 5 columns]"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pdb_uniprot_info"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "afb95431-1ec2-491d-83cf-b288322e7fd6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "pdb_merge = pd.merge(pdb_merge, pdb_uniprot_info, on=\"uniprot\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "cc0ab000-40e8-423a-8d75-6c3b59753150",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "P01116        18010\n",
+       "P0CX51        10846\n",
+       "O13516        10730\n",
+       "P05756        10359\n",
+       "Q3E7X9        10323\n",
+       "              ...  \n",
+       "A0KKT0            1\n",
+       "Q06672            1\n",
+       "P86179            1\n",
+       "A0A090BWT0        1\n",
+       "Q484B6            1\n",
+       "Name: uniprot, Length: 34551, dtype: int64"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pdb_merge.uniprot.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "8560bfda-d99c-463f-8ed5-f834449d252f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['query',\n",
+       " 'target',\n",
+       " 'seq. id.',\n",
+       " 'alignment length',\n",
+       " 'no. mismatches',\n",
+       " 'no. gap open',\n",
+       " 'query start',\n",
+       " 'target start',\n",
+       " 'query end',\n",
+       " 'target end',\n",
+       " 'e value',\n",
+       " 'bit score',\n",
+       " 'uniprot',\n",
+       " '#query',\n",
+       " 'seed_ortholog',\n",
+       " 'evalue',\n",
+       " 'score',\n",
+       " 'eggNOG_OGs',\n",
+       " 'max_annot_lvl',\n",
+       " 'COG_category',\n",
+       " 'Description',\n",
+       " 'Preferred_name',\n",
+       " 'GOs',\n",
+       " 'EC',\n",
+       " 'KEGG_ko',\n",
+       " 'KEGG_Pathway',\n",
+       " 'KEGG_Module',\n",
+       " 'KEGG_Reaction',\n",
+       " 'KEGG_rclass',\n",
+       " 'BRITE',\n",
+       " 'KEGG_TC',\n",
+       " 'CAZy',\n",
+       " 'BiGG_Reaction',\n",
+       " 'PFAMs',\n",
+       " 'Entry name',\n",
+       " 'Gene names',\n",
+       " 'Function [CC]',\n",
+       " 'Taxonomic lineage (PHYLUM)']"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(pdb_merge.columns)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "09ae8063-9c39-4654-8042-031148dca85d",
+   "metadata": {},
+   "source": [
+    "This list seems quiet confusing but in general it now contains the following information:\n",
+    " - Foldseek query number of that protein and target. The next columns all have to do with foldseek alignment quality, including evalue and bit score.\n",
+    " - uniprot ID of foldseek pdb, translated with the uniprot API.\n",
+    " - \"#query\" is the long query name of the fasta that UPIMAPI pulled out from uniprot using the uniprot ID. All columns until PFAM stem from the eggnog search using the fasta file.\n",
+    " - Entry name, gene names, Function and Taxonomic lineage (PHYLUM) additionally stem from UPIMAPI retrieving additional infomration from uniprot."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "3d1e3600-dbe4-4185-86b1-fd439e04ff1c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>query</th>\n",
+       "      <th>target</th>\n",
+       "      <th>seq. id.</th>\n",
+       "      <th>alignment length</th>\n",
+       "      <th>no. mismatches</th>\n",
+       "      <th>no. gap open</th>\n",
+       "      <th>query start</th>\n",
+       "      <th>target start</th>\n",
+       "      <th>query end</th>\n",
+       "      <th>target end</th>\n",
+       "      <th>...</th>\n",
+       "      <th>KEGG_rclass</th>\n",
+       "      <th>BRITE</th>\n",
+       "      <th>KEGG_TC</th>\n",
+       "      <th>CAZy</th>\n",
+       "      <th>BiGG_Reaction</th>\n",
+       "      <th>PFAMs</th>\n",
+       "      <th>Entry name</th>\n",
+       "      <th>Gene names</th>\n",
+       "      <th>Function [CC]</th>\n",
+       "      <th>Taxonomic lineage (PHYLUM)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>4731</td>\n",
+       "      <td>5htk</td>\n",
+       "      <td>0.601</td>\n",
+       "      <td>251</td>\n",
+       "      <td>97</td>\n",
+       "      <td>3</td>\n",
+       "      <td>5</td>\n",
+       "      <td>254</td>\n",
+       "      <td>176</td>\n",
+       "      <td>424</td>\n",
+       "      <td>...</td>\n",
+       "      <td>RC00152</td>\n",
+       "      <td>ko00000,ko00001,ko01000</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>6PF2K,His_Phos_1</td>\n",
+       "      <td>F262_HUMAN</td>\n",
+       "      <td>PFKFB2</td>\n",
+       "      <td>FUNCTION: Synthesis and degradation of fructos...</td>\n",
+       "      <td>Chordata</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>4731</td>\n",
+       "      <td>5htk</td>\n",
+       "      <td>0.629</td>\n",
+       "      <td>240</td>\n",
+       "      <td>87</td>\n",
+       "      <td>2</td>\n",
+       "      <td>9</td>\n",
+       "      <td>248</td>\n",
+       "      <td>181</td>\n",
+       "      <td>418</td>\n",
+       "      <td>...</td>\n",
+       "      <td>RC00152</td>\n",
+       "      <td>ko00000,ko00001,ko01000</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>6PF2K,His_Phos_1</td>\n",
+       "      <td>F262_HUMAN</td>\n",
+       "      <td>PFKFB2</td>\n",
+       "      <td>FUNCTION: Synthesis and degradation of fructos...</td>\n",
+       "      <td>Chordata</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3214</td>\n",
+       "      <td>5htk</td>\n",
+       "      <td>0.127</td>\n",
+       "      <td>212</td>\n",
+       "      <td>128</td>\n",
+       "      <td>12</td>\n",
+       "      <td>135</td>\n",
+       "      <td>293</td>\n",
+       "      <td>1</td>\n",
+       "      <td>208</td>\n",
+       "      <td>...</td>\n",
+       "      <td>RC00152</td>\n",
+       "      <td>ko00000,ko00001,ko01000</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>6PF2K,His_Phos_1</td>\n",
+       "      <td>F262_HUMAN</td>\n",
+       "      <td>PFKFB2</td>\n",
+       "      <td>FUNCTION: Synthesis and degradation of fructos...</td>\n",
+       "      <td>Chordata</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3214</td>\n",
+       "      <td>5htk</td>\n",
+       "      <td>0.132</td>\n",
+       "      <td>212</td>\n",
+       "      <td>127</td>\n",
+       "      <td>12</td>\n",
+       "      <td>135</td>\n",
+       "      <td>293</td>\n",
+       "      <td>1</td>\n",
+       "      <td>208</td>\n",
+       "      <td>...</td>\n",
+       "      <td>RC00152</td>\n",
+       "      <td>ko00000,ko00001,ko01000</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>6PF2K,His_Phos_1</td>\n",
+       "      <td>F262_HUMAN</td>\n",
+       "      <td>PFKFB2</td>\n",
+       "      <td>FUNCTION: Synthesis and degradation of fructos...</td>\n",
+       "      <td>Chordata</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>3311</td>\n",
+       "      <td>5htk</td>\n",
+       "      <td>0.180</td>\n",
+       "      <td>211</td>\n",
+       "      <td>112</td>\n",
+       "      <td>11</td>\n",
+       "      <td>9</td>\n",
+       "      <td>182</td>\n",
+       "      <td>3</td>\n",
+       "      <td>189</td>\n",
+       "      <td>...</td>\n",
+       "      <td>RC00152</td>\n",
+       "      <td>ko00000,ko00001,ko01000</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>6PF2K,His_Phos_1</td>\n",
+       "      <td>F262_HUMAN</td>\n",
+       "      <td>PFKFB2</td>\n",
+       "      <td>FUNCTION: Synthesis and degradation of fructos...</td>\n",
+       "      <td>Chordata</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4500378</th>\n",
+       "      <td>26834</td>\n",
+       "      <td>4c65</td>\n",
+       "      <td>0.157</td>\n",
+       "      <td>484</td>\n",
+       "      <td>270</td>\n",
+       "      <td>25</td>\n",
+       "      <td>7</td>\n",
+       "      <td>423</td>\n",
+       "      <td>6</td>\n",
+       "      <td>418</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>Amidohydro_1</td>\n",
+       "      <td>OTASE_ASPNC</td>\n",
+       "      <td>Am2 An14g02080</td>\n",
+       "      <td>FUNCTION: Carboxypeptidase that catalyzes the ...</td>\n",
+       "      <td>Ascomycota</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4500379</th>\n",
+       "      <td>26834</td>\n",
+       "      <td>4c5y</td>\n",
+       "      <td>0.142</td>\n",
+       "      <td>497</td>\n",
+       "      <td>265</td>\n",
+       "      <td>26</td>\n",
+       "      <td>7</td>\n",
+       "      <td>424</td>\n",
+       "      <td>7</td>\n",
+       "      <td>421</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>Amidohydro_1</td>\n",
+       "      <td>OTASE_ASPNC</td>\n",
+       "      <td>Am2 An14g02080</td>\n",
+       "      <td>FUNCTION: Carboxypeptidase that catalyzes the ...</td>\n",
+       "      <td>Ascomycota</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4500380</th>\n",
+       "      <td>26834</td>\n",
+       "      <td>4c5z</td>\n",
+       "      <td>0.158</td>\n",
+       "      <td>493</td>\n",
+       "      <td>269</td>\n",
+       "      <td>27</td>\n",
+       "      <td>2</td>\n",
+       "      <td>424</td>\n",
+       "      <td>5</td>\n",
+       "      <td>421</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>Amidohydro_1</td>\n",
+       "      <td>OTASE_ASPNA</td>\n",
+       "      <td>Am2 ASPNIDRAFT_41631</td>\n",
+       "      <td>FUNCTION: Carboxypeptidase that catalyzes the ...</td>\n",
+       "      <td>Ascomycota</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4500381</th>\n",
+       "      <td>26834</td>\n",
+       "      <td>4c5z</td>\n",
+       "      <td>0.155</td>\n",
+       "      <td>495</td>\n",
+       "      <td>267</td>\n",
+       "      <td>24</td>\n",
+       "      <td>2</td>\n",
+       "      <td>424</td>\n",
+       "      <td>6</td>\n",
+       "      <td>421</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>Amidohydro_1</td>\n",
+       "      <td>OTASE_ASPNA</td>\n",
+       "      <td>Am2 ASPNIDRAFT_41631</td>\n",
+       "      <td>FUNCTION: Carboxypeptidase that catalyzes the ...</td>\n",
+       "      <td>Ascomycota</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4500382</th>\n",
+       "      <td>26834</td>\n",
+       "      <td>5xgx</td>\n",
+       "      <td>0.129</td>\n",
+       "      <td>477</td>\n",
+       "      <td>250</td>\n",
+       "      <td>32</td>\n",
+       "      <td>5</td>\n",
+       "      <td>423</td>\n",
+       "      <td>1</td>\n",
+       "      <td>370</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>ko00000,ko01000,ko01002</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>Amidohydro_1,Amidohydro_3</td>\n",
+       "      <td>Q484B6_COLP3</td>\n",
+       "      <td>iadA CPS_1869</td>\n",
+       "      <td>FUNCTION: Catalyzes the hydrolytic cleavage of...</td>\n",
+       "      <td>Proteobacteria</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>4500383 rows × 38 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         query target  seq. id.  alignment length  no. mismatches  \\\n",
+       "0         4731   5htk     0.601               251              97   \n",
+       "1         4731   5htk     0.629               240              87   \n",
+       "2         3214   5htk     0.127               212             128   \n",
+       "3         3214   5htk     0.132               212             127   \n",
+       "4         3311   5htk     0.180               211             112   \n",
+       "...        ...    ...       ...               ...             ...   \n",
+       "4500378  26834   4c65     0.157               484             270   \n",
+       "4500379  26834   4c5y     0.142               497             265   \n",
+       "4500380  26834   4c5z     0.158               493             269   \n",
+       "4500381  26834   4c5z     0.155               495             267   \n",
+       "4500382  26834   5xgx     0.129               477             250   \n",
+       "\n",
+       "         no. gap open  query start  target start  query end  target end  ...  \\\n",
+       "0                   3            5           254        176         424  ...   \n",
+       "1                   2            9           248        181         418  ...   \n",
+       "2                  12          135           293          1         208  ...   \n",
+       "3                  12          135           293          1         208  ...   \n",
+       "4                  11            9           182          3         189  ...   \n",
+       "...               ...          ...           ...        ...         ...  ...   \n",
+       "4500378            25            7           423          6         418  ...   \n",
+       "4500379            26            7           424          7         421  ...   \n",
+       "4500380            27            2           424          5         421  ...   \n",
+       "4500381            24            2           424          6         421  ...   \n",
+       "4500382            32            5           423          1         370  ...   \n",
+       "\n",
+       "         KEGG_rclass                    BRITE KEGG_TC CAZy BiGG_Reaction  \\\n",
+       "0            RC00152  ko00000,ko00001,ko01000       -    -             -   \n",
+       "1            RC00152  ko00000,ko00001,ko01000       -    -             -   \n",
+       "2            RC00152  ko00000,ko00001,ko01000       -    -             -   \n",
+       "3            RC00152  ko00000,ko00001,ko01000       -    -             -   \n",
+       "4            RC00152  ko00000,ko00001,ko01000       -    -             -   \n",
+       "...              ...                      ...     ...  ...           ...   \n",
+       "4500378            -                        -       -    -             -   \n",
+       "4500379            -                        -       -    -             -   \n",
+       "4500380            -                        -       -    -             -   \n",
+       "4500381            -                        -       -    -             -   \n",
+       "4500382            -  ko00000,ko01000,ko01002       -    -             -   \n",
+       "\n",
+       "                             PFAMs    Entry name            Gene names  \\\n",
+       "0                 6PF2K,His_Phos_1    F262_HUMAN                PFKFB2   \n",
+       "1                 6PF2K,His_Phos_1    F262_HUMAN                PFKFB2   \n",
+       "2                 6PF2K,His_Phos_1    F262_HUMAN                PFKFB2   \n",
+       "3                 6PF2K,His_Phos_1    F262_HUMAN                PFKFB2   \n",
+       "4                 6PF2K,His_Phos_1    F262_HUMAN                PFKFB2   \n",
+       "...                            ...           ...                   ...   \n",
+       "4500378               Amidohydro_1   OTASE_ASPNC        Am2 An14g02080   \n",
+       "4500379               Amidohydro_1   OTASE_ASPNC        Am2 An14g02080   \n",
+       "4500380               Amidohydro_1   OTASE_ASPNA  Am2 ASPNIDRAFT_41631   \n",
+       "4500381               Amidohydro_1   OTASE_ASPNA  Am2 ASPNIDRAFT_41631   \n",
+       "4500382  Amidohydro_1,Amidohydro_3  Q484B6_COLP3         iadA CPS_1869   \n",
+       "\n",
+       "                                             Function [CC]  \\\n",
+       "0        FUNCTION: Synthesis and degradation of fructos...   \n",
+       "1        FUNCTION: Synthesis and degradation of fructos...   \n",
+       "2        FUNCTION: Synthesis and degradation of fructos...   \n",
+       "3        FUNCTION: Synthesis and degradation of fructos...   \n",
+       "4        FUNCTION: Synthesis and degradation of fructos...   \n",
+       "...                                                    ...   \n",
+       "4500378  FUNCTION: Carboxypeptidase that catalyzes the ...   \n",
+       "4500379  FUNCTION: Carboxypeptidase that catalyzes the ...   \n",
+       "4500380  FUNCTION: Carboxypeptidase that catalyzes the ...   \n",
+       "4500381  FUNCTION: Carboxypeptidase that catalyzes the ...   \n",
+       "4500382  FUNCTION: Catalyzes the hydrolytic cleavage of...   \n",
+       "\n",
+       "        Taxonomic lineage (PHYLUM)  \n",
+       "0                         Chordata  \n",
+       "1                         Chordata  \n",
+       "2                         Chordata  \n",
+       "3                         Chordata  \n",
+       "4                         Chordata  \n",
+       "...                            ...  \n",
+       "4500378                 Ascomycota  \n",
+       "4500379                 Ascomycota  \n",
+       "4500380                 Ascomycota  \n",
+       "4500381                 Ascomycota  \n",
+       "4500382             Proteobacteria  \n",
+       "\n",
+       "[4500383 rows x 38 columns]"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pdb_merge"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "81c73dbe-3b82-465e-a6c0-27a0465a003a",
+   "metadata": {},
+   "source": [
+    "NOw lets do the same for afdb and swp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "414f25d8-73d5-4c87-9cc9-e0afbfd0644b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get data file names\n",
+    "path = \"/g/arendt/Fabian/PhD/Computational/Spongefold/UPIMAPI/afdb/emapper/\"\n",
+    "filenames = glob.glob(path + \"/*.tsv\")\n",
+    "\n",
+    "dfs = []\n",
+    "for filename in filenames:\n",
+    "    dfs.append(pd.read_csv(filename, sep='\\t', skiprows=4, skipfooter=3, engine='python'))\n",
+    "\n",
+    "# Concatenate all data into one DataFrame\n",
+    "afdb_emapper = pd.concat(dfs, ignore_index=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "ae989a35-adec-4930-a614-488a37029c05",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "afdb_emapper[\"uniprot\"] = afdb_emapper[\"#query\"].str.split(\"|\").str[1]\n",
+    "\n",
+    "afdb_merge = pd.merge(afdb, afdb_emapper, on=\"uniprot\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "8841c12f-8f85-45e8-a048-5d276b6c7bef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "afdb_uniprot_info = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/UPIMAPI/afdb/afdb_upimapi_mapping_eggnog.tsv\", sep='\\t')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "66c30db2-7cd5-4e13-a2c3-d2b2a5ef648d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "afdb_uniprot_info.rename(columns={'Entry': 'uniprot'}, inplace=True)\n",
+    "afdb_uniprot_info = afdb_uniprot_info[[\"uniprot\", \"Entry name\", \"Gene names\", \"Function [CC]\", \"Taxonomic lineage (PHYLUM)\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "1d541f5d-7556-4095-9f31-d1f5c23cbbbd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "afdb_merge = pd.merge(afdb_merge, afdb_uniprot_info, on=\"uniprot\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "552d4713-5a5c-450a-9f58-d21e6cd7d501",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get data file names\n",
+    "path = \"/g/arendt/Fabian/PhD/Computational/Spongefold/UPIMAPI/swp/emapper/\"\n",
+    "filenames = glob.glob(path + \"/*.tsv\")\n",
+    "\n",
+    "dfs = []\n",
+    "for filename in filenames:\n",
+    "    dfs.append(pd.read_csv(filename, sep='\\t', skiprows=4, skipfooter=3, engine='python'))\n",
+    "\n",
+    "# Concatenate all data into one DataFrame\n",
+    "swp_emapper = pd.concat(dfs, ignore_index=True)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "81f84c74-3372-4b27-84ab-2c31d1bb4126",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "swp_emapper[\"uniprot\"] = swp_emapper[\"#query\"].str.split(\"|\").str[1]\n",
+    "\n",
+    "swp_merge = pd.merge(swp, swp_emapper, on=\"uniprot\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "9865cc34-8bd4-480f-9375-36f82401b07d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "swp_uniprot_info = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/UPIMAPI/swp/swp_upimapi_mapping_eggnog.tsv\", sep='\\t')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "12cc6569-4954-4ee9-a352-ae4353f71aef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "swp_uniprot_info.rename(columns={'Entry': 'uniprot'}, inplace=True)\n",
+    "swp_uniprot_info = swp_uniprot_info[[\"uniprot\", \"Entry name\", \"Gene names\", \"Function [CC]\", \"Taxonomic lineage (PHYLUM)\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "f216f616-b9c6-4371-a2eb-9476584f24c0",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "swp_merge = pd.merge(swp_merge, swp_uniprot_info, on=\"uniprot\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "39f10d29-f245-4685-a999-5440a165be3c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>query</th>\n",
+       "      <th>target</th>\n",
+       "      <th>seq. id.</th>\n",
+       "      <th>alignment length</th>\n",
+       "      <th>no. mismatches</th>\n",
+       "      <th>no. gap open</th>\n",
+       "      <th>query start</th>\n",
+       "      <th>target start</th>\n",
+       "      <th>query end</th>\n",
+       "      <th>target end</th>\n",
+       "      <th>...</th>\n",
+       "      <th>KEGG_rclass</th>\n",
+       "      <th>BRITE</th>\n",
+       "      <th>KEGG_TC</th>\n",
+       "      <th>CAZy</th>\n",
+       "      <th>BiGG_Reaction</th>\n",
+       "      <th>PFAMs</th>\n",
+       "      <th>Entry name</th>\n",
+       "      <th>Gene names</th>\n",
+       "      <th>Function [CC]</th>\n",
+       "      <th>Taxonomic lineage (PHYLUM)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>10000</td>\n",
+       "      <td>AF-O35840-F1</td>\n",
+       "      <td>0.251</td>\n",
+       "      <td>139</td>\n",
+       "      <td>91</td>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "      <td>136</td>\n",
+       "      <td>161</td>\n",
+       "      <td>292</td>\n",
+       "      <td>...</td>\n",
+       "      <td>RC00020,RC00037,RC00041,RC00055</td>\n",
+       "      <td>ko00000,ko00001,ko01000</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>LCAT</td>\n",
+       "      <td>LCAT_GERGM</td>\n",
+       "      <td>LCAT</td>\n",
+       "      <td>FUNCTION: Central enzyme in the extracellular ...</td>\n",
+       "      <td>Chordata</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1414</td>\n",
+       "      <td>AF-O35840-F1</td>\n",
+       "      <td>0.314</td>\n",
+       "      <td>343</td>\n",
+       "      <td>177</td>\n",
+       "      <td>9</td>\n",
+       "      <td>62</td>\n",
+       "      <td>397</td>\n",
+       "      <td>1</td>\n",
+       "      <td>292</td>\n",
+       "      <td>...</td>\n",
+       "      <td>RC00020,RC00037,RC00041,RC00055</td>\n",
+       "      <td>ko00000,ko00001,ko01000</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>LCAT</td>\n",
+       "      <td>LCAT_GERGM</td>\n",
+       "      <td>LCAT</td>\n",
+       "      <td>FUNCTION: Central enzyme in the extracellular ...</td>\n",
+       "      <td>Chordata</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1413</td>\n",
+       "      <td>AF-O35840-F1</td>\n",
+       "      <td>0.350</td>\n",
+       "      <td>134</td>\n",
+       "      <td>84</td>\n",
+       "      <td>3</td>\n",
+       "      <td>44</td>\n",
+       "      <td>175</td>\n",
+       "      <td>160</td>\n",
+       "      <td>292</td>\n",
+       "      <td>...</td>\n",
+       "      <td>RC00020,RC00037,RC00041,RC00055</td>\n",
+       "      <td>ko00000,ko00001,ko01000</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>LCAT</td>\n",
+       "      <td>LCAT_GERGM</td>\n",
+       "      <td>LCAT</td>\n",
+       "      <td>FUNCTION: Central enzyme in the extracellular ...</td>\n",
+       "      <td>Chordata</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>9999</td>\n",
+       "      <td>AF-O35840-F1</td>\n",
+       "      <td>0.235</td>\n",
+       "      <td>140</td>\n",
+       "      <td>99</td>\n",
+       "      <td>5</td>\n",
+       "      <td>73</td>\n",
+       "      <td>209</td>\n",
+       "      <td>158</td>\n",
+       "      <td>292</td>\n",
+       "      <td>...</td>\n",
+       "      <td>RC00020,RC00037,RC00041,RC00055</td>\n",
+       "      <td>ko00000,ko00001,ko01000</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>LCAT</td>\n",
+       "      <td>LCAT_GERGM</td>\n",
+       "      <td>LCAT</td>\n",
+       "      <td>FUNCTION: Central enzyme in the extracellular ...</td>\n",
+       "      <td>Chordata</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>26188</td>\n",
+       "      <td>AF-O35840-F1</td>\n",
+       "      <td>0.414</td>\n",
+       "      <td>82</td>\n",
+       "      <td>47</td>\n",
+       "      <td>1</td>\n",
+       "      <td>74</td>\n",
+       "      <td>154</td>\n",
+       "      <td>158</td>\n",
+       "      <td>239</td>\n",
+       "      <td>...</td>\n",
+       "      <td>RC00020,RC00037,RC00041,RC00055</td>\n",
+       "      <td>ko00000,ko00001,ko01000</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>LCAT</td>\n",
+       "      <td>LCAT_GERGM</td>\n",
+       "      <td>LCAT</td>\n",
+       "      <td>FUNCTION: Central enzyme in the extracellular ...</td>\n",
+       "      <td>Chordata</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11935340</th>\n",
+       "      <td>7835</td>\n",
+       "      <td>AF-Q8DKL5-F1</td>\n",
+       "      <td>0.145</td>\n",
+       "      <td>48</td>\n",
+       "      <td>41</td>\n",
+       "      <td>0</td>\n",
+       "      <td>7</td>\n",
+       "      <td>54</td>\n",
+       "      <td>240</td>\n",
+       "      <td>287</td>\n",
+       "      <td>...</td>\n",
+       "      <td>RC00004,RC00039,RC00041</td>\n",
+       "      <td>ko00000,ko00001,ko00002,ko01000,ko01004</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>FA_synthesis</td>\n",
+       "      <td>PLSX_THEVB</td>\n",
+       "      <td>plsX tlr0844</td>\n",
+       "      <td>FUNCTION: Catalyzes the reversible formation o...</td>\n",
+       "      <td>Cyanobacteria</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11935341</th>\n",
+       "      <td>7835</td>\n",
+       "      <td>AF-Q5N5X4-F1</td>\n",
+       "      <td>0.090</td>\n",
+       "      <td>44</td>\n",
+       "      <td>40</td>\n",
+       "      <td>0</td>\n",
+       "      <td>7</td>\n",
+       "      <td>50</td>\n",
+       "      <td>236</td>\n",
+       "      <td>279</td>\n",
+       "      <td>...</td>\n",
+       "      <td>RC00004,RC00039,RC00041</td>\n",
+       "      <td>ko00000,ko00001,ko00002,ko01000,ko01004</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>FA_synthesis</td>\n",
+       "      <td>PLSX_SYNP6</td>\n",
+       "      <td>plsX syc0103_c</td>\n",
+       "      <td>FUNCTION: Catalyzes the reversible formation o...</td>\n",
+       "      <td>Cyanobacteria</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11935342</th>\n",
+       "      <td>8001</td>\n",
+       "      <td>AF-Q5F786-F1</td>\n",
+       "      <td>0.250</td>\n",
+       "      <td>24</td>\n",
+       "      <td>18</td>\n",
+       "      <td>0</td>\n",
+       "      <td>13</td>\n",
+       "      <td>36</td>\n",
+       "      <td>220</td>\n",
+       "      <td>243</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>ko00000</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>DUF711</td>\n",
+       "      <td>Y1297_NEIG1</td>\n",
+       "      <td>NGO1297</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Proteobacteria</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11935343</th>\n",
+       "      <td>8001</td>\n",
+       "      <td>AF-C5CE71-F1</td>\n",
+       "      <td>0.105</td>\n",
+       "      <td>104</td>\n",
+       "      <td>72</td>\n",
+       "      <td>7</td>\n",
+       "      <td>12</td>\n",
+       "      <td>109</td>\n",
+       "      <td>137</td>\n",
+       "      <td>225</td>\n",
+       "      <td>...</td>\n",
+       "      <td>RC00002,RC00078</td>\n",
+       "      <td>ko00000,ko00001,ko00002,ko01000</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>CobS</td>\n",
+       "      <td>COBS_KOSOT</td>\n",
+       "      <td>cobS Kole_0456</td>\n",
+       "      <td>FUNCTION: Joins adenosylcobinamide-GDP and alp...</td>\n",
+       "      <td>Thermotogae</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11935344</th>\n",
+       "      <td>8001</td>\n",
+       "      <td>AF-G3KIM4-F1</td>\n",
+       "      <td>0.033</td>\n",
+       "      <td>120</td>\n",
+       "      <td>71</td>\n",
+       "      <td>6</td>\n",
+       "      <td>23</td>\n",
+       "      <td>111</td>\n",
+       "      <td>161</td>\n",
+       "      <td>266</td>\n",
+       "      <td>...</td>\n",
+       "      <td>RC00002,RC00818,RC01839</td>\n",
+       "      <td>ko00000,ko00001,ko00002,ko01000</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>HGD-D</td>\n",
+       "      <td>LCDA_ANAPI</td>\n",
+       "      <td>lcdA</td>\n",
+       "      <td>FUNCTION: Involved in the acrylate pathway for...</td>\n",
+       "      <td>Firmicutes</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>11935345 rows × 38 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          query        target  seq. id.  alignment length  no. mismatches  \\\n",
+       "0         10000  AF-O35840-F1     0.251               139              91   \n",
+       "1          1414  AF-O35840-F1     0.314               343             177   \n",
+       "2          1413  AF-O35840-F1     0.350               134              84   \n",
+       "3          9999  AF-O35840-F1     0.235               140              99   \n",
+       "4         26188  AF-O35840-F1     0.414                82              47   \n",
+       "...         ...           ...       ...               ...             ...   \n",
+       "11935340   7835  AF-Q8DKL5-F1     0.145                48              41   \n",
+       "11935341   7835  AF-Q5N5X4-F1     0.090                44              40   \n",
+       "11935342   8001  AF-Q5F786-F1     0.250                24              18   \n",
+       "11935343   8001  AF-C5CE71-F1     0.105               104              72   \n",
+       "11935344   8001  AF-G3KIM4-F1     0.033               120              71   \n",
+       "\n",
+       "          no. gap open  query start  target start  query end  target end  ...  \\\n",
+       "0                    4            4           136        161         292  ...   \n",
+       "1                    9           62           397          1         292  ...   \n",
+       "2                    3           44           175        160         292  ...   \n",
+       "3                    5           73           209        158         292  ...   \n",
+       "4                    1           74           154        158         239  ...   \n",
+       "...                ...          ...           ...        ...         ...  ...   \n",
+       "11935340             0            7            54        240         287  ...   \n",
+       "11935341             0            7            50        236         279  ...   \n",
+       "11935342             0           13            36        220         243  ...   \n",
+       "11935343             7           12           109        137         225  ...   \n",
+       "11935344             6           23           111        161         266  ...   \n",
+       "\n",
+       "                              KEGG_rclass  \\\n",
+       "0         RC00020,RC00037,RC00041,RC00055   \n",
+       "1         RC00020,RC00037,RC00041,RC00055   \n",
+       "2         RC00020,RC00037,RC00041,RC00055   \n",
+       "3         RC00020,RC00037,RC00041,RC00055   \n",
+       "4         RC00020,RC00037,RC00041,RC00055   \n",
+       "...                                   ...   \n",
+       "11935340          RC00004,RC00039,RC00041   \n",
+       "11935341          RC00004,RC00039,RC00041   \n",
+       "11935342                                -   \n",
+       "11935343                  RC00002,RC00078   \n",
+       "11935344          RC00002,RC00818,RC01839   \n",
+       "\n",
+       "                                            BRITE KEGG_TC CAZy BiGG_Reaction  \\\n",
+       "0                         ko00000,ko00001,ko01000       -    -             -   \n",
+       "1                         ko00000,ko00001,ko01000       -    -             -   \n",
+       "2                         ko00000,ko00001,ko01000       -    -             -   \n",
+       "3                         ko00000,ko00001,ko01000       -    -             -   \n",
+       "4                         ko00000,ko00001,ko01000       -    -             -   \n",
+       "...                                           ...     ...  ...           ...   \n",
+       "11935340  ko00000,ko00001,ko00002,ko01000,ko01004       -    -             -   \n",
+       "11935341  ko00000,ko00001,ko00002,ko01000,ko01004       -    -             -   \n",
+       "11935342                                  ko00000       -    -             -   \n",
+       "11935343          ko00000,ko00001,ko00002,ko01000       -    -             -   \n",
+       "11935344          ko00000,ko00001,ko00002,ko01000       -    -             -   \n",
+       "\n",
+       "                 PFAMs   Entry name      Gene names  \\\n",
+       "0                 LCAT   LCAT_GERGM            LCAT   \n",
+       "1                 LCAT   LCAT_GERGM            LCAT   \n",
+       "2                 LCAT   LCAT_GERGM            LCAT   \n",
+       "3                 LCAT   LCAT_GERGM            LCAT   \n",
+       "4                 LCAT   LCAT_GERGM            LCAT   \n",
+       "...                ...          ...             ...   \n",
+       "11935340  FA_synthesis   PLSX_THEVB    plsX tlr0844   \n",
+       "11935341  FA_synthesis   PLSX_SYNP6  plsX syc0103_c   \n",
+       "11935342        DUF711  Y1297_NEIG1         NGO1297   \n",
+       "11935343          CobS   COBS_KOSOT  cobS Kole_0456   \n",
+       "11935344         HGD-D   LCDA_ANAPI            lcdA   \n",
+       "\n",
+       "                                              Function [CC]  \\\n",
+       "0         FUNCTION: Central enzyme in the extracellular ...   \n",
+       "1         FUNCTION: Central enzyme in the extracellular ...   \n",
+       "2         FUNCTION: Central enzyme in the extracellular ...   \n",
+       "3         FUNCTION: Central enzyme in the extracellular ...   \n",
+       "4         FUNCTION: Central enzyme in the extracellular ...   \n",
+       "...                                                     ...   \n",
+       "11935340  FUNCTION: Catalyzes the reversible formation o...   \n",
+       "11935341  FUNCTION: Catalyzes the reversible formation o...   \n",
+       "11935342                                                NaN   \n",
+       "11935343  FUNCTION: Joins adenosylcobinamide-GDP and alp...   \n",
+       "11935344  FUNCTION: Involved in the acrylate pathway for...   \n",
+       "\n",
+       "         Taxonomic lineage (PHYLUM)  \n",
+       "0                          Chordata  \n",
+       "1                          Chordata  \n",
+       "2                          Chordata  \n",
+       "3                          Chordata  \n",
+       "4                          Chordata  \n",
+       "...                             ...  \n",
+       "11935340              Cyanobacteria  \n",
+       "11935341              Cyanobacteria  \n",
+       "11935342             Proteobacteria  \n",
+       "11935343                Thermotogae  \n",
+       "11935344                 Firmicutes  \n",
+       "\n",
+       "[11935345 rows x 38 columns]"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "swp_merge"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "511b875a-bdc8-43f9-8e33-34ad491d0487",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "alphafold['query'] = alphafold.index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "f4407b86-a8b0-4ef1-8c98-3309c0084b15",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "alphafold.reset_index(drop=True, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5df8b919-554a-4df0-9e37-1e70f974f3e6",
+   "metadata": {},
+   "source": [
+    "Merge tables with alphafold results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "id": "b43d8449-d8be-468b-96b4-ee72531cb3ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pdb_res = pd.merge(pdb_merge, alphafold, on=\"query\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "22c0968a-140f-4ce4-a65a-cc4d668a424a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "c3376_g1      6368\n",
+       "c105400_g1    6368\n",
+       "c94782_g1     6076\n",
+       "c43224_g1     6002\n",
+       "c89761_g1     5787\n",
+       "              ... \n",
+       "c87906_g2        1\n",
+       "c111030_g1       1\n",
+       "c99629_g1        1\n",
+       "c96209_g1        1\n",
+       "c74333_g1        1\n",
+       "Name: gene_id, Length: 10157, dtype: int64"
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pdb_res.gene_id.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b8f492f6-450d-484f-ba07-8a396ea6c314",
+   "metadata": {},
+   "source": [
+    "Translate the UniProt IDs to gene names; whatever obtained a UniProt ID should have a gene name, and this will be our fallback option if emapper annotation is not present:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "id": "a567e55d-761f-4ca7-a2c1-8a77f24749a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "afdb_res = pd.merge(afdb_merge, alphafold, on=\"query\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "id": "7dec1532-b05e-496c-aa06-0dfa1b70e637",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "c102030_g2    330\n",
+       "c98824_g1     328\n",
+       "c96693_g1     326\n",
+       "c106372_g1    326\n",
+       "c104839_g2    325\n",
+       "             ... \n",
+       "c95444_g1      38\n",
+       "c44058_g1      34\n",
+       "c105808_g1     21\n",
+       "c112778_g1      5\n",
+       "c78729_g1       5\n",
+       "Name: gene_id, Length: 29386, dtype: int64"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "afdb_res.gene_id.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "id": "dc9c73b6-ec0c-4744-9d3c-f34a46d86908",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "swp_res = pd.merge(swp_merge, alphafold, on=\"query\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "0ccb5e8c-0ecf-474a-8464-71700626ccdd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "c94230_g1     311\n",
+       "c103036_g1    310\n",
+       "c94975_g1     308\n",
+       "c105925_g1    308\n",
+       "c103624_g1    307\n",
+       "             ... \n",
+       "c101908_g2     10\n",
+       "c44058_g1       9\n",
+       "c95444_g1       7\n",
+       "c103292_g1      6\n",
+       "c105808_g1      4\n",
+       "Name: gene_id, Length: 29385, dtype: int64"
+      ]
+     },
+     "execution_count": 56,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "swp_res.gene_id.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "cdda4572-1cf5-4fd1-9c3b-5cf5f50c3e2f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['query',\n",
+       " 'target',\n",
+       " 'seq. id.',\n",
+       " 'alignment length',\n",
+       " 'no. mismatches',\n",
+       " 'no. gap open',\n",
+       " 'query start',\n",
+       " 'target start',\n",
+       " 'query end',\n",
+       " 'target end',\n",
+       " 'e value',\n",
+       " 'bit score',\n",
+       " 'uniprot',\n",
+       " '#query',\n",
+       " 'seed_ortholog',\n",
+       " 'evalue',\n",
+       " 'score',\n",
+       " 'eggNOG_OGs',\n",
+       " 'max_annot_lvl',\n",
+       " 'COG_category',\n",
+       " 'Description',\n",
+       " 'Preferred_name',\n",
+       " 'GOs',\n",
+       " 'EC',\n",
+       " 'KEGG_ko',\n",
+       " 'KEGG_Pathway',\n",
+       " 'KEGG_Module',\n",
+       " 'KEGG_Reaction',\n",
+       " 'KEGG_rclass',\n",
+       " 'BRITE',\n",
+       " 'KEGG_TC',\n",
+       " 'CAZy',\n",
+       " 'BiGG_Reaction',\n",
+       " 'PFAMs',\n",
+       " 'Entry name',\n",
+       " 'Gene names',\n",
+       " 'Function [CC]',\n",
+       " 'Taxonomic lineage (PHYLUM)',\n",
+       " 'plddt',\n",
+       " 'MSA size',\n",
+       " 'query length',\n",
+       " 'gene name',\n",
+       " 'protein_id',\n",
+       " 'gene_id']"
+      ]
+     },
+     "execution_count": 57,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(pdb_res.columns)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "94bd0480-b089-48fe-8841-9f839beaf7b8",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "Before I add the eggnog information from the sponge proteome, I need to drop and rename some of the columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "id": "b866d26f-94b5-4875-ba4f-45003d74ce1f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pdb_res.rename(columns = {'query':'fs_query', 'target':'fs_target', 'e value':'fs_e value', 'bit score':\"fs_bit score\", 'uniprot':'fs_target_uniprot', 'evalue':'fs_target_eggnog_evalue', \n",
+    "                         'score':'fs_target_eggnog_score', 'eggNOG_OGs':'fs_target_eggnogOGs', 'max_annot_lvl':'fs_target_max_annot_lvl','COG_category':'fs_target_COG_category',\n",
+    "                         'Description':'fs_target_Description', 'Preferred_name':'fs_target_Preferred_name', 'GOs':'fs_target_GOs', 'Entry name':'fs_target_Entry name', \n",
+    "                         'Gene names':'fs_target_Gene names', 'Function [CC]':'fs_target_Function [CC]', 'Taxonomic lineage (PHYLUM)':'fs_target_Taxonomic lineage (PHYLUM)', 'PFAMs':'fs_target_PFAMs'}, inplace = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "id": "3b7c0cc2-a740-42cb-bf08-09154963e4fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pdb_res.drop(['no. mismatches',\n",
+    " 'no. gap open',\n",
+    " 'query start',\n",
+    " 'target start',\n",
+    " 'query end',\n",
+    " 'target end','#query',\n",
+    " 'seed_ortholog','EC',\n",
+    " 'KEGG_ko',\n",
+    " 'KEGG_Pathway',\n",
+    " 'KEGG_Module',\n",
+    " 'KEGG_Reaction',\n",
+    " 'KEGG_rclass',\n",
+    " 'BRITE',\n",
+    " 'KEGG_TC',\n",
+    " 'CAZy',\n",
+    " 'BiGG_Reaction'], axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "0d510c96-ebaf-48f0-aa59-e6070dcecba0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "afdb_res.rename(columns = {'query':'fs_query', 'target':'fs_target', 'e value':'fs_e value', 'bit score':\"fs_bit score\", 'uniprot':'fs_target_uniprot', 'evalue':'fs_target_eggnog_evalue', \n",
+    "                         'score':'fs_target_eggnog_score', 'eggNOG_OGs':'fs_target_eggnogOGs', 'max_annot_lvl':'fs_target_max_annot_lvl','COG_category':'fs_target_COG_category',\n",
+    "                         'Description':'fs_target_Description', 'Preferred_name':'fs_target_Preferred_name', 'GOs':'fs_target_GOs', 'Entry name':'fs_target_Entry name', \n",
+    "                         'Gene names':'fs_target_Gene names', 'Function [CC]':'fs_target_Function [CC]', 'Taxonomic lineage (PHYLUM)':'fs_target_Taxonomic lineage (PHYLUM)', 'PFAMs':'fs_target_PFAMs'}, inplace = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "id": "d384509d-212c-4ad7-9fe8-897d7048cdf0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "afdb_res.drop(['no. mismatches',\n",
+    " 'no. gap open',\n",
+    " 'query start',\n",
+    " 'target start',\n",
+    " 'query end',\n",
+    " 'target end','#query',\n",
+    " 'seed_ortholog','EC',\n",
+    " 'KEGG_ko',\n",
+    " 'KEGG_Pathway',\n",
+    " 'KEGG_Module',\n",
+    " 'KEGG_Reaction',\n",
+    " 'KEGG_rclass',\n",
+    " 'BRITE',\n",
+    " 'KEGG_TC',\n",
+    " 'CAZy',\n",
+    " 'BiGG_Reaction'], axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "id": "78d517da-65a2-4ac6-a007-d3d3c6c3ddae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "swp_res.rename(columns = {'query':'fs_query', 'target':'fs_target', 'e value':'fs_e value', 'bit score':\"fs_bit score\", 'uniprot':'fs_target_uniprot', 'evalue':'fs_target_eggnog_evalue', \n",
+    "                         'score':'fs_target_eggnog_score', 'eggNOG_OGs':'fs_target_eggnogOGs', 'max_annot_lvl':'fs_target_max_annot_lvl','COG_category':'fs_target_COG_category',\n",
+    "                         'Description':'fs_target_Description', 'Preferred_name':'fs_target_Preferred_name', 'GOs':'fs_target_GOs', 'Entry name':'fs_target_Entry name', \n",
+    "                         'Gene names':'fs_target_Gene names', 'Function [CC]':'fs_target_Function [CC]', 'Taxonomic lineage (PHYLUM)':'fs_target_Taxonomic lineage (PHYLUM)', 'PFAMs':'fs_target_PFAMs'}, inplace = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "id": "00cdb496-4bd6-40ac-a22d-ac1f5e4ba960",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "swp_res.drop(['no. mismatches',\n",
+    " 'no. gap open',\n",
+    " 'query start',\n",
+    " 'target start',\n",
+    " 'query end',\n",
+    " 'target end','#query',\n",
+    " 'seed_ortholog','EC',\n",
+    " 'KEGG_ko',\n",
+    " 'KEGG_Pathway',\n",
+    " 'KEGG_Module',\n",
+    " 'KEGG_Reaction',\n",
+    " 'KEGG_rclass',\n",
+    " 'BRITE',\n",
+    " 'KEGG_TC',\n",
+    " 'CAZy',\n",
+    " 'BiGG_Reaction'], axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "id": "808506eb-e776-422f-8c5f-de068d8ba97b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['fs_query',\n",
+       " 'fs_target',\n",
+       " 'seq. id.',\n",
+       " 'alignment length',\n",
+       " 'fs_e value',\n",
+       " 'fs_bit score',\n",
+       " 'fs_target_uniprot',\n",
+       " 'fs_target_eggnog_evalue',\n",
+       " 'fs_target_eggnog_score',\n",
+       " 'fs_target_eggnogOGs',\n",
+       " 'fs_target_max_annot_lvl',\n",
+       " 'fs_target_COG_category',\n",
+       " 'fs_target_Description',\n",
+       " 'fs_target_Preferred_name',\n",
+       " 'fs_target_GOs',\n",
+       " 'fs_target_PFAMs',\n",
+       " 'fs_target_Entry name',\n",
+       " 'fs_target_Gene names',\n",
+       " 'fs_target_Function [CC]',\n",
+       " 'fs_target_Taxonomic lineage (PHYLUM)',\n",
+       " 'plddt',\n",
+       " 'MSA size',\n",
+       " 'query length',\n",
+       " 'gene name',\n",
+       " 'protein_id',\n",
+       " 'gene_id']"
+      ]
+     },
+     "execution_count": 69,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(afdb_res.columns)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "74a8129b-5ae0-4b7f-9a26-7bf0947670b5",
+   "metadata": {},
+   "source": [
+    "Now the question is how to subset these huge tables a little bit more so we can work with them properly. The best way would be to filter based on fs bit score and/or fs_target_eggnogOGs. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d37e7a82-8017-4b16-9419-1cb45d41858d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#pdb_seq = enrich_from_uniprot(pdb, \"uniprot\", \"gene name\", uniprot_from=\"ACC\", uniprot_to=\"GENENAME\")\n",
+    "# swp_seq = enrich_from_uniprot(swp, \"uniprot\", \"gene name\", uniprot_from=\"ACC\", uniprot_to=\"GENENAME\")\n",
+    "#afdb_seq = enrich_from_uniprot(afdb, \"uniprot\", \"gene name\", uniprot_from=\"ACC\", uniprot_to=\"GENENAME\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "de9d5f80-9ac8-4a14-a628-1e99af934403",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#pdb_seq = enrich_from_uniprot(pdb_seq, \"uniprot\", \"eggnog\", uniprot_from=\"ACC\", uniprot_to=\"EGGNOG_ID\")\n",
+    "#swp_seq = enrich_from_uniprot(swp_seq, \"uniprot\", \"eggnog\", uniprot_from=\"ACC\", uniprot_to=\"EGGNOG_ID\")\n",
+    "#afdb_seq = enrich_from_uniprot(afdb_seq, \"uniprot\", \"eggnog\", uniprot_from=\"ACC\", uniprot_to=\"EGGNOG_ID\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8ec832ac-8525-438d-819a-cab2232f574f",
+   "metadata": {},
+   "source": [
+    "Cross-reference the eggNOG IDs with the eggNOG annotation that gives a nice name/description for each (most) orthogroup:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "id": "33c69bec-6a43-49c2-8ea8-3f011fcf86c4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eggnog = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/spongilla_eggnog.tsv\", sep=\"\\t\", skiprows=4, skipfooter=3, engine='python')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "id": "4247ce9f-18b7-4fa1-a6a6-58218ef6e797",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>#query</th>\n",
+       "      <th>seed_ortholog</th>\n",
+       "      <th>evalue</th>\n",
+       "      <th>score</th>\n",
+       "      <th>eggNOG_OGs</th>\n",
+       "      <th>max_annot_lvl</th>\n",
+       "      <th>COG_category</th>\n",
+       "      <th>Description</th>\n",
+       "      <th>Preferred_name</th>\n",
+       "      <th>GOs</th>\n",
+       "      <th>...</th>\n",
+       "      <th>KEGG_ko</th>\n",
+       "      <th>KEGG_Pathway</th>\n",
+       "      <th>KEGG_Module</th>\n",
+       "      <th>KEGG_Reaction</th>\n",
+       "      <th>KEGG_rclass</th>\n",
+       "      <th>BRITE</th>\n",
+       "      <th>KEGG_TC</th>\n",
+       "      <th>CAZy</th>\n",
+       "      <th>BiGG_Reaction</th>\n",
+       "      <th>PFAMs</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>c100000_g1_i1_m.41809</td>\n",
+       "      <td>400682.PAC_15712888</td>\n",
+       "      <td>6.690000e-72</td>\n",
+       "      <td>242.0</td>\n",
+       "      <td>COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...</td>\n",
+       "      <td>33208|Metazoa</td>\n",
+       "      <td>I</td>\n",
+       "      <td>CRAL/TRIO domain</td>\n",
+       "      <td>MOSPD2</td>\n",
+       "      <td>-</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>CRAL_TRIO,Motile_Sperm</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>c100000_g1_i2_m.41814</td>\n",
+       "      <td>400682.PAC_15712888</td>\n",
+       "      <td>1.640000e-13</td>\n",
+       "      <td>77.8</td>\n",
+       "      <td>COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...</td>\n",
+       "      <td>33208|Metazoa</td>\n",
+       "      <td>I</td>\n",
+       "      <td>CRAL/TRIO domain</td>\n",
+       "      <td>MOSPD2</td>\n",
+       "      <td>-</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>CRAL_TRIO,Motile_Sperm</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>c100000_g2_i1_m.41818</td>\n",
+       "      <td>400682.PAC_15712888</td>\n",
+       "      <td>3.350000e-33</td>\n",
+       "      <td>135.0</td>\n",
+       "      <td>COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...</td>\n",
+       "      <td>33208|Metazoa</td>\n",
+       "      <td>I</td>\n",
+       "      <td>CRAL/TRIO domain</td>\n",
+       "      <td>MOSPD2</td>\n",
+       "      <td>-</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>CRAL_TRIO,Motile_Sperm</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>c100001_g1_i2_m.41826</td>\n",
+       "      <td>400682.PAC_15716590</td>\n",
+       "      <td>5.880000e-48</td>\n",
+       "      <td>176.0</td>\n",
+       "      <td>COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33...</td>\n",
+       "      <td>33208|Metazoa</td>\n",
+       "      <td>A</td>\n",
+       "      <td>RNA secondary structure unwinding</td>\n",
+       "      <td>DDX46</td>\n",
+       "      <td>GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>ko:K12811</td>\n",
+       "      <td>ko03040,map03040</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>ko00000,ko00001,ko01000,ko03041</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>DEAD,Helicase_C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>c100001_g2_i1_m.41829</td>\n",
+       "      <td>400682.PAC_15716590</td>\n",
+       "      <td>1.220000e-305</td>\n",
+       "      <td>868.0</td>\n",
+       "      <td>COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33...</td>\n",
+       "      <td>33208|Metazoa</td>\n",
+       "      <td>A</td>\n",
+       "      <td>RNA secondary structure unwinding</td>\n",
+       "      <td>DDX46</td>\n",
+       "      <td>GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>ko:K12811</td>\n",
+       "      <td>ko03040,map03040</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>ko00000,ko00001,ko01000,ko03041</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>DEAD,Helicase_C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24193</th>\n",
+       "      <td>c99995_g1_i1_m.41796</td>\n",
+       "      <td>45351.EDO40823</td>\n",
+       "      <td>3.090000e-120</td>\n",
+       "      <td>352.0</td>\n",
+       "      <td>COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33...</td>\n",
+       "      <td>33208|Metazoa</td>\n",
+       "      <td>L</td>\n",
+       "      <td>protein-DNA loading ATPase activity</td>\n",
+       "      <td>RFC3</td>\n",
+       "      <td>GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>ko:K10756</td>\n",
+       "      <td>ko03030,ko03420,ko03430,map03030,map03420,map0...</td>\n",
+       "      <td>M00289,M00295</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>ko00000,ko00001,ko00002,ko03032,ko03036,ko03400</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>DNA_pol3_delta2,Rep_fac_C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24194</th>\n",
+       "      <td>c99995_g2_i1_m.41797</td>\n",
+       "      <td>109478.XP_005883212.1</td>\n",
+       "      <td>1.570000e-71</td>\n",
+       "      <td>226.0</td>\n",
+       "      <td>COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33...</td>\n",
+       "      <td>33208|Metazoa</td>\n",
+       "      <td>L</td>\n",
+       "      <td>protein-DNA loading ATPase activity</td>\n",
+       "      <td>RFC3</td>\n",
+       "      <td>GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>ko:K10756</td>\n",
+       "      <td>ko03030,ko03420,ko03430,map03030,map03420,map0...</td>\n",
+       "      <td>M00289,M00295</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>ko00000,ko00001,ko00002,ko03032,ko03036,ko03400</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>DNA_pol3_delta2,Rep_fac_C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24195</th>\n",
+       "      <td>c99997_g2_i1_m.41801</td>\n",
+       "      <td>7739.XP_002612114.1</td>\n",
+       "      <td>1.150000e-205</td>\n",
+       "      <td>583.0</td>\n",
+       "      <td>COG0644@1|root,2QW6Y@2759|Eukaryota,39YRX@3315...</td>\n",
+       "      <td>33208|Metazoa</td>\n",
+       "      <td>C</td>\n",
+       "      <td>FAD binding domain</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>FAD_binding_3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24196</th>\n",
+       "      <td>c99998_g1_i1_m.41804</td>\n",
+       "      <td>400682.PAC_15712215</td>\n",
+       "      <td>2.240000e-15</td>\n",
+       "      <td>89.7</td>\n",
+       "      <td>COG0666@1|root,KOG4369@1|root,KOG0504@2759|Euk...</td>\n",
+       "      <td>33208|Metazoa</td>\n",
+       "      <td>T</td>\n",
+       "      <td>positive regulation of MDA-5 signaling pathway</td>\n",
+       "      <td>ANKRD17</td>\n",
+       "      <td>GO:0000785,GO:0001568,GO:0001654,GO:0001745,GO...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>ko:K16726</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>ko00000,ko03036</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>Ank_2,Ank_4,KH_1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24197</th>\n",
+       "      <td>c99999_g1_i1_m.41806</td>\n",
+       "      <td>400682.PAC_15718953</td>\n",
+       "      <td>5.030000e-29</td>\n",
+       "      <td>111.0</td>\n",
+       "      <td>2DI7Z@1|root,2S5Y2@2759|Eukaryota,3A72E@33154|...</td>\n",
+       "      <td>33208|Metazoa</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>24198 rows × 21 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                      #query          seed_ortholog         evalue  score  \\\n",
+       "0      c100000_g1_i1_m.41809    400682.PAC_15712888   6.690000e-72  242.0   \n",
+       "1      c100000_g1_i2_m.41814    400682.PAC_15712888   1.640000e-13   77.8   \n",
+       "2      c100000_g2_i1_m.41818    400682.PAC_15712888   3.350000e-33  135.0   \n",
+       "3      c100001_g1_i2_m.41826    400682.PAC_15716590   5.880000e-48  176.0   \n",
+       "4      c100001_g2_i1_m.41829    400682.PAC_15716590  1.220000e-305  868.0   \n",
+       "...                      ...                    ...            ...    ...   \n",
+       "24193   c99995_g1_i1_m.41796         45351.EDO40823  3.090000e-120  352.0   \n",
+       "24194   c99995_g2_i1_m.41797  109478.XP_005883212.1   1.570000e-71  226.0   \n",
+       "24195   c99997_g2_i1_m.41801    7739.XP_002612114.1  1.150000e-205  583.0   \n",
+       "24196   c99998_g1_i1_m.41804    400682.PAC_15712215   2.240000e-15   89.7   \n",
+       "24197   c99999_g1_i1_m.41806    400682.PAC_15718953   5.030000e-29  111.0   \n",
+       "\n",
+       "                                              eggNOG_OGs  max_annot_lvl  \\\n",
+       "0      COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...  33208|Metazoa   \n",
+       "1      COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...  33208|Metazoa   \n",
+       "2      COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...  33208|Metazoa   \n",
+       "3      COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33...  33208|Metazoa   \n",
+       "4      COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33...  33208|Metazoa   \n",
+       "...                                                  ...            ...   \n",
+       "24193  COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33...  33208|Metazoa   \n",
+       "24194  COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33...  33208|Metazoa   \n",
+       "24195  COG0644@1|root,2QW6Y@2759|Eukaryota,39YRX@3315...  33208|Metazoa   \n",
+       "24196  COG0666@1|root,KOG4369@1|root,KOG0504@2759|Euk...  33208|Metazoa   \n",
+       "24197  2DI7Z@1|root,2S5Y2@2759|Eukaryota,3A72E@33154|...  33208|Metazoa   \n",
+       "\n",
+       "      COG_category                                     Description  \\\n",
+       "0                I                                CRAL/TRIO domain   \n",
+       "1                I                                CRAL/TRIO domain   \n",
+       "2                I                                CRAL/TRIO domain   \n",
+       "3                A               RNA secondary structure unwinding   \n",
+       "4                A               RNA secondary structure unwinding   \n",
+       "...            ...                                             ...   \n",
+       "24193            L             protein-DNA loading ATPase activity   \n",
+       "24194            L             protein-DNA loading ATPase activity   \n",
+       "24195            C                              FAD binding domain   \n",
+       "24196            T  positive regulation of MDA-5 signaling pathway   \n",
+       "24197            -                                               -   \n",
+       "\n",
+       "      Preferred_name                                                GOs  ...  \\\n",
+       "0             MOSPD2                                                  -  ...   \n",
+       "1             MOSPD2                                                  -  ...   \n",
+       "2             MOSPD2                                                  -  ...   \n",
+       "3              DDX46  GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO...  ...   \n",
+       "4              DDX46  GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO...  ...   \n",
+       "...              ...                                                ...  ...   \n",
+       "24193           RFC3  GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO...  ...   \n",
+       "24194           RFC3  GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO...  ...   \n",
+       "24195              -                                                  -  ...   \n",
+       "24196        ANKRD17  GO:0000785,GO:0001568,GO:0001654,GO:0001745,GO...  ...   \n",
+       "24197              -                                                  -  ...   \n",
+       "\n",
+       "         KEGG_ko                                       KEGG_Pathway  \\\n",
+       "0              -                                                  -   \n",
+       "1              -                                                  -   \n",
+       "2              -                                                  -   \n",
+       "3      ko:K12811                                   ko03040,map03040   \n",
+       "4      ko:K12811                                   ko03040,map03040   \n",
+       "...          ...                                                ...   \n",
+       "24193  ko:K10756  ko03030,ko03420,ko03430,map03030,map03420,map0...   \n",
+       "24194  ko:K10756  ko03030,ko03420,ko03430,map03030,map03420,map0...   \n",
+       "24195          -                                                  -   \n",
+       "24196  ko:K16726                                                  -   \n",
+       "24197          -                                                  -   \n",
+       "\n",
+       "         KEGG_Module KEGG_Reaction KEGG_rclass  \\\n",
+       "0                  -             -           -   \n",
+       "1                  -             -           -   \n",
+       "2                  -             -           -   \n",
+       "3                  -             -           -   \n",
+       "4                  -             -           -   \n",
+       "...              ...           ...         ...   \n",
+       "24193  M00289,M00295             -           -   \n",
+       "24194  M00289,M00295             -           -   \n",
+       "24195              -             -           -   \n",
+       "24196              -             -           -   \n",
+       "24197              -             -           -   \n",
+       "\n",
+       "                                                 BRITE KEGG_TC CAZy  \\\n",
+       "0                                                    -       -    -   \n",
+       "1                                                    -       -    -   \n",
+       "2                                                    -       -    -   \n",
+       "3                      ko00000,ko00001,ko01000,ko03041       -    -   \n",
+       "4                      ko00000,ko00001,ko01000,ko03041       -    -   \n",
+       "...                                                ...     ...  ...   \n",
+       "24193  ko00000,ko00001,ko00002,ko03032,ko03036,ko03400       -    -   \n",
+       "24194  ko00000,ko00001,ko00002,ko03032,ko03036,ko03400       -    -   \n",
+       "24195                                                -       -    -   \n",
+       "24196                                  ko00000,ko03036       -    -   \n",
+       "24197                                                -       -    -   \n",
+       "\n",
+       "      BiGG_Reaction                      PFAMs  \n",
+       "0                 -     CRAL_TRIO,Motile_Sperm  \n",
+       "1                 -     CRAL_TRIO,Motile_Sperm  \n",
+       "2                 -     CRAL_TRIO,Motile_Sperm  \n",
+       "3                 -            DEAD,Helicase_C  \n",
+       "4                 -            DEAD,Helicase_C  \n",
+       "...             ...                        ...  \n",
+       "24193             -  DNA_pol3_delta2,Rep_fac_C  \n",
+       "24194             -  DNA_pol3_delta2,Rep_fac_C  \n",
+       "24195             -              FAD_binding_3  \n",
+       "24196             -           Ank_2,Ank_4,KH_1  \n",
+       "24197             -                          -  \n",
+       "\n",
+       "[24198 rows x 21 columns]"
+      ]
+     },
+     "execution_count": 71,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eggnog[\"gene_id\"] = eggnog[\"#query\"].str.split(\"_\").str[:2].str.join(\"_\")\n",
+    "eggnog[['gene_id', 'protein_id']] = eggnog['#query'].str.split('_')., 1, expand=True\n",
+    "eggnog[\"#query\"].str.split(\"_\").str[:2].str.join(\"_\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "id": "f3756118-c34c-4e6d-8954-c94473601758",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eggnog[\"gene_id\"] = eggnog[\"#query\"].str.split(\"_\").str[:2].str.join(\"_\")\n",
+    "eggnog[\"protein_id\"] = eggnog[\"#query\"].str.split(\".\").str[1]\n",
+    "eggnog.drop(['#query',\n",
+    " 'seed_ortholog',\n",
+    " 'EC',\n",
+    " 'KEGG_ko',\n",
+    " 'KEGG_Pathway',\n",
+    " 'KEGG_Module',\n",
+    " 'KEGG_Reaction',\n",
+    " 'KEGG_rclass',\n",
+    " 'BRITE',\n",
+    " 'KEGG_TC',\n",
+    " 'CAZy',\n",
+    " 'BiGG_Reaction'], axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "id": "18507e2d-7055-44d7-ac36-0d439c05e34c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>evalue</th>\n",
+       "      <th>score</th>\n",
+       "      <th>eggNOG_OGs</th>\n",
+       "      <th>max_annot_lvl</th>\n",
+       "      <th>COG_category</th>\n",
+       "      <th>Description</th>\n",
+       "      <th>Preferred_name</th>\n",
+       "      <th>GOs</th>\n",
+       "      <th>PFAMs</th>\n",
+       "      <th>gene_id</th>\n",
+       "      <th>protein_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>6.690000e-72</td>\n",
+       "      <td>242.0</td>\n",
+       "      <td>COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...</td>\n",
+       "      <td>33208|Metazoa</td>\n",
+       "      <td>I</td>\n",
+       "      <td>CRAL/TRIO domain</td>\n",
+       "      <td>MOSPD2</td>\n",
+       "      <td>-</td>\n",
+       "      <td>CRAL_TRIO,Motile_Sperm</td>\n",
+       "      <td>c100000_g1</td>\n",
+       "      <td>41809</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1.640000e-13</td>\n",
+       "      <td>77.8</td>\n",
+       "      <td>COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...</td>\n",
+       "      <td>33208|Metazoa</td>\n",
+       "      <td>I</td>\n",
+       "      <td>CRAL/TRIO domain</td>\n",
+       "      <td>MOSPD2</td>\n",
+       "      <td>-</td>\n",
+       "      <td>CRAL_TRIO,Motile_Sperm</td>\n",
+       "      <td>c100000_g1</td>\n",
+       "      <td>41814</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3.350000e-33</td>\n",
+       "      <td>135.0</td>\n",
+       "      <td>COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...</td>\n",
+       "      <td>33208|Metazoa</td>\n",
+       "      <td>I</td>\n",
+       "      <td>CRAL/TRIO domain</td>\n",
+       "      <td>MOSPD2</td>\n",
+       "      <td>-</td>\n",
+       "      <td>CRAL_TRIO,Motile_Sperm</td>\n",
+       "      <td>c100000_g2</td>\n",
+       "      <td>41818</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>5.880000e-48</td>\n",
+       "      <td>176.0</td>\n",
+       "      <td>COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33...</td>\n",
+       "      <td>33208|Metazoa</td>\n",
+       "      <td>A</td>\n",
+       "      <td>RNA secondary structure unwinding</td>\n",
+       "      <td>DDX46</td>\n",
+       "      <td>GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO...</td>\n",
+       "      <td>DEAD,Helicase_C</td>\n",
+       "      <td>c100001_g1</td>\n",
+       "      <td>41826</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1.220000e-305</td>\n",
+       "      <td>868.0</td>\n",
+       "      <td>COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33...</td>\n",
+       "      <td>33208|Metazoa</td>\n",
+       "      <td>A</td>\n",
+       "      <td>RNA secondary structure unwinding</td>\n",
+       "      <td>DDX46</td>\n",
+       "      <td>GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO...</td>\n",
+       "      <td>DEAD,Helicase_C</td>\n",
+       "      <td>c100001_g2</td>\n",
+       "      <td>41829</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24193</th>\n",
+       "      <td>3.090000e-120</td>\n",
+       "      <td>352.0</td>\n",
+       "      <td>COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33...</td>\n",
+       "      <td>33208|Metazoa</td>\n",
+       "      <td>L</td>\n",
+       "      <td>protein-DNA loading ATPase activity</td>\n",
+       "      <td>RFC3</td>\n",
+       "      <td>GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO...</td>\n",
+       "      <td>DNA_pol3_delta2,Rep_fac_C</td>\n",
+       "      <td>c99995_g1</td>\n",
+       "      <td>41796</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24194</th>\n",
+       "      <td>1.570000e-71</td>\n",
+       "      <td>226.0</td>\n",
+       "      <td>COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33...</td>\n",
+       "      <td>33208|Metazoa</td>\n",
+       "      <td>L</td>\n",
+       "      <td>protein-DNA loading ATPase activity</td>\n",
+       "      <td>RFC3</td>\n",
+       "      <td>GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO...</td>\n",
+       "      <td>DNA_pol3_delta2,Rep_fac_C</td>\n",
+       "      <td>c99995_g2</td>\n",
+       "      <td>41797</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24195</th>\n",
+       "      <td>1.150000e-205</td>\n",
+       "      <td>583.0</td>\n",
+       "      <td>COG0644@1|root,2QW6Y@2759|Eukaryota,39YRX@3315...</td>\n",
+       "      <td>33208|Metazoa</td>\n",
+       "      <td>C</td>\n",
+       "      <td>FAD binding domain</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>FAD_binding_3</td>\n",
+       "      <td>c99997_g2</td>\n",
+       "      <td>41801</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24196</th>\n",
+       "      <td>2.240000e-15</td>\n",
+       "      <td>89.7</td>\n",
+       "      <td>COG0666@1|root,KOG4369@1|root,KOG0504@2759|Euk...</td>\n",
+       "      <td>33208|Metazoa</td>\n",
+       "      <td>T</td>\n",
+       "      <td>positive regulation of MDA-5 signaling pathway</td>\n",
+       "      <td>ANKRD17</td>\n",
+       "      <td>GO:0000785,GO:0001568,GO:0001654,GO:0001745,GO...</td>\n",
+       "      <td>Ank_2,Ank_4,KH_1</td>\n",
+       "      <td>c99998_g1</td>\n",
+       "      <td>41804</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24197</th>\n",
+       "      <td>5.030000e-29</td>\n",
+       "      <td>111.0</td>\n",
+       "      <td>2DI7Z@1|root,2S5Y2@2759|Eukaryota,3A72E@33154|...</td>\n",
+       "      <td>33208|Metazoa</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>-</td>\n",
+       "      <td>c99999_g1</td>\n",
+       "      <td>41806</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>24198 rows × 11 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "              evalue  score  \\\n",
+       "0       6.690000e-72  242.0   \n",
+       "1       1.640000e-13   77.8   \n",
+       "2       3.350000e-33  135.0   \n",
+       "3       5.880000e-48  176.0   \n",
+       "4      1.220000e-305  868.0   \n",
+       "...              ...    ...   \n",
+       "24193  3.090000e-120  352.0   \n",
+       "24194   1.570000e-71  226.0   \n",
+       "24195  1.150000e-205  583.0   \n",
+       "24196   2.240000e-15   89.7   \n",
+       "24197   5.030000e-29  111.0   \n",
+       "\n",
+       "                                              eggNOG_OGs  max_annot_lvl  \\\n",
+       "0      COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...  33208|Metazoa   \n",
+       "1      COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...  33208|Metazoa   \n",
+       "2      COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...  33208|Metazoa   \n",
+       "3      COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33...  33208|Metazoa   \n",
+       "4      COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33...  33208|Metazoa   \n",
+       "...                                                  ...            ...   \n",
+       "24193  COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33...  33208|Metazoa   \n",
+       "24194  COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33...  33208|Metazoa   \n",
+       "24195  COG0644@1|root,2QW6Y@2759|Eukaryota,39YRX@3315...  33208|Metazoa   \n",
+       "24196  COG0666@1|root,KOG4369@1|root,KOG0504@2759|Euk...  33208|Metazoa   \n",
+       "24197  2DI7Z@1|root,2S5Y2@2759|Eukaryota,3A72E@33154|...  33208|Metazoa   \n",
+       "\n",
+       "      COG_category                                     Description  \\\n",
+       "0                I                                CRAL/TRIO domain   \n",
+       "1                I                                CRAL/TRIO domain   \n",
+       "2                I                                CRAL/TRIO domain   \n",
+       "3                A               RNA secondary structure unwinding   \n",
+       "4                A               RNA secondary structure unwinding   \n",
+       "...            ...                                             ...   \n",
+       "24193            L             protein-DNA loading ATPase activity   \n",
+       "24194            L             protein-DNA loading ATPase activity   \n",
+       "24195            C                              FAD binding domain   \n",
+       "24196            T  positive regulation of MDA-5 signaling pathway   \n",
+       "24197            -                                               -   \n",
+       "\n",
+       "      Preferred_name                                                GOs  \\\n",
+       "0             MOSPD2                                                  -   \n",
+       "1             MOSPD2                                                  -   \n",
+       "2             MOSPD2                                                  -   \n",
+       "3              DDX46  GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO...   \n",
+       "4              DDX46  GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO...   \n",
+       "...              ...                                                ...   \n",
+       "24193           RFC3  GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO...   \n",
+       "24194           RFC3  GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO...   \n",
+       "24195              -                                                  -   \n",
+       "24196        ANKRD17  GO:0000785,GO:0001568,GO:0001654,GO:0001745,GO...   \n",
+       "24197              -                                                  -   \n",
+       "\n",
+       "                           PFAMs     gene_id protein_id  \n",
+       "0         CRAL_TRIO,Motile_Sperm  c100000_g1      41809  \n",
+       "1         CRAL_TRIO,Motile_Sperm  c100000_g1      41814  \n",
+       "2         CRAL_TRIO,Motile_Sperm  c100000_g2      41818  \n",
+       "3                DEAD,Helicase_C  c100001_g1      41826  \n",
+       "4                DEAD,Helicase_C  c100001_g2      41829  \n",
+       "...                          ...         ...        ...  \n",
+       "24193  DNA_pol3_delta2,Rep_fac_C   c99995_g1      41796  \n",
+       "24194  DNA_pol3_delta2,Rep_fac_C   c99995_g2      41797  \n",
+       "24195              FAD_binding_3   c99997_g2      41801  \n",
+       "24196           Ank_2,Ank_4,KH_1   c99998_g1      41804  \n",
+       "24197                          -   c99999_g1      41806  \n",
+       "\n",
+       "[24198 rows x 11 columns]"
+      ]
+     },
+     "execution_count": 90,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eggnog"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "id": "60c99abb-679f-43a9-af6f-0fb37159f68a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['evalue',\n",
+       " 'seq_eggnog_score',\n",
+       " 'seq_eggnogOGs',\n",
+       " 'seq_max_annot_lvl',\n",
+       " 'seq_COG_category',\n",
+       " 'seq_Description',\n",
+       " 'seq_Preferred_name',\n",
+       " 'seq_GOs',\n",
+       " 'seq_PFAMs',\n",
+       " 'gene_id',\n",
+       " 'protein_id']"
+      ]
+     },
+     "execution_count": 93,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(eggnog.columns)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "id": "06223fb1-1492-4723-b621-3d9a13b0b3fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eggnog.rename(columns = {'e value':'seq_e value',\n",
+    "                         'score':'seq_eggnog_score', 'eggNOG_OGs':'seq_eggnogOGs', 'max_annot_lvl':'seq_max_annot_lvl','COG_category':'seq_COG_category',\n",
+    "                         'Description':'seq_Description', 'Preferred_name':'seq_Preferred_name', 'GOs':'seq_GOs', 'PFAMs':'seq_PFAMs'}, inplace = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "id": "10d606c3-1bc9-4c78-81c7-279d607b8e90",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>fs_query</th>\n",
+       "      <th>fs_target</th>\n",
+       "      <th>seq. id.</th>\n",
+       "      <th>alignment length</th>\n",
+       "      <th>fs_e value</th>\n",
+       "      <th>fs_bit score</th>\n",
+       "      <th>fs_target_uniprot</th>\n",
+       "      <th>fs_target_eggnog_evalue</th>\n",
+       "      <th>fs_target_eggnog_score</th>\n",
+       "      <th>fs_target_eggnogOGs</th>\n",
+       "      <th>...</th>\n",
+       "      <th>fs_target_Entry name</th>\n",
+       "      <th>fs_target_Gene names</th>\n",
+       "      <th>fs_target_Function [CC]</th>\n",
+       "      <th>fs_target_Taxonomic lineage (PHYLUM)</th>\n",
+       "      <th>plddt</th>\n",
+       "      <th>MSA size</th>\n",
+       "      <th>query length</th>\n",
+       "      <th>gene name</th>\n",
+       "      <th>protein_id</th>\n",
+       "      <th>gene_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1442503</th>\n",
+       "      <td>41612</td>\n",
+       "      <td>AF-Q6NYT3-F1</td>\n",
+       "      <td>0.173</td>\n",
+       "      <td>156</td>\n",
+       "      <td>1.632</td>\n",
+       "      <td>90</td>\n",
+       "      <td>Q6NYT3</td>\n",
+       "      <td>1.470000e-217</td>\n",
+       "      <td>600.0</td>\n",
+       "      <td>28N8T@1|root,2QUU4@2759|Eukaryota,39TGP@33154|...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>IER5L_DANRE</td>\n",
+       "      <td>ier5l si:ch211-208h16.10 zgc:77455</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Chordata</td>\n",
+       "      <td>63.615046</td>\n",
+       "      <td>747.0</td>\n",
+       "      <td>218</td>\n",
+       "      <td>c99854_g1_i2_m.41041</td>\n",
+       "      <td>41041</td>\n",
+       "      <td>c99854_g1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1442600</th>\n",
+       "      <td>41612</td>\n",
+       "      <td>AF-Q8CCI5-F1</td>\n",
+       "      <td>0.141</td>\n",
+       "      <td>155</td>\n",
+       "      <td>6.330</td>\n",
+       "      <td>68</td>\n",
+       "      <td>Q8CCI5</td>\n",
+       "      <td>8.230000e-132</td>\n",
+       "      <td>377.0</td>\n",
+       "      <td>KOG4477@1|root,KOG4477@2759|Eukaryota,39UAE@33...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>RYBP_MOUSE</td>\n",
+       "      <td>Rybp Dedaf</td>\n",
+       "      <td>FUNCTION: Component of a Polycomb group (PcG) ...</td>\n",
+       "      <td>Chordata</td>\n",
+       "      <td>63.615046</td>\n",
+       "      <td>747.0</td>\n",
+       "      <td>218</td>\n",
+       "      <td>c99854_g1_i2_m.41041</td>\n",
+       "      <td>41041</td>\n",
+       "      <td>c99854_g1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1442521</th>\n",
+       "      <td>41612</td>\n",
+       "      <td>AF-I1MQL7-F1</td>\n",
+       "      <td>0.154</td>\n",
+       "      <td>226</td>\n",
+       "      <td>10.360</td>\n",
+       "      <td>60</td>\n",
+       "      <td>I1MQL7</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "      <td>947.0</td>\n",
+       "      <td>KOG0724@1|root,KOG0724@2759|Eukaryota,37Q80@33...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>I1MQL7_SOYBN</td>\n",
+       "      <td>778089 GLYMA_16G217700</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Streptophyta</td>\n",
+       "      <td>63.615046</td>\n",
+       "      <td>747.0</td>\n",
+       "      <td>218</td>\n",
+       "      <td>c99854_g1_i2_m.41041</td>\n",
+       "      <td>41041</td>\n",
+       "      <td>c99854_g1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1442614</th>\n",
+       "      <td>41612</td>\n",
+       "      <td>AF-Q9XUQ0-F1</td>\n",
+       "      <td>0.140</td>\n",
+       "      <td>235</td>\n",
+       "      <td>11.020</td>\n",
+       "      <td>59</td>\n",
+       "      <td>Q9XUQ0</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "      <td>961.0</td>\n",
+       "      <td>2ASSX@1|root,2RZQJ@2759|Eukaryota,39UW6@33154|...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Q9XUQ0_CAEEL</td>\n",
+       "      <td>pqn-67 CELE_T16G1.1 T16G1.1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Nematoda (roundworms)</td>\n",
+       "      <td>63.615046</td>\n",
+       "      <td>747.0</td>\n",
+       "      <td>218</td>\n",
+       "      <td>c99854_g1_i2_m.41041</td>\n",
+       "      <td>41041</td>\n",
+       "      <td>c99854_g1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1442500</th>\n",
+       "      <td>41612</td>\n",
+       "      <td>AF-C1H5I7-F1</td>\n",
+       "      <td>0.119</td>\n",
+       "      <td>151</td>\n",
+       "      <td>11.020</td>\n",
+       "      <td>59</td>\n",
+       "      <td>C1H5I7</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "      <td>1177.0</td>\n",
+       "      <td>2CNBF@1|root,2QUZZ@2759|Eukaryota,39NHE@33154|...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>C1H5I7_PARBA</td>\n",
+       "      <td>PAAG_06028</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Ascomycota</td>\n",
+       "      <td>63.615046</td>\n",
+       "      <td>747.0</td>\n",
+       "      <td>218</td>\n",
+       "      <td>c99854_g1_i2_m.41041</td>\n",
+       "      <td>41041</td>\n",
+       "      <td>c99854_g1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1442707</th>\n",
+       "      <td>41612</td>\n",
+       "      <td>AF-Q0J235-F1</td>\n",
+       "      <td>0.107</td>\n",
+       "      <td>140</td>\n",
+       "      <td>481.100</td>\n",
+       "      <td>-17</td>\n",
+       "      <td>Q0J235</td>\n",
+       "      <td>4.250000e-263</td>\n",
+       "      <td>734.0</td>\n",
+       "      <td>28J99@1|root,2QQZR@2759|Eukaryota,37SB3@33090|...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>ROLL9_ORYSJ</td>\n",
+       "      <td>RL9 SLL1 Os09g0395300 LOC_Os09g23200 B1040D06.24</td>\n",
+       "      <td>FUNCTION: Probable transcription factor that r...</td>\n",
+       "      <td>Streptophyta</td>\n",
+       "      <td>63.615046</td>\n",
+       "      <td>747.0</td>\n",
+       "      <td>218</td>\n",
+       "      <td>c99854_g1_i2_m.41041</td>\n",
+       "      <td>41041</td>\n",
+       "      <td>c99854_g1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1442619</th>\n",
+       "      <td>41612</td>\n",
+       "      <td>AF-P54258-F1</td>\n",
+       "      <td>0.160</td>\n",
+       "      <td>150</td>\n",
+       "      <td>481.100</td>\n",
+       "      <td>-17</td>\n",
+       "      <td>P54258</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "      <td>1649.0</td>\n",
+       "      <td>KOG2133@1|root,KOG2133@2759|Eukaryota,39T1J@33...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>ATN1_RAT</td>\n",
+       "      <td>Atn1 Drpla</td>\n",
+       "      <td>FUNCTION: Transcriptional corepressor. Recruit...</td>\n",
+       "      <td>Chordata</td>\n",
+       "      <td>63.615046</td>\n",
+       "      <td>747.0</td>\n",
+       "      <td>218</td>\n",
+       "      <td>c99854_g1_i2_m.41041</td>\n",
+       "      <td>41041</td>\n",
+       "      <td>c99854_g1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1442625</th>\n",
+       "      <td>41612</td>\n",
+       "      <td>AF-Q8IM56-F1</td>\n",
+       "      <td>0.121</td>\n",
+       "      <td>148</td>\n",
+       "      <td>481.100</td>\n",
+       "      <td>-19</td>\n",
+       "      <td>Q8IM56</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "      <td>2878.0</td>\n",
+       "      <td>2CMI8@1|root,2QQEE@2759|Eukaryota,3YC5G@5794|A...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Q8IM56_PLAF7</td>\n",
+       "      <td>PF3D7_1403800</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Apicomplexa</td>\n",
+       "      <td>63.615046</td>\n",
+       "      <td>747.0</td>\n",
+       "      <td>218</td>\n",
+       "      <td>c99854_g1_i2_m.41041</td>\n",
+       "      <td>41041</td>\n",
+       "      <td>c99854_g1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1442455</th>\n",
+       "      <td>41612</td>\n",
+       "      <td>AF-P13611-F8</td>\n",
+       "      <td>0.192</td>\n",
+       "      <td>151</td>\n",
+       "      <td>481.100</td>\n",
+       "      <td>-21</td>\n",
+       "      <td>P13611</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "      <td>6472.0</td>\n",
+       "      <td>28IZN@1|root,2QRBE@2759|Eukaryota,38FU8@33154|...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>CSPG2_HUMAN</td>\n",
+       "      <td>VCAN CSPG2</td>\n",
+       "      <td>FUNCTION: May play a role in intercellular sig...</td>\n",
+       "      <td>Chordata</td>\n",
+       "      <td>63.615046</td>\n",
+       "      <td>747.0</td>\n",
+       "      <td>218</td>\n",
+       "      <td>c99854_g1_i2_m.41041</td>\n",
+       "      <td>41041</td>\n",
+       "      <td>c99854_g1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1442557</th>\n",
+       "      <td>41612</td>\n",
+       "      <td>AF-Q96KW2-F1</td>\n",
+       "      <td>0.105</td>\n",
+       "      <td>247</td>\n",
+       "      <td>481.100</td>\n",
+       "      <td>-26</td>\n",
+       "      <td>Q96KW2</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "      <td>1989.0</td>\n",
+       "      <td>28YNY@1|root,2RWWZ@2759|Eukaryota,3AER2@33154|...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>P12L2_HUMAN</td>\n",
+       "      <td>POM121L2 POM121L</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Chordata</td>\n",
+       "      <td>63.615046</td>\n",
+       "      <td>747.0</td>\n",
+       "      <td>218</td>\n",
+       "      <td>c99854_g1_i2_m.41041</td>\n",
+       "      <td>41041</td>\n",
+       "      <td>c99854_g1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>265 rows × 26 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         fs_query     fs_target  seq. id.  alignment length  fs_e value  \\\n",
+       "1442503     41612  AF-Q6NYT3-F1     0.173               156       1.632   \n",
+       "1442600     41612  AF-Q8CCI5-F1     0.141               155       6.330   \n",
+       "1442521     41612  AF-I1MQL7-F1     0.154               226      10.360   \n",
+       "1442614     41612  AF-Q9XUQ0-F1     0.140               235      11.020   \n",
+       "1442500     41612  AF-C1H5I7-F1     0.119               151      11.020   \n",
+       "...           ...           ...       ...               ...         ...   \n",
+       "1442707     41612  AF-Q0J235-F1     0.107               140     481.100   \n",
+       "1442619     41612  AF-P54258-F1     0.160               150     481.100   \n",
+       "1442625     41612  AF-Q8IM56-F1     0.121               148     481.100   \n",
+       "1442455     41612  AF-P13611-F8     0.192               151     481.100   \n",
+       "1442557     41612  AF-Q96KW2-F1     0.105               247     481.100   \n",
+       "\n",
+       "         fs_bit score fs_target_uniprot  fs_target_eggnog_evalue  \\\n",
+       "1442503            90            Q6NYT3            1.470000e-217   \n",
+       "1442600            68            Q8CCI5            8.230000e-132   \n",
+       "1442521            60            I1MQL7             0.000000e+00   \n",
+       "1442614            59            Q9XUQ0             0.000000e+00   \n",
+       "1442500            59            C1H5I7             0.000000e+00   \n",
+       "...               ...               ...                      ...   \n",
+       "1442707           -17            Q0J235            4.250000e-263   \n",
+       "1442619           -17            P54258             0.000000e+00   \n",
+       "1442625           -19            Q8IM56             0.000000e+00   \n",
+       "1442455           -21            P13611             0.000000e+00   \n",
+       "1442557           -26            Q96KW2             0.000000e+00   \n",
+       "\n",
+       "         fs_target_eggnog_score  \\\n",
+       "1442503                   600.0   \n",
+       "1442600                   377.0   \n",
+       "1442521                   947.0   \n",
+       "1442614                   961.0   \n",
+       "1442500                  1177.0   \n",
+       "...                         ...   \n",
+       "1442707                   734.0   \n",
+       "1442619                  1649.0   \n",
+       "1442625                  2878.0   \n",
+       "1442455                  6472.0   \n",
+       "1442557                  1989.0   \n",
+       "\n",
+       "                                       fs_target_eggnogOGs  ...  \\\n",
+       "1442503  28N8T@1|root,2QUU4@2759|Eukaryota,39TGP@33154|...  ...   \n",
+       "1442600  KOG4477@1|root,KOG4477@2759|Eukaryota,39UAE@33...  ...   \n",
+       "1442521  KOG0724@1|root,KOG0724@2759|Eukaryota,37Q80@33...  ...   \n",
+       "1442614  2ASSX@1|root,2RZQJ@2759|Eukaryota,39UW6@33154|...  ...   \n",
+       "1442500  2CNBF@1|root,2QUZZ@2759|Eukaryota,39NHE@33154|...  ...   \n",
+       "...                                                    ...  ...   \n",
+       "1442707  28J99@1|root,2QQZR@2759|Eukaryota,37SB3@33090|...  ...   \n",
+       "1442619  KOG2133@1|root,KOG2133@2759|Eukaryota,39T1J@33...  ...   \n",
+       "1442625  2CMI8@1|root,2QQEE@2759|Eukaryota,3YC5G@5794|A...  ...   \n",
+       "1442455  28IZN@1|root,2QRBE@2759|Eukaryota,38FU8@33154|...  ...   \n",
+       "1442557  28YNY@1|root,2RWWZ@2759|Eukaryota,3AER2@33154|...  ...   \n",
+       "\n",
+       "        fs_target_Entry name  \\\n",
+       "1442503          IER5L_DANRE   \n",
+       "1442600           RYBP_MOUSE   \n",
+       "1442521         I1MQL7_SOYBN   \n",
+       "1442614         Q9XUQ0_CAEEL   \n",
+       "1442500         C1H5I7_PARBA   \n",
+       "...                      ...   \n",
+       "1442707          ROLL9_ORYSJ   \n",
+       "1442619             ATN1_RAT   \n",
+       "1442625         Q8IM56_PLAF7   \n",
+       "1442455          CSPG2_HUMAN   \n",
+       "1442557          P12L2_HUMAN   \n",
+       "\n",
+       "                                     fs_target_Gene names  \\\n",
+       "1442503                ier5l si:ch211-208h16.10 zgc:77455   \n",
+       "1442600                                        Rybp Dedaf   \n",
+       "1442521                            778089 GLYMA_16G217700   \n",
+       "1442614                       pqn-67 CELE_T16G1.1 T16G1.1   \n",
+       "1442500                                        PAAG_06028   \n",
+       "...                                                   ...   \n",
+       "1442707  RL9 SLL1 Os09g0395300 LOC_Os09g23200 B1040D06.24   \n",
+       "1442619                                        Atn1 Drpla   \n",
+       "1442625                                     PF3D7_1403800   \n",
+       "1442455                                        VCAN CSPG2   \n",
+       "1442557                                  POM121L2 POM121L   \n",
+       "\n",
+       "                                   fs_target_Function [CC]  \\\n",
+       "1442503                                                NaN   \n",
+       "1442600  FUNCTION: Component of a Polycomb group (PcG) ...   \n",
+       "1442521                                                NaN   \n",
+       "1442614                                                NaN   \n",
+       "1442500                                                NaN   \n",
+       "...                                                    ...   \n",
+       "1442707  FUNCTION: Probable transcription factor that r...   \n",
+       "1442619  FUNCTION: Transcriptional corepressor. Recruit...   \n",
+       "1442625                                                NaN   \n",
+       "1442455  FUNCTION: May play a role in intercellular sig...   \n",
+       "1442557                                                NaN   \n",
+       "\n",
+       "        fs_target_Taxonomic lineage (PHYLUM)      plddt MSA size query length  \\\n",
+       "1442503                             Chordata  63.615046    747.0          218   \n",
+       "1442600                             Chordata  63.615046    747.0          218   \n",
+       "1442521                         Streptophyta  63.615046    747.0          218   \n",
+       "1442614                Nematoda (roundworms)  63.615046    747.0          218   \n",
+       "1442500                           Ascomycota  63.615046    747.0          218   \n",
+       "...                                      ...        ...      ...          ...   \n",
+       "1442707                         Streptophyta  63.615046    747.0          218   \n",
+       "1442619                             Chordata  63.615046    747.0          218   \n",
+       "1442625                          Apicomplexa  63.615046    747.0          218   \n",
+       "1442455                             Chordata  63.615046    747.0          218   \n",
+       "1442557                             Chordata  63.615046    747.0          218   \n",
+       "\n",
+       "                    gene name protein_id    gene_id  \n",
+       "1442503  c99854_g1_i2_m.41041      41041  c99854_g1  \n",
+       "1442600  c99854_g1_i2_m.41041      41041  c99854_g1  \n",
+       "1442521  c99854_g1_i2_m.41041      41041  c99854_g1  \n",
+       "1442614  c99854_g1_i2_m.41041      41041  c99854_g1  \n",
+       "1442500  c99854_g1_i2_m.41041      41041  c99854_g1  \n",
+       "...                       ...        ...        ...  \n",
+       "1442707  c99854_g1_i2_m.41041      41041  c99854_g1  \n",
+       "1442619  c99854_g1_i2_m.41041      41041  c99854_g1  \n",
+       "1442625  c99854_g1_i2_m.41041      41041  c99854_g1  \n",
+       "1442455  c99854_g1_i2_m.41041      41041  c99854_g1  \n",
+       "1442557  c99854_g1_i2_m.41041      41041  c99854_g1  \n",
+       "\n",
+       "[265 rows x 26 columns]"
+      ]
+     },
+     "execution_count": 66,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "afdb_res[afdb_res['gene_id'].str.contains(\"c99854_g1\")].sort_values(by=\"fs_bit score\", ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 94,
+   "id": "2c216345-b501-419c-b3a4-2a71894a5899",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "KeyError",
+     "evalue": "'gene_id'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "\u001b[0;32m/tmp/ipykernel_77/3048031996.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mafdb_res\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmerge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mafdb_merge\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0meggnog\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mon\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"gene_id\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/reshape/merge.py\u001b[0m in \u001b[0;36mmerge\u001b[0;34m(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)\u001b[0m\n\u001b[1;32m    105\u001b[0m     \u001b[0mvalidate\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    106\u001b[0m ) -> DataFrame:\n\u001b[0;32m--> 107\u001b[0;31m     op = _MergeOperation(\n\u001b[0m\u001b[1;32m    108\u001b[0m         \u001b[0mleft\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    109\u001b[0m         \u001b[0mright\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/reshape/merge.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, left, right, how, on, left_on, right_on, axis, left_index, right_index, sort, suffixes, copy, indicator, validate)\u001b[0m\n\u001b[1;32m    698\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mright_join_keys\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    699\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin_names\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 700\u001b[0;31m         ) = self._get_merge_keys()\n\u001b[0m\u001b[1;32m    701\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    702\u001b[0m         \u001b[0;31m# validate the merge keys dtypes. We may need to coerce\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/reshape/merge.py\u001b[0m in \u001b[0;36m_get_merge_keys\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1103\u001b[0m                         \u001b[0mright_keys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1104\u001b[0m                     \u001b[0;32mif\u001b[0m \u001b[0mlk\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1105\u001b[0;31m                         \u001b[0mleft_keys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mleft\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_label_or_level_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1106\u001b[0m                         \u001b[0mjoin_names\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1107\u001b[0m                     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m_get_label_or_level_values\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m   1774\u001b[0m             \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maxes\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_level_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1775\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1776\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1777\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1778\u001b[0m         \u001b[0;31m# Check for duplicates\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'gene_id'"
+     ]
+    }
+   ],
+   "source": [
+    "afdb_res = pd.merge(afdb_merge, eggnog, on=\"gene_id\")"
    ]
   },
   {