From 9d321e6af9c98c45b4e501745ba734e591f8f292 Mon Sep 17 00:00:00 2001
From: Nikolaos Papadopoulos <npapadop@embl.de>
Date: Wed, 17 Aug 2022 12:56:22 +0200
Subject: [PATCH] fixed ModelArchive bugs (protein names/models, protein query
 length); added AFDB self comparison as per editors' request

---
 analysis/suppl-ModelArchive_metadata.ipynb    | 217 +++--
 analysis/suppl-ModelArchive_preparation.ipynb |   2 +-
 analysis/suppl-compare_seq_profile.ipynb      | 898 ++++++++++++++++++
 scripts/fs_self_search_afdb.sh                |  23 +
 4 files changed, 1064 insertions(+), 76 deletions(-)
 create mode 100644 analysis/suppl-compare_seq_profile.ipynb
 create mode 100755 scripts/fs_self_search_afdb.sh

diff --git a/analysis/suppl-ModelArchive_metadata.ipynb b/analysis/suppl-ModelArchive_metadata.ipynb
index f30f805..d59fb11 100644
--- a/analysis/suppl-ModelArchive_metadata.ipynb
+++ b/analysis/suppl-ModelArchive_metadata.ipynb
@@ -10,7 +10,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2022-07-28 08:02\n"
+      "2022-08-17 11:07\n"
      ]
     }
    ],
@@ -41,43 +41,24 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pandas as pd"
+    "import pandas as pd\n",
+    "import numpy as np"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "id": "d7d72594",
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "FileNotFoundError",
-     "evalue": "[Errno 2] No such file or directory: '../ol_data/Slacustris_eggnog.tsv'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
-      "\u001b[0;32m/tmp/ipykernel_136/1368120052.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msequence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../ol_data/Slacustris_eggnog.tsv'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'\\t'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mstructure\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_parquet\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../old_data/structure_annotation.parquet'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    309\u001b[0m                     \u001b[0mstacklevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstacklevel\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    310\u001b[0m                 )\n\u001b[0;32m--> 311\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    312\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    313\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[1;32m    584\u001b[0m     \u001b[0mkwds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkwds_defaults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    585\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 586\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    587\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    588\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    480\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    481\u001b[0m     \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 482\u001b[0;31m     \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    483\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    484\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m    809\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"has_index_names\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"has_index_names\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    810\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 811\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    812\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    813\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, engine)\u001b[0m\n\u001b[1;32m   1038\u001b[0m             )\n\u001b[1;32m   1039\u001b[0m         \u001b[0;31m# error: Too many arguments for \"ParserBase\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1040\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mmapping\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m# type: ignore[call-arg]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1041\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1042\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_failover_to_python\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/io/parsers/c_parser_wrapper.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, src, **kwds)\u001b[0m\n\u001b[1;32m     49\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     50\u001b[0m         \u001b[0;31m# open handles\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 51\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_open_handles\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     52\u001b[0m         \u001b[0;32massert\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandles\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     53\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/io/parsers/base_parser.py\u001b[0m in \u001b[0;36m_open_handles\u001b[0;34m(self, src, kwds)\u001b[0m\n\u001b[1;32m    220\u001b[0m         \u001b[0mLet\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mreaders\u001b[0m \u001b[0mopen\u001b[0m \u001b[0mIOHandles\u001b[0m \u001b[0mafter\u001b[0m \u001b[0mthey\u001b[0m \u001b[0mare\u001b[0m \u001b[0mdone\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mtheir\u001b[0m \u001b[0mpotential\u001b[0m \u001b[0mraises\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    221\u001b[0m         \"\"\"\n\u001b[0;32m--> 222\u001b[0;31m         self.handles = get_handle(\n\u001b[0m\u001b[1;32m    223\u001b[0m             \u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    224\u001b[0m             \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/io/common.py\u001b[0m in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m    699\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoding\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m\"b\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    700\u001b[0m             \u001b[0;31m# Encoding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 701\u001b[0;31m             handle = open(\n\u001b[0m\u001b[1;32m    702\u001b[0m                 \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    703\u001b[0m                 \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../ol_data/Slacustris_eggnog.tsv'"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "sequence = pd.read_csv('../old_data/Slacustris_eggnog.tsv', sep='\\t')\n",
-    "structure = pd.read_parquet('../old_data/structure_annotation.parquet')"
+    "sequence = pd.read_csv('../data/Slacustris_eggnog.tsv', sep='\\t')\n",
+    "structure = pd.read_parquet('../data/structure_annotation.parquet')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "6532f2fc",
    "metadata": {},
    "outputs": [],
@@ -87,39 +68,65 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
+   "id": "98d46a4b-3ee2-48b0-9100-cbdfaf5a863f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged['plddt'].replace('-', np.NaN, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6bec4cc1-3dff-48fe-91a6-0546f94be850",
+   "metadata": {},
+   "source": [
+    "amend the fact that we counted * characters when calculating sequence length:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "741d43fc-b176-4d23-9770-41348c1b2725",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged['query length'] = merged['query length'] - merged['has_end']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
    "id": "4a137f30",
    "metadata": {},
    "outputs": [],
    "source": [
     "def generate_description(row):\n",
-    "    desc = f\"Preferred name [EggNOG]: {row['Preferred_name_seq']}\\n\" + \\\n",
-    "        f\"Description [EggNOG]: {row['Description_seq']}\\n\" + \\\n",
-    "        f\"CoFFE preferred name [EggNOG]: {row['Preferred_name_str']}\\n\" + \\\n",
-    "        f\"CoFFE description [EggNOG]: {row['Description_str']}\\n\" + \\\n",
-    "        f\"CoFFE UniProt function: {row['Function [CC]']}\\n\\n\" + \\\n",
-    "        f\"isoform ID: {row['isoform']}\\n\" + \\\n",
+    "    desc = f\"isoform ID: {row['isoform']}\\n\" + \\\n",
     "        f\"peptide length: {row['query length']}\\n\" + \\\n",
-    "        f\"input MSA size [MMseqs2]: {row['MSA size']}\\n\" + \\\n",
+    "        f\"input MSA size [MMseqs2]: {row['MSA size']:.0f}\\n\" + \\\n",
     "        f\"gene ID: {row['gene_id_str']}\\n\" + \\\n",
     "        f\"protein ID: {row['protein_id_str']}\\n\" + \\\n",
     "        f\"mRNA has start codon [TransDecoder]: {row['has_start']}\\n\" + \\\n",
     "        f\"mRNA has stop codon [TransDecoder]: {row['has_end']}\\n\\n\" + \\\n",
-    "        f\"average pLDDT [ColabFold]: {row['plddt']}\\n\\n\" + \\\n",
+    "        f\"Preferred name [EggNOG]: {row['Preferred_name_seq']}\\n\" + \\\n",
+    "        f\"Description [EggNOG]: {row['Description_seq']}\\n\" + \\\n",
+    "        f\"CoFFE preferred name [EggNOG]: {row['Preferred_name_str']}\\n\" + \\\n",
+    "        f\"CoFFE description [EggNOG]: {row['Description_str']}\\n\" + \\\n",
+    "        f\"CoFFE UniProt function: {row['Function [CC]']}\\n\\n\" + \\\n",
+    "        f\"average pLDDT [ColabFold]: {row['plddt']:.2f}\\n\\n\" + \\\n",
     "        f\"best structural hit [FoldSeek]: {row['target']}\\n\" + \\\n",
     "        f\"structural state %identity [FoldSeek]: {row['seq. id.']}\\n\" + \\\n",
     "        f\"bit score [FoldSeek]: {row['bit score']}\\n\\n\" + \\\n",
     "        f\"UniProt ID of best structural hit (CoFFE): {row['uniprot']}\\n\" + \\\n",
     "        f\"CoFFE Emapper bit score [EggNOG]: {row['score_str']}\\n\" + \\\n",
-    "        f\"CoFFE orthogroups [EggNOG]: {row['eggNOG_OGs_str']}\\n\" + \\\n",
+    "        f\"CoFFE orthogroups [EggNOG]: {row['eggNOG_OGs_str'].replace(',', ', ')}\\n\" + \\\n",
     "        f\"CoFFE maximum annotation level [EggNOG]: {row['max_annot_lvl_str']}\\n\" + \\\n",
-    "#         f\"CoFFE GO terms [EggNOG]: {row['GOs_str']}\\n\" + \\\n",
-    "        f\"CoFFE PFAM domains [EggNOG]: {row['PFAMs_str']}\\n\\n\" + \\\n",
+    "        f\"CoFFE PFAM domains [EggNOG]: {row['PFAMs_str'].replace(',', ', ')}\\n\\n\" + \\\n",
     "        f\"best sequence hit bit score [Emapper]: {row['score_seq']}\\n\" + \\\n",
-    "        f\"Orthogroups [EggNOG]: {row['eggNOG_OGs_seq']}\\n\" + \\\n",
+    "        f\"Orthogroups [EggNOG]: {row['eggNOG_OGs_seq'].replace(',', ', ')}\\n\" + \\\n",
     "        f\"Maximum annotation level [EggNOG]: {row['max_annot_lvl_seq']}\\n\" + \\\n",
-    "#         f\"Emapper GO terms [EggNOG]: {row['GOs_seq']}\\n\" + \\\n",
-    "        f\"PFAM domains [EggNOG]: {row['PFAMs_seq']}\\n\"\n",
+    "        f\"PFAM domains [EggNOG]: {row['PFAMs_seq'].replace(',', ', ')}\\n\"\n",
     "    return desc"
    ]
   },
@@ -133,7 +140,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "id": "935a770c",
    "metadata": {},
    "outputs": [
@@ -141,44 +148,42 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Emapper preferred name [EggNOG]: -\n",
-      "Emapper description [EggNOG]: -\n",
-      "CoFFE preferred name [EggNOG]: PLA2G15\n",
-      "CoFFE description [EggNOG]: Lecithin:cholesterol acyltransferase\n",
-      "CoFFE UniProt function: -\n",
-      "\n",
-      "isoform ID: c103531_g3_i1\n",
-      "peptide length: 202\n",
-      "input MSA size [MMseqs2]: 1924.0\n",
-      "gene ID: c103531_g3\n",
-      "protein ID: 66482\n",
+      "isoform ID: c12419_g1_i1\n",
+      "peptide length: 125\n",
+      "input MSA size [MMseqs2]: 973\n",
+      "gene ID: c12419_g1\n",
+      "protein ID: 1183\n",
       "mRNA has start codon [TransDecoder]: False\n",
-      "mRNA has stop codon [TransDecoder]: True\n",
+      "mRNA has stop codon [TransDecoder]: False\n",
+      "\n",
+      "Preferred name [EggNOG]: DBR1\n",
+      "Description [EggNOG]: Debranching enzyme homolog 1 (S. cerevisiae)\n",
+      "CoFFE preferred name [EggNOG]: USE1\n",
+      "CoFFE description [EggNOG]: Unconventional SNARE in the ER 1 homolog (S. cerevisiae)\n",
+      "CoFFE UniProt function: FUNCTION: SNARE that may be involved in targeting and fusion of Golgi-derived retrograde transport vesicles with the ER. {ECO:0000250}.\n",
       "\n",
-      "average pLDDT [ColabFold]: 88.31183168316831\n",
+      "average pLDDT [ColabFold]: 64.27\n",
       "\n",
-      "best structural hit [FoldSeek]: AF-A0A077YYC2-F1\n",
-      "structural state %identity [FoldSeek]: 0.295\n",
-      "bit score [FoldSeek]: 659.0\n",
+      "best structural hit [FoldSeek]: AF-Q7ZTY7-F1\n",
+      "structural state %identity [FoldSeek]: 0.134\n",
+      "bit score [FoldSeek]: 73.0\n",
       "\n",
-      "UniProt ID of best structural hit (CoFFE): A0A077YYC2\n",
-      "CoFFE Emapper bit score [EggNOG]: 526.0\n",
-      "CoFFE orthogroups [EggNOG]: KOG2369@1|root,KOG3236@1|root,KOG2369@2759|Eukaryota,KOG3236@2759|Eukaryota,38DE3@33154|Opisthokonta,3BDIK@33208|Metazoa,3CTGA@33213|Bilateria,40D2W@6231|Nematoda\n",
+      "UniProt ID of best structural hit (CoFFE): Q7ZTY7\n",
+      "CoFFE Emapper bit score [EggNOG]: 487.0\n",
+      "CoFFE orthogroups [EggNOG]: KOG2678@1|root, KOG2678@2759|Eukaryota, 38T50@33154|Opisthokonta, 3BE5T@33208|Metazoa, 3CWSZ@33213|Bilateria, 4852S@7711|Chordata, 48XPJ@7742|Vertebrata, 49UTR@7898|Actinopterygii\n",
       "CoFFE maximum annotation level [EggNOG]: 33208|Metazoa\n",
-      "CoFFE GO terms [EggNOG]: GO:0000323,GO:0003674,GO:0003824,GO:0004620,GO:0004622,GO:0004623,GO:0005488,GO:0005543,GO:0005575,GO:0005576,GO:0005615,GO:0005622,GO:0005623,GO:0005634,GO:0005654,GO:0005737,GO:0005764,GO:0005773,GO:0006082,GO:0006629,GO:0006631,GO:0006643,GO:0006644,GO:0006650,GO:0006665,GO:0006672,GO:0006793,GO:0006796,GO:0006807,GO:0008150,GO:0008152,GO:0008289,GO:0008374,GO:0009056,GO:0009062,GO:0009395,GO:0009987,GO:0016042,GO:0016054,GO:0016298,GO:0016740,GO:0016746,GO:0016747,GO:0016787,GO:0016788,GO:0019637,GO:0019752,GO:0031974,GO:0031981,GO:0032787,GO:0034638,GO:0034641,GO:0043167,GO:0043168,GO:0043226,GO:0043227,GO:0043229,GO:0043231,GO:0043233,GO:0043436,GO:0043603,GO:0044237,GO:0044238,GO:0044242,GO:0044248,GO:0044255,GO:0044281,GO:0044282,GO:0044421,GO:0044422,GO:0044424,GO:0044428,GO:0044444,GO:0044446,GO:0044464,GO:0046337,GO:0046338,GO:0046395,GO:0046434,GO:0046470,GO:0046475,GO:0046486,GO:0046503,GO:0047499,GO:0052689,GO:0070013,GO:0071704,GO:0072329,GO:0097164,GO:1901564,GO:1901565,GO:1901575\n",
-      "CoFFE PFAM domains [EggNOG]: LCAT\n",
+      "CoFFE PFAM domains [EggNOG]: Use1\n",
       "\n",
-      "best sequence hit bit score [Emapper]: -\n",
-      "Emapper orthogroups [EggNOG]: -\n",
-      "Emapper maximum annotation level [EggNOG]: -\n",
-      "Emapper GO terms [EggNOG]: -\n",
-      "Emapper PFAM domains [EggNOG]: -\n",
+      "best sequence hit bit score [Emapper]: 453.0\n",
+      "Orthogroups [EggNOG]: KOG2863@1|root, KOG2863@2759|Eukaryota, 38B4T@33154|Opisthokonta, 3BA79@33208|Metazoa, 3CU1D@33213|Bilateria, 47Z8A@7711|Chordata, 48Y57@7742|Vertebrata, 49SNU@7898|Actinopterygii\n",
+      "Maximum annotation level [EggNOG]: 33208|Metazoa\n",
+      "PFAM domains [EggNOG]: DBR1, Metallophos\n",
       "\n"
      ]
     }
    ],
    "source": [
-    "print(generate_description(merged.loc[10001]))"
+    "print(generate_description(merged.loc[20360]))"
    ]
   },
   {
@@ -191,7 +196,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 9,
    "id": "1c5133d7",
    "metadata": {},
    "outputs": [],
@@ -209,7 +214,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
    "id": "fa278548",
    "metadata": {},
    "outputs": [],
@@ -221,12 +226,74 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
    "id": "1f718df1",
    "metadata": {},
    "outputs": [],
    "source": [
-    "merged.to_csv('/g/arendt/npapadop/data/spongfold_publish/model_archive_metadata.csv')"
+    "merged.to_csv('../data/model_archive_metadata.csv')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "69a67bca-9839-4711-95f3-313f65cdab04",
+   "metadata": {},
+   "source": [
+    "We also need an overview table that lists all the proteins. We are going to use the same columns as for the description but save them as a table instead:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "1858268f-5bed-467a-89ac-a203fbdd1942",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "keep = ['isoform', 'query length', 'MSA size', 'gene_id_str',\n",
+    "        'protein_id_str', 'has_start', 'has_end',\n",
+    "        'Preferred_name_seq', 'Description_seq', 'Preferred_name_str',\n",
+    "        'Description_str', 'Function [CC]', 'plddt', 'target',\n",
+    "        'seq. id.', 'bit score', 'uniprot', 'score_str',\n",
+    "        'eggNOG_OGs_str', 'max_annot_lvl_str', 'PFAMs_str',\n",
+    "        'score_seq', 'eggNOG_OGs_seq', 'max_annot_lvl_seq',\n",
+    "        'PFAMs_seq']\n",
+    "\n",
+    "columns = ['isoform ID', 'peptide length', 'input MSA size [MMseqs2]', 'gene ID',\n",
+    "           'protein ID', 'mRNA has start codon [TransDecoder]',\n",
+    "           'mRNA has stop codon [TransDecoder]',\n",
+    "           'Preferred name [EggNOG]', 'Description [EggNOG]',\n",
+    "           'CoFFE preferred name [EggNOG]', 'CoFFE description [EggNOG]',\n",
+    "           'CoFFE UniProt function', 'average pLDDT [ColabFold]',\n",
+    "           'best structural hit [FoldSeek]', 'structural state %identity [FoldSeek]',\n",
+    "           'bit score [FoldSeek]', 'UniProt ID of best structural hit (CoFFE)',\n",
+    "           'CoFFE Emapper bit score [EggNOG]', 'CoFFE orthogroups [EggNOG]',\n",
+    "           'CoFFE maximum annotation level [EggNOG]', 'CoFFE PFAM domains [EggNOG]',\n",
+    "           'best sequence hit bit score [Emapper]', 'Orthogroups [EggNOG]',\n",
+    "           'Maximum annotation level [EggNOG]', 'PFAM domains [EggNOG]']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "76b04936-1189-47ac-9a19-cabd1bea79cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged = structure.join(sequence, on='protein_id', lsuffix='_str', rsuffix='_seq').fillna('-')\n",
+    "merged['query length'] = merged['query length'] - merged['has_end']\n",
+    "overview = merged[keep]\n",
+    "overview.columns = columns\n",
+    "overview.reset_index(inplace=True, drop=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "4f5ce19c-b264-4652-bcac-ef92651d5c02",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "overview.to_csv('../data/model_archive_overview.csv')"
    ]
   }
  ],
@@ -246,7 +313,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.13"
+   "version": "3.9.6"
   }
  },
  "nbformat": 4,
diff --git a/analysis/suppl-ModelArchive_preparation.ipynb b/analysis/suppl-ModelArchive_preparation.ipynb
index 518b90f..a673989 100644
--- a/analysis/suppl-ModelArchive_preparation.ipynb
+++ b/analysis/suppl-ModelArchive_preparation.ipynb
@@ -10,7 +10,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2022-06-08 14:02\n"
+      "2022-08-16 07:52\n"
      ]
     }
    ],
diff --git a/analysis/suppl-compare_seq_profile.ipynb b/analysis/suppl-compare_seq_profile.ipynb
new file mode 100644
index 0000000..f51fa7d
--- /dev/null
+++ b/analysis/suppl-compare_seq_profile.ipynb
@@ -0,0 +1,898 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "273c6a8d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2022-07-29 15:06\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datetime import datetime, timezone\n",
+    "import pytz\n",
+    "\n",
+    "utc_dt = datetime.now(timezone.utc) # UTC time\n",
+    "dt = utc_dt.astimezone()\n",
+    "tz = pytz.timezone('Europe/Berlin')\n",
+    "berlin_now = datetime.now(tz)\n",
+    "print(f'{berlin_now:%Y-%m-%d %H:%M}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "09defaef-09b9-4d8e-84a1-dc3c7a56b80a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "from os.path import exists\n",
+    "from tqdm import tqdm\n",
+    "import requests\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "8aaa0695",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "structural_annotation = pd.read_parquet('../old_data/structure_annotation.parquet')\n",
+    "sequence_annotation = pd.read_csv('../old_data/Slacustris_eggnog.tsv', sep='\\t')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "95457088",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mmseqs = pd.read_csv('../data/cfres_mmseqs_s75_e1.m8', sep=\"\\s+\", header=None)\n",
+    "mmseqs.columns = [\"query\", \"target\", \"seq. id.\", \"alignment length\", \"no. mismatches\",\n",
+    "                       \"no. gap open\", \"query start\", \"query end\", \"target start\", \"target end\",\n",
+    "                       \"e value\", \"bit score\"]\n",
+    "\n",
+    "mmseqs['gene_id'] = mmseqs['query'].str.split('_').str[:2].apply(lambda x: '_'.join(x))\n",
+    "mmseqs['normalized bit score'] = mmseqs['bit score'] / mmseqs['alignment length']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "093bff38",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def best_bit_score(df, sort_by='normalized bit score', tiebreak='alignment length'):\n",
+    "    have_max = df[sort_by] == np.max(df[sort_by])\n",
+    "    max_ali = df[have_max][tiebreak] == np.max(df[have_max][tiebreak])\n",
+    "    return df[have_max][max_ali].index.values[0]\n",
+    "\n",
+    "def keep_best(df, groupby='gene_id'):\n",
+    "    df.reset_index(inplace=True)\n",
+    "    idx = df.groupby(groupby).apply(best_bit_score)\n",
+    "    res = df.loc[idx].copy()\n",
+    "    return res"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "042ff08e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mmseqs_filtered = keep_best(mmseqs)\n",
+    "del mmseqs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "9645fec8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hhblits = pd.read_csv('../data/cfres_hhblits_e1.m8', sep=\"\\s+\", header=None)\n",
+    "hhblits.columns = [\"query\", \"target\", \"seq. id.\", \"alignment length\", \"no. mismatches\",\n",
+    "                       \"no. gap open\", \"query start\", \"query end\", \"target start\", \"target end\",\n",
+    "                       \"e value\", \"bit score\"]\n",
+    "\n",
+    "hhblits['gene_id'] = hhblits['query'].str.split('_').str[:2].apply(lambda x: '_'.join(x))\n",
+    "hhblits['normalized bit score'] = hhblits['bit score'] / hhblits['alignment length']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "7e9d0601",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hhblits_filtered = keep_best(hhblits)\n",
+    "del hhblits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "da58c74f-5cd2-435a-8281-6195ed7f15af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mmseqs_filtered.drop(columns=['index'], inplace=True)\n",
+    "hhblits_filtered.drop(columns=['index'], inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "260c26ec",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['evalue', 'score', 'eggNOG_OGs', 'max_annot_lvl', 'COG_category',\n",
+       "       'Description', 'Preferred_name', 'GOs', 'PFAMs', 'gene_id',\n",
+       "       'protein_id'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sequence_annotation.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "c92b4a01",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "structural_annotation['normalized bit score'] = structural_annotation['bit score'] / structural_annotation['alignment length']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "5ceaaff2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<matplotlib.legend.Legend at 0x7f7975465070>"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD4CAYAAAD8Zh1EAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAZdUlEQVR4nO3df3BV5b3v8feXgAUUxJLcthJs0jOIAgkBg3oLF+GgGNCRudoZRUFOW0qZi0due0XwtMcWOu30jJxbD1WhDKWWuVSoSCn3CtpSsIK1hx+6G34oGmlKQlqNoShyoPzwe//ITs4m7JCdZO2svdf+vGYyZK317LW/e0/45Mmzn/Usc3dERCT7dQu7ABERCYYCXUQkIhToIiIRoUAXEYkIBbqISER0D+uJ8/PzvaioKKynFxHJSnv27Hnf3QuSHQst0IuKiti9e3dYTy8ikpXM7E+tHdOQi4hIRCjQRUQiQoEuIhIRoY2hi0juOHPmDLW1tZw6dSrsUrJGz549KSwspEePHik/RoEuImlXW1tLnz59KCoqwszCLifjuTsNDQ3U1tZSXFyc8uM05CIiaXfq1Cn69++vME+RmdG/f/92/0WjQBeRLqEwb5+OvF8KdBGRiNAYuoh0uS0H3g30fDcP+VSg58tW6qFfzMHNYVcgIpIyBbqI5ITq6mquueYaZs6cybBhw7jvvvvYsmULo0ePZtCgQezcuZNvf/vbzJgxg4kTJ1JUVMT69et5+OGHKSkpoaKigjNnzgCwYMEChgwZQmlpKQ899BAA9fX13HXXXYwaNYpRo0bxyiuvANDQ0MDEiRMZMWIEX/3qV/nsZz/L+++/z4kTJ7jtttsYPnw4w4YNY+3atZ1+jQp0EckZVVVVzJ07l8rKSt58801+9rOfsWPHDhYvXsz3vvc9AN555x2ef/55fvnLXzJt2jTGjx/P3r176dWrF88//zxHjx7lF7/4Bfv376eyspJvfvObAMydO5evfe1r7Nq1i+eee46ZM2cCsHDhQsaMGcPrr7/OHXfcweHDhwF44YUXuPLKK/nDH/7Avn37qKio6PTrU6CLSM4oLi6mpKSEbt26MXToUCZMmICZUVJSQnV1NQCTJk2iR48elJSUcO7cueagbWrTt29fevbsycyZM1m/fj29e/cGYMuWLTzwwAOUlZVxxx138OGHH3L8+HFefvllpk2bBsBtt93GFVdc0Xy+LVu2MH/+fLZv387ll1/e6denQL+IWM2xsEsQkQB94hOfaP6+W7duzdvdunXj7Nmz57Xp1q0bPXr0aJ4+2NSme/fu7Ny5k7vuuosNGzY0B/7HH3/Mq6++SiwWIxaLceTIEfr06QMkn4J49dVXs2fPHkpKSnjkkUdYtGhRp1+fAr01+kBURJL46KOP+OCDD5g8eTKPP/44sVgMgIkTJ/LEE080t2vaP3bsWFavXg3A5s2b+etf/wpAXV0dvXv3Ztq0aTz00EO89tprna5N0xZFpMtl8zTD48ePM2XKFE6dOoW784Mf/ACAJUuWMGfOHEpLSzl79ixjx45l2bJlfOtb32Lq1KmMHDmSm266iauuugqAvXv3Mm/evOa/BJYuXdrp2szdO32SjigvL/eMvsHFwc3Eao5RdvPUsCsRyXpvvPEG1157bdhlZISmm/vk5+e32TbZ+2Zme9y9PFn7NodczGylmb1nZvtaOX6fmVXGv35nZsPbrFJERAKXyhj608DF5tP8EbjJ3UuB7wDLA6hLRCSSqqurU+qdd0SbY+ju/rKZFV3k+O8SNn8PFAZQl4iItFPQs1y+DLQ6PcTMZpnZbjPbXV9fH/BTB0tTFkUk2wQW6GY2nsZAn99aG3df7u7l7l5eUFAQ1FN3maAXFBIRCVIg0xbNrBRYAUxy94YgzikiIu3T6UA3s6uA9cB0d3+r8yVlrvy6rTBE0xhFOi3oC/cGT2qzSXV1Nbfffjv79p0/YW/cuHEsXryY8vLzZwI+/fTT7N69+7yLhZpcdtllfPTRR9TV1fHggw+ybt06YrEYdXV1TJ48uXOvpRNSmbb4DPAqMNjMas3sy2Y228xmx5s8CvQHnjKzmJll8OTyTtCVoyLSwpVXXsm6deuAxitDN23aFGo9bQa6u09198+4ew93L3T3H7v7MndfFj8+092vcPey+FfSCe/Z5LyxcgW5SGScO3eOr3zlKwwdOpSJEydy8uRJAJ599lmuv/56rr76arZv397cvqamhoqKCgYPHszChQsvOF91dTXDhg3j9OnTPProo6xdu5aysjLWrl3Lb3/7W8rKyigrK2PEiBEcP3487a9Pa7k0aQrug5sbh1biNNtFJDrefvtt5syZw/79++nXrx/PPfccAGfPnmXnzp08/vjj5wX3zp07Wb16NbFYjGeffZbWrm6/5JJLWLRoEXfffTexWIy7776bxYsX8+STTxKLxdi+fTu9evVK++tToItIziguLqasrAyA6667rnnJ3DvvvPOCfQC33HIL/fv3p1evXtx5553s2LEj5ecaPXo0X//611myZAnHjh2je/f0L52lQBeRnJG4fG5eXt4FS+Ym7oMLl71NtgxuaxYsWMCKFSs4efIkN954I2+++WZnSk+JAl1EpBW//vWvOXr0KCdPnmTDhg2MHj261bZ9+vQ5b5z8nXfeoaSkhPnz51NeXt4lga7lc0Wk66UwzTATjBkzhunTp1NVVcW99957wdTGROPHj+f73/8+ZWVlPPLII+zYsYNt27aRl5fHkCFDmDQp/a9Zy+c2Obi58YcsvmxuorKbp2o5XZFO0PK5HRP48rnSSLNdRCTTKdBFRCJCgZ5IFxGJSBZToIuIRIQCXUQkIhToIiIRoXno7bTlwLvcPORTYZchktVeqnkp0PONGzgusHPV19dz++23c/r0aZYsWcJf/vIXHn30UT796U+zbdu2wJ4nHRToCWI1xygb2C/sMkQkRL/5zW+45ppr+OlPfwpARUUFTz31FOPHjw+5srZpyEVEcsKqVasoLS1l+PDhTJ8+nT/96U9MmDCB0tJSJkyYwOHDh4nFYjz88MNs2rSJsrIyFi5cyI4dO5g9ezbz5s3j3LlzzJs3j1GjRlFaWsqPfvSjsF/WedRD74imq0pFJCvs37+f7373u7zyyivk5+dz9OhRZsyYwf3338+MGTNYuXIlDz74IBs2bGDRokXn3alo27ZtzXc0Wr58OZdffjm7du3ib3/7G6NHj2bixIkUFxeH/AobqYcepytBRaJr69atfOELXyA/Px+AT37yk7z66qvce++9AEyfPj2lpXF/9atfsWrVKsrKyrjhhhtoaGjg7bffTmvt7aEeejvl120FjbOLZBV3b3Pp21SWxnV3fvjDH3LrrbcGVVqg1EMXkcibMGECP//5z2loaADg6NGjfP7zn2fNmjUArF69mjFjxrR5nltvvZWlS5dy5swZAN566y1OnDiRvsLbST30FMS2PBN2CSKREuQ0w1QMHTqUb3zjG9x0003k5eUxYsQIlixZwpe+9CUee+wxCgoK+MlPftLmeWbOnEl1dTUjR47E3SkoKGDDhg3pfwEp0vK5cU2hXTawX5vj6WUD++lDUZF20PK5HaPlc0VEcpQCXUQkIhToItIlwhrezVYdeb/aDHQzW2lm75nZvlaOm5ktMbMqM6s0s5HtriKLbTnwbtgliGS8nj170tDQoFBPkbvT0NBAz5492/W4VGa5PA08Aaxq5fgkYFD86wZgafxfEREACgsLqa2tpb6+PuxSskbPnj0pLCxs12PaDHR3f9nMii7SZAqwyht/9f7ezPqZ2Wfc/c/tqkREIqtHjx4Zc3l8lAUxhj4AqEnYro3vu4CZzTKz3Wa2O1N/U2sJABHJVkEEerLrZZMOlLn7cncvd/fygoKCAJ5aRESaBBHotcDAhO1CoC6A83aZznywmV+3NcBKREQ6LohA3wjcH5/tciPwQc6Mnx/cHHYFIiLN2vxQ1MyeAcYB+WZWC3wL6AHg7suATcBkoAr4D+CL6So2LQ5uJr/uWNhViIh0WiqzXKa2cdyBOYFVJCIiHaIrRUVEIkKB3gma4igimUSB3gEKchHJRAp0EZGIUKCLiESEAl1EJCJyPtA1Hi4iUZHzgS4iEhUKdBGRiFCgB0FruohIBlCgd5BuPScimUaBLiISEQp0EZGIUKCLiESEAl1EJCIU6CIiEaFAD4CuNhWRTKBAFxGJCAW6iEhE5Hagd+IKz/y6rQEWIiLSebkd6AHSlaMiEjYFuohIROR0oGt2iohESUqBbmYVZnbQzKrMbEGS45eb2f81sz+Y2X4z+2LwpYqIyMW0Gehmlgc8CUwChgBTzWxIi2ZzgAPuPhwYB/yrmV0ScK0iInIRqfTQrweq3P2Qu58G1gBTWrRxoI+ZGXAZcBQ4G2ilIiJyUakE+gCgJmG7Nr4v0RPAtUAdsBeY6+4fB1JhmmhWiohETSqBbkn2eYvtW4EYcCVQBjxhZn0vOJHZLDPbbWa76+vr21lqsDSPXESiJpVArwUGJmwX0tgTT/RFYL03qgL+CFzT8kTuvtzdy929vKCgoKM1Zy7dik5EQpRKoO8CBplZcfyDznuAjS3aHAYmAJjZp4DBwKEgCxURkYvr3lYDdz9rZg8ALwJ5wEp3329ms+PHlwHfAZ42s700DtHMd/f301h3xsmv2woD+4VdhojksDYDHcDdNwGbWuxblvB9HTAx2NJERKQ9cvpK0aDpylMRCZMCXUQkIhToIiIRoUBPB01fFJEQ5GSg6ypREYminAx0EZEoUqAHTL1/EQmLAl1EJCIU6CIiEaFAFxGJCAW6iEhEKNBFRCJCgS4iEhEKdBGRiFCgi4hEhAJdRCQiFOgiIhGRc4G+5cC7jbeLSyPd6EJEwpBzgS4iElUKdBGRiFCgi4hEhAI9YOkenxcRaY0CXUQkIhToIiIRkVKgm1mFmR00syozW9BKm3FmFjOz/Wb222DLDI6GREQkqrq31cDM8oAngVuAWmCXmW109wMJbfoBTwEV7n7YzP5LmuoVEZFWpNJDvx6ocvdD7n4aWANMadHmXmC9ux8GcPf3gi0z++jeoiLS1VIJ9AFATcJ2bXxfoquBK8zsJTPbY2b3JzuRmc0ys91mtru+vr5jFYuISFKpBLol2ecttrsD1wG3AbcC/2xmV1/wIPfl7l7u7uUFBQXtLrbTDm7usqfSWL2IdLU2x9Bp7JEPTNguBOqStHnf3U8AJ8zsZWA48FYgVYqISJtS6aHvAgaZWbGZXQLcA2xs0eaXwH8zs+5m1hu4AXgj2FJFRORi2uyhu/tZM3sAeBHIA1a6+34zmx0/vszd3zCzF4BK4GNghbvvS2fhIiJyvlSGXHD3TcCmFvuWtdh+DHgsuNKCp2VtRSTKdKWoiEhEKNBFRCJCgZ5uXThVUkRymwI9nQ5u1ri9iHQZBbqISEQo0NNIvXMR6UoKdBGRiEhpHnqUvXby7aT7R/Ya1MWViIh0jnroIiIRoUAXEYmInBxyaW2YRUQkm6mHLiISEQr0rqCrRUWkCyjQRUQiQoEuIhIRCvQuoCtGRaQrKNBFRCIiZ6YtbjnwLvntaJ84tVFXjYpINlAPXUQkIhToXUnTF0UkjXIm0PPrtoZdgohIWuVMoIuIRJ0CvatouEVE0ixnZrlkwoJcsZpjlA0OuwoRiaqUeuhmVmFmB82syswWXKTdKDM7Z2ZfCK7EAKh3LCI5oM0eupnlAU8CtwC1wC4z2+juB5K0+xfgxXQUGqYg5qTralERSbdUeujXA1XufsjdTwNrgClJ2v0j8BzwXoD1BUJhKiK5IJVAHwDUJGzXxvc1M7MBwH8Hll3sRGY2y8x2m9nu+vr69tYqIiIXkUqgW5J93mL7cWC+u5+72Incfbm7l7t7eUFBQYoliohIKlKZ5VILDEzYLgTqWrQpB9aYGUA+MNnMzrr7hiCKFBGRtqXSQ98FDDKzYjO7BLgH2JjYwN2L3b3I3YuAdcD/UJgnt+XAu2GXICIR1WYP3d3PmtkDNM5eyQNWuvt+M5sdP37RcXMREekaKV1Y5O6bgE0t9iUNcnf/h86XFXEHN8PgSWFXISIRo0v/Q6BplCKSDpEP9NiWZ8IuQUSkS0Q+0EVEcoUCXUQkIhToIiIRkTPL5walswt16c5JIpIu6qGHRUv6ikjAFOgiIhER6SGXl2pe4lAG3KlIRKQrqIcuIhIRCnQRkYiI9JBLrOYYfdN4/iBuTSciEhT10EVEIiLSgd63oTLsElqlBbpEJGiRDnQRkVyiQBcRiYjIBno23OotG2oUkewR2UDPFgp1EQmKAl1EJCIU6AF57eTbzV/tkbj6onrrItIZkb2wKL9uK4fDLiJVzSsvjgy1DBHJbuqhZwDNSReRICjQM4RCXUQ6K6VAN7MKMztoZlVmtiDJ8fvMrDL+9TszGx58qdkj1fF03b1IRILUZqCbWR7wJDAJGAJMNbMhLZr9EbjJ3UuB7wDLgy40FyjgRaQzUumhXw9Uufshdz8NrAGmJDZw99+5+1/jm78HCoMtU0RE2pLKLJcBQE3Cdi1ww0XafxlIesNMM5sFzAK46qqrUiyxfV6qeQkgY+5U1KEldg9uhsGT0lSRiERVKj10S7LPkzY0G09joM9Pdtzdl7t7ubuXFxQUpF5lDtFcdBHpqFQCvRYYmLBdCNS1bGRmpcAKYIq7NwRTXm7SjBcR6YhUhlx2AYPMrBg4AtwD3JvYwMyuAtYD0939rcCrjIjWZr3obkciEoQ2A93dz5rZA8CLQB6w0t33m9ns+PFlwKNAf+ApMwM46+7l6StbRERaSunSf3ffBGxqsW9ZwvczgZnBlpY7zvvgNGEwq+kDXoBxA8d1WT0ikp0iu5ZLtmoK92M1L523P5VwT2zTUuJjWmunXxoi2S2SgR6rOUbfsItIo4703C8W9kG0b+0XRnt/+eiXikjHRTLQc0l7gzdd50rlsUHWKiIXilyga8pfZlGIi3SdyK222LehMuwSRERCEbkeOkQn1A/FXuZzZWPDLqNLpTK2rtk/IslFMtAj4ciesCvIKK0FvcJd5D8p0DPUofoTYZeQ1RT0kosU6BJ5qc7PF8l2WR/omkUhTfSzILku6wM98o7sgQHXhV1FTtAwjWS7aAV6BD9IPFR/gs+hUO9qCnfJRtEK9Ig6VH+CDz8+RtnAfmGXEjmducJVQS+ZRoGeJfo2VEK3S9VTz1Dq0UsmiFSgR32q36H6E3xuQNhVSBPNjZdME6lAzwWxGg29ZBOFu3SlyAT6odjLYZfQJfo2VMLA3FoOIIo0Li/pEJlAzyXqpWcnfQAr6aZAF8kyGsaR1ijQs1CyGS/qtUebPoCVVEQj0CN4QVG7HNkD/F3YVUjIFO6S9YEeqzlG34ZoT1dMpuliI4CyyN2mRDorlfF6hX70ZH2gR+VmFh3Rt6GSD/uXNn/PwLEaepGUXaxHrw9ns1PWB7okiA+95OKdjqRzUl2pUkGf2VIKdDOrAP4NyANWuPv3Wxy3+PHJwH8A/+DurwVcazMtk/qf+jZUQsGlQPxK2f6N+5tulq3eunSF1nr77d0vndNmoJtZHvAkcAtQC+wys43ufiCh2SRgUPzrBmBp/N+0idUco6zbO+l8iqyRbMmD5qGobpcS+/jvGoP9yJ7Gsff4MI3CXtIhlRk5qexX0LdfKj3064Eqdz8EYGZrgClAYqBPAVa5uwO/N7N+ZvYZd/9z4BXH9W2o5FC6Tp7FWn6mcKj+BH2p5FDDhW0ONcCH/UsV7JKRgvpLvLW/DFprk81SCfQBQE3Cdi0X9r6TtRkAnBfoZjYLmBXf/MjMDrarWsgH3m/nY3KB3pfk9L5cSO9Jctn0vny2tQOpBLol2ecdaIO7LweWp/CcyQsx2+3u5R19fFTpfUlO78uF9J4kF5X3JZUZzLXAwITtQqCuA21ERCSNUgn0XcAgMys2s0uAe4CNLdpsBO63RjcCH6Rz/FxERC7U5pCLu581sweAF2mctrjS3feb2ez48WXAJhqnLFbROG3xi2mqt8PDNRGn9yU5vS8X0nuSXCTeF2ucmCIiItlOq4CIiESEAl1EJCKyJtDNrMLMDppZlZktCLuesJnZQDPbZmZvmNl+M5sbdk2ZxMzyzOx1M/t/YdeSKeIX/K0zszfjPzf/NeyawmZmX4v//9lnZs+YWc+wa+qMrAj0hOUHJgFDgKlmNiTcqkJ3Fvhf7n4tcCMwR+/JeeYCb4RdRIb5N+AFd78GGE6Ovz9mNgB4ECh392E0Tvq4J9yqOicrAp2E5Qfc/TTQtPxAznL3PzctgObux2n8zzkg3Koyg5kVArcBK8KuJVOYWV9gLPBjAHc/7e7HQi0qM3QHeplZd6A3WX79TLYEemtLCwhgZkXACODfQy4lUzwOPAx8HHIdmeRzQD3wk/hQ1AozuzTsosLk7keAxcBhGpcp+cDdfxVuVZ2TLYGe0tICucjMLgOeA/6nu38Ydj1hM7PbgffcPcfvS3iB7sBIYKm7jwBOADn9WZSZXUHjX/rFwJXApWY2LdyqOidbAl1LCyRhZj1oDPPV7r4+7HoyxGjgDjOrpnFo7u/N7P+EW1JGqAVq3b3pr7h1NAZ8LrsZ+KO717v7GWA98PmQa+qUbAn0VJYfyCnxm4r8GHjD3f932PVkCnd/xN0L3b2Ixp+Tre6e1b2uILj7X4AaMxsc3zWB85fAzkWHgRvNrHf8/9MEsvyD4qy4BV1ryw+EXFbYRgPTgb1mFovv+yd33xReSZLh/hFYHe8UHSJ9S3RkBXf/dzNbB7xG46yx18nyJQB06b+ISERky5CLiIi0QYEuIhIRCnQRkYhQoIuIRIQCXUQkIhToIiIRoUAXEYmI/w9iflt0knsXdQAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<Figure size 432x288 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "fig, ax = plt.subplots()\n",
+    "ax.hist(mmseqs_filtered['normalized bit score'], bins=100, label='mmseqs', alpha=0.3, density=True);\n",
+    "ax.hist(hhblits_filtered['normalized bit score'], bins=100, label='hhblits', alpha=0.3, density=True);\n",
+    "ax.hist(structural_annotation['normalized bit score'], bins=100, label='coffe', alpha=0.3, density=True);\n",
+    "ax.legend()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "7649fb1a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "unique_up_id = pd.concat([hhblits_filtered['target'].drop_duplicates(),\n",
+    "                          mmseqs_filtered['target'].drop_duplicates()])\n",
+    "unique_up_id.drop_duplicates(inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "e8d0cfb7-c86d-41a6-99e7-d514bc1f3873",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "148"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "request_size = 500\n",
+    "no_chunks = np.ceil((len(unique_up_id) / request_size) * 3).astype(int)\n",
+    "no_chunks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "85231f72-12ad-4cc7-897a-544e0be27228",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 148/148 [01:45<00:00,  1.41it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "request_size = 500\n",
+    "no_chunks = np.ceil((len(unique_up_id) / request_size) * 3).astype(int)\n",
+    "\n",
+    "with open('../data/profile_sequences.fasta', 'w') as result:\n",
+    "    for i in tqdm(range(no_chunks)):\n",
+    "        a = (i * request_size) // 3\n",
+    "        b = ((i+1) * request_size) // 3\n",
+    "        chunk = unique_up_id[a:b]\n",
+    "        bait50 = ['UniRef50_' + c for c in chunk]\n",
+    "        bait90 = ['UniRef90_' + c for c in chunk]\n",
+    "        bait100 = ['UniRef100_' + c for c in chunk]\n",
+    "        expanded_chunk = bait50 + bait90 + bait100\n",
+    "        url = f\"https://rest.uniprot.org/uniref/ids?ids={','.join(expanded_chunk)}&format=fasta\"\n",
+    "        response = requests.get(url)\n",
+    "        result.write(response.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3d17ee4-6176-443a-b320-a54aaa7ff8cd",
+   "metadata": {},
+   "source": [
+    "(wait for EggNOG-mapper to annotate the sequences)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "eb0c48d8-ef71-440f-8ea0-6e7f3118f957",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file = '/g/arendt/npapadop/data/spongfold_publish/MM_ffr33uy6.emapper.annotations.tsv'\n",
+    "sensitive = pd.read_csv(file, sep='\\t', skiprows=4, skipfooter=3, engine='python')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "0a97dfb7-4783-44fa-a330-f07b0585fcf8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test = sensitive['#query'].str.split('_').str[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "942d4a2d-2a8b-44ff-89b4-8525f3ad926e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "A0A6A6LEY7    3\n",
+       "A0A1X7U3S9    3\n",
+       "A0A3N5U8U0    3\n",
+       "A0A6J0B743    3\n",
+       "A0A482W475    3\n",
+       "             ..\n",
+       "A0A6I8SCD7    1\n",
+       "A0A1A8MN43    1\n",
+       "A0A0C2FDV2    1\n",
+       "A0A2J8INC7    1\n",
+       "A0A0K0D6S2    1\n",
+       "Name: #query, Length: 20360, dtype: int64"
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "909804dd-4006-4dcc-a4ba-308a5b3975d0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>#query</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>seed_ortholog</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>evalue</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>score</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>eggNOG_OGs</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max_annot_lvl</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>COG_category</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Description</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Preferred_name</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>GOs</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>EC</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>KEGG_ko</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>KEGG_Pathway</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>KEGG_Module</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>KEGG_Reaction</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>KEGG_rclass</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>BRITE</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>KEGG_TC</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>CAZy</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>BiGG_Reaction</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>PFAMs</th>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "Empty DataFrame\n",
+       "Columns: []\n",
+       "Index: [#query, seed_ortholog, evalue, score, eggNOG_OGs, max_annot_lvl, COG_category, Description, Preferred_name, GOs, EC, KEGG_ko, KEGG_Pathway, KEGG_Module, KEGG_Reaction, KEGG_rclass, BRITE, KEGG_TC, CAZy, BiGG_Reaction, PFAMs]"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sensitive[sensitive['#query'].str.contains('UPI00005B2EF3')].T"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6d1eb54-d942-465e-a868-517d442fe1dd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "1c156c27-d2ec-47d6-b764-2b54816922a0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sensitive['uniprot'] = sensitive['#query'].str.split('_').str[1]\n",
+    "sensitive.reset_index(drop=True, inplace=True)\n",
+    "# remove unnecessary columns\n",
+    "dead_weight = ['#query', 'seed_ortholog', 'EC', 'KEGG_ko', 'KEGG_Pathway',\n",
+    "               'KEGG_Module', 'KEGG_Reaction', 'KEGG_rclass', 'BRITE',\n",
+    "               'KEGG_TC', 'CAZy', 'BiGG_Reaction']\n",
+    "sensitive.drop(dead_weight, axis=1, inplace=True)\n",
+    "# convert rest to categorical to save space\n",
+    "to_categorical = ['uniprot', 'eggNOG_OGs', 'max_annot_lvl', 'COG_category',\n",
+    "                  'Description', 'Preferred_name', 'GOs', 'PFAMs']\n",
+    "sensitive[to_categorical] = sensitive[to_categorical].astype(\"category\")\n",
+    "# finally save in parquet format\n",
+    "sensitive.drop_duplicates(inplace=True)\n",
+    "sensitive.to_parquet('../data/uniprot_profiles.parquet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "d68690ea-060a-480f-802c-aac6baae570d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(20360, 10)"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sensitive.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "f60ac7bf-f031-4bd6-99f3-eb3117b50cf1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "7417224       A0A409VGI9\n",
+       "7453017       A0A6A6LEY7\n",
+       "3711344       A0A1A8QDB6\n",
+       "5432158       A0A1X7UJF6\n",
+       "1658532           S9X2M4\n",
+       "               ...      \n",
+       "144364        A0A2F0BPH8\n",
+       "144949            L1J317\n",
+       "4535244       A0A0K0D6S2\n",
+       "2883206    UPI0003F0A19A\n",
+       "2883636       A0A7S2RZY7\n",
+       "Name: target, Length: 24540, dtype: object"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "unique_up_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "fd08dceb-c307-4ad6-bd23-91b70aaf7ef4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>query</th>\n",
+       "      <th>target</th>\n",
+       "      <th>seq. id.</th>\n",
+       "      <th>alignment length</th>\n",
+       "      <th>no. mismatches</th>\n",
+       "      <th>no. gap open</th>\n",
+       "      <th>query start</th>\n",
+       "      <th>query end</th>\n",
+       "      <th>target start</th>\n",
+       "      <th>target end</th>\n",
+       "      <th>e value</th>\n",
+       "      <th>bit score</th>\n",
+       "      <th>gene_id</th>\n",
+       "      <th>normalized bit score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>3537669</th>\n",
+       "      <td>c100005_g1_i4_m.41851</td>\n",
+       "      <td>UPI000C6D6B72</td>\n",
+       "      <td>0.262</td>\n",
+       "      <td>138</td>\n",
+       "      <td>100</td>\n",
+       "      <td>0</td>\n",
+       "      <td>13</td>\n",
+       "      <td>150</td>\n",
+       "      <td>7</td>\n",
+       "      <td>142</td>\n",
+       "      <td>1.464000e-04</td>\n",
+       "      <td>53</td>\n",
+       "      <td>c100005_g1</td>\n",
+       "      <td>0.384058</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3711020</th>\n",
+       "      <td>c100012_g4_i1_m.41921</td>\n",
+       "      <td>A0A4V1IVK5</td>\n",
+       "      <td>0.681</td>\n",
+       "      <td>133</td>\n",
+       "      <td>41</td>\n",
+       "      <td>0</td>\n",
+       "      <td>32</td>\n",
+       "      <td>164</td>\n",
+       "      <td>73</td>\n",
+       "      <td>203</td>\n",
+       "      <td>3.106000e-49</td>\n",
+       "      <td>183</td>\n",
+       "      <td>c100012_g4</td>\n",
+       "      <td>1.375940</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3108519</th>\n",
+       "      <td>c100014_g2_i1_m.41927</td>\n",
+       "      <td>A0A1D1W000</td>\n",
+       "      <td>0.456</td>\n",
+       "      <td>59</td>\n",
+       "      <td>32</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>62</td>\n",
+       "      <td>118</td>\n",
+       "      <td>176</td>\n",
+       "      <td>4.196000e-04</td>\n",
+       "      <td>52</td>\n",
+       "      <td>c100014_g2</td>\n",
+       "      <td>0.881356</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7530038</th>\n",
+       "      <td>c100023_g1_i1_m.41965</td>\n",
+       "      <td>UPI0009E61F51</td>\n",
+       "      <td>0.342</td>\n",
+       "      <td>90</td>\n",
+       "      <td>57</td>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>92</td>\n",
+       "      <td>2743</td>\n",
+       "      <td>2829</td>\n",
+       "      <td>4.556000e-04</td>\n",
+       "      <td>53</td>\n",
+       "      <td>c100023_g1</td>\n",
+       "      <td>0.588889</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5277692</th>\n",
+       "      <td>c100036_g1_i4_m.42040</td>\n",
+       "      <td>A0A7S3BMQ4</td>\n",
+       "      <td>0.523</td>\n",
+       "      <td>99</td>\n",
+       "      <td>46</td>\n",
+       "      <td>0</td>\n",
+       "      <td>71</td>\n",
+       "      <td>169</td>\n",
+       "      <td>1</td>\n",
+       "      <td>98</td>\n",
+       "      <td>2.166000e-19</td>\n",
+       "      <td>100</td>\n",
+       "      <td>c100036_g1</td>\n",
+       "      <td>1.010101</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8219227</th>\n",
+       "      <td>c99972_g1_i2_m.41694</td>\n",
+       "      <td>UPI0009E28D88</td>\n",
+       "      <td>0.466</td>\n",
+       "      <td>63</td>\n",
+       "      <td>33</td>\n",
+       "      <td>0</td>\n",
+       "      <td>8</td>\n",
+       "      <td>70</td>\n",
+       "      <td>392</td>\n",
+       "      <td>453</td>\n",
+       "      <td>1.130000e-05</td>\n",
+       "      <td>57</td>\n",
+       "      <td>c99972_g1</td>\n",
+       "      <td>0.904762</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6087296</th>\n",
+       "      <td>c99980_g3_i1_m.41746</td>\n",
+       "      <td>UPI00005B2EF3</td>\n",
+       "      <td>0.434</td>\n",
+       "      <td>62</td>\n",
+       "      <td>35</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>63</td>\n",
+       "      <td>7</td>\n",
+       "      <td>68</td>\n",
+       "      <td>4.330000e-05</td>\n",
+       "      <td>51</td>\n",
+       "      <td>c99980_g3</td>\n",
+       "      <td>0.822581</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3238079</th>\n",
+       "      <td>c99987_g1_i2_m.41772</td>\n",
+       "      <td>A0A1X7URN4</td>\n",
+       "      <td>0.529</td>\n",
+       "      <td>220</td>\n",
+       "      <td>97</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>221</td>\n",
+       "      <td>37</td>\n",
+       "      <td>243</td>\n",
+       "      <td>1.868000e-61</td>\n",
+       "      <td>221</td>\n",
+       "      <td>c99987_g1</td>\n",
+       "      <td>1.004545</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>574005</th>\n",
+       "      <td>c99993_g2_i1_m.41786</td>\n",
+       "      <td>UPI00177B4952</td>\n",
+       "      <td>0.663</td>\n",
+       "      <td>137</td>\n",
+       "      <td>41</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>137</td>\n",
+       "      <td>23</td>\n",
+       "      <td>145</td>\n",
+       "      <td>1.384000e-49</td>\n",
+       "      <td>183</td>\n",
+       "      <td>c99993_g2</td>\n",
+       "      <td>1.335766</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>574028</th>\n",
+       "      <td>c99993_g3_i1_m.41790</td>\n",
+       "      <td>UPI00106CF215</td>\n",
+       "      <td>0.265</td>\n",
+       "      <td>133</td>\n",
+       "      <td>95</td>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>135</td>\n",
+       "      <td>93</td>\n",
+       "      <td>222</td>\n",
+       "      <td>5.938000e-04</td>\n",
+       "      <td>52</td>\n",
+       "      <td>c99993_g3</td>\n",
+       "      <td>0.390977</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3347 rows × 14 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                         query         target  seq. id.  alignment length  \\\n",
+       "3537669  c100005_g1_i4_m.41851  UPI000C6D6B72     0.262               138   \n",
+       "3711020  c100012_g4_i1_m.41921     A0A4V1IVK5     0.681               133   \n",
+       "3108519  c100014_g2_i1_m.41927     A0A1D1W000     0.456                59   \n",
+       "7530038  c100023_g1_i1_m.41965  UPI0009E61F51     0.342                90   \n",
+       "5277692  c100036_g1_i4_m.42040     A0A7S3BMQ4     0.523                99   \n",
+       "...                        ...            ...       ...               ...   \n",
+       "8219227   c99972_g1_i2_m.41694  UPI0009E28D88     0.466                63   \n",
+       "6087296   c99980_g3_i1_m.41746  UPI00005B2EF3     0.434                62   \n",
+       "3238079   c99987_g1_i2_m.41772     A0A1X7URN4     0.529               220   \n",
+       "574005    c99993_g2_i1_m.41786  UPI00177B4952     0.663               137   \n",
+       "574028    c99993_g3_i1_m.41790  UPI00106CF215     0.265               133   \n",
+       "\n",
+       "         no. mismatches  no. gap open  query start  query end  target start  \\\n",
+       "3537669             100             0           13        150             7   \n",
+       "3711020              41             0           32        164            73   \n",
+       "3108519              32             0            4         62           118   \n",
+       "7530038              57             0            3         92          2743   \n",
+       "5277692              46             0           71        169             1   \n",
+       "...                 ...           ...          ...        ...           ...   \n",
+       "8219227              33             0            8         70           392   \n",
+       "6087296              35             0            2         63             7   \n",
+       "3238079              97             0            2        221            37   \n",
+       "574005               41             0            1        137            23   \n",
+       "574028               95             0            3        135            93   \n",
+       "\n",
+       "         target end       e value  bit score     gene_id  normalized bit score  \n",
+       "3537669         142  1.464000e-04         53  c100005_g1              0.384058  \n",
+       "3711020         203  3.106000e-49        183  c100012_g4              1.375940  \n",
+       "3108519         176  4.196000e-04         52  c100014_g2              0.881356  \n",
+       "7530038        2829  4.556000e-04         53  c100023_g1              0.588889  \n",
+       "5277692          98  2.166000e-19        100  c100036_g1              1.010101  \n",
+       "...             ...           ...        ...         ...                   ...  \n",
+       "8219227         453  1.130000e-05         57   c99972_g1              0.904762  \n",
+       "6087296          68  4.330000e-05         51   c99980_g3              0.822581  \n",
+       "3238079         243  1.868000e-61        221   c99987_g1              1.004545  \n",
+       "574005          145  1.384000e-49        183   c99993_g2              1.335766  \n",
+       "574028          222  5.938000e-04         52   c99993_g3              0.390977  \n",
+       "\n",
+       "[3347 rows x 14 columns]"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "hhblits_filtered[~hhblits_filtered['target'].isin(sensitive['uniprot'])]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c0f35e75-1a3f-4bab-8afe-160d3347738f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mmseqs_filtered"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f99bd75f-ae5a-4d00-8a01-5588c711f45c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "foldseek_full = pd.read_parquet('../old_data/fs_targets.parquet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1359934-a10c-4301-9ffa-000acb4159e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mmseqs_filtered = mmseqs_filtered.merge(sensitive, on='uniprot', how='left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72eed180-8cb0-4f76-b316-f5717f56b026",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "hhblits = hhblits.merge(sensitive, on='uniprot', how='left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5b356e4-10f9-4782-a39c-4f19a221ebdb",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/scripts/fs_self_search_afdb.sh b/scripts/fs_self_search_afdb.sh
new file mode 100755
index 0000000..a586869
--- /dev/null
+++ b/scripts/fs_self_search_afdb.sh
@@ -0,0 +1,23 @@
+#!/bin/bash -ex
+#SBATCH -J fs_self_afdb
+#SBATCH -t 10:00:00
+#SBATCH -c 32
+#SBATCH -N 1
+#SBATCH --mem=128G
+#SBATCH -o /g/arendt/npapadop/cluster/fs_self.out
+#SBATCH -e /g/arendt/npapadop/cluster/fs_self.err
+
+module load GCC
+module load bzip2
+
+FOLDSEEK="/g/arendt/npapadop/repos/foldseek/build/bin/foldseek"
+QUERYDB="/scratch/npapadop/foldseek_target/afdb"
+TARGETDB="/scratch/npapadop/foldseek_target/afdb"
+ALIGNMENT="/scratch/npapadop/foldseek_results/self/self"
+
+"$FOLDSEEK" search "$QUERYDB" "$TARGETDB" "$ALIGNMENT" tmpFolder -a --threads 16
+"$FOLDSEEK" aln2tmscore "$QUERYDB" "$TARGETDB" "$ALIGNMENT" "$ALIGNMENT"_aln_tmscore
+"$FOLDSEEK" createtsv "$QUERYDB" "$TARGETDB" "$ALIGNMENT"_aln_tmscore "$ALIGNMENT"_aln_tmscore.tsv
+"$FOLDSEEK" convertalis "$QUERYDB" "$TARGETDB" "$ALIGNMENT" "$ALIGNMENT"_score.tsv
+
+module unload
-- 
GitLab