diff --git a/analysis/suppl-get_species.ipynb b/analysis/suppl-get_species.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..ae08f18182fe0e31df34ef451b730d9030eea563
--- /dev/null
+++ b/analysis/suppl-get_species.ipynb
@@ -0,0 +1,158 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5d30842b-ecc2-4461-87f2-3940b28b8167",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f0254af0-b87f-4550-aa70-2e285c5f85b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "afdb = pd.read_csv('/Users/npapadop/Documents/data/coffe/self_score.tsv', sep='\\t', header=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9920841f-ca20-4e91-bcd2-c9caf0d59a44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "afdb.columns = ['query', 'target', 'perc_id', 'ali_length', 'no_mismatch', 'no_gapopen',\n",
+    "                'q_start', 'q_end', 't_start', 't_end', 'eval', 'bit']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "efa843cc-2af9-43fe-8164-5ef85d438359",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "afdb['query'] = afdb['query'].str.split('-').str[1]\n",
+    "afdb['target'] = afdb['target'].str.split('-').str[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "aaefa6a1-6fd0-4775-a300-9871359005a4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "unique_up_id = pd.concat([afdb['query'], afdb['target']])\n",
+    "unique_up_id.drop_duplicates(inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "5e9ad3ce-400e-4b45-b782-11078d0cf492",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "unique_up_id = unique_up_id.values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "52171df1-5a9b-4105-b8e7-b6186831a0d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests, sys\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ce2575ad-af72-4036-af92-6f53920bb64d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5604/5604 [3:03:46<00:00,  1.97s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "request_size = 100\n",
+    "no_chunks = np.ceil(len(unique_up_id) / request_size).astype(int)\n",
+    "\n",
+    "accessions = []\n",
+    "species_list = []\n",
+    "\n",
+    "for i in tqdm(range(no_chunks)):\n",
+    "    a = i * request_size\n",
+    "    b = (i+1) * request_size\n",
+    "    chunk = [str(c) for c in unique_up_id[a:b]]\n",
+    "    url = f\"https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=100&accession={','.join(chunk)}\"\n",
+    "    r = requests.get(url, headers={ \"Accept\" : \"application/json\"})\n",
+    "\n",
+    "    if not r.ok:\n",
+    "      r.raise_for_status()\n",
+    "      break\n",
+    "    for response in r.json():\n",
+    "        accession = response['accession']\n",
+    "        species = response['organism']['names'][0]['value']\n",
+    "        accessions.append(accession)\n",
+    "        species_list.append(species)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "9e427ed7-1af4-4b35-80f0-232b58c5d0ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "annot = pd.DataFrame({'id': accessions, 'species': species_list})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "7fb82b4d-8e14-4fa6-a8ea-079a8de11e4e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "annot.to_csv('/Users/npapadop/Documents/data/coffe/afdb_proteomes_species.tsv', header=True, sep='\\t')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/analysis/suppl-model_species_validation.ipynb b/analysis/suppl-model_species_validation.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..0c82da699f59f870375812432f3bad027fa9e89e
--- /dev/null
+++ b/analysis/suppl-model_species_validation.ipynb
@@ -0,0 +1,209 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "7bcc7b18-f2b0-45c7-9a1d-534689b52950",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "from matplotlib import pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "78c429cf-dabc-4350-b2db-76ca32b4d88c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "afdb = pd.read_csv('/Users/npapadop/Documents/data/coffe/self_score.tsv', sep='\\t', header=None)\n",
+    "\n",
+    "afdb.columns = ['query', 'target', 'perc_id', 'ali_length', 'no_mismatch', 'no_gapopen',\n",
+    "                'q_start', 'q_end', 't_start', 't_end', 'eval', 'bit']\n",
+    "\n",
+    "afdb['query'] = afdb['query'].str.split('-').str[1]\n",
+    "afdb['target'] = afdb['target'].str.split('-').str[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "a01e36ac-e35d-4d7b-9747-22c7f4c1e52a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "annot = pd.read_csv('/Users/npapadop/Documents/data/coffe/afdb_proteomes_species.tsv', sep='\\t', index_col=0)\n",
+    "\n",
+    "uniprot_to_eggnog = pd.read_csv('/Users/npapadop/Documents/data/coffe/uni2egg_euk.Nov2018.tsv', sep='\\t', header=None)\n",
+    "uniprot_to_eggnog.columns = ['uniprot', 'orthogroup']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "1fb78a25-0642-49a1-b7b4-2cf096a662ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def non_conspecific_hits(alignment, annotation, species):\n",
+    "    belongs = annotation['species'] == species\n",
+    "    query_belongs = alignment['query'].isin(annotation['id'][belongs])\n",
+    "    target_belongs = alignment['target'].isin(annotation['id'][belongs])\n",
+    "    return query_belongs & ~target_belongs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "4da45e3b-9ffd-419a-b812-a492872a15ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def best_match(df, sort_by='bit', tiebreak='ali_length'):\n",
+    "    have_max = df[sort_by] == np.max(df[sort_by])\n",
+    "    max_ali = df[have_max][tiebreak] == np.max(df[have_max][tiebreak])\n",
+    "    return df[have_max][max_ali].index.values[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "2f3771b7-91f7-4399-aa89-72ba1eef1209",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def subset_comparable(annotation, indices, reference):\n",
+    "    species_df = annotation.loc[indices].copy()\n",
+    "\n",
+    "    species_df['query_og'] = species_df.join(reference.set_index('uniprot'), on='query')['orthogroup']\n",
+    "    species_df['target_og'] = species_df.join(reference.set_index('uniprot'), on='target')['orthogroup']\n",
+    "\n",
+    "    query_available = ~species_df['query_og'].isnull()\n",
+    "    target_available = ~species_df['target_og'].isnull()\n",
+    "    \n",
+    "    return species_df[query_available & target_available]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "596af8e7-d2b9-4a34-b9e4-976fec4790b8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def perc_agreement(df, df_available):\n",
+    "    if len(df_available) == 0:\n",
+    "        return -0\n",
+    "    perc = np.sum(df_available['query_og'] == df_available['target_og']) / len(df_available)\n",
+    "    return perc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "4170a709-fbc7-4903-955d-091f776689c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def analyse_species(species, alignment, annotation, reference):\n",
+    "    is_species = non_conspecific_hits(alignment, annotation, species)\n",
+    "    species_df = alignment[is_species].groupby('query').apply(best_match)\n",
+    "    species_av = subset_comparable(alignment, species_df, reference)\n",
+    "    perc = perc_agreement(species_df, species_av)\n",
+    "    return [len(species_df), len(species_av), perc]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "f824d534-adbe-48ed-9dd4-99ff8a1efac0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ajellomyces capsulatus (strain G186AR / H82 / ATCC MYA-2454 / RMSCC 2432)\t6551\t0\t 0.00%\n",
+      "Arabidopsis thaliana\t22748\t18361\t 90.63%\n",
+      "Brugia malayi\t7254\t0\t 0.00%\n",
+      "Caenorhabditis elegans\t13993\t1824\t 55.70%\n",
+      "Campylobacter jejuni subsp. jejuni serotype O:2 (strain ATCC 700819 / NCTC 11168)\t1420\t0\t 0.00%\n",
+      "Candida albicans (strain SC5314 / ATCC MYA-2876)\t4725\t3480\t 89.71%\n",
+      "Cladophialophora carrionii\t9194\t493\t 67.14%\n",
+      "Danio rerio\t22041\t14807\t 89.19%\n",
+      "Dictyostelium discoideum\t7233\t4696\t 67.59%\n",
+      "Dracunculus medinensis\t8443\t0\t 0.00%\n",
+      "Drosophila melanogaster\t9480\t6943\t 81.75%\n",
+      "Enterococcus faecium\t2185\t0\t 0.00%\n",
+      "Escherichia coli (strain K12)\t4022\t0\t 0.00%\n",
+      "Fonsecaea pedrosoi CBS 271.37\t10960\t0\t 0.00%\n",
+      "Glycine max\t44403\t31595\t 90.56%\n",
+      "Haemophilus influenzae (strain ATCC 51907 / DSM 11121 / KW20 / Rd)\t1512\t0\t 0.00%\n",
+      "Helicobacter pylori (strain ATCC 700392 / 26695)\t1202\t0\t 0.00%\n",
+      "Homo sapiens\t18775\t16666\t 96.65%\n",
+      "Klebsiella pneumoniae subsp. pneumoniae (strain HS11286)\t4794\t0\t 0.00%\n",
+      "Leishmania infantum\t6001\t5481\t 85.66%\n",
+      "Madurella mycetomatis\t7779\t0\t 0.00%\n",
+      "Methanocaldococcus jannaschii (strain ATCC 43067 / DSM 2661 / JAL-1 / JCM 10045 / NBRC 100440)\t1274\t0\t 0.00%\n",
+      "Mus musculus\t20581\t17718\t 98.40%\n",
+      "Mycobacterium leprae (strain TN)\t1454\t0\t 0.00%\n",
+      "Mycobacterium tuberculosis (strain ATCC 25618 / H37Rv)\t3554\t0\t 0.00%\n",
+      "Mycobacterium ulcerans str. Harvey\t5725\t0\t 0.00%\n",
+      "Neisseria gonorrhoeae (strain ATCC 700825 / FA 1090)\t1538\t0\t 0.00%\n",
+      "Nocardia brasiliensis ATCC 700358\t6924\t0\t 0.00%\n",
+      "Onchocerca volvulus\t8594\t0\t 0.00%\n",
+      "Oryza sativa subsp. japonica\t24200\t17051\t 91.89%\n",
+      "Paracoccidioides lutzii (strain ATCC MYA-826 / Pb01)\t6278\t568\t 72.18%\n",
+      "Plasmodium falciparum (isolate 3D7)\t2687\t0\t 0.00%\n",
+      "Pseudomonas aeruginosa (strain ATCC 15692 / DSM 22644 / CIP 104116 / JCM 14847 / LMG 12228 / 1C / PRS 101 / PAO1)\t4819\t0\t 0.00%\n",
+      "Rattus norvegicus\t18492\t16229\t 98.72%\n",
+      "Saccharomyces cerevisiae (strain ATCC 204508 / S288c)\t4610\t3712\t 88.93%\n",
+      "Salmonella typhimurium (strain LT2 / SGSC1412 / ATCC 700720)\t4158\t0\t 0.00%\n",
+      "Schistosoma mansoni\t9545\t1395\t 92.83%\n",
+      "Schizosaccharomyces pombe (strain 972 / ATCC 24843)\t4274\t2466\t 89.98%\n",
+      "Shigella dysenteriae serotype 1 (strain Sd197)\t3546\t0\t 0.00%\n",
+      "Sporothrix schenckii (strain ATCC 58251 / de Perez 2211183)\t6921\t1442\t 69.97%\n",
+      "Staphylococcus aureus (strain NCTC 8325 / PS 47)\t2087\t0\t 0.00%\n",
+      "Streptococcus pneumoniae (strain ATCC BAA-255 / R6)\t1665\t0\t 0.00%\n",
+      "Strongyloides stercoralis\t9555\t0\t 0.00%\n",
+      "Trichuris trichiura\t7632\t0\t 0.00%\n",
+      "Trypanosoma brucei brucei (strain 927/4 GUTat10.1)\t6509\t5703\t 93.07%\n",
+      "Trypanosoma cruzi (strain CL Brener)\t13704\t11352\t 89.21%\n",
+      "Wuchereria bancrofti\t9467\t0\t 0.00%\n",
+      "Zea mays\t30034\t18601\t 91.83%\n"
+     ]
+    }
+   ],
+   "source": [
+    "for species in np.unique(annot['species']):\n",
+    "    total_proteins, have_og, og_agreement = analyse_species(species, afdb, annot, uniprot_to_eggnog)\n",
+    "    print(f'{species}\\t{total_proteins}\\t{have_og}\\t{og_agreement * 100: .2f}%')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/analysis/suppl-struct_seq_agreement.ipynb b/analysis/suppl-struct_seq_agreement.ipynb
index dbddab5cd11cef99492f7d01807e3f2c0632b1c1..5d754cc5ca66391dae1508ad73fa5d9356407a46 100644
--- a/analysis/suppl-struct_seq_agreement.ipynb
+++ b/analysis/suppl-struct_seq_agreement.ipynb
@@ -663,7 +663,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.6"
+   "version": "3.9.13"
   }
  },
  "nbformat": 4,