diff --git a/analysis/review-proteome_coverage.ipynb b/analysis/revision-proteome_coverage.ipynb
similarity index 100%
rename from analysis/review-proteome_coverage.ipynb
rename to analysis/revision-proteome_coverage.ipynb
diff --git a/analysis/revision-remote_species.ipynb b/analysis/revision-remote_species.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..3de2f255d2ee3a5081aeb969efbacee500475b9d
--- /dev/null
+++ b/analysis/revision-remote_species.ipynb
@@ -0,0 +1,394 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "56159bf4-7e59-4416-a5ed-8054179cd39d",
+   "metadata": {},
+   "source": [
+    "How good is protein structure at finding conserved function? To find out, we tried to compare enzyme function between distantly related species, to see how often function was correctly annotated by structural similarity. In particular, we were interested in cases where orthology via sequence similarity was no longer detectable but the annotated protein function was still similar. Enzymes are an interesting test case, since they have an easily accessible function description (EC number).\n",
+    "\n",
+    "The detailed strategy I am going to pursue here:\n",
+    "\n",
+    "1. take the proteome of a species distantly related to human (yeast, arabidopsis)\n",
+    "2. foldseek against Alphafold/proteomes\n",
+    "3. keep only significant human hits\n",
+    "4. remove query/target pairs that clearly share evolutionary history (same root orthogroup/same most specific orthogroup)\n",
+    "5. for all remaining cases: only keep enzymes (valid EC is available)\n",
+    "6. calculate how many digits of the EC overlap\n",
+    "7. plot the results\n",
+    "\n",
+    "```\n",
+    "# download Alphafold proteomes:\n",
+    "\n",
+    "> foldseek databases Alphafold/Proteome proteomes tmp/\n",
+    "\n",
+    "# download and untar yeast protein structures, then:\n",
+    "\n",
+    "> foldseek createdb UP000002311_559292_YEAST_v4/*.pdb.gz yeast\n",
+    "> foldseek search yeast proteomes annot_yeast tmp\n",
+    "> foldseek convertalis yeast proteomes annot_yeast yeast.m8\n",
+    "```\n",
+    "\n",
+    "(same for _Arabidopsis_)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d46b79e8-e6a3-4503-89a4-687527ce3256",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "from matplotlib import pyplot as plt\n",
+    "import seaborn as sns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "11891bb5-7525-46ac-b944-49464fa4e5dd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def keep_term(row, term=\"root\"):\n",
+    "    \"\"\"\n",
+    "    A function to isolate an EggNOG orthogroup by term.\n",
+    "    \n",
+    "    Expects a comma-separated string where the target orthogroup\n",
+    "    contains the term.\n",
+    "    \"\"\"\n",
+    "    x = np.array(row.split(','))\n",
+    "    keep = np.zeros(len(x), dtype=bool)\n",
+    "    for i, og in enumerate(x):\n",
+    "        keep[i] = term in og\n",
+    "#     print(x, keep)\n",
+    "    try:\n",
+    "        return x[keep][0]\n",
+    "    except IndexError:\n",
+    "        return None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "8d4c0fd8-6c3c-4e8f-8e3d-36b4abc2a40a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compare_EC(x, slot1=\"query_EC\", slot2=\"target_EC\"):\n",
+    "    query = [np.array(ec.split(\".\")) for ec in x[slot1].split(\",\")]\n",
+    "    target = [np.array(ec.split(\".\")) for ec in x[slot2].split(\",\")]\n",
+    "    max_agreement = 0\n",
+    "    for q in query:\n",
+    "        for t in target:\n",
+    "            tmp = np.sum(q == t)\n",
+    "            if tmp > max_agreement:\n",
+    "                max_agreement = tmp\n",
+    "    return max_agreement"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "7fa6da37-7faf-404a-87cd-0d690a9a4ea6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def same_structure_same_EC(query_annot, query_foldseek, target_annot, lowest_relevant_OG=\"Opisthokonta\"):\n",
+    "    in_target = query_foldseek[\"target\"].isin(target_annot[\"id\"])\n",
+    "\n",
+    "    best = query_foldseek[in_target].sort_values(\"e value\").drop_duplicates(\"query\")\n",
+    "\n",
+    "    informative_columns = [\"eggNOG_OGs\", \"Preferred_name\", \"GOs\", \"EC\"]\n",
+    "\n",
+    "    best[\"corrected bit score\"] = best[\"bit score\"] / best[\"alignment length\"]\n",
+    "\n",
+    "    best = best[[\"query\", \"target\", \"corrected bit score\"]]\n",
+    "\n",
+    "    query = query_annot.set_index(\"id\").loc[best[\"query\"]][informative_columns]\n",
+    "    query.columns = \"query_\" + query.columns\n",
+    "\n",
+    "    target = target_annot.set_index(\"id\").loc[best[\"target\"]][informative_columns]\n",
+    "    target.columns = \"target_\" + target.columns\n",
+    "\n",
+    "    best = best.reset_index(drop=True).join(target.reset_index(drop=True))\n",
+    "\n",
+    "    best = best.join(query, on=\"query\")\n",
+    "\n",
+    "    best['query_root'] = best['query_eggNOG_OGs'].apply(keep_term, term=\"root\")\n",
+    "    best['target_root'] = best['target_eggNOG_OGs'].apply(keep_term, term=\"root\")\n",
+    "    best['query_op'] = best['query_eggNOG_OGs'].apply(keep_term, term=lowest_relevant_OG)\n",
+    "    best['target_op'] = best['target_eggNOG_OGs'].apply(keep_term, term=lowest_relevant_OG)\n",
+    "\n",
+    "    best[\"homolog\"] = best[\"query_root\"] == best[\"target_root\"]\n",
+    "    best[\"ortholog\"] = best[\"query_op\"] == best[\"target_op\"]\n",
+    "\n",
+    "    no_homology_whatsoever = ~best[\"homolog\"] & ~best[\"ortholog\"]\n",
+    "    unrelated = best[no_homology_whatsoever].reset_index(drop=True).copy()\n",
+    "\n",
+    "    unrelated[\"target_EC\"].replace(\"-\", None, inplace=True)\n",
+    "    unrelated[\"query_EC\"].replace(\"-\", None, inplace=True)\n",
+    "\n",
+    "    target_has_EC = ~unrelated[\"target_EC\"].isnull()\n",
+    "    query_has_EC = ~unrelated[\"query_EC\"].isnull()\n",
+    "\n",
+    "    keep = target_has_EC & query_has_EC\n",
+    "\n",
+    "    unrelated_with_EC = unrelated[keep].reset_index(drop=True).copy()\n",
+    "\n",
+    "    unrelated_with_EC[\"ec overlap\"] = unrelated_with_EC.apply(compare_EC, axis=1)\n",
+    "\n",
+    "    fig, ax = plt.subplots()\n",
+    "    b = sns.boxplot(data=unrelated_with_EC, y='corrected bit score', x='ec overlap', ax=ax, fliersize=0, whis=[5, 95])\n",
+    "    counts = unrelated_with_EC.groupby(['ec overlap']).apply(len)\n",
+    "    for i, c in enumerate(counts.values):\n",
+    "        ax.text(i-0.1, 4., str(c), fontsize='x-large')\n",
+    "        ax.set_ylim(0, 5);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "22f00eeb-722e-4820-a797-18e76e50de82",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/md/d6lwwbv97xb6g6ddypntnprh0000gp/T/ipykernel_488/2997960917.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'.\n",
+      "  yeast_annot = pd.read_csv(\"../data/eggnog/S_cerevisiae_annotations.tsv\", sep=\"\\t\", skiprows=4, skipfooter=3)\n"
+     ]
+    }
+   ],
+   "source": [
+    "yeast_annot = pd.read_csv(\"../data/eggnog/S_cerevisiae_annotations.tsv\", sep=\"\\t\", skiprows=4, skipfooter=3)\n",
+    "yeast_annot[\"max_annot_lvl\"].unique()\n",
+    "yeast_annot[\"id\"] = yeast_annot[\"#query\"].str.split(\"|\").str[1]\n",
+    "\n",
+    "yeast_foldseek = pd.read_csv(\"/Users/npapadop/Documents/data/foldseek/yeast.m8\", sep=\"\\t\", header=None)\n",
+    "yeast_foldseek.columns = [\"query\", \"target\", \"seq. id.\", \"alignment length\", \"no. mismatches\",\n",
+    "                       \"no. gap open\", \"query start\", \"query end\", \"target start\", \"target end\",\n",
+    "                       \"e value\", \"bit score\"]\n",
+    "yeast_foldseek[\"query\"] = yeast_foldseek[\"query\"].str.split(\"-\").str[1]\n",
+    "yeast_foldseek[\"target\"] = yeast_foldseek[\"target\"].str.split(\"-\").str[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "d8ad897d-3c5e-4ab5-b79d-c0ca22af20fd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(5994, 5537, 5520)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a = yeast_foldseek[\"query\"].unique()\n",
+    "b = yeast_annot[\"id\"].unique()\n",
+    "c = np.intersect1d(a, b)\n",
+    "\n",
+    "len(a), len(b), len(c)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4437ebdf-5878-43ce-8e7a-f21c77bc6277",
+   "metadata": {},
+   "source": [
+    "we don't want to get into quibbly territory, so we'll proceed with the IDs that are in both lists and will not wonder why they don't match to 100%."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "f57728f2-f505-4195-9df1-a606ec65f40d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "keep = yeast_annot[\"id\"].isin(c)\n",
+    "yeast_annot = yeast_annot[keep]\n",
+    "\n",
+    "keep = yeast_foldseek[\"query\"].isin(c)\n",
+    "yeast_foldseek = yeast_foldseek[keep]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "d4a421da-3d5d-47f3-90eb-f77441d929d1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/md/d6lwwbv97xb6g6ddypntnprh0000gp/T/ipykernel_488/2273340934.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'.\n",
+      "  human_annot = pd.read_csv(\"../data/eggnog/H_sapiens_annotations.tsv\", sep=\"\\t\", skiprows=4, skipfooter=3)\n"
+     ]
+    }
+   ],
+   "source": [
+    "human_annot = pd.read_csv(\"../data/eggnog/H_sapiens_annotations.tsv\", sep=\"\\t\", skiprows=4, skipfooter=3)\n",
+    "human_annot[\"id\"] = human_annot[\"#query\"].str.split(\"|\").str[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "f7f4835c-03b8-4117-b494-387c570e5c12",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 640x480 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "same_structure_same_EC(yeast_annot, yeast_foldseek, human_annot)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ff056da6-2246-41c0-85cd-01eda6589dda",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/md/d6lwwbv97xb6g6ddypntnprh0000gp/T/ipykernel_488/2842444695.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'.\n",
+      "  arabidopsis_annot = pd.read_csv(\"../data/eggnog/A_thaliana_annotations.tsv\", sep=\"\\t\", skiprows=4, skipfooter=3)\n"
+     ]
+    }
+   ],
+   "source": [
+    "arabidopsis_annot = pd.read_csv(\"../data/eggnog/A_thaliana_annotations.tsv\", sep=\"\\t\", skiprows=4, skipfooter=3)\n",
+    "arabidopsis_annot[\"max_annot_lvl\"].unique()\n",
+    "arabidopsis_annot[\"id\"] = arabidopsis_annot[\"#query\"].str.split(\"|\").str[1]\n",
+    "\n",
+    "arabidopsis_foldseek = pd.read_csv(\"/Users/npapadop/Documents/data/foldseek/arabidopsis.m8\", sep=\"\\t\", header=None)\n",
+    "arabidopsis_foldseek.columns = [\"query\", \"target\", \"seq. id.\", \"alignment length\", \"no. mismatches\",\n",
+    "                       \"no. gap open\", \"query start\", \"query end\", \"target start\", \"target end\",\n",
+    "                       \"e value\", \"bit score\"]\n",
+    "arabidopsis_foldseek[\"query\"] = arabidopsis_foldseek[\"query\"].str.split(\"-\").str[1]\n",
+    "arabidopsis_foldseek[\"target\"] = arabidopsis_foldseek[\"target\"].str.split(\"-\").str[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "cfd46099-627e-4b12-aee3-5979914eaff7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(27298, 25145, 25071)"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a = arabidopsis_foldseek[\"query\"].unique()\n",
+    "b = arabidopsis_annot[\"id\"].unique()\n",
+    "c = np.intersect1d(a, b)\n",
+    "\n",
+    "len(a), len(b), len(c)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "21080634-fbcd-4e68-9dbf-e1589e3dfe41",
+   "metadata": {},
+   "source": [
+    "we don't want to get into quibbly territory, so we'll proceed with the IDs that are in both lists and will not wonder why they don't match to 100%."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "007206af-b083-45d1-9132-5d36b6d3f8aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "keep = arabidopsis_annot[\"id\"].isin(c)\n",
+    "arabidopsis_annot = arabidopsis_annot[keep]\n",
+    "\n",
+    "keep = arabidopsis_foldseek[\"query\"].isin(c)\n",
+    "arabidopsis_foldseek = arabidopsis_foldseek[keep]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "a5fbe0b2-1545-410a-9416-053cab1514cc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 640x480 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "same_structure_same_EC(arabidopsis_annot, arabidopsis_foldseek, human_annot, lowest_relevant_OG=\"Eukaryota\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "48d39bb0-ed6d-4aea-9235-374a37592fbb",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}