From 97404076e5e5201aaa4bc554509cb658213b3c56 Mon Sep 17 00:00:00 2001
From: Fabian Ruperti <fabian.ruperti@embl.de>
Date: Thu, 15 Dec 2022 14:07:06 +0100
Subject: [PATCH] revision GO term comparison

---
 analysis/revision-GO_term_comparison.ipynb | 1185 ++++++++++++++++++++
 1 file changed, 1185 insertions(+)
 create mode 100644 analysis/revision-GO_term_comparison.ipynb

diff --git a/analysis/revision-GO_term_comparison.ipynb b/analysis/revision-GO_term_comparison.ipynb
new file mode 100644
index 0000000..2ca4cc7
--- /dev/null
+++ b/analysis/revision-GO_term_comparison.ipynb
@@ -0,0 +1,1185 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 237,
+   "id": "99e6a23c-f2a3-4369-977e-4d2c74707d0c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2022-12-14 17:26\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datetime import datetime, timezone\n",
+    "import pytz\n",
+    "\n",
+    "utc_dt = datetime.now(timezone.utc) # UTC time\n",
+    "dt = utc_dt.astimezone()\n",
+    "tz = pytz.timezone('Europe/Berlin')\n",
+    "berlin_now = datetime.now(tz)\n",
+    "print(f'{berlin_now:%Y-%m-%d %H:%M}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 238,
+   "id": "a6e66eaf-c7a8-4b26-a99b-5032d49c25a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import glob\n",
+    "import os\n",
+    "\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "from matplotlib_venn import venn2, venn3\n",
+    "from matplotlib import cm\n",
+    "import seaborn as sns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 239,
+   "id": "0cd74ade-73c8-44f6-9d6e-f0f37bb760d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "structural_annotation = pd.read_parquet('/g/arendt/npapadop/repos/coffe/data/structure_annotation.parquet')\n",
+    "sequence_annotation = pd.read_csv('/g/arendt/npapadop/repos/coffe/data/Slacustris_eggnog.tsv', sep='\\t')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 240,
+   "id": "c3d289af-b99b-4403-af5b-a8fc15f8269f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "annotations_complete = structural_annotation[['protein_id', 'eggNOG_OGs', 'MSA size','alignment length', 'query length', 'seq. id.', 'bit score','plddt', 'complete_protein', \"Preferred_name\", \"Description\", \"GOs\"]].merge(sequence_annotation[['protein_id', 'eggNOG_OGs', 'score', \"Preferred_name\", \"Description\", \"GOs\"]], \n",
+    "                                                                           on='protein_id', suffixes=['_struct', '_seq'], how = 'outer')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d3b9a268-6698-415f-a0c4-4f7e0fd515d8",
+   "metadata": {},
+   "source": [
+    "## Comparison of GO-terms between structural and sequence annotation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 241,
+   "id": "63dd7ec5-8841-4bfe-9e38-db55f67c298d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GO_struct_missing = annotations_complete['GOs_struct'] == '-'\n",
+    "GO_struct_isnan = annotations_complete['GOs_struct'].isnull()\n",
+    "\n",
+    "GO_seq_missing = annotations_complete['GOs_seq'] == '-'\n",
+    "GO_seq_isnan = annotations_complete['GOs_seq'].isnull()\n",
+    "\n",
+    "GO_struct_avail = ~(GO_struct_missing | GO_struct_isnan)\n",
+    "GO_seq_avail = ~(GO_seq_missing | GO_seq_isnan)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 242,
+   "id": "469c03ab-31cf-474c-b6eb-fc9f613015ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "annotations_GOs = annotations_complete[GO_struct_avail & GO_seq_avail]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "34f832e3-b005-464b-9788-07f758239276",
+   "metadata": {},
+   "source": [
+    "Now that I have all cases in which structure and sequence annotations actually produce GO-terms (via EggNOG-mapper), I can compare the overlap between those annotations on the level of GO-terms."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a7f160be-f3e5-4d8a-bf46-833458196906",
+   "metadata": {},
+   "source": [
+    "There will be a couple of different levels:\n",
+    "\n",
+    "    - Complete overlap\n",
+    "    - Partial overlap:\n",
+    "        - Unique GO-terms on both sides\n",
+    "        - All structure GO-terms in sequence GO-terms\n",
+    "        - All sequence GO-terms in structure GO-terms\n",
+    "    - NO overlap at all"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 243,
+   "id": "fe53ea5e-26de-46cf-ab69-44288f1a2b26",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GOs = annotations_GOs[['GOs_struct', 'GOs_seq']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 244,
+   "id": "3761bb31-a522-40a0-8e86-c262b2b9e150",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_278/2049958699.py:1: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  GOs['GOs_struct'] = GOs['GOs_struct'].str.split(',')\n",
+      "/tmp/ipykernel_278/2049958699.py:2: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  GOs['GOs_seq'] = GOs['GOs_seq'].str.split(',')\n"
+     ]
+    }
+   ],
+   "source": [
+    "GOs['GOs_struct'] = GOs['GOs_struct'].str.split(',')\n",
+    "GOs['GOs_seq'] = GOs['GOs_seq'].str.split(',')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 245,
+   "id": "1b45906f-bd6c-4424-93de-ffbef8fad67e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def find_overlap(row):\n",
+    "    list1 = row['GOs_struct']\n",
+    "    list2 = row['GOs_seq']\n",
+    "    overlap = set(list1).intersection(list2)\n",
+    "    if len(overlap) == 0:\n",
+    "        return 'no overlap'\n",
+    "    elif len(overlap) == len(list1) and len(overlap) == len(list2):\n",
+    "        return 'complete overlap'\n",
+    "    else:\n",
+    "        # Check if there are unique elements in both lists\n",
+    "        unique1 = set(list1) - overlap\n",
+    "        unique2 = set(list2) - overlap\n",
+    "        if len(unique1) > 0 and len(unique2) > 0:\n",
+    "            return 'partial overlap - unique GOs'\n",
+    "        # Check if list1 contains all elements of list2 and more\n",
+    "        elif len(overlap) == len(list2) and len(overlap) < len(list1):\n",
+    "            return 'partial overlap - structure GOs expanded'\n",
+    "        # Check if list2 contains all elements of list1 and more\n",
+    "        elif len(overlap) == len(list1) and len(overlap) < len(list2):\n",
+    "            return 'partial overlap - sequence GOs expanded'\n",
+    "        # If none of the above, then overlap is partial but not unique or expanded\n",
+    "        else:\n",
+    "            return 'partial'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 246,
+   "id": "90a4a44d-2931-4b87-a7a8-1fe677e8912b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def report_coverage_struct(row):\n",
+    "    list1 = row['GOs_struct']\n",
+    "    list2 = row['GOs_seq']\n",
+    "    overlap = set(list1).intersection(list2)\n",
+    "    return len(overlap)/len(list1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 247,
+   "id": "9c799814-04f9-49d4-9427-873c50e3a2cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def report_coverage_seq(row):\n",
+    "    list1 = row['GOs_struct']\n",
+    "    list2 = row['GOs_seq']\n",
+    "    overlap = set(list1).intersection(list2)\n",
+    "    return len(overlap)/len(list2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 248,
+   "id": "bcc70854-7a7b-4bdc-b59f-c9244f9ffae8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_278/1677403612.py:1: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  GOs['overlap'] = GOs.apply(find_overlap, axis=1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "GOs['overlap'] = GOs.apply(find_overlap, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 249,
+   "id": "948c120b-f680-4a9d-ba9e-11ddc15be017",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_278/2972605403.py:1: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  GOs['coverage_struct'] = GOs.apply(report_coverage_struct, axis=1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "GOs['coverage_struct'] = GOs.apply(report_coverage_struct, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 250,
+   "id": "ca29c96d-a8da-4c7c-bdf2-5ad1f380c4da",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_278/1789197005.py:1: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  GOs['coverage_seq'] = GOs.apply(report_coverage_seq, axis=1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "GOs['coverage_seq'] = GOs.apply(report_coverage_seq, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 251,
+   "id": "9195a8b5-f5e4-4e10-a8e6-fcf190fdd42f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "complete overlap                            7052\n",
+       "partial overlap - unique GOs                2360\n",
+       "partial overlap - sequence GOs expanded     1236\n",
+       "partial overlap - structure GOs expanded     967\n",
+       "no overlap                                    17\n",
+       "Name: overlap, dtype: int64"
+      ]
+     },
+     "execution_count": 251,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "GOs['overlap'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 252,
+   "id": "6a4352ef-1326-4102-9ca6-799217ba6be4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "color_reference = {\n",
+    "    'complete overlap': cm.tab20.colors[0],\n",
+    "    'partial overlap - unique GOs': cm.tab20.colors[18],\n",
+    "    'partial overlap - sequence GOs expanded': cm.tab20.colors[19],\n",
+    "    'partial overlap - structure GOs expanded': cm.tab20.colors[19],\n",
+    "    'no overlap': cm.tab20.colors[16]\n",
+    "}\n",
+    "\n",
+    "patterns = ['...', '', '...', '', '...', '', '', '', '']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 253,
+   "id": "ae8bac54-f10d-4020-8019-6221c13b56b7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 432x288 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "order = ['complete overlap', 'partial overlap - unique GOs',\n",
+    "         'partial overlap - sequence GOs expanded', 'partial overlap - structure GOs expanded',\n",
+    "         'no overlap']\n",
+    "vc = GOs['overlap'].value_counts()[order]\n",
+    "labels = vc.index\n",
+    "sizes = vc.values\n",
+    "colors = [color_reference[i] for i in labels]\n",
+    "# explode = (0.1, 0, 0, 0, 0)\n",
+    "\n",
+    "fig, ax = plt.subplots()\n",
+    "piechart = ax.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=False, colors=colors)\n",
+    "ax.axis('equal');\n",
+    "\n",
+    "#plt.savefig('./figures/analysis-sequence_structure_agreement.pdf')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 254,
+   "id": "ab2a9480-aaee-43d3-af23-ca05886b3695",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<AxesSubplot:xlabel='coverage_struct', ylabel='Count'>"
+      ]
+     },
+     "execution_count": 254,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 432x288 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "sns.histplot(GOs[GOs['overlap'] != 'complete overlap']['coverage_struct'], bins=100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 255,
+   "id": "f628c3eb-422c-403d-b267-ec10489171df",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.6454013592969468"
+      ]
+     },
+     "execution_count": 255,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.mean(GOs[GOs['overlap'] != 'complete overlap']['coverage_struct'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1bc8b46c-37e3-46b3-8ca8-e85ab8e47038",
+   "metadata": {},
+   "source": [
+    "Of all proteins that are not completely overlapping, the coverage of GO terms overlapping within the GO terms of *strucural* annotations is around 65 %. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 256,
+   "id": "582eaae9-25f8-4be3-99f0-aef7a0c438a0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<AxesSubplot:xlabel='coverage_seq', ylabel='Count'>"
+      ]
+     },
+     "execution_count": 256,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 432x288 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "sns.histplot(GOs[GOs['overlap'] != 'complete overlap']['coverage_seq'], bins=100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 257,
+   "id": "cc844026-2562-4ee2-a107-95a68c627a40",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.6510767765619442"
+      ]
+     },
+     "execution_count": 257,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.mean(GOs[GOs['overlap'] != 'complete overlap']['coverage_seq'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b6659ef1-6b7c-4a5b-b50e-0ff4ab593783",
+   "metadata": {},
+   "source": [
+    "Of all proteins that are not completely overlapping, the coverage of GO terms overlapping within the GO terms of sequence annotations is again around 65 %. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2531a431-6d03-4c4b-bb22-86ef4755c6ce",
+   "metadata": {},
+   "source": [
+    "## Check semantic similarity of GO terms of sequence-structure protein pairs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4081cb1c-443d-4dad-a32d-a4c28a4c2d94",
+   "metadata": {},
+   "source": [
+    "Create a table that can be used as an input for GOGO (https://www.nature.com/articles/s41598-018-33219-y):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 258,
+   "id": "3d0dbc57-a86a-4898-9bc0-61970b379817",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GOs_GOGO = GOs[['GOs_struct', 'GOs_seq']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 259,
+   "id": "c13c5605-d293-4558-a9d1-5551ef57fb1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# iterate over each row and each column\n",
+    "for i, row in GOs_GOGO.iterrows():\n",
+    "    for col in GOs_GOGO.columns:\n",
+    "        # separate the elements in the list with a space\n",
+    "        GOs_GOGO.at[i, col] = \" \".join(str(x) for x in row[col])\n",
+    "        # add the index of the row to the beginning of the list\n",
+    "        GOs_GOGO.at[i, col] = str(i) + \" \" + GOs_GOGO.at[i, col]\n",
+    "\n",
+    "# save the dataframe to a txt file without the header and index\n",
+    "GOs_GOGO.to_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/GOGO_input.txt\", sep=\";\", index=False, header=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5958c38f-58b8-4c2e-9fe3-f5ffceba9da0",
+   "metadata": {},
+   "source": [
+    "I ran GOGO locally: `perl gene_pair_comb.pl ~/Desktop/GOGO_input.txt ~/Desktop/GOGO_input_result.txt`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 260,
+   "id": "101e4b9a-e315-406c-8cee-ba1115e55208",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GOGO_result = pd.read_csv('/g/arendt/Fabian/PhD/Computational/Spongefold/GOGO_input_result.txt', sep=';', header=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 261,
+   "id": "23985e19-becf-40ed-8047-cbcd3b1258f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GOGO_result.rename(columns={0 :'GOs_struct', 1 :'GOs_seq'}, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 262,
+   "id": "854e6e6d-5834-418e-8fcc-24c33846a5ef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GOGO_result['GOs_struct'] = GOGO_result['GOs_struct'].str.split(' ').str[1:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 263,
+   "id": "60e1b05f-761a-4153-a38f-586143883f34",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GOGO_result['GOs_seq'] = GOGO_result['GOs_seq'].str.split(' ').str[1:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 264,
+   "id": "86603707-9eab-4fee-ade7-42077ed24068",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a new column in the dataframe\n",
+    "GOGO_result['BPO'] = None\n",
+    "GOGO_result['CCO'] = None\n",
+    "GOGO_result['MFO'] = None\n",
+    "\n",
+    "# iterate over each row in the dataframe\n",
+    "for i, row in GOGO_result.iterrows():\n",
+    "    # get the fifth last element of the list in col1\n",
+    "    # and store it in the new column\n",
+    "    GOGO_result.at[i, 'BPO'] = row['GOs_seq'][-5]\n",
+    "    GOGO_result.at[i, 'CCO'] = row['GOs_seq'][-3]\n",
+    "    GOGO_result.at[i, 'MFO'] = row['GOs_seq'][-1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 265,
+   "id": "fe401867-0ab5-46df-88ba-4b560b00cf7d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>GOs_struct</th>\n",
+       "      <th>GOs_seq</th>\n",
+       "      <th>BPO</th>\n",
+       "      <th>CCO</th>\n",
+       "      <th>MFO</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>[GO:0000902, GO:0000904, GO:0001654, GO:000174...</td>\n",
+       "      <td>[GO:0000902, GO:0000904, GO:0001654, GO:000174...</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>[GO:0003674, GO:0005215]</td>\n",
+       "      <td>[GO:0000166, GO:0003674, GO:0003676, GO:000372...</td>\n",
+       "      <td>NA</td>\n",
+       "      <td>NA</td>\n",
+       "      <td>1.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>[GO:0001539, GO:0003674, GO:0003774, GO:000377...</td>\n",
+       "      <td>[GO:0001539, GO:0003674, GO:0003774, GO:000377...</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>[GO:0000323, GO:0001959, GO:0002682, GO:000367...</td>\n",
+       "      <td>[GO:0000323, GO:0001959, GO:0002682, GO:000367...</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>[GO:0000166, GO:0000323, GO:0001882, GO:000188...</td>\n",
+       "      <td>[GO:0005575, GO:0005622, GO:0005623, GO:000573...</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11627</th>\n",
+       "      <td>[GO:0000003, GO:0000165, GO:0000166, GO:000156...</td>\n",
+       "      <td>[GO:0000003, GO:0000165, GO:0001654, GO:000170...</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11628</th>\n",
+       "      <td>[GO:0003674, GO:0003676, GO:0003723, GO:000548...</td>\n",
+       "      <td>[GO:0003674, GO:0003676, GO:0003723, GO:000548...</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11629</th>\n",
+       "      <td>[GO:0000012, GO:0000166, GO:0000228, GO:000072...</td>\n",
+       "      <td>[GO:0000012, GO:0000166, GO:0000228, GO:000072...</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11630</th>\n",
+       "      <td>[GO:0000003, GO:0000578, GO:0001700, GO:000300...</td>\n",
+       "      <td>[GO:0000003, GO:0000578, GO:0001700, GO:000300...</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11631</th>\n",
+       "      <td>[GO:0000151, GO:0000209, GO:0000226, GO:000367...</td>\n",
+       "      <td>[GO:0000151, GO:0000209, GO:0003674, GO:000382...</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>11632 rows × 5 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                              GOs_struct  \\\n",
+       "0      [GO:0000902, GO:0000904, GO:0001654, GO:000174...   \n",
+       "1                               [GO:0003674, GO:0005215]   \n",
+       "2      [GO:0001539, GO:0003674, GO:0003774, GO:000377...   \n",
+       "3      [GO:0000323, GO:0001959, GO:0002682, GO:000367...   \n",
+       "4      [GO:0000166, GO:0000323, GO:0001882, GO:000188...   \n",
+       "...                                                  ...   \n",
+       "11627  [GO:0000003, GO:0000165, GO:0000166, GO:000156...   \n",
+       "11628  [GO:0003674, GO:0003676, GO:0003723, GO:000548...   \n",
+       "11629  [GO:0000012, GO:0000166, GO:0000228, GO:000072...   \n",
+       "11630  [GO:0000003, GO:0000578, GO:0001700, GO:000300...   \n",
+       "11631  [GO:0000151, GO:0000209, GO:0000226, GO:000367...   \n",
+       "\n",
+       "                                                 GOs_seq    BPO    CCO    MFO  \n",
+       "0      [GO:0000902, GO:0000904, GO:0001654, GO:000174...  1.000  1.000  1.000  \n",
+       "1      [GO:0000166, GO:0003674, GO:0003676, GO:000372...     NA     NA  1.000  \n",
+       "2      [GO:0001539, GO:0003674, GO:0003774, GO:000377...  1.000  1.000  1.000  \n",
+       "3      [GO:0000323, GO:0001959, GO:0002682, GO:000367...  1.000  1.000  1.000  \n",
+       "4      [GO:0005575, GO:0005622, GO:0005623, GO:000573...  1.000  1.000  1.000  \n",
+       "...                                                  ...    ...    ...    ...  \n",
+       "11627  [GO:0000003, GO:0000165, GO:0001654, GO:000170...  1.000  1.000  1.000  \n",
+       "11628  [GO:0003674, GO:0003676, GO:0003723, GO:000548...  1.000  1.000  1.000  \n",
+       "11629  [GO:0000012, GO:0000166, GO:0000228, GO:000072...  1.000  1.000  1.000  \n",
+       "11630  [GO:0000003, GO:0000578, GO:0001700, GO:000300...  1.000  1.000  1.000  \n",
+       "11631  [GO:0000151, GO:0000209, GO:0003674, GO:000382...  1.000  1.000  1.000  \n",
+       "\n",
+       "[11632 rows x 5 columns]"
+      ]
+     },
+     "execution_count": 265,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "GOGO_result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 266,
+   "id": "05de5391-becd-4500-bac2-450632bbf482",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(1.000    11161\n",
+       " NA         471\n",
+       " Name: BPO, dtype: int64,\n",
+       " 1.000    11159\n",
+       " NA         473\n",
+       " Name: CCO, dtype: int64,\n",
+       " 1.000    10090\n",
+       " NA        1542\n",
+       " Name: MFO, dtype: int64)"
+      ]
+     },
+     "execution_count": 266,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "GOGO_result['BPO'].value_counts(),GOGO_result['CCO'].value_counts(),GOGO_result['MFO'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3582746f-d5cb-4bd4-9163-7da1b4889d70",
+   "metadata": {},
+   "source": [
+    "This output is a little weird and concerning at first. IN all cases where semantic similarity (Biological Process, Cellular component, Molecular Function) is also to be calculated, we have scores of 1.000. How is this possible when we have a few cases where there is no overlap of GO terms whatsoever?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 267,
+   "id": "a60e24f5-45dd-4938-acfa-901fcfda11d2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>GOs_struct</th>\n",
+       "      <th>GOs_seq</th>\n",
+       "      <th>overlap</th>\n",
+       "      <th>coverage_struct</th>\n",
+       "      <th>coverage_seq</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>737</th>\n",
+       "      <td>[GO:0000322, GO:0000323, GO:0000324, GO:000032...</td>\n",
+       "      <td>[GO:0003674, GO:0005488, GO:0005515]</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1341</th>\n",
+       "      <td>[GO:0003197, GO:0003205, GO:0003279, GO:000727...</td>\n",
+       "      <td>[GO:0005575, GO:0005576, GO:0005615, GO:000562...</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1715</th>\n",
+       "      <td>[GO:0000166, GO:0003674, GO:0003824, GO:000548...</td>\n",
+       "      <td>[GO:0005575, GO:0005618, GO:0005622, GO:000562...</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2818</th>\n",
+       "      <td>[GO:0005575, GO:0005623, GO:0005886, GO:001602...</td>\n",
+       "      <td>[GO:0008150, GO:0009966, GO:0009967, GO:001064...</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4012</th>\n",
+       "      <td>[GO:0003674, GO:0005488, GO:0005515, GO:001989...</td>\n",
+       "      <td>[GO:0000139, GO:0005575, GO:0005622, GO:000562...</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4937</th>\n",
+       "      <td>[GO:0003674, GO:0005488, GO:0005515, GO:001990...</td>\n",
+       "      <td>[GO:0000228, GO:0000785, GO:0000790, GO:000557...</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14652</th>\n",
+       "      <td>[GO:0005575, GO:0005622, GO:0005623, GO:0044464]</td>\n",
+       "      <td>[GO:0007275, GO:0007399, GO:0008150, GO:000998...</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20744</th>\n",
+       "      <td>[GO:0005575, GO:0005622, GO:0005623, GO:000573...</td>\n",
+       "      <td>[GO:0003674, GO:0003824, GO:0003964, GO:000613...</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21097</th>\n",
+       "      <td>[GO:0005575, GO:0005622, GO:0005623, GO:000573...</td>\n",
+       "      <td>[GO:0003674, GO:0003824, GO:0003924, GO:000548...</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27083</th>\n",
+       "      <td>[GO:0003674, GO:0003824, GO:0006629, GO:000815...</td>\n",
+       "      <td>[GO:0005575, GO:0005622, GO:0005623, GO:000573...</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28309</th>\n",
+       "      <td>[GO:0000075, GO:0000077, GO:0000278, GO:000028...</td>\n",
+       "      <td>[GO:0003674, GO:0005488, GO:0005515]</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35725</th>\n",
+       "      <td>[GO:0003008, GO:0005575, GO:0005623, GO:000727...</td>\n",
+       "      <td>[GO:0003674, GO:0005488, GO:0005515, GO:0005516]</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36403</th>\n",
+       "      <td>[GO:0003674, GO:0003779, GO:0005488, GO:000551...</td>\n",
+       "      <td>[GO:0006355, GO:0008150, GO:0009889, GO:000989...</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38385</th>\n",
+       "      <td>[GO:0007154, GO:0007267, GO:0008150, GO:000960...</td>\n",
+       "      <td>[GO:0005575, GO:0005576]</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38735</th>\n",
+       "      <td>[GO:0000322, GO:0000323, GO:0000324, GO:000032...</td>\n",
+       "      <td>[GO:0003674, GO:0005488, GO:0005515]</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39987</th>\n",
+       "      <td>[GO:0000322, GO:0000323, GO:0000324, GO:000032...</td>\n",
+       "      <td>[GO:0003674, GO:0005488, GO:0005515]</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>41192</th>\n",
+       "      <td>[GO:0005575, GO:0005622, GO:0005623, GO:000563...</td>\n",
+       "      <td>[GO:0003674, GO:0005488, GO:0005515, GO:0042802]</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                              GOs_struct  \\\n",
+       "737    [GO:0000322, GO:0000323, GO:0000324, GO:000032...   \n",
+       "1341   [GO:0003197, GO:0003205, GO:0003279, GO:000727...   \n",
+       "1715   [GO:0000166, GO:0003674, GO:0003824, GO:000548...   \n",
+       "2818   [GO:0005575, GO:0005623, GO:0005886, GO:001602...   \n",
+       "4012   [GO:0003674, GO:0005488, GO:0005515, GO:001989...   \n",
+       "4937   [GO:0003674, GO:0005488, GO:0005515, GO:001990...   \n",
+       "14652   [GO:0005575, GO:0005622, GO:0005623, GO:0044464]   \n",
+       "20744  [GO:0005575, GO:0005622, GO:0005623, GO:000573...   \n",
+       "21097  [GO:0005575, GO:0005622, GO:0005623, GO:000573...   \n",
+       "27083  [GO:0003674, GO:0003824, GO:0006629, GO:000815...   \n",
+       "28309  [GO:0000075, GO:0000077, GO:0000278, GO:000028...   \n",
+       "35725  [GO:0003008, GO:0005575, GO:0005623, GO:000727...   \n",
+       "36403  [GO:0003674, GO:0003779, GO:0005488, GO:000551...   \n",
+       "38385  [GO:0007154, GO:0007267, GO:0008150, GO:000960...   \n",
+       "38735  [GO:0000322, GO:0000323, GO:0000324, GO:000032...   \n",
+       "39987  [GO:0000322, GO:0000323, GO:0000324, GO:000032...   \n",
+       "41192  [GO:0005575, GO:0005622, GO:0005623, GO:000563...   \n",
+       "\n",
+       "                                                 GOs_seq     overlap  \\\n",
+       "737                 [GO:0003674, GO:0005488, GO:0005515]  no overlap   \n",
+       "1341   [GO:0005575, GO:0005576, GO:0005615, GO:000562...  no overlap   \n",
+       "1715   [GO:0005575, GO:0005618, GO:0005622, GO:000562...  no overlap   \n",
+       "2818   [GO:0008150, GO:0009966, GO:0009967, GO:001064...  no overlap   \n",
+       "4012   [GO:0000139, GO:0005575, GO:0005622, GO:000562...  no overlap   \n",
+       "4937   [GO:0000228, GO:0000785, GO:0000790, GO:000557...  no overlap   \n",
+       "14652  [GO:0007275, GO:0007399, GO:0008150, GO:000998...  no overlap   \n",
+       "20744  [GO:0003674, GO:0003824, GO:0003964, GO:000613...  no overlap   \n",
+       "21097  [GO:0003674, GO:0003824, GO:0003924, GO:000548...  no overlap   \n",
+       "27083  [GO:0005575, GO:0005622, GO:0005623, GO:000573...  no overlap   \n",
+       "28309               [GO:0003674, GO:0005488, GO:0005515]  no overlap   \n",
+       "35725   [GO:0003674, GO:0005488, GO:0005515, GO:0005516]  no overlap   \n",
+       "36403  [GO:0006355, GO:0008150, GO:0009889, GO:000989...  no overlap   \n",
+       "38385                           [GO:0005575, GO:0005576]  no overlap   \n",
+       "38735               [GO:0003674, GO:0005488, GO:0005515]  no overlap   \n",
+       "39987               [GO:0003674, GO:0005488, GO:0005515]  no overlap   \n",
+       "41192   [GO:0003674, GO:0005488, GO:0005515, GO:0042802]  no overlap   \n",
+       "\n",
+       "       coverage_struct  coverage_seq  \n",
+       "737                0.0           0.0  \n",
+       "1341               0.0           0.0  \n",
+       "1715               0.0           0.0  \n",
+       "2818               0.0           0.0  \n",
+       "4012               0.0           0.0  \n",
+       "4937               0.0           0.0  \n",
+       "14652              0.0           0.0  \n",
+       "20744              0.0           0.0  \n",
+       "21097              0.0           0.0  \n",
+       "27083              0.0           0.0  \n",
+       "28309              0.0           0.0  \n",
+       "35725              0.0           0.0  \n",
+       "36403              0.0           0.0  \n",
+       "38385              0.0           0.0  \n",
+       "38735              0.0           0.0  \n",
+       "39987              0.0           0.0  \n",
+       "41192              0.0           0.0  "
+      ]
+     },
+     "execution_count": 267,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "GOs[GOs['overlap'] == 'no overlap']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d38d20e1-6644-4215-998d-6ba2213fcd2c",
+   "metadata": {},
+   "source": [
+    "Looking more closely at these cases, one realizes that these are similar GO terms all over, such as GO:0005575 or GO:0003674. This happenes on both sides. There is something weird about it. Niko, please have a look!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 268,
+   "id": "1ff24373-d621-4e76-8c2b-3318a6f17577",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "protein_id                                                           63159\n",
+       "eggNOG_OGs_struct        COG5096@1|root,KOG1061@2759|Eukaryota,37I5T@33...\n",
+       "MSA size                                                            4182.0\n",
+       "alignment length                                                     841.0\n",
+       "query length                                                         790.0\n",
+       "seq. id.                                                             0.361\n",
+       "bit score                                                           2903.0\n",
+       "plddt                                                            82.705063\n",
+       "complete_protein                                                     False\n",
+       "Preferred_name_struct                                                    -\n",
+       "Description_struct       Subunit of clathrin-associated adaptor protein...\n",
+       "GOs_struct               GO:0003674,GO:0005488,GO:0005515,GO:0019899,GO...\n",
+       "eggNOG_OGs_seq           COG5096@1|root,KOG1061@2759|Eukaryota,38E7X@33...\n",
+       "score                                                                733.0\n",
+       "Preferred_name_seq                                                   AP4B1\n",
+       "Description_seq                                           clathrin binding\n",
+       "GOs_seq                  GO:0000139,GO:0005575,GO:0005622,GO:0005623,GO...\n",
+       "Name: 4012, dtype: object"
+      ]
+     },
+     "execution_count": 268,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "annotations_complete.iloc[4012]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "035867d4-3369-4e99-aaed-a99c2994e468",
+   "metadata": {},
+   "source": [
+    "This is an example in which the GO terms do not overlap. However, the description is very similar and the OGs are the same to the Eukaryotic level. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 270,
+   "id": "34504af3-0021-4b2f-a856-e642ece7c88a",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "__init__() missing 2 required positional arguments: 'go2obj' and 'annots'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m/tmp/ipykernel_278/2809043968.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0;31m# Create a TermCounts object to store information about the GO terms\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mterm_counts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTermCounts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[0;31m# Calculate the information content of each GO term in the lists\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mTypeError\u001b[0m: __init__() missing 2 required positional arguments: 'go2obj' and 'annots'"
+     ]
+    }
+   ],
+   "source": [
+    "from goatools.semantic import TermCounts, get_info_content\n",
+    "\n",
+    "# Define your two lists of GO terms\n",
+    "list1 = ['GO:0000001', 'GO:0000002', 'GO:0000003']\n",
+    "list2 = ['GO:0000004', 'GO:0000005', 'GO:0000006']\n",
+    "\n",
+    "# Create a TermCounts object to store information about the GO terms\n",
+    "term_counts = TermCounts()\n",
+    "\n",
+    "# Calculate the information content of each GO term in the lists\n",
+    "ic1 = [get_info_content(go, term_counts) for go in list1]\n",
+    "ic2 = [get_info_content(go, term_counts) for go in list2]\n",
+    "\n",
+    "# Calculate the semantic similarity between the two lists of GO terms using the Resnik measure\n",
+    "similarity = sum([ic1[i] * ic2[i] for i in range(len(list1))])\n",
+    "\n",
+    "# Print the semantic similarity score\n",
+    "print(similarity)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f388fa0c-5b4d-49ce-9b48-386d3f33acfe",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
-- 
GitLab