From 97404076e5e5201aaa4bc554509cb658213b3c56 Mon Sep 17 00:00:00 2001
From: Fabian Ruperti <fabian.ruperti@embl.de>
Date: Thu, 15 Dec 2022 14:07:06 +0100
Subject: [PATCH] revision GO term comparison

---
 analysis/revision-GO_term_comparison.ipynb | 1185 ++++++++++++++++++++
 1 file changed, 1185 insertions(+)
 create mode 100644 analysis/revision-GO_term_comparison.ipynb

diff --git a/analysis/revision-GO_term_comparison.ipynb b/analysis/revision-GO_term_comparison.ipynb
new file mode 100644
index 0000000..2ca4cc7
--- /dev/null
+++ b/analysis/revision-GO_term_comparison.ipynb
@@ -0,0 +1,1185 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 237,
+   "id": "99e6a23c-f2a3-4369-977e-4d2c74707d0c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2022-12-14 17:26\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datetime import datetime, timezone\n",
+    "import pytz\n",
+    "\n",
+    "utc_dt = datetime.now(timezone.utc) # UTC time\n",
+    "dt = utc_dt.astimezone()\n",
+    "tz = pytz.timezone('Europe/Berlin')\n",
+    "berlin_now = datetime.now(tz)\n",
+    "print(f'{berlin_now:%Y-%m-%d %H:%M}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 238,
+   "id": "a6e66eaf-c7a8-4b26-a99b-5032d49c25a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import glob\n",
+    "import os\n",
+    "\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "from matplotlib_venn import venn2, venn3\n",
+    "from matplotlib import cm\n",
+    "import seaborn as sns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 239,
+   "id": "0cd74ade-73c8-44f6-9d6e-f0f37bb760d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "structural_annotation = pd.read_parquet('/g/arendt/npapadop/repos/coffe/data/structure_annotation.parquet')\n",
+    "sequence_annotation = pd.read_csv('/g/arendt/npapadop/repos/coffe/data/Slacustris_eggnog.tsv', sep='\\t')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 240,
+   "id": "c3d289af-b99b-4403-af5b-a8fc15f8269f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "annotations_complete = structural_annotation[['protein_id', 'eggNOG_OGs', 'MSA size','alignment length', 'query length', 'seq. id.', 'bit score','plddt', 'complete_protein', \"Preferred_name\", \"Description\", \"GOs\"]].merge(sequence_annotation[['protein_id', 'eggNOG_OGs', 'score', \"Preferred_name\", \"Description\", \"GOs\"]], \n",
+    "                                                                           on='protein_id', suffixes=['_struct', '_seq'], how = 'outer')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d3b9a268-6698-415f-a0c4-4f7e0fd515d8",
+   "metadata": {},
+   "source": [
+    "## Comparison of GO-terms between structural and sequence annotation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 241,
+   "id": "63dd7ec5-8841-4bfe-9e38-db55f67c298d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GO_struct_missing = annotations_complete['GOs_struct'] == '-'\n",
+    "GO_struct_isnan = annotations_complete['GOs_struct'].isnull()\n",
+    "\n",
+    "GO_seq_missing = annotations_complete['GOs_seq'] == '-'\n",
+    "GO_seq_isnan = annotations_complete['GOs_seq'].isnull()\n",
+    "\n",
+    "GO_struct_avail = ~(GO_struct_missing | GO_struct_isnan)\n",
+    "GO_seq_avail = ~(GO_seq_missing | GO_seq_isnan)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 242,
+   "id": "469c03ab-31cf-474c-b6eb-fc9f613015ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "annotations_GOs = annotations_complete[GO_struct_avail & GO_seq_avail]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "34f832e3-b005-464b-9788-07f758239276",
+   "metadata": {},
+   "source": [
+    "Now that I have all cases in which structure and sequence annotations actually produce GO-terms (via EggNOG-mapper), I can compare the overlap between those annotations on the level of GO-terms."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a7f160be-f3e5-4d8a-bf46-833458196906",
+   "metadata": {},
+   "source": [
+    "There will be a couple of different levels:\n",
+    "\n",
+    "    - Complete overlap\n",
+    "    - Partial overlap:\n",
+    "        - Unique GO-terms on both sides\n",
+    "        - All structure GO-terms in sequence GO-terms\n",
+    "        - All sequence GO-terms in structure GO-terms\n",
+    "    - NO overlap at all"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 243,
+   "id": "fe53ea5e-26de-46cf-ab69-44288f1a2b26",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GOs = annotations_GOs[['GOs_struct', 'GOs_seq']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 244,
+   "id": "3761bb31-a522-40a0-8e86-c262b2b9e150",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_278/2049958699.py:1: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  GOs['GOs_struct'] = GOs['GOs_struct'].str.split(',')\n",
+      "/tmp/ipykernel_278/2049958699.py:2: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  GOs['GOs_seq'] = GOs['GOs_seq'].str.split(',')\n"
+     ]
+    }
+   ],
+   "source": [
+    "GOs['GOs_struct'] = GOs['GOs_struct'].str.split(',')\n",
+    "GOs['GOs_seq'] = GOs['GOs_seq'].str.split(',')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 245,
+   "id": "1b45906f-bd6c-4424-93de-ffbef8fad67e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def find_overlap(row):\n",
+    "    list1 = row['GOs_struct']\n",
+    "    list2 = row['GOs_seq']\n",
+    "    overlap = set(list1).intersection(list2)\n",
+    "    if len(overlap) == 0:\n",
+    "        return 'no overlap'\n",
+    "    elif len(overlap) == len(list1) and len(overlap) == len(list2):\n",
+    "        return 'complete overlap'\n",
+    "    else:\n",
+    "        # Check if there are unique elements in both lists\n",
+    "        unique1 = set(list1) - overlap\n",
+    "        unique2 = set(list2) - overlap\n",
+    "        if len(unique1) > 0 and len(unique2) > 0:\n",
+    "            return 'partial overlap - unique GOs'\n",
+    "        # Check if list1 contains all elements of list2 and more\n",
+    "        elif len(overlap) == len(list2) and len(overlap) < len(list1):\n",
+    "            return 'partial overlap - structure GOs expanded'\n",
+    "        # Check if list2 contains all elements of list1 and more\n",
+    "        elif len(overlap) == len(list1) and len(overlap) < len(list2):\n",
+    "            return 'partial overlap - sequence GOs expanded'\n",
+    "        # If none of the above, then overlap is partial but not unique or expanded\n",
+    "        else:\n",
+    "            return 'partial'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 246,
+   "id": "90a4a44d-2931-4b87-a7a8-1fe677e8912b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def report_coverage_struct(row):\n",
+    "    list1 = row['GOs_struct']\n",
+    "    list2 = row['GOs_seq']\n",
+    "    overlap = set(list1).intersection(list2)\n",
+    "    return len(overlap)/len(list1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 247,
+   "id": "9c799814-04f9-49d4-9427-873c50e3a2cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def report_coverage_seq(row):\n",
+    "    list1 = row['GOs_struct']\n",
+    "    list2 = row['GOs_seq']\n",
+    "    overlap = set(list1).intersection(list2)\n",
+    "    return len(overlap)/len(list2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 248,
+   "id": "bcc70854-7a7b-4bdc-b59f-c9244f9ffae8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_278/1677403612.py:1: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  GOs['overlap'] = GOs.apply(find_overlap, axis=1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "GOs['overlap'] = GOs.apply(find_overlap, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 249,
+   "id": "948c120b-f680-4a9d-ba9e-11ddc15be017",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_278/2972605403.py:1: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  GOs['coverage_struct'] = GOs.apply(report_coverage_struct, axis=1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "GOs['coverage_struct'] = GOs.apply(report_coverage_struct, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 250,
+   "id": "ca29c96d-a8da-4c7c-bdf2-5ad1f380c4da",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_278/1789197005.py:1: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  GOs['coverage_seq'] = GOs.apply(report_coverage_seq, axis=1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "GOs['coverage_seq'] = GOs.apply(report_coverage_seq, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 251,
+   "id": "9195a8b5-f5e4-4e10-a8e6-fcf190fdd42f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "complete overlap                            7052\n",
+       "partial overlap - unique GOs                2360\n",
+       "partial overlap - sequence GOs expanded     1236\n",
+       "partial overlap - structure GOs expanded     967\n",
+       "no overlap                                    17\n",
+       "Name: overlap, dtype: int64"
+      ]
+     },
+     "execution_count": 251,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "GOs['overlap'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 252,
+   "id": "6a4352ef-1326-4102-9ca6-799217ba6be4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "color_reference = {\n",
+    "    'complete overlap': cm.tab20.colors[0],\n",
+    "    'partial overlap - unique GOs': cm.tab20.colors[18],\n",
+    "    'partial overlap - sequence GOs expanded': cm.tab20.colors[19],\n",
+    "    'partial overlap - structure GOs expanded': cm.tab20.colors[19],\n",
+    "    'no overlap': cm.tab20.colors[16]\n",
+    "}\n",
+    "\n",
+    "patterns = ['...', '', '...', '', '...', '', '', '', '']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 253,
+   "id": "ae8bac54-f10d-4020-8019-6221c13b56b7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAesAAADnCAYAAAA+Rn5uAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAA/GUlEQVR4nO3deXhU1fnA8e+bhSUk7EuIioMYEVGMgrigQnCrpipuVatVY2ur1tYuWtOfu9Y21qVWrVptFZfa4r6NiqiIiGyyi6iIBteobIEsZJl5f3+cGxhClgmZmTtJ3s/z5GFy595z33snzHvPueeeI6qKMcYYY5JXit8BGGOMMaZ5lqyNMcaYJGfJ2hhjjElylqyNMcaYJGfJ2hhjjElylqyNMcaYJGfJ2hhjjElylqyNMcaYJGfJOkZEZLKInNrCOueJSE6iYmqOF8vdfsdhjDGmZZasE+s8wPdkLSJpfsdgjDEmeh06WYvIOSKyVESWiMij3rJdReQNb/kbIjLEWz5ZRO4Vkeki8qmIjBeRB0VkhYhMjiizXERuE5GF3vYDGtnvaBGZISILRGSqiAz2at1jgP+IyGIR6d7Yeo2UtV28ItJLREpEJMVbJ0NEvhCRdBEZJiKvemXOFJE9I47vdhGZDtzcYB/Hi8hcEVkkIq+LyCBv+XUi8qiIvCkiK0Xkglh9NsYYY6LXYZO1iIwErgQmquq+wKXeW3cDj6jqKOA/wJ0Rm/UBJgK/BV4E/gaMBPYRkTxvnR7AQlXdH5gBXNtgv+nAXcCpqjoaeBC4SVWfAt4DzlLVPKCusfUaOZTt4lXVMmAJMN5b53hgqqrWAvcDv/LKvAy4J6KsPYAjVfX3DfbxDnCQqu4H/A/4Q8R7o4AC4GDgmmRpxjfGmM6kIzeHTgSeUtU1AKq6zlt+MHCy9/pR4K8R27yoqioiy4BvVXUZgIgsBwLAYiAMTPHWfwx4psF+hwN7A9NEBCAV+KaR+KJdr6l4pwCnA9OBM4B7RCQTOAR40isToGtEWU+qaqiRfewMTPFq9l2AzyLee15Vq4Aqr1Y+FniukTKMMcbESUdO1gJEM6VY5DrV3r/hiNf1vzd1rhruQ4DlqnpwFPFFs15T+3sB+IuI9AVGA2/iav0bvJp7YyqaWH4XcLuqviAiE4DrGtlfU78bY4yJsw7bDA68AfxIRPoBeEkN4F1cTRTgLFwTcGukAPW9vn/cyPYfAQNE5GBvv+lekzzAJiArivUiNRqvqpYD84C/Ay+pakhVNwKfichpXpkiIvtGcUy9gK+81+c2eO9EEenmnccJwPwoyjPGGBNDHbZmrarLReQmYIaIhIBFuN7YvwYeFJHLge+BwlYWXQGMFJEFQBmuKTpyvzVeZ7I7RaQX7hzfASwHJgP3iUgVrnm7qfUiNRfvFOBJXBKtdxZwr4hcBaTj7kEvaeGYrsM1nX8FzAGGRrw3DwgCQ4AbVfXrFsoyxhgTY6JqrZqtISLlqprpdxyJICLXAeWqeqsf+w8UBbsBg4Fs79/BwCDc/f2aKH/Kgc+BL0uKCxq7X2+MMUmvw9asTfILFAV74jqs7Yt7/jwyMWcDvWO4u7pAUfArYDVQAnwCrAA+AFaWFBfUxnBfxhgTU1azNgkRKAqm4Hq/Hwgc5P3sSXL0m6jFJe/luD4Br5cUFzS8HWGMMb6xZG3iIlAUHIRLyPXJeQxbO9e1B18Dr9f/lBQXNPZYnTHGJIQlaxMzgaLg3sAp3s8+PocTa8uBad7PjJLigqYegzPGmJizZG3aJFAU3A84DZeg9/A5nESpBWYDTwCPlRQXlPkcjzGmg7NkbVotUBTMAc4GzsENx9qZVeIeobu/pLhgjt/BGGM6JkvWJiqBomBX3HPh5wBHkhwdw5LNUtzY7FbbNsbElCVr06xAUTAD+AVwOe6RKtMyq20bY2LKkrVpVKAomAX8EvgdsN00oCZqS4H7gAdLiguqW1rZGGMaY8nabCNQFOyNG+L0UqBv82ubVigBrgIeLykusP90xphWsWRtAAgUBfvj5vG+BOjpczgd2ULgDyXFBW/4HYgxpv2wZN3JeYOXXA5ciJti0yTGVFzSXup3IMaY5GfJupMKFAUF+DlwM26KTJN4YeBR4OqS4oIv/A7GGJO8LFl3QoGi4B7AA8DhfsdiANgM3An8paS4YIPPsRhjkpAl604kUBRMwzV5XwN08zkcs721wKUlxQX/8TsQY0xysWTdSQSKgqOBfwF5PodiWvY0cGFJccEavwMxxiQHS9YdXKAo2B24Hve8dKrP4ZjofQf8vKS44Hm/AzHG+M+SdQcWKApOxA1/OczvWMwOexi4pKS4oNzvQIwx/rFk3QEFioIpQDHu/rRp/z4GflRSXLDE70CMMf6wyRg6GG+Y0BewRN2R7AHMCRQFL/I7EGOMP6xm3YEEioLDcIl6L79jMXHzFPAzm9XLmM7FknUHESgK5gNPAv38jsXE3fvAMSXFBV/7HYgxJjGsGbwDCBQFLwRewxJ1Z7E38G6gKDjc70CMMYlhNet2zBvk5O/AxX7HYnyxBigoKS6Y53cgxpj4smTdTgWKgn1xzd4T/Y7F+KoCOKWkuGCq34EYY+LHmsHboUBRcDdgLpaojZsp7cVAUfDHfgdijIkfS9btTKAoOBSYDuzudywmaaQDjwWKgr/xOxBjTHxYsm5HAkXBXXGJeojfsZikI8DfAkXBYr8DMcbEnt2zbicCRcEhwAwg4HMoJvk9iHsW2/5zG9NBWM26HQgUBXcG3sIStYnO+cAtfgdhjIkdq1knOa/X9zvACL9jMe3OL0uKC+7xOwhjTNtZsk5igaJgD+B14CC/YzHtUgg4saS4IOh3IMaYtrFknaQCRcF03DjfP/A7FtOuVQCHlxQXLPQ7EGPMjrN71kkoUBQUYDKWqE3b9QBeChQFd/E7EGPMjrNknZyuAGyQCxMrg4GXA0XBnn4HYozZMdYMnmQCRcFxuJ7faT6HYjqe14HjSooLav0OxBjTOlazTiKBomA/4H9YojbxcSRwn99BGGNaz2rWScK7T/0iUOB3LNEKby5n7St3UrPmcwD6H3cpaX13Zs3zN1O38VvSeg6i/6QiUrtlRrVt151GsP6th6j6dAFdBg6l/w9/D0D5+28S3ryJnmNOTNzBdWyXlxQX3Op3EMaY6FnNOnlcRjtK1ADr3rifbruNZqcL7iPn/LtI77cLG+c8SbfAvuz08wfoFtiXjXOejHrbcHUF1V+tIOf8u1ENU/N9CeHaairef52s/drVqUl2NwWKgnv7HYQxJnqWrJNAoCh4MPBnv+NojXB1JZu/WE7mqKMBkNR0UrplUvnJXHrsfQQAPfY+gsqVc6LeFgQN1aGqaF0NkpLKxnnPkDX6BCTV7gzEUBfgYe/xQGNMO2DfgD7zRihrd/ep6zaUkprRk7Uv30HNd5/RNXt3+hzxc0IVG0jL7AtAWmZfwhUbot42pWsGGcMP4ZvJv6bbrvsiXXtQ883H9B53ZoKPrlPYH7gSuM7nOIwxUbCatf8eoh3OoqXhEDWlq8ja7zhyCu9E0rs22eTdmm17HXgqOYV30Xfizyib+Ri9DzubTUum8v1zxWx493/xPKTO6MpAUXB/v4MwxrTMkrWPAkXB3wEn+B3HjkjL6k9qVn+65gwHIGP4OGq+XUVqj97Ula8DoK58HSk9eke9baT639P67ETF+28yYFIRtd+vpnbdV3E8qk4nDXgkUBTs6ncgxpjmWbL2SaAouCfQbuceTs3sQ1rP/tSu/RKAzauXkN5/CBm7H0jF+28AUPH+G2TsfmDU20baMPMxeh16FoTrQMNuoaSgddVxPKpOaSRwvd9BGGOaZ49u+SRQFHyFdj6caM23n7L21TvRUB1pvbPpd9xvQMOseb6Yuo3fk9ZzAP1P/COp3bOo27SWta/eyaDTrm9y2/pHvCo/nk3Nd5/R+1A3iNv6N/9N1WcLSR8YYMDxl/t1uB1ZCDispLhgtt+BGGMaZ8naB4Gi4HGAzYRkksnHQF5JcUGV34EYY7ZnzeAJ5j0uc7vfcRjTwB6049syxnR0lqwT7xJguN9BGNOISwJFwX39DsIYsz1L1gkUKAoOAK7xOw5jmpCC1a6NSUqWrBPrRqC330EY04wfBIqC+X4HYYzZlnUwS5BAUXAUsBBI9TsWY1rwHjC2pLjAvhyMSRJWs06cO7BEbdqHMcBpfgdhjNnKatYJECgKngw87XccxrTCx8CIkuKCsN+BGGOsZh133jzV7WpGLWNwj3Kd7ncQxhjHknX8HY09qmXap6u8i01jjM8sWcffr/0OwJgdtBdwqt9BGGPsnnVcBYqCu+Pu/VntxLRXS3HDkNoXhTE+spp1fF2CJWrTvo2inU84Y0xHYMk6TgJFwUyg0O84jIkB+zs2xmeWrOPnXKCn30EYEwMnBIqCffwOwpjOzJJ1HHg9aC/xOw5jYqQrcIbfQRjTmVmyjo+jgD39DsKYGDrP7wCM6cwsWcfHr/wOwJgYGxsoCtoFqDE+sWQdY4Gi4G7AcX7HYUwcnOd3AMZ0VpasY+9M7LyajunsQFHQ/raN8YH9x4u9SX4HYEyc7ITrj2GMSTBL1jEUKArujJte0JiO6ly/AzCmM7JkHVuT/A7AmDibFCgK9vI7CGM6G0vWsXWS3wEYE2fdsQ6UxiScJesY8Wobh/sdhzEJMN7vAIzpbCxZx85EIM3vIIxJALsoNSbBLFnHztF+B2BMgowIFAUH+B2EMZ2JJevYsUdaTGdymN8BGNOZWLKOgUBRcCgwzO84jEkgawo3JoEsWceG1apNZ2PJ2pgEsmQdGwf5HYAxCbZvoCho87UbkyCWrGNjpN8BGJNgKcChfgdhTGdhyTo2RvgdgDE+sKZwYxLEknUbBYqCQ4Asv+MwxgeWrI1JEEvWbWdN4KazGhMoCtpAQMYkgCXrttvL7wCM8Uk6btpMY0ycWbJuO6tZm85sV78DMKYzsGTddpasTWc2xO8AjOkMLFm3nTWDm87MatbGJIAl6zbweoJn+h2HMT6yZG1MAliybhtrAjednSVrYxLAknXb7O53AMb4zO5ZG5MAlqzbppffARjjM0vWxiSAJeu2sfvVprPLCBQFB/gdhDEdnSXrtrFkbYzVro2JO0vWbWPJ2hjrZGZM3FmybhtL1saAzWttTJxZsm4bS9bGuDHCjTFxZMm6bSxZmzap+nQBXz3wC7765wWUzXlyu/dr137BN4/+ntW3TqJs7jNblocqyyh97A98/e+Lqfx49pbl3z19I3Wb1iYk9gjtOlmLyHkicrffcRjTHEvWbWPJ2uwwDYdYN+1eBp52PTk/u4eKD2ZQs+bzbdZJ6ZZF3yN/Qc+xJ2+zvOKDGfTYeyLZZ9/KxnkuiVd+Mpcug4aRltUvYcfgabfJWkRsik/TLliybhtL1maH1XzzMWm9B5PeOxtJTafHiMOpWjlnm3VSe/Sm6+A9kJTUbZZLahpaV4OGakEEDYfY9N7z9Dxw26SeIFElaxEJiMgKEXlARJaLyGsi0t17L09E5ojIUhF5VkT6NLL9riLyhrfOGyIyRER6iUiJiKR462SIyBciki4iw0TkVRFZICIzRWRPb53JInK7iEwHbm6wj+NFZK6ILBKR10VkkLf8OhF5VETeFJGVInJBG8+ZMa1iV5VtY8naRwrhoYGeswdlpIlqCAiDhkFDCmEkHEIljGgINKwQAlWBkAph8LYRQoiGUXWvIVy/XCCsaJgUr2xvGUL9uirgvU/IW64STfwffbms35d9KnofMeStVQDLP/1ywPcl32ROGDLws4brzkn/aJe0rl1CY4b0/Rpgc19JfeOBl/coX/50+qEnHLp6/Qc3d885oF9o5LA538fq/Earqq5bFRREu3oucKaqXiAiTwCnAI8BjwC/UtUZInIDcC3wmwbb3g08oqoPi8j5wJ2qOklElgDjgenA8cBUVa0VkfuBC1V1pYgcCNwDTPTK2gM4UlVDInJexD7eAQ5SVRWRnwF/AH7vvTcKOAjoASwSkaCqfh3tgRvTFpas26aH3wF0Vpqesr76kIGrdh7YM3xEzx4CDAeSZ3AODSuEQxAOi4a9K4NQGFRFwyEIhdMqX0rXTenph+efORTCYV3TpdsqfT99/IRJg8VdcISFcBgNaen8ku5du3cNTTxsfC8IqxAOF/wg/3MhrBvXrZfbfntb/6K7frfqgRsfzK3cVJH6w3OOKh114J4bhRBoSIWwbrmIIaRoCCEU3nrBEvaWhWHL+2EgrKLuAsWVU/86rEJYIEwopY+24sx8pqqLvdcLgICI9AJ6q+oMb/nDwPY38OFgoL7p4FHgr97rKcDpuGR9BnCPiGQChwBPimy5duoaUdaT6q7wGtoZmCIig4EuQOSF0/OqWgVUebXyscBzLR6xMTFgybptavwOoDMK90r/qGbsgAxSZMzbmyrLJ2ZlICJhYAbuC9r/e6iSIpCSBmxXz67PbP2G7MWGda8SSsvuDlBWFqb/kL2oSx+a1bC41MwhpPXoQXW3/Qc2fO+ph6/lpMv+zIy3Ps3bfdxJHHT8Sdx1UWGfYUdcHvPDasJLrVi3OuJ1COjehv3Wn8oXgL+ISF9gNPAm7kJ6g6rmNbFtRRPL7wJuV9UXRGQCcF0j+2vqd2Pixu5Zt806vwPobOqGZs6qOXDAEFJkF4Aa1cyva+sW4571HQ98jquxJb2h++TxbclnfP/F59TV1DA3+Dx5RxzdqjK+LfmUDd9+y/CxB1OzuQp361aora5ucdsYqm3LxqpaBqwXkcO8RT/BXXg19C6u5gxwFq7JGlUtB+YBfwdeUtWQqm4EPhOR0wDE2TeKcHoBX3mvz23w3oki0k1E+gETgPnRHJ8xsWA167ZZi828lRAq1NaM6T9b+3Y9vOF7r5aV9/7pgC39kYZ5/84FcoBdEhRiq6WmpXH2NX/i9p/+mHAozKGnns5OucOZ/t9HAMg/8xzKvv+OG04+lqryciQlhWmTH+BPr7xF90xX+X76bzdzym+vAODAH07irovP5/VH/s2kSy9L5KG0KVl7zgXuE5EM4FOgsJF1fg08KCKXA983WGcKrul8QsSys4B7ReQqXGvL/4AlLcRxHa7p/CtgDjA04r15QBA3vOqNdr/aJJKoWkvOjgoUBYPAcX7H0dFp15TvqscNKiU9ZVRT69yw04BV6SLDGizejEvaY2lbc6tp3lmFuTmP+x1EPInIdUC5qt7qdyymc7Jm8LaxZvA4C/Xruqx6fLY2l6gBFlRs/rKRxd1wTePrcLUkEx8JH4XFmM7GmsHbxr6k4qh2eM+3Q7tmHoxIix3GXt9YvveBPbrViEiXRt7eyftZBGRhty5ircP/P1DV6/yOwXRuVrNuG6tZx4GmUFV9yMB3QoGsw6NJ1AAVYe23ti7UUsey/YAArvNSWRvDNFt1+GRtjN8sWbeNfUnFWDgj9cvq/MGrNSv90NZuO21jRWO16obScE3jtcBM3Agopm3s/4ExcWbJum2sZh1DoezuC2oOHdSDtJQ9d2T7ZVXV+4VUG7t33Zj+wGHAh8D7O7I/A0BtYW7ORr+DMKajs2TdNlajiAEFrdmnz4zaUX32o5ExoVtRTsryqupVrdxsL2AkMAv4bkf33YnZ40vGJIAl67axmnUbaapsqjls0NxwTsZ4b0SPNnm1rDxXVVvbtC3AOFzv8RnE5rnhJr320P1cdVw+VxdM5L7fXkxt9eZt3l/0+lSuOf5Irj3hKK4/+Vg+fm8eABvXreXPZ0zi6oKJLJz26pb177yokPXflsYz5OZ84teOjelMLFm3zbd+B9CehbPSV1VPHLxGM9IOilWZ60PhnE3h8MId3Lx+FLQviNMoaOtLv+H1Rx/kmmde5sbgm4TDIeYGn99mnREHH8r1L0zj+hemcf6fb2PylW6Ak7kvPce4k07j/6a8wKv/vheAxW++xq577UOfQdnxCDcarW3JMMbsAEvWbfMlsMnvINqjul16zKk5eMAgUmRoy2u3zvSNlXVtLGI33BjTc3GJO6ZCdXXUbN7s/q2qovfAbRNttx49qJ98orqqcsvrtLQ0aqo3U1dTg0gKobo6pk3+Fz/42UWxDrE1rGZtTALYCGZtFCgKzsZNm2eioBCq3b/fO+EB3cbHax8pUHvjTgPKUkT6x6C4+lHQDgAyYlAe0x7+F8/87WbSu3Zj70PH8/Pb7t5unQWvvcLTt/2FTevWcun9D7P7fmOo3LSR+3/3S8rWruG0y/6Prz75mIzMLMad/KNYhLWjTi7MzXnWzwCM6QysZt12y/wOoL3QLilrqydkL4lnogYIQ/qq6prlMSqufhS09cDsthZWUbaBRW9M5eY353D7Owuprqxk9vNPb7fe6KOP5c9T3+aSe/7Ns3fcAkBGVk9+88CjXPvMK+w6ch+WTn+d0ccUMPnKy/nHry7gk0XvtTW8HWE1a2MSwJJ129ljP1EI9+6yonp89ma6pu6fiP29XFYe6wk8dsLNp7wIWLmjhXzw7kz67zyEnn37kZaezv5HH9tskh1+wEF8/8VqNq3bti/jC//4Gz+86NfMfek5dt17H87/y+08c3vxjoa1o+qwZG1MQliybjtL1i2oG5b1Ts3Y/kNJkZ0Stc/S2tBuVeHw0jgUvR/unvbbwIbWbtw3Zyc+XbyQ6qoqVJUVs99h8G6526zz7erPqL89tXr5Mupqasnss/WJtiSZFhNgeWFuTlWid2pMZ2Rjg7fdYr8DSFYq1NSMHTBXe3c5rOW1Y++dTZUbj+qVGY+iU4HDcc/Zz8Q99hXVhe+wffdnzDEFXD/pGFLT0hgyYiTjzzhrm2kxF0x9mXefe4rUtDS6dOvGhXfcu6WTGSTNtJhg8zkbkzDWwSwGAkXBj4HcFlfsRLRb6jfVhwxcS3rK3n7F0EWouC5nQEhEesZ5VytwTcL7xHk/yeYXhbk59/sdhDGdgTWDx8ZcvwNIJqEB3ZZUHz4ozc9EDVCj9Piytm5JAnY1AtibzjcK2jy/AzCms7BkHRuWrD21I3rNqN2v70hEBvgdC8ArG8r7JWhX9aOgdScBo6AlgSqsv4YxCWPJOjY6fbLWFKmsHjfw3dCQzPGIJE1fiM9qaveqCevHCdxlFu5Rry8BX56lSpCFhbk5bR18xhgTJUvWsbEEN3hGpxTukba6emL2V5qZfojfsTRmfkWVHwNnDwXG4JqKP/dh//E23e8AjOlMLFnHQElxQQ2d9MsrNLj7/JpxA3uTmpK0Heze2FgxSlX9upgaCwwC3gIqfYohHl5PxE5EZJKI7BXx+w0icmQL20wWkVPjH92W/V0nIgnvii8iARH5cQzLyxOR42JVXiPlZ4rIvSKySkQWicgCEbkg4v2RIvKmiHwsIitF5GqJfAyinRCREmnF6Ikicp6IbD+MYQOWrGPnGb8DSCQFrRnVZ0btPn3GINLL73iaU6Xa+/u60I5O7hELXYEJuOey3/UxjlipIAajubVE3O2USbhpTAFQ1WtUNSEXCtEQf2/5BIBGk/UOxpUHtCpZixNtHvkXbiTAXFXdD/gB0NcrpzvwAlCsqnsA+wKHABe3Jp6OzJJ17DwHhPwOIhE0TcqqDx80Pzw4Yzzt5Mr3tbKK7n7HAOTgvoAW04ZR0JLAG4W5OTUtreTV/D4UkYdFZKmIPCUiGd5714jIfBF5X0Tur69BichbIvJnEZkBXAGcANwiIotFZFhkrbmpMpqJJ09E5nixPCsifURkhIjMi1gnICJLvdejRWSGVwOcKiKDG4nx0gb7uMCLaYmIPB1xvJNF5D4RmenVHH8Y/ekGERnvnYPFXq00CygGDvOW/daroT0pIi8Cr4nIBBF5KaKMu0XkPO/1ASLyrhfnPHEX3DcAp3vlnd6wxcA7zwHvZ4WI3AMsBHYRkcu9414qItc3Ev8wXCvTVfVT2Krq96p6s7fKj4FZqvqa914lcAlQ1MzxN9zH2d6xLBaRf4pIqnecS0Wkm4j0EJHlIrK3d27e9v4OPvA+mxSvnHtF5D1v3esjyi8RketFZKGILBORPb3l/UTkNS+uf+I6mzYZk7e80Ps7mIHrmNoiS9YxUlJcsAY3QEaHFu6ZvrI6f/AGuqeN9TuW1li+uTovpJos947zaMMoaEnglVasOxy4X1VHARvZWlO6W1UPUNW9cT3oI5NXb1Udr6o34Wpbl6tqnqo2nI6zuTIa8whwhRfLMuBaVV0BdBGR3bx1TgeeEJF04C7gVFUdDTwI3NRIjLc12MczXkz74p6//2nEewFc58MC4D4R6dZCvJEuA36pqnnAYbje+EXATO/c/M1b72DgXFWd2FRBItIFmAJc6sV5JK615BpgilfelBbiGQ484tWQh+PGmRiL+9seLSKHN1h/JLCkmbnmR9JgWlrv884UN05CY8cfeUwjcJ/dOG+dEHCWqs7H/Q39Cfgr8Jiq1j/FMBb4PW58hGHAyd7yK1V1DDAKGC8ioyJ2tUZV9wfu9WICuBZ4xzsXLwBDmovJu+i7HpekjyKi5ag5lqxjq0M3hdftmvluzUEDckiRXf2OZQfI0srqz/wOIkL9KGgh3EVeU19iyejlVqz7harO8l4/Bhzqvc4XkbkisgyYiPuyrtdSoqjXXBnb8GqOvVV1hrfoYdz5B3gCqJ+67HRv/8Nxz85PE5HFwFXAzlHEuLdXe14GnNUgpidUNayqK4FPgT2jO0zAPcN/u4j82juOpnriT1PVdU28V2848I2XyFDVjc2U15TVqjrHe32097MIV9PekxYGiRKRK73a5tf1i4CmRuhSWj7+I3DT2s73Pq8jcBfE4FoMjsJ1+PxrxDbzVPVTVQ0B/2Xr3+aPRGShdzwj2TaZ1n/HL8BdfIH7O3oMQFWDuKb+5mI6EHjLa1moIcq/96R5xKaDeBb4OxHNIB2BCnW1o/vNCveL72xZ8TZ1Y/nwvIyuofqmqCTRD1dTaC+joM0pzM1pTQtFwy9g9WqU9wBjVPULEbkON7tZvYqWCo2ijNaYAjwpIs8AqqorRWQfYLmqHtzENk3FOBmYpKpLvCbnCRHvbXcuIn8RkZtwtW68mtjWFVWLRSSIu6c8R5ruZBcZVx3bVsjqz09ziTFSU9s33I8Af1HVfzZT1gfAviKS4l2w3ATcJCLl3vvL2Xrx5Ap1rR3lqroJ2O74VfXDBjE8rKp/bGTffYFMIN07hvrYG/vbHIqrMR+gqutFZHKD464fgD/EtvmzsfPZaEwiMqmJ9ZtlNesYKiku+JIONl6ydkn5vnpC9vvtPVEDlIXC2WWh8IKW1/RF5Cho3/ocS3P+28r1h4hIfcI7E3iHrV9+a0QkE2iu5/Ym3LPrDbWmDFS1DFgvIvXj1P8EN3hNfXNrCLiarbWcj4AB9bGLSLqINFlzj5AFfOM1o5/V4L3TRCTFu3+7m7ePyBiv9Jqg8xoWKiLDVHWZd4/3PVzttalzU281sJeIdPVaFo7wln8I5IjIAV7ZWeI6pDUsrwTY31tnf9zjiI2ZCpzvfQ6IyE4iMrDBsX3ixf2niPu23dhasfkPcGj9RYi4Dmd34tWEmzj+SG8Ap9bvV0T6imxpAbwf99n+B7g5YpuxIjLUu1d9Ou5vsycumZeJyCDg2CaOOdLbeJ+1iBwL1M+601RMc4EJ3r3udOC0KPZhNes4eBp3L6TdC/Xturx2TL++iOT5HUusvLmpgpP7xHuo8B1WPwpaOe5Rr0OALn4G1EAI12TcGiuAc72ONyuBe1W1UkQewN03LqH5C9z/AQ94zZ9bErKqbmhFGfXOxd0rzsA1QxdGvDcFuAUvIalqjbiObHd6iS4NuANXA2zO1bgv49VebJHJ7yPcBcIg4MJWPk74GxHJx30GH+D6DYSBOhFZgqvRr4/cwGtxeAJYijv3iyKO7XTgLi8pVuHuW08Hirwm27/gvsvO8X6fDzQ6uJCqvubdn50tro9fOXA22w+9+zPcOf5ERNZ5+73CK6NKRE70YvoH7jbRo0D9I02NHX9kDB+IyFW4jnUpuBEEfyki44E6VX3cu0h4V0QmeuduNq6T3j64hPusqoZFZBHuc/4Ud/HckuuB/3pN5zPwxlVoKiZVneO1BM0GvsHdOmixtc8m8oixQFFwd9p3T18AanN7zgwNzTwQ1xmlw0iBuht3GrAupcGVf5L6DDez1xi/A/G8Xpibc1S0K4tIAHjJ6wDWqXnNqS+p6lN+x2JARCYAl6lqq3rl+8mawWOspLjgE9wVdbukKVRXHzRgZmi3rMM6WqIGCEPax5trVvgdR5QiR0Fb7XMs0PomcGNMjFgzeHw8TfJ3FNpOuFvq1zXjBm4gLcWX+acT5eWy8sCe3bv6HUZrjMV1bHkLOADo4UMMm3F/11FT1RLcffhOT1XP8zsGs5WqvoX7/9RuWM06Pv5NO5t1KTSw26Kawwd1IS0lqmf+2rPv60K7VoTCi/2Oo5XqR0HbiD+joP2vMDenzIf9GmOwZB0XXq/wdtNkWDuy94zavL6jaMV4tu3dzPLKFh8PSlKDcR3PltBEh584uSuB+zLGNGDJOn5u8TuAlmiqlFcfNmh2aOce40muZ4/jblZ55f7e4zzt1b64UZcSMQra7MLcHD/HVjem07NkHSclxQXv07phGRMqnJn2WXX+4G81I62pQR86tDql++qa2iV+x9FG9aOghYnvKGhWqzbGZ5as4yspa9d1O2XMrTlkYD9SZZjfsfjplbKK9vD4VjT64kZB+xj3TG0slQL2uJExPrNkHUclxQXTcaPtJAWFcE1e37fqRvYeixscv1P7vKZ2z+rwNkMWbuPBP/6OSw8axdUFW+dEKN+wnlvPO4Oio8Zx63lnUFG2odFtKzeW8Y9fXcD/HXM4V/5gPJ8scn8GT95yE9ccfyQPXP7rLeu++9xTTHv4X7E4pD1xkw+8i0uysXBvYW5Ou+osaUxHZMk6/pKidq3pKeurx2cvDA/qPqG9TGuZCHMrqhqOsrTFuJN/xO/+/Z9tlr18/z8YcfChFE+bxYiDD+Xl+//R6LaP/+ka9jksnz9PfZvrX5hGzrBcKjdt5JOF73HDi6+j4TBffrSCms1VzHrmCfJ/fG4sD+sQ3FjIbwEtTmXZjA24se6NMT6zZB1/T+NGovJNuFf6R9UTssvplposI2EljekbK/ZV1arG3ht+wEH06NV7m2WL3pjKuJPcUL7jTjqNha+/ut12VeWb+Pi9uRx22pkApHXpQkbPXoikUFdbi6pSs3kzqWnpvPqv+zjynJ+Slp4e4yMjE/eo11fseOvOHfa4ljHJwZJ1nJUUF4SA2/3af93QzFk1Bw4YQors4lcMyWyzaq9v60JR93TeuGYNvQcOAqD3wEFsWrt2u3W+/3w1WX368WDRb7nuxKN56P8uo7qyku6ZmYw55jiuO/Fo+u+8C92zsvhs2WL2O/KY2B3Q9nZ0FLQNuLGwjTFJwJJ1YjyIG+M5YVSorT6g/9t1e/Qahxus3zRhall5ZizLC4VCrP5gGRN+fA7XPf8aXTMyCN7v5iM49oKLuf6FaZzxx2t59o5bmHTp5bz9xOPcc+kvePGeO2IZRkNjgWxc03g0z5jfbrVqY5KHJesEKCkuqARuTdT+tGvKt9X5g1do366Ht7y2+XBzzb51bmjMFvXs358N37kZLDd89y1Z/fptt07f7MH0yR7MsH33B2DMMQV8vnzb4eJXf/A+ANmB3Xj3uae4+O//5KuPP+Lbkk/bcigtiXYUtPXYvWpjkool68S5nQbz18ZDqF/XpdXjsyE9ZVS899WRLK7cHFUT8X4Tj2bWs08CMOvZJ9nviO2bsHsNGEjf7By++fQTAD6Y/Q45u++xzTrP3vFXJl16GaG6WsLhEACSkkJ1VaO3z2MtchS0xv4m/1KYm7MxEYEYY6JjyTpBSooLaoBfxnMftcN7vl07ut8I3KTpphVeK6vYS1XrIpfd99uLuen0Eyj9bBW/P2w0bz/5X477+S/5YNbbFB01jg9mvc1xP3cf6fpvS/nbz36yZduzrr6R+y/7FdccfyRfrFhOwYW/2vLewmmvMnSfPPoMyiajZy+G5Y3m6h8egYgwZMTIBB0x4EZB2x03Clr9XMifYLVqY5KOzWedYIGi4OPAmbEsU1Ooqjlo4ALNSj80luV2Npdn95vbNy31QL/j8Mk6YDlwa2Fuzgt+B2OM2ZbVrBPvd7h7hjERzkj9ojp/8OeWqNvujY0Vnfn/Q1+g0hK1McmpM385+aKkuKAUuCoWZYWyu79Xc+igTNJShseivM5uUeXm/cOq3/gdh0+qgUv8DsIY0zhL1v64B9jhWYwUtGafPm/VjuqzPyJ9YhhXp6aQumJzTSKnnUwmfy3MzfnE7yCMMY2ze9Y+CRQFxwKzaeUFk6bJxppDBq7Q7mlJdW819F0pZcVXE163FkTI+OEpZJzyY8Ibyyi78QpCpV+Tmp1Dr2v+SkrWtsOSa0016y79KdTWoKEQ3cYfSeZ5FwGw6f6/UzNvFmnD9qDXH/8EQNVrL6GbNpJxyo9jfhz90lK/+P2gvjtL5xqS9WNg38LcnM1+B2KMaZzVrH1SUlwwD3igNduEs9JXVecPXptsiRqA1FSyLvwd/Sc/Q99/PELl81OoK1lFxX8fost+Y+n/6At02W8sFf99aPtt07vQ5/b76fevJ+j3wP+onvcuNR8sJVy+idrlS+j3rycgHKb205Vo9WY2T32R7ieeFpfDWFsX2qUirIviUnhyqgXOskRtTHKzZO2vPwLfR7Ni3S495tQcPGAQKTI0zjHtkNR+A0jfYwQAKRk9SBsylNCa76me9RbdjjkegG7HHE/1O9O321ZESOme4X6pq4O6OjfXSEoK6o2lrdXVSFoaFVMepvvJZyBpMR9Le4sZmyo6U+K6rjA3J2lmhjPGNM6StY9KigvWAxc3t45CqGb/fm/V7dX7IERiOixmvIRKv6b2k49IH7E34fVrSe03AHAJPbxhXaPbaCjE2gtO5/uTj6DLmINIH7EPKRk96Hb4Eaz7+RmkDs5BemRS++EHdBuXH9f4Z5dXjVbV9S2v2e7NBIr9DsIY0zJL1j4rKS54Cri7sfe0S8ra6gnZS8IDuk1IbFQ7LlxVyYZrLyPr4stI6RH9tYWkptLvgSn0f2IqtR++T91nrq9TjzPOo98DU8i66PeUP3QPmYUXURl8hg3X/4HyR1t1FyFqIej6WU3tspbXbNfKgJ8U5uaE/Q7EGNMyS9bJ4ffA/MgF4d5dVlSPz95M19T9fYqp1bSulrJrL6PbkcfS7fAjAEjp04/QWtfSH1r7PSm9+zZbRkpmFl32HUP1vG2Hrq5d+SEAaTvvyubXXqL3tX+lruQT6r5szURS0Xt5Q/nguBScPC4pzM2Jz8kzxsScJesk4A1F+iPctITUDct6p2Zs/6GkyE6+BtYKqsrGW64nbchQepy2ddjNroeMZ/PUFwHYPPVFuo6bsN224Q3rCJdvcuVUb6Zm4VzShgS2Waf8oXvIPO8iNFQHYVcZFElBq+Nze/mr2rrczeHwB3Ep3H//K8zNeczvIIwx0bNknSRKigtKgHOrD+z/dt3uPQ9FpJvfMbVG7fuL2TwtSM3i+ay94HTWXnA61XNm0uPMQmoWzGXNT06gZsFcepxZCEBozXesL3JjcITWrmH97y5g7c9+xNqLzqbL6APpevDWCcM2vzOd9OEjSe0/kJTMLNL3GsXan54GIqQPi994MLPLqxI6rWmCvA/8wu8gjDGtY89ZJ5ns6YtvJEYjnJm26Sqy6dqc/iki0sPvWGLke2BsYW5Oid+BGGNax2rWyecawMZnTgLVqlnf1NZ1lGeuq4FJlqiNaZ8sWSeZ0vw8Bc4GOur90nbl1bKKXn7HECM/K8zNebfl1YwxyciSdRIqzc/bBBwLfOl3LJ3dyuqafWpVV/kdRxv92TqUGdO+WbJOUqX5eZ8DRwNr/I6ls1tYsbk9XzQ9jfWBMKbds2SdxErz81bgatib/I6lM3t9Y/lIVa3xO44d8DZwTmFujvUiNaads2Sd5Erz894DTsR1EDI+KA9r/3Wh0AK/42ilmcBxhbk5lX4HYoxpO0vW7UBpft504HQg5HcsndW0sooufsfQCu/gEnWF34EYY2LDknU7UZqf9zxwPpawfbG0qnq/kOpXfscRhVnAsYW5OeV+B2KMiR1L1u1IaX7eI7hhSa1JPMEUUj6oqv7E7zhaMBtL1MZ0SJas25nS/LxngOOwTmcJ90pZ+e6qmqyzVM0BflCYm2N/F8Z0QJas26HS/Lw3gXzc8JEmQdaHwjttCocX+h1HI14FjirMzdnodyDGmPiwZN1OlebnLQAOBWyawwR6a2Nlnd8xNPAv4Hhr+jamY7OJPNq57OmLd8LVrPb2O5bOIBVqbthpwMYUkf4+h6LANYW5OX/yOQ5jTAJYzbqdK83P+wo4GHjK71g6gxB0WVVdu9znMCqB0yxRG9N5WLLuAErz88pL8/NOAy7HHu2Ku1fKynf2cfdfAIcW5uY87WMMxpgEs2TdgZTm590KHAl853csHdk3tXXDqsLh933Y9TTggMLcnI4ybacxJkqWrDuY0vy8t4D9cc/cmjiZVV61IYG7qwEuA44pzM35NlE7FZFJIrJXxO83iMiRLWwzWUROjX90W/Z3nYhclqj9dQQiMkhEHheRT0VkgYjMFpGTIt4/VETmiciH3s/P/Yx3R4lIqzpdJvvfkiXrDsi7jz0B+DuuI5KJsZmbKvZT1UQ80/whcFBhbs5tiZyQQ0TSgEnAlmStqteo6uuJiqElXoymFUREgOeAt1V1N1UdDZwB7Oy9nw08Dlyoqnvinjj5hYgU+BSy8Viy7qBK8/NqSvPzfoN7Hru9z8ecdGqUHl/W1i2O827uB0bvSLO3iAS8WtHDIrJURJ4SkQzvvWtEZL6IvC8i93tf4IjIWyLyZxGZAVwBnADcIiKLRWRYZK25qTKaiSdPROZ4sTwrIn1EZISIzGsQ81Lv9WgRmeHV/KaKyOBGYry0wT4u8GJaIiJPRxzvZBG5T0RmisjHIvLDVp7L8d45WCwii0Qky1t+ube/pSJyfcT6V4rIRyLyuoj8t7625sU+xnvdX0RKvNepInJLRFm/8JZP8LZ5yvss/xPxWR0gIu96xzpPRLKaKqeBiUCNqt5Xv0BVV6vqXd6vvwQmq+pC7701wB+AIm+/p3mf+RIRebuJ87XdeRGRk7zzISIy2PscskXkPBF5XkRe9c7ZtRHlPOd9/sslonYvIuUicpMXwxwRGeQtHyqulWC+iNzYUkwNPytgeFN/A8nAknUHV5qfNwMYBdwBJOvoW+3Sq2XlfeNU9Drg5MLcnF+0cdas4cD9qjoK2Ahc7C2/W1UPUNW9ge5AZPLqrarjVfUm4AXgclXNU9WGF3zNldGYR4ArvFiWAdeq6gqgi4js5q1zOvCEiKQDdwGnejW/B4GbGonxtgb7eMaLaV9gBfDTiPcCwHigALhPRLq1EG+ky4BfqmoecBhQJSJHA7nAWCAPGC0ih4tIfU11P+Bk4IAoyv8pUKaqB3jrXyAiQ7339gN+g2vh2A0YJyJdgCnApd6xHglUtVBOvZFAcwP7jAQazjD3nrcc4BrgGG+/JzTcuKnzoqrPAqW4i4EHcJ9/qbfZWOAsb/3T6i9ogPO9z38M8GsR6ect7wHM8WJ4G7jAW/534F7v+OvLbjKmHfysfGPJuhMozc+rLM3P+y1wOPCx3/F0FJ9W146sCevKGBapwEPAiMLcnGdjUN4XqjrLe/0YrkkTIF9E5orIMlxNa2TENlOiLLu5MrYhIr1wCXaGt+hh3N8iwBO48e7BJespuIuMvYFpIrIYuAqvmbaFGPf2as/LcF/+kTE9oaphVV0JfArsGd1hAm5ylNtF5NfecdQBR3s/i3DJb09cQjgMeFZVK1V1I+6CpyVHA+d4xzoX6OeVBTBPVb/0hrldjLvoGA58o6rzAVR1Y0RMTZXTKBH5h1dDnV+/iMZvndUvmwVMFpELgNQmjqWx8wLwK+CPQLWq/jdim2mqulZVq4Bn2Pp3+msRWYIbSneXiHJqgJe81wu8cwIwDqgv99EoYtqRz8o3ds+nEynNz5uVPX1xHnAD7mrdPv82eq+y6ptDMjOa/UKM0kLgksLcnFh2DGz4patejfIeYIyqfiEi1wGRtcwWp9WMoozWmAI8KSLPAKqqK0VkH2C5qh7cxDZNxTgZmKSqS0TkPFy/jXrbnYvIX0TkJlytG68GvXVF1WIRCeLG5J8jrpOdAH9R1X82KOc3jeyrXh1bK0iR50uAX6nq1AZlTWDbSXtCuP+zTSXURstpYDlwSsSx/VLcAD/vRbw/hm0T12jgA2/9C0XkQNy5Wiwieaq6tkEM250Xz0641r1BIpISMc5+Y3+nE3AtBgeraqWIvMXWc1arW0fzqj8nW7ZtZL878lklHatZdzKl+XlVpfl5lwP7kORXku3BGxsr9lHVtsyCth7XPH1AjBM1wBARqU94Z+Lmua7/wlsjIplAcz23NwFZjSxvTRmoahmwXkQO8xb9BJjhvbcK94V7NVtrzB8BA+pjF5F0EWmy5h4hC/jGa0Y/q8F7p4lIiogMwzUnf9Qgxiu95v68hoWKyDBVXaaqN+OS2p7AVOB87/gRkZ1EZCCuWfYkEeku7t728RFFleASH2x7zqYCF3lxIyJ7iEiPZo7zQyBHRA7w1s8S19kumnLeBLqJyEURyzIiXv8DOE9E8rwy+gE3A3+NOBdzVfUaYA2uxhup0fPixfcQ8GPcLYrfRWxzlIj0FZHuuE6Ns4BewHovUe8JHNTM+ag3C9esDdt+/jvyWSUdq1l1UqX5eR8CJ2ZPXzweuBV3NW1aqTKsfdbUhd4dkJ52SCs3VeDfwB8Lc3PWxCE0cF+K54rIP4GVuPt5lSLyAO6+cQkwv5nt/wc84DX/bkkuqrqhFWXUOxd3rzgD1wxdGPHeFOAWYKhXfo24jmx3ek3oabg+Fy2NHHc1rvl3tRdb5IXGR7gLhEG4ns6bo4i53m9EJB93UfEB8IqqVovICGC2uD5f5cDZqrpQRKbgmqxXAzMjyrkVd0/+J7ikWe9fuKbcheIK+x6XtBrlnZ/Tgbu8BFeFq4W2WI6qqohMAv4mIn/w1qnAdShEVb8RkbNxn3sWrlZ6h6q+6BVxi4jkesvfAJY0KP+1xs4LcCEwU1Vnes30873WCnAXkY8CuwOPq+p73q2MC8V1OPwI1xTekkuBx0XkUmDLoEFNxdTCZ5V0bGxwQ/b0xYK74r0J2NXncNqdvbt3XXRWv177Rbm6As8CNxbm5iyOV0wiEgBe8jqAdWoiMhl3LhI+JK93i6BcVW9N9L7bA+92xRhVvcTvWJKdNYMbSvPztDQ/7z+4jiuXE9GT0rTs/arqvJDqFy2sFsbVIEcV5uacEs9EbYzpeKxmbbaTPX1xF1zT1e+JGBTDNO30vj1n5GV0G9/IWyFcD9WbCnNzPkxwWMaYDsKStWmS1zx+LO4503yfw0lqvVJTvrkiu99AEal/nGUz8B+guDA35xMfQzPGdACWrE1Usqcv3h/Xg/MUdvwxnQ6tKLvf/F5pqT2BfwKTC3Nz1vsdkzGmY7BkbVole/riXriewWfjRoRqdpjJTqICeGqPrl0eePuQvWa1uLYxxrSSJWuzw7KnL94F14v8JzQzglUHVQ1Mxz3e9FRpfl6Lg4kYY8yOsmRtYsIbGe0U3D3u/emYNe5vgSDwIjDNErQxJlEsWZuYy56+eBBwDHAUbtzoHH8j2mEh3OAa9Ql6Xml+nv2HMcYknCVrE3fZ0xcPx43TPBo36039TE3J5ivcCFj1PwtK8/NaNYG9McbEgyVrk3DZ0xen4gZgycNNT5eHG295MI3P5BNrpbghL+t/lgBzS/PzvkrAvo0xptUsWZukkT19cQqQjZudp+FPT6BrxE+XiNfpQCVuzuZNjfy7FvgMWAV8Vpqf15Y5oo0xJuEsWRtjjDFJzsYGN8YYY5Jc3JO1iEwSkb0ifr/Bm7y9uW0me1PkJYSIXCcilyVqf9GK5lzFYB9pIvJnEVkpIou9nysj3t9ZRJ733l8lIn8XkS7xjMkYY8y24pqsvQnHJxExGYSqXqOqr8dzv63hxZiUEnSu/oR7tGofVc0DDsPdA8abE/cZ4DlVzQX2ADJxU2kaY4xJkGaTtYgERORDEXlYRJaKyFPe5PGIyDUiMl9E3heR+70vdkTkLa+mNgM3ofkJuAnLF4vIsMhac1NlNBNPnojM8WJ5VkT6iMgIEZnXIOal3uvRIjJDRBaIyFQRGdxIjJc22McFXkxLROTpiOOdLCL3ichMEflYRH7YmhPdsPbuHXPA+1khIg+IyHIRec2bUH6bFgYR+YH3WbwjIneKyEvNleu9PltE5nnn/p8Rk0zUr5sBXAD8SlU3A6jqJlW9zltlIrBZVR/y3gsBvwXOF5EMERkZUf5ScZPSG2OMibFoatbDgftVdRSud+3F3vK7VfUAb3L77kBk8uqtquNV9SbgBeByVc1T1VUNym6ujMY8AlzhxbIMuFZVVwBdRGQ3b53TgSdEJB24CzhVVUcDD7JtjbA+xtsa7OMZL6Z9gRXATyPeC+DGwy4A7hORWE1okQv8Q1VHAhtwI4Ft4e3nAeB4XM03u6UCRWQE7lyM82rMIeCsBqvtDnyuqpuaKGYksCBygapuBD73tr0Q+LtX/hjgy5biMsYY03rRJOsvVLV+coLHgEO91/kiMldEluFqYJFjQ0+Jcv/NlbENEemFS7AzvEUPA4d7r58AfuS9Pt3b/3Dc4BvTRGQxcBWwcxQx7u3VnpfhkltkTE+oalhVV+Kez90zusNs0Wequth7vQB3URBpT2+dleq67z8WRZlH4AYhme8d/xHAbs1tICKFXi35CxHZBTdkaGOPC9Qvnw38n4hcAeyqqlVRxGWMMaaVoknWDb+s1avp3YOrte6Dq/VF1jJbHDM5ijJaYwrwIxHZA1AvmQqw3KvR56nqPqp6dBQxTgYu8WK6vkFM252LBsd0k5fsFjdSbh3bnu/IcqsjXoeAxu6jN/WMXVPlCvBwxPEPj2jervcJMEREsgBU9SGvllyGG5xkOa7GvIWI9AR2AVap6uO42xxVwFQRmdhEjMYYY9ogmmQ9REQO9l6fCbzD1oSwRkQycVMmNmUTkNXI8taUgaqWAetF5DBv0U+AGd57q3BJ7mq21pg/AgbUxy4i6SISzcxQWcA3XjN6w2bj00QkRUSG4WqpHzWI8cr65NhIuSW4CS4Qkf2BoVHEUu9DYKi3X3CfQ0vlvgGcKiIDvff6isiuDeKtBP4N3F3fpO/d1+4SUUaGiJwT8d5twGRVrfRuPXyqqnfibneMasUxGWOMiVI0yXoFcK7XaasvcK+qbsDVhJcBzwHzm9n+f8DlIrIoItnQyjLqnYvrrLYUN0TlDRHvTcHNsfyEV34N7gLgZhFZAiwGDoliH1fjxoWehkuSkT7CXSC8AlxY3ykrSk8Dfb1a90XAx9Fu6O3n50BQRN4BVrdUrqp+gGv6f807X9Nww3k2dCXwDfC+iCwCZuJuMXztNbmfhLtIWemVvRn4P2/b073tFuOa6h+J9piMMcZEr9kRzLxexS95HcA6NRGZjDsXTyVBLBOAy1S1VT3SjTHGtE82gpkxxhiT5GxscGOMMSbJWc3aGGOMSXKWrI0xxpgkZ8naGGOMSXKWrI0xxpgkZ8naGGOMSXKWrI0xxpgk9/9hXCt25xCoeAAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<Figure size 432x288 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "order = ['complete overlap', 'partial overlap - unique GOs',\n",
+    "         'partial overlap - sequence GOs expanded', 'partial overlap - structure GOs expanded',\n",
+    "         'no overlap']\n",
+    "vc = GOs['overlap'].value_counts()[order]\n",
+    "labels = vc.index\n",
+    "sizes = vc.values\n",
+    "colors = [color_reference[i] for i in labels]\n",
+    "# explode = (0.1, 0, 0, 0, 0)\n",
+    "\n",
+    "fig, ax = plt.subplots()\n",
+    "piechart = ax.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=False, colors=colors)\n",
+    "ax.axis('equal');\n",
+    "\n",
+    "#plt.savefig('./figures/analysis-sequence_structure_agreement.pdf')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 254,
+   "id": "ab2a9480-aaee-43d3-af23-ca05886b3695",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<AxesSubplot:xlabel='coverage_struct', ylabel='Count'>"
+      ]
+     },
+     "execution_count": 254,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEHCAYAAABfkmooAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAZMUlEQVR4nO3dfbBkdX3n8ffHGUEeBIEZEGbQgTCiwMaoI8GHTVTcBTUlmBV3XBVEKlMxxKhJDJDsxmylpmR3Uyk1K7oTNWDWEhEfGJ+lRtQ1ijgoIg8S7g4RJhBm0AQFDe4M3/3jnMHm2n1Pc+d2971z36+qW/f07zx9f/1wPn3O6T6dqkKSpJk8atIFSJLmP8NCktTJsJAkdTIsJEmdDAtJUifDQpLUaemkCxiVZcuW1apVqyZdhiQtKNdee+09VbV8evseGxarVq1i8+bNky5DkhaUJN/v1+5hKElSJ8NCktTJsJAkdTIsJEmdDAtJUifDQpLUybCQJHXaY79nIUmLwc6dO5mamnro9jHHHMOSJUvmfD2GhSQtYFNTU6x716fZb9kR3H/PnWw49yUce+yxc74ew0KSFrj9lh3BAY9/4kjX4TkLSVInw0KS1MmwkCR1MiwkSZ0MC0lSp5GFRZL3J9mW5Iaetv+R5HtJrk/y8SSP6xl3QZKpJLckOaWn/RlJvtuOe2eSjKpmSVJ/o9yzuBg4dVrblcAJVfXLwN8DFwAkOQ5YCxzfznNRkl3fKnk3sA5Y3f5NX6YkacRGFhZV9RXgh9PavlBVO9qbVwMr2+HTgEur6oGqug2YAk5McjhwQFV9vaoK+ABw+qhqliT1N8lzFq8DPtsOrwDu6Bm3tW1b0Q5Pb5ckjdFEwiLJnwA7gA/uauozWc3QPmi565JsTrJ5+/btu1+oJAmYQFgkOQv4DeBV7aElaPYYjuyZbCVwZ9u+sk97X1W1oarWVNWa5cuXz23hkrSIjTUskpwKnAe8tKp+0jNqI7A2yd5JjqI5kX1NVd0F/DjJSe2noM4ErhhnzZKkEV5IMMmHgOcBy5JsBd5K8+mnvYEr20/AXl1Vv11VNya5DLiJ5vDUuVW1s13U62k+WbUPzTmOzyJJGquRhUVVvbJP8/tmmH49sL5P+2bghDksTZL0CPkNbklSJ8NCktTJsJAkdTIsJEmdDAtJUifDQpLUybCQJHUyLCRJnQwLSVInw0KS1MmwkCR1MiwkSZ0MC0lSJ8NCktTJsJAkdTIsJEmdDAtJUifDQpLUybCQJHUyLCRJnQwLSVInw0KS1MmwkCR1MiwkSZ1GFhZJ3p9kW5IbetoOTnJlklvb/wf1jLsgyVSSW5Kc0tP+jCTfbce9M0lGVbMkqb9R7llcDJw6re18YFNVrQY2tbdJchywFji+neeiJEvaed4NrANWt3/TlylJGrGRhUVVfQX44bTm04BL2uFLgNN72i+tqgeq6jZgCjgxyeHAAVX19aoq4AM980iSxmTc5ywOq6q7ANr/h7btK4A7eqbb2rataIent0uSxmi+nODudx6iZmjvv5BkXZLNSTZv3759zoqTpMVu3GFxd3toifb/trZ9K3Bkz3QrgTvb9pV92vuqqg1Vtaaq1ixfvnxOC5ekxWzcYbEROKsdPgu4oqd9bZK9kxxFcyL7mvZQ1Y+TnNR+CurMnnkkSWOydFQLTvIh4HnAsiRbgbcCFwKXJTkHuB04A6CqbkxyGXATsAM4t6p2tot6Pc0nq/YBPtv+SZLGaGRhUVWvHDDq5AHTrwfW92nfDJwwh6VJkh6h+XKCW5I0jxkWkqROhoUkqZNhIUnqZFhIkjoZFpKkToaFJKmTYSFJ6mRYSJI6GRaSpE6GhSSpk2EhSepkWEiSOhkWkqROhoUkqZNhIUnqZFhIkjoZFpKkToaFJKmTYSFJ6mRYSJI6GRaSpE6GhSSpk2EhSeo0kbBI8uYkNya5IcmHkjwmycFJrkxya/v/oJ7pL0gyleSWJKdMomZJWszGHhZJVgC/B6ypqhOAJcBa4HxgU1WtBja1t0lyXDv+eOBU4KIkS8ZdtyQtZpM6DLUU2CfJUmBf4E7gNOCSdvwlwOnt8GnApVX1QFXdBkwBJ463XEla3MYeFlX1j8BfALcDdwH3VtUXgMOq6q52mruAQ9tZVgB39Cxia9v2C5KsS7I5yebt27ePqguStOhM4jDUQTR7C0cBRwD7JXn1TLP0aat+E1bVhqpaU1Vrli9fvvvFSpKAyRyGeiFwW1Vtr6r/B3wMeDZwd5LDAdr/29rptwJH9sy/kuawlSRpTCYRFrcDJyXZN0mAk4GbgY3AWe00ZwFXtMMbgbVJ9k5yFLAauGbMNUvSorZ03Cusqm8kuRz4FrAD+DawAdgfuCzJOTSBckY7/Y1JLgNuaqc/t6p2jrtuSVrMxh4WAFX1VuCt05ofoNnL6Df9emD9qOuSJPXnN7glSZ0MC0lSJ8NCktTJsJAkdRoqLJI8Z5g2SdKeadg9i78ask2StAea8aOzSZ5F8+3q5Ul+v2fUATRXi5UkLQJd37PYi+bLckuBx/a0/wh4+aiKkiTNLzOGRVV9Gfhykour6vtjqkmSNM8M+w3uvZNsAFb1zlNVLxhFUZKk+WXYsPgI8B7gvYDXZZKkRWbYsNhRVe8eaSWSpHlr2I/OfjLJ7yQ5PMnBu/5GWpkkad4Yds9i1+9MvKWnrYCj57YcSdJ8NFRYVNVRoy5EkjR/DRUWSc7s115VH5jbciRJ89Gwh6Ge2TP8GJofKfoWYFhI0iIw7GGoN/TeTnIg8LcjqUiSNO/M9hLlPwFWz2UhkqT5a9hzFp+k+fQTNBcQfApw2aiKkiTNL8Oes/iLnuEdwPerausI6pEkzUNDHYZqLyj4PZorzx4E/GyURUmS5pdhfynvFcA1wBnAK4BvJPES5ZK0SAx7gvtPgGdW1VlVdSZwIvBfZrvSJI9LcnmS7yW5Ocmz2kuIXJnk1vb/QT3TX5BkKsktSU6Z7XolSbMzbFg8qqq29dz+wSOYt593AJ+rqicDTwVuBs4HNlXVamBTe5skxwFrgeOBU4GLkvgrfZI0RsNu8D+X5PNJXpvktcCngc/MZoVJDgB+DXgfQFX9rKr+BTgNuKSd7BLg9Hb4NODSqnqgqm4Dpmj2bCRJY9L1G9zHAIdV1VuS/CbwXCDA14EPznKdRwPbgb9J8lTgWuCN7XruAqiqu5Ic2k6/Ari6Z/6tbZskaUy69izeDvwYoKo+VlW/X1VvptmrePss17kUeDrw7qp6GnA/7SGnAdKnrfq0kWRdks1JNm/fvn2W5UmSpusKi1VVdf30xqraTPMTq7OxFdhaVd9ob19OEx53JzkcoP2/rWf6I3vmXwnc2W/BVbWhqtZU1Zrly5fPsjxJ0nRdYfGYGcbtM5sVVtU/AXckObZtOhm4CdjIz3834yzginZ4I7A2yd5JjqK5zMg1s1m3JGl2ur7B/c0kv1VVf93bmOQcmnMNs/UG4INJ9gK2AGfTBNdl7bJvp/lOB1V1Y5LLaAJlB3BuVfk74JI0Rl1h8Sbg40lexc/DYQ2wF/Cy2a60qq5rlzPdyQOmXw+sn+36JEm7Z8awqKq7gWcneT5wQtv86ar64sgrkyTNG8P+nsVVwFUjrkWSNE/tzrewJUmLhGEhSepkWEiSOhkWkqROhoUkqZNhIUnqZFhIkjoZFpKkToaFJKmTYSFJ6mRYSJI6GRaSpE6GhSSpk2EhSepkWEiSOhkWkqROhoUkqZNhIUnqZFhIkjoZFpKkToaFJKmTYSFJ6jSxsEiyJMm3k3yqvX1wkiuT3Nr+P6hn2guSTCW5Jckpk6pZkharSe5ZvBG4uef2+cCmqloNbGpvk+Q4YC1wPHAqcFGSJWOuVZIWtYmERZKVwEuA9/Y0nwZc0g5fApze035pVT1QVbcBU8CJYypVksTk9izeDvwR8GBP22FVdRdA+//Qtn0FcEfPdFvbNknSmIw9LJL8BrCtqq4ddpY+bTVg2euSbE6yefv27bOuUZL0cJPYs3gO8NIk/wBcCrwgyf8G7k5yOED7f1s7/VbgyJ75VwJ39ltwVW2oqjVVtWb58uWjql+SFp2xh0VVXVBVK6tqFc2J6y9W1auBjcBZ7WRnAVe0wxuBtUn2TnIUsBq4ZsxlS9KitnTSBfS4ELgsyTnA7cAZAFV1Y5LLgJuAHcC5VbVzcmVK0uIz0bCoqi8BX2qHfwCcPGC69cD6sRUmSXoYv8EtSepkWEiSOhkWkqROhoUkqZNhIUnqZFhIkjoZFpKkToaFJKmTYSFJ6mRYSJI6GRaSpE6GhSSpk2EhSepkWEiSOhkWkqROhoUkqZNhIUnqZFhIkjoZFpKkToaFJKmTYSFJ6mRYSJI6GRaSpE6GhSSp09jDIsmRSa5KcnOSG5O8sW0/OMmVSW5t/x/UM88FSaaS3JLklHHXLEmL3ST2LHYAf1BVTwFOAs5NchxwPrCpqlYDm9rbtOPWAscDpwIXJVkygboladEae1hU1V1V9a12+MfAzcAK4DTgknayS4DT2+HTgEur6oGqug2YAk4ca9GStMhN9JxFklXA04BvAIdV1V3QBApwaDvZCuCOntm2tm2SpDGZWFgk2R/4KPCmqvrRTJP2aasBy1yXZHOSzdu3b5+LMiVJTCgskjyaJig+WFUfa5vvTnJ4O/5wYFvbvhU4smf2lcCd/ZZbVRuqak1VrVm+fPloipekRWgSn4YK8D7g5qr6y55RG4Gz2uGzgCt62tcm2TvJUcBq4Jpx1StJgqUTWOdzgNcA301yXdv2x8CFwGVJzgFuB84AqKobk1wG3ETzSapzq2rn2KuWpEVs7GFRVV+l/3kIgJMHzLMeWD+yoiRJM/Ib3JKkToaFJKmTYSFJ6mRYSJI6GRaSpE6GhSSpk2EhSepkWEiSOhkWkqROhoUkqZNhIUnqZFhIkjoZFpKkToaFJKmTYSFJ6mRYSJI6GRaSpE6GhSSp0yR+g1vzzM6dO5mamnro9jHHHMOSJUsmWFF/o6qzd7k7dzY/775rufP1vtDsTX8ewWQe54XyutvFsJjnxvGEmpqaYt27Ps1+y47g/nvuZMO5L+HYY4+dF0/m3hq2bNnC2z5zM/svf3idc7nc7bdex6P2PZBDVhy12+uYa+N4PObDYz5qvc93YGKP86DX3TCmP3+rRllpw7CY52bzhJrNC36/ZUdwwOOfOHDd923bygUvOZ6jjz566GXOtraZNuSPPfIpHPD4J1IPPsiWLVsemh542HK66uvtW+9y77vnTpbuf0jnOkbZ/2Fq3t1QHzTPMI/5I30MB913c7WcYfrYO++WLVvY95CfP9+HeZwH1fpI+zB9ml2vu94ahllOv9fFqBkWYzbMk27QE3v6E6p3/l6DXvC9y52+jl3vTHrX0bvu++65kz/7xHUcsuLegRuqQcufvq5dT/JBtQ0KiPvuufOhPt7/w3/izz7xfQ5Zce/D9gaAoTZy0/vWz6B1DHOfDtro7O4GvyvUZ3pDMexe2q51DHrMhwmU6WHcb29tmLoHLWfQYzBMDdM3roMe52H6POh+HHRfT5931+uut4ZhljPodTFKhsVueqTv6oZ50g16Yvc+oWDwRhH6v+CnL7drHdNfVPv1Ca2Z6p5pXV21DfNC2K9nY79rbwB42HJnuo+HeTfWbx3D3qeDDmf1ezc5bIgOOtzQb5nTN6LD7KVNX8egx7zfm4jp9/W+fe67QcuZ8TFYNvxjMEwN/Z5T/R7nYfrcez/2mmnPdfq8Xff1JAOil2Exze4cJhnmHfcwT7qZntj79by4Znqh9nvBT19u1zqGfcc9qO5Huq6ZXsyP1H5D3MdztfyuPg/aGM90P84UovCLe4CDltm7xzXsXlq/EB3mTcRMG8Ku5QwKuRkDcogg353DM8P2eZDekBtU97DrnVRA9FowYZHkVOAdwBLgvVV14SjWM8whnGHfoXXtKcBwT7qZPJIX6lza3brHadK1zmaj0xWiwy5z+h7XIENt/EY4zaCQm81zeS4f70eyrJn20Ea53nFZEGGRZAnwLuDfAVuBbybZWFU3jWJ9wxzCGfYd2ijeNQ+sex4+wdQYxWOzpz3eo9rLHJdh9tAWsgURFsCJwFRVbQFIcilwGjCSsLi/fYL+9J+38ah9D+w/zQ/u5EeP2buZ5oEHfj7cM/3AaeZg+KH65ni5C33Y+8X7ZaL3y5he/4OGm23X0+Z8mwgLJyxWAHf03N4K/Or0iZKsA9a1N+9Lcsss17cMuGeW8y5U9nlxWGx9Xmz95cl/9Qe72+e+nzhYKGGRPm2/cESwqjYAG3Z7Zcnmqlqzu8tZSOzz4rDY+rzY+guj6/NCuTbUVuDIntsrgYVzMFOSFriFEhbfBFYnOSrJXsBaYOOEa5KkRWNBHIaqqh1Jfhf4PM1HZ99fVTeOcJW7fShrAbLPi8Ni6/Ni6y+MqM+pcVyBSpK0oC2Uw1CSpAkyLCRJnRZ1WCQ5NcktSaaSnN9nfJK8sx1/fZKnT6LOuTJEf1/V9vP6JF9L8tRJ1DmXuvrcM90zk+xM8vJx1jcKw/Q5yfOSXJfkxiRfHneNc22I5/aBST6Z5Dttn8+eRJ1zJcn7k2xLcsOA8XO/7aqqRflHc6L8/wJHA3sB3wGOmzbNi4HP0nzP4yTgG5Oue8T9fTZwUDv8ooXc32H73DPdF4HPAC+fdN1jeJwfR3P1gye0tw+ddN1j6PMfA/+tHV4O/BDYa9K170affw14OnDDgPFzvu1azHsWD11CpKp+Buy6hEiv04APVONq4HFJDh93oXOks79V9bWq+uf25tU032dZyIZ5jAHeAHwU2DbO4kZkmD7/J+BjVXU7QFUt9H4P0+cCHpskwP40YbFjvGXOnar6Ck0fBpnzbddiDot+lxBZMYtpFopH2pdzaN6ZLGSdfU6yAngZ8J4x1jVKwzzOTwIOSvKlJNcmOXNs1Y3GMH3+n8BTaL7M+13gjVX14HjKm4g533YtiO9ZjMgwlxAZ6jIjC8TQfUnyfJqweO5IKxq9Yfr8duC8qtrZvOlc8Ibp81LgGcDJwD7A15NcXVV/P+riRmSYPp8CXAe8APgl4Mok/6eqfjTi2iZlzrddizkshrmEyJ50mZGh+pLkl4H3Ai+qqh+MqbZRGabPa4BL26BYBrw4yY6q+sRYKpx7wz6v76mq+4H7k3wFeCqwUMNimD6fDVxYzQH9qSS3AU8GrhlPiWM359uuxXwYaphLiGwEzmw/WXAScG9V3TXuQudIZ3+TPAH4GPCaBfwus1dnn6vqqKpaVVWrgMuB31nAQQHDPa+vAP5tkqVJ9qW5gvPNY65zLg3T59tp9qRIchhwLLCFPdecb7sW7Z5FDbiESJLfbse/h+bTMS8GpoCf0Lw7WZCG7O+fAocAF7XvtHfUAr5i55B93qMM0+equjnJ54DrgQdpfnmy70cwF4IhH+c/By5O8l2aQzTnVdWCvXR5kg8BzwOWJdkKvBV4NIxu2+XlPiRJnRbzYShJ0pAMC0lSJ8NCktTJsJAkdTIspAUmyelJjpvD5b02yRFztTztmQwLaZaSTOqj56cDfcNiljW9FjAsNCM/Oqs9Unu9oz+kucTB9cB/Bt5Pc8XR7TSfO7+X5gqlR1fVg+0X1G6huXrpE4B3tdP/BPitqvpekotpLuD2NOBbwIdpLhmyD/BT4OyquqVd1sU03xK+GVgFnFtVm5P8e+C/AnvTXC317Kq6b0A/LgReSnPRuy/QfGnyU23t9wL/AXgf8DXgOTRfxvo3wKeq6vJ2GfdV1f7t8B8Br6H5fsVngc1tnf/Y1v+sqvrpI7u3tShM+lK7/vk313/A8TQb/WXt7YOBTwJntbdfB3yiHb4CeH47/B9pvqAGsAlY3Q7/KvDFdvhimo31kvb2AcDSdviFwEfb4T8E/lc7fALNxn4NzSVFvgLs1447D/jTAf04uO3Hrjd1j+up4eU9030JuKjn9vTx97X/X0QTKvvuWn7P/Gsm/bj5N7//Fu03uLVHewFwebXf0K2qHyZ5FvCb7fi/Bf57O/xhmpC4iuYyERcl2Z/mtz0+0nNxwb17lv+RqtrZDh8IXJJkNc1ezKPb9ucC72jXf0OS69v2k2gOIf1du+y9gK8P6MePgH8F3pvk0zQhNciHZxi3ywuBv6mqn7R1zXSJa+lhDAvtiUL3FTZ3jd8IvC3JwTRXYv0isB/wL1X1KwPmvb9n+M+Bq6rqZUlW0bxL31XDoNqurKpXdtRHNZexOJHmmkZrgd+lCcKumnbQno9sf79hr551e9xZs+IJbu2JNgGvSHIIQBsEX6PZ4AK8CvgqQDXnCq6h2Qv4VFXtrOay1bclOaOdPxn8E7MH0hzvh+ZE8S5fBV7Rzn8czXkEaH5U6jlJjmnH7ZvkSf0W3O7hHFhVnwHeBPxKO+rHwGNn6P8/0AQfND+Cs2tv5wvA69rzKbvul2GWJxkW2vNU1Y3AeuDLSb4D/CXwe8DZ7eGg1wBv7Jnlw8CrefihnFcB57Tz30j/X9iD5nDW25L8Hc1F7Ha5CFjeru88mpPs91bVdppQ+VA77mqak+D9PBb4VDvdl4E3t+2XAm9J8u0kv9Rnvr8Gfj3JNTTnW+5v75fP0exJbU5yHc15FWjOcbwnzW9y7zOgFi1yfhpKGoEkS4BHV9W/thv0TcCTqvnZT2nB8ZyFNBr7AlcleTTNuYLXGxRayNyzkOaBJB8HjprWfF5VfX4S9UjTGRaSpE6e4JYkdTIsJEmdDAtJUifDQpLUybCQJHUyLCRJnf4/VGT/VIxV70gAAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<Figure size 432x288 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "sns.histplot(GOs[GOs['overlap'] != 'complete overlap']['coverage_struct'], bins=100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 255,
+   "id": "f628c3eb-422c-403d-b267-ec10489171df",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.6454013592969468"
+      ]
+     },
+     "execution_count": 255,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.mean(GOs[GOs['overlap'] != 'complete overlap']['coverage_struct'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1bc8b46c-37e3-46b3-8ca8-e85ab8e47038",
+   "metadata": {},
+   "source": [
+    "Of all proteins that are not completely overlapping, the coverage of GO terms overlapping within the GO terms of *strucural* annotations is around 65 %. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 256,
+   "id": "582eaae9-25f8-4be3-99f0-aef7a0c438a0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<AxesSubplot:xlabel='coverage_seq', ylabel='Count'>"
+      ]
+     },
+     "execution_count": 256,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEGCAYAAACUzrmNAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAXwUlEQVR4nO3de5BcZ33m8e9jyTbYxka2ZMfINhIgDDK7BCIcc1kKcCo2sLUGChOxXLSsa1VJHG7JZrGzVSFbKRXOFkUBWQxxOSyCZTHiEiwwAbzCQAjYRoC5yIrWs3KwFSuWuISLk5hI/PaPPgPtcfecntF094z6+6ma6tNvn+7zvqen36ffc7rfTlUhSdJsjhl3BSRJi59hIUlqZVhIkloZFpKkVoaFJKnV8nFXYFhWrlxZa9asGXc1JGlJ+epXv/rdqlo1s/yoDYs1a9awc+fOcVdDkpaUJN/pVe5hKElSK8NCktTKsJAktTIsJEmtDAtJUivDQpLUyrCQJLUaWlgkeXeSA0m+3VV2apIbk9zRXK7ouu3KJFNJ9iS5qKv8V5J8q7nt7UkyrDpLknob5sjiPcDFM8quAHZU1TpgR3OdJOuBjcB5zX2uTrKsuc87gc3AuuZv5mNK0sQ6fPgwe/bs+fnf4cOHh7KdoYVFVX0B+P6M4kuArc3yVuAFXeXXVdX9VXUnMAWcn+RM4OSq+nJ1fqXpvV33kaSJNzU1xeZ33MDrP/h1Nr/jBqampoaynVFP93FGVe0HqKr9SU5vylcDN3ett68p+5dmeWZ5T0k20xmFcM455yxgtSVp8Tpx5SM4+ZceOdRtLJYT3L3OQ9Qs5T1V1TVVtaGqNqxa9aB5sCRJ8zTqsLi3ObREc3mgKd8HnN213lnAPU35WT3KJUkjNOqw2A5sapY3Add3lW9McnyStXROZN/aHLL6cZILmk9BvbLrPpKkERnaOYskHwCeBaxMsg94I3AVsC3JZcBdwKUAVbUryTbgduAQcHlVTZ/S/y06n6x6KPCXzZ8kaYSGFhZV9dI+N13YZ/0twJYe5TuBJyxg1SRJc7RYTnBLkhYxw0KS1MqwkCS1MiwkSa0MC0lSK8NCktTKsJAktTIsJEmtDAtJUivDQpLUyrCQJLUyLCRJrQwLSVIrw0KS1MqwkCS1MiwkSa0MC0lSK8NCktTKsJAktTIsJEmtDAtJUivDQpLUyrCQJLUyLCRJrQwLSVIrw0KS1MqwkCS1MiwkSa0MC0lSK8NCktTKsJAktTIsJEmtxhIWSV6fZFeSbyf5QJKHJDk1yY1J7mguV3Stf2WSqSR7klw0jjpL0iQbeVgkWQ28BthQVU8AlgEbgSuAHVW1DtjRXCfJ+ub284CLgauTLBt1vSVpko3rMNRy4KFJlgMnAPcAlwBbm9u3Ai9oli8Brquq+6vqTmAKOH+01ZWkyTbysKiqvwPeDNwF7Ad+WFWfAc6oqv3NOvuB05u7rAbu7nqIfU3ZgyTZnGRnkp0HDx4cVhMkaeKM4zDUCjqjhbXAI4ATk7x8trv0KKteK1bVNVW1oao2rFq16sgrK0kCxnMY6teAO6vqYFX9C/BR4GnAvUnOBGguDzTr7wPO7rr/WXQOW0mSRmQcYXEXcEGSE5IEuBDYDWwHNjXrbAKub5a3AxuTHJ9kLbAOuHXEdZakibZ81BusqluSfBj4GnAI+DpwDXASsC3JZXQC5dJm/V1JtgG3N+tfXlWHR11vSZpkIw8LgKp6I/DGGcX30xll9Fp/C7Bl2PWSJPXmN7glSa0MC0lSK8NCktTKsJAktTIsJEmtDAtJUivDQpLUyrCQJLUyLCRJrQwLSVIrw0KS1MqwkCS1MiwkSa0MC0lSK8NCktTKsJAktTIsJEmtDAtJUivDQpLUyrCQJLUyLCRJrQwLSVIrw0KS1MqwkCS1MiwkSa0MC0lSK8NCktTKsJAktTIsJEmtDAtJUivDQpLUyrCQJLUaS1gkeXiSDyf5myS7kzw1yalJbkxyR3O5omv9K5NMJdmT5KJx1FmSJtm4RhZvAz5VVY8DngjsBq4AdlTVOmBHc50k64GNwHnAxcDVSZaNpdaSNKFGHhZJTgaeCfw5QFX9tKr+AbgE2NqsthV4QbN8CXBdVd1fVXcCU8D5o6yzJE26cYwsHgUcBP5nkq8nuTbJicAZVbUfoLk8vVl/NXB31/33NWUPkmRzkp1Jdh48eHB4LZCkCTOOsFgOPBl4Z1U9CbiP5pBTH+lRVr1WrKprqmpDVW1YtWrVkddUkgSMJyz2Afuq6pbm+ofphMe9Sc4EaC4PdK1/dtf9zwLuGVFdJUkMGBZJnj5I2SCq6u+Bu5Oc2xRdCNwObAc2NWWbgOub5e3AxiTHJ1kLrANunc+2JUnzs3zA9f6Uzrv/trJBvRp4f5LjgL3Aq+gE17YklwF3AZcCVNWuJNvoBMoh4PKqOjzP7UqS5mHWsEjyVOBpwKokv9t108nAvD++WlW3ARt63HRhn/W3AFvmuz1J0pFpG1kcB5zUrPewrvIfAS8eVqUkSYvLrGFRVZ8HPp/kPVX1nRHVSZK0yAx6zuL4JNcAa7rvU1XPGUalJEmLy6Bh8SHgXcC1gCeXJWnCDBoWh6rqnUOtiSRp0Rr0S3kfT/LbSc5sZoc9NcmpQ62ZJGnRGHRkMf1lud/vKis68zxJko5yA4VFVa0ddkUkSYvXQGGR5JW9yqvqvQtbHUnSYjToYaindC0/hM43rb8GGBaSNAEGPQz16u7rSU4B3jeUGkmSFp35TlH+j3Rmf5UkTYBBz1l8nF/84NAy4PHAtmFVSpK0uAx6zuLNXcuHgO9U1b4h1EeStAgNdBiqmVDwb+jMPLsC+OkwKyVJWlwG/aW8l9D5dbpLgZcAtyRxinJJmhCDHob6r8BTquoAQJJVwP+h8/vZkqSj3KCfhjpmOiga35vDfSVJS9ygI4tPJfk08IHm+m8AnxxOlSRJi03bb3A/Bjijqn4/yYuAZwABvgy8fwT1kyQtAm2Hkt4K/Bigqj5aVb9bVa+nM6p463CrJklaLNrCYk1VfXNmYVXtpPMTq5KkCdAWFg+Z5baHLmRFJEmLV1tYfCXJf5pZmOQy4KvDqZIkabFp+zTU64C/SPIyfhEOG4DjgBcOsV6SpEVk1rCoqnuBpyV5NvCEpviGqvrs0GsmSVo0Bv09i5uAm4ZcF0nSIuW3sCVJrQwLSVIrw0KS1MqwkCS1MiwkSa3GFhZJliX5epJPNNdPTXJjkjuayxVd616ZZCrJniQXjavOkjSpxjmyeC2wu+v6FcCOqloH7Giuk2Q9sBE4D7gYuDrJshHXVZIm2ljCIslZwPOBa7uKLwG2NstbgRd0lV9XVfdX1Z3AFHD+iKoqSWJ8I4u3Av8F+FlX2RlVtR+guTy9KV8N3N213r6m7EGSbE6yM8nOgwcPLnilJWlSjTwskvxb4EBVDToRYXqUVa8Vq+qaqtpQVRtWrVo17zpKkh5o0J9VXUhPB/5dkufRmQL95CT/C7g3yZlVtT/JmcD0b37vA87uuv9ZwD0jrbEkTbiRjyyq6sqqOquq1tA5cf3Zqno5sB3Y1Ky2Cbi+Wd4ObExyfJK1wDrg1hFXW5Im2jhGFv1cBWxrfivjLuBSgKralWQbcDtwCLi8qg6Pr5qSNHnGGhZV9Tngc83y94AL+6y3BdgysopJkh7Ab3BLkloZFpKkVoaFJKmVYSFJamVYSJJaGRaSpFaGhSSplWEhSWplWEiSWhkWkqRWhoUkqZVhIUlqZVhIkloZFpKkVoaFJKmVYSFJamVYSJJaGRaSpFaGhSSplWEhSWplWEiSWhkWkqRWhoUkqZVhIUlqZVhIkloZFpKkVoaFJKmVYSFJamVYSJJaGRaSpFaGhSSplWEhSWo18rBIcnaSm5LsTrIryWub8lOT3JjkjuZyRdd9rkwylWRPkotGXWdJmnTjGFkcAn6vqh4PXABcnmQ9cAWwo6rWATua6zS3bQTOAy4Grk6ybAz1lqSJNfKwqKr9VfW1ZvnHwG5gNXAJsLVZbSvwgmb5EuC6qrq/qu4EpoDzR1ppSZpwYz1nkWQN8CTgFuCMqtoPnUABTm9WWw3c3XW3fU2ZJGlExhYWSU4CPgK8rqp+NNuqPcqqz2NuTrIzyc6DBw8uRDUlSYwpLJIcSyco3l9VH22K701yZnP7mcCBpnwfcHbX3c8C7un1uFV1TVVtqKoNq1atGk7lJWkCjePTUAH+HNhdVW/pumk7sKlZ3gRc31W+McnxSdYC64BbR1VfSRIsH8M2nw68AvhWktuasj8ArgK2JbkMuAu4FKCqdiXZBtxO55NUl1fV4ZHXWpIm2MjDoqq+SO/zEAAX9rnPFmDL0ColSZqV3+CWJLUyLCRJrQwLSVIrw0KS1MqwkCS1MiwkSa0MC0lSK8NCktTKsJAktTIsJEmtDAtJUivDQpLUyrCQJLUyLCRJrQwLSVIrw0KS1MqwkCS1MiwkSa3G8RvckqQjcPjwYaampgDYu3cvVcPfpmEhSUvM1NQUm99xAyeufAQH77iNh539+KFv07CQpDHrHikAPOYxj2HZsmWz3ufElY/g5F96JD/57j3Drh5gWEjS2HWPFH5yYB9XPv88HvWoRwG/CI5xHHrqZlhI0iLQPVL4o4/dxmmrf/iA4Ni7dy9v+uRuTlo1ukNP3QwLSWM3n8Mw4zKKup542oODYzogRnnoqZthIQ1gKXVmS1H3YZj7vnsP11z+fM4999xxV6unharroIeVuoNjnAwLzVn3P/nhw4cBft5xzqUTXUodcL8Ool8b5lo+KWZr//RhmFFvdz7a6jrI9sbxiaYjYVhMkIV6wcz8Jz/mhFM4bfXaOb/Lmus7tHF3tNMdRP3sZ+zduxfgAceRu9vQr23jfAc97v0H43vOR7HfZ44Upv8v+p2whtF/oulIGBaLxCheyAv5gun+J19+0mnzfpfVqwOeuc5C1X+hRkT3ff/v+aOPfedBx5FnhsgJp/V+9zmud9BzHR0Nso3u/dhvn87sRPvtl2n9wri70+233dme10H2+yAjwn6HjHqNFNpOWI/6E01HwrCYh2F07AvZkfd7MXe/ULtfkEfScXbr1+EP0rbuDrhfR9avo5n5fPRrT78RUb9OaLZ90es4cq8Qmc/+6meQsGvb122jo0H2Rb/92G+U2e9wy8w6THec/cJ45sneXtvt97x2P/7M/T7zNdI2Upztue03UpjthPVSYVjMwyCd35F8yWbQjny2UOj+iF33C2n6n3PmC7LXi3y2x+/1jqhfh9+vbTMf58QeQTBoRzPdXmDWDuOE0x48IurXCc0ntOdyMnKQgBzkeR1kX8/c33PtkAfdj9PL/UZZg4Zrv/14Yst2Z3tee/3vz/x/GWSk2F2n2f6f+1ksJ6znyrCYp7Yhbb8v2fQbMs/2Qu71ggVaQ6HXi+cBbejxwuvXCfcLnQftlx4jl7ZOaqb5djTTz8cgHUa/es+2L+Z7yGCQgJzLfh+0nr06xSPpkOfyjnjQUdawO862NsKD/1/m0oa5jiaXMsNihrmOCGY7lNDrSzazDZ/7vZB7vWCBgUNhLmZ7xzmXx5/Pu8ZB7ttt0I5mvh3SQnUE8+10Bt3vg+xr4Ig75Lnux6X6DrrbIG04Gto5CMNihkFGBP3evc128mqQ4fMgFvLF37aNUXcuw6jDkRrlvjga9peOXksmLJJcDLwNWAZcW1VXDWtbg4wI+r1TXqonryRpNkvix4+SLAPeATwXWA+8NMn6UWx7OggeuuL0BywPsr4kHS2WysjifGCqqvYCJLkOuAS4fRgbu68Zyv/TDw5wzP3386OHHL/olhd7/dwvi2vZ/TIZ+6XTdz1pGN3ikgmL1cDdXdf3Ab86c6Ukm4HNzdWfJNkzz+2tBL47z/suVbZ5MkxamyetvTzuT3/vSNvc82OeSyUs0qPsQR9krKprgGuOeGPJzqracKSPs5TY5skwaW2etPbC8Nq8JM5Z0BlJnN11/SzAj31I0ogslbD4CrAuydokxwEbge1jrpMkTYwlcRiqqg4l+R3g03Q+Ovvuqto1xE0e8aGsJcg2T4ZJa/OktReG1ObUUpr2UJI0FkvlMJQkaYwMC0lSq4kOiyQXJ9mTZCrJFT1uT5K3N7d/M8mTx1HPhTJAe1/WtPObSb6U5InjqOdCamtz13pPSXI4yYtHWb9hGKTNSZ6V5LYku5J8ftR1XGgD/G+fkuTjSb7RtPlV46jnQkny7iQHkny7z+0L33dV1UT+0TlR/v+ARwHHAd8A1s9Y53nAX9L5nscFwC3jrveQ2/s0YEWz/Nyl3N5B29y13meBTwIvHne9R/A8P5zO7AfnNNdPH3e9R9DmPwD+pFleBXwfOG7cdT+CNj8TeDLw7T63L3jfNckji59PIVJVPwWmpxDpdgnw3uq4GXh4kjNHXdEF0treqvpSVf2guXozne+zLGWDPMcArwY+AhwYZeWGZJA2/3vgo1V1F0BVLfV2D9LmAh6WJMBJdMLi0GiruXCq6gt02tDPgvddkxwWvaYQWT2PdZaKubblMjrvTJay1jYnWQ28EHjXCOs1TIM8z48FViT5XJKvJnnlyGo3HIO0+X8Aj6fzZd5vAa+tqp+NpnpjseB915L4nsWQDDKFyEDTjCwRA7clybPphMUzhlqj4RukzW8F3lBVhztvOpe8Qdq8HPgV4ELgocCXk9xcVf932JUbkkHafBFwG/Ac4NHAjUn+qqp+NOS6jcuC912THBaDTCFyNE0zMlBbkvxr4FrguVX1vRHVbVgGafMG4LomKFYCz0tyqKo+NpIaLrxB/6+/W1X3Afcl+QLwRGCphsUgbX4VcFV1DuhPJbkTeBxw62iqOHIL3ndN8mGoQaYQ2Q68svlkwQXAD6tq/6grukBa25vkHOCjwCuW8LvMbq1trqq1VbWmqtYAHwZ+ewkHBQz2f3098G+SLE9yAp0ZnHePuJ4LaZA230VnJEWSM4Bzgb0cvRa875rYkUX1mUIkyW82t7+LzqdjngdMAf9I593JkjRge/8QOA24unmnfaiW8IydA7b5qDJIm6tqd5JPAd8Efkbnlyd7fgRzKRjwef5j4D1JvkXnEM0bqmrJTl2e5APAs4CVSfYBbwSOheH1XU73IUlqNcmHoSRJAzIsJEmtDAtJUivDQpLUyrCQJLUyLKQFlmRiP5Kuo5dhoYmS5JXNlM3fSPK+JI9MsqMp25HknGY6679NckxznxOS3J3k2CSPTvKpZk6lv0ryuGad9yR5S5KbgD9Jcn4zzfvXm8tzux5rW7O9Dya5JcmG5rZfT/LlJF9L8qEkJ83SjquS3N48zpubslVJPpLkK83f05vy05J8pqnLnyX5TpKVQ97VOtqMe6pd//wb1R9wHrAHWNlcPxX4OLCpuf4fgY81y9cDz26Wf4POF9cAdgDrmuVfBT7bLL8H+ASwrLl+MrC8Wf414CPN8n8G/qxZfgKdmU830Jlq5AvAic1tbwD+sE87Tm3aMf09qYc3l/8beEazfA6wu1l++/RjAc+nM0fQynE/H/4trT+Hy5okzwE+XM03d6vq+0meCryouf19wH9vlj9IJyRuojN9xNXNO/2nAR/qmnTw+K7H/1BVHW6WTwG2JllHp3M+til/BvC2ZvvfTvLNpvwCYD3w181jHwd8uU87fgT8M3BtkhvohBR0Qml9V91OTvIwOr998KJmmzck+QHSHBkWmiShfebN6du3A29KciqdGVo/C5wI/ENV/XKf+97XtfzHwE1V9cIka4DPddWhX91urKqXttSP6kxvcT6duY42Ar9DJwiPAZ5aVf/0gAfuhIdTNeiIeM5Ck2QH8JIkpwE0QfAlOh0uwMuALwJU1U/ozEj6NuATVXW4OtNZ35nk0ub+Sf+fnj0F+Ltm+T90lX8ReElz//XAv2rKbwaenuQxzW0nJHlsrwduRjinVNUngdcBv9zc9Bk6wTG93nT5F5q2keS5wIo+dZb6Miw0MapqF7AF+HySbwBvAV4DvKo5HPQK4LVdd/kg8PLmctrLgMua+++i9y/vQedw1puS/DWdye2mXQ2sarb3BjqT+f2wqg7SCZUPNLfdTGcK7V4eBnyiWe/zwOub8tcAG5qT3rcDv9mU/zfgmUm+Bvw6nRlYpTlxIkFphJIsA46tqn9O8mg6o53HVufnQEdVh78FNtQSnnVVo+c5C2m0TgBuSnIsnfMUvzXKoJDmy5GFtIgl+Qtg7YziN1TVp8dRH00uw0KS1MoT3JKkVoaFJKmVYSFJamVYSJJaGRaSpFb/H+UZtnjiMMfNAAAAAElFTkSuQmCC\n",
+      "text/plain": [
+       "<Figure size 432x288 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "sns.histplot(GOs[GOs['overlap'] != 'complete overlap']['coverage_seq'], bins=100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 257,
+   "id": "cc844026-2562-4ee2-a107-95a68c627a40",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.6510767765619442"
+      ]
+     },
+     "execution_count": 257,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.mean(GOs[GOs['overlap'] != 'complete overlap']['coverage_seq'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b6659ef1-6b7c-4a5b-b50e-0ff4ab593783",
+   "metadata": {},
+   "source": [
+    "Of all proteins that are not completely overlapping, the coverage of GO terms overlapping within the GO terms of sequence annotations is again around 65 %. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2531a431-6d03-4c4b-bb22-86ef4755c6ce",
+   "metadata": {},
+   "source": [
+    "## Check semantic similarity of GO terms of sequence-structure protein pairs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4081cb1c-443d-4dad-a32d-a4c28a4c2d94",
+   "metadata": {},
+   "source": [
+    "Create a table that can be used as an input for GOGO (https://www.nature.com/articles/s41598-018-33219-y):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 258,
+   "id": "3d0dbc57-a86a-4898-9bc0-61970b379817",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GOs_GOGO = GOs[['GOs_struct', 'GOs_seq']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 259,
+   "id": "c13c5605-d293-4558-a9d1-5551ef57fb1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# iterate over each row and each column\n",
+    "for i, row in GOs_GOGO.iterrows():\n",
+    "    for col in GOs_GOGO.columns:\n",
+    "        # separate the elements in the list with a space\n",
+    "        GOs_GOGO.at[i, col] = \" \".join(str(x) for x in row[col])\n",
+    "        # add the index of the row to the beginning of the list\n",
+    "        GOs_GOGO.at[i, col] = str(i) + \" \" + GOs_GOGO.at[i, col]\n",
+    "\n",
+    "# save the dataframe to a txt file without the header and index\n",
+    "GOs_GOGO.to_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/GOGO_input.txt\", sep=\";\", index=False, header=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5958c38f-58b8-4c2e-9fe3-f5ffceba9da0",
+   "metadata": {},
+   "source": [
+    "I ran GOGO locally: `perl gene_pair_comb.pl ~/Desktop/GOGO_input.txt ~/Desktop/GOGO_input_result.txt`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 260,
+   "id": "101e4b9a-e315-406c-8cee-ba1115e55208",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GOGO_result = pd.read_csv('/g/arendt/Fabian/PhD/Computational/Spongefold/GOGO_input_result.txt', sep=';', header=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 261,
+   "id": "23985e19-becf-40ed-8047-cbcd3b1258f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GOGO_result.rename(columns={0 :'GOs_struct', 1 :'GOs_seq'}, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 262,
+   "id": "854e6e6d-5834-418e-8fcc-24c33846a5ef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GOGO_result['GOs_struct'] = GOGO_result['GOs_struct'].str.split(' ').str[1:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 263,
+   "id": "60e1b05f-761a-4153-a38f-586143883f34",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GOGO_result['GOs_seq'] = GOGO_result['GOs_seq'].str.split(' ').str[1:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 264,
+   "id": "86603707-9eab-4fee-ade7-42077ed24068",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a new column in the dataframe\n",
+    "GOGO_result['BPO'] = None\n",
+    "GOGO_result['CCO'] = None\n",
+    "GOGO_result['MFO'] = None\n",
+    "\n",
+    "# iterate over each row in the dataframe\n",
+    "for i, row in GOGO_result.iterrows():\n",
+    "    # get the fifth last element of the list in col1\n",
+    "    # and store it in the new column\n",
+    "    GOGO_result.at[i, 'BPO'] = row['GOs_seq'][-5]\n",
+    "    GOGO_result.at[i, 'CCO'] = row['GOs_seq'][-3]\n",
+    "    GOGO_result.at[i, 'MFO'] = row['GOs_seq'][-1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 265,
+   "id": "fe401867-0ab5-46df-88ba-4b560b00cf7d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>GOs_struct</th>\n",
+       "      <th>GOs_seq</th>\n",
+       "      <th>BPO</th>\n",
+       "      <th>CCO</th>\n",
+       "      <th>MFO</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>[GO:0000902, GO:0000904, GO:0001654, GO:000174...</td>\n",
+       "      <td>[GO:0000902, GO:0000904, GO:0001654, GO:000174...</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>[GO:0003674, GO:0005215]</td>\n",
+       "      <td>[GO:0000166, GO:0003674, GO:0003676, GO:000372...</td>\n",
+       "      <td>NA</td>\n",
+       "      <td>NA</td>\n",
+       "      <td>1.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>[GO:0001539, GO:0003674, GO:0003774, GO:000377...</td>\n",
+       "      <td>[GO:0001539, GO:0003674, GO:0003774, GO:000377...</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>[GO:0000323, GO:0001959, GO:0002682, GO:000367...</td>\n",
+       "      <td>[GO:0000323, GO:0001959, GO:0002682, GO:000367...</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>[GO:0000166, GO:0000323, GO:0001882, GO:000188...</td>\n",
+       "      <td>[GO:0005575, GO:0005622, GO:0005623, GO:000573...</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11627</th>\n",
+       "      <td>[GO:0000003, GO:0000165, GO:0000166, GO:000156...</td>\n",
+       "      <td>[GO:0000003, GO:0000165, GO:0001654, GO:000170...</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11628</th>\n",
+       "      <td>[GO:0003674, GO:0003676, GO:0003723, GO:000548...</td>\n",
+       "      <td>[GO:0003674, GO:0003676, GO:0003723, GO:000548...</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11629</th>\n",
+       "      <td>[GO:0000012, GO:0000166, GO:0000228, GO:000072...</td>\n",
+       "      <td>[GO:0000012, GO:0000166, GO:0000228, GO:000072...</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11630</th>\n",
+       "      <td>[GO:0000003, GO:0000578, GO:0001700, GO:000300...</td>\n",
+       "      <td>[GO:0000003, GO:0000578, GO:0001700, GO:000300...</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11631</th>\n",
+       "      <td>[GO:0000151, GO:0000209, GO:0000226, GO:000367...</td>\n",
+       "      <td>[GO:0000151, GO:0000209, GO:0003674, GO:000382...</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "      <td>1.000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>11632 rows × 5 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                              GOs_struct  \\\n",
+       "0      [GO:0000902, GO:0000904, GO:0001654, GO:000174...   \n",
+       "1                               [GO:0003674, GO:0005215]   \n",
+       "2      [GO:0001539, GO:0003674, GO:0003774, GO:000377...   \n",
+       "3      [GO:0000323, GO:0001959, GO:0002682, GO:000367...   \n",
+       "4      [GO:0000166, GO:0000323, GO:0001882, GO:000188...   \n",
+       "...                                                  ...   \n",
+       "11627  [GO:0000003, GO:0000165, GO:0000166, GO:000156...   \n",
+       "11628  [GO:0003674, GO:0003676, GO:0003723, GO:000548...   \n",
+       "11629  [GO:0000012, GO:0000166, GO:0000228, GO:000072...   \n",
+       "11630  [GO:0000003, GO:0000578, GO:0001700, GO:000300...   \n",
+       "11631  [GO:0000151, GO:0000209, GO:0000226, GO:000367...   \n",
+       "\n",
+       "                                                 GOs_seq    BPO    CCO    MFO  \n",
+       "0      [GO:0000902, GO:0000904, GO:0001654, GO:000174...  1.000  1.000  1.000  \n",
+       "1      [GO:0000166, GO:0003674, GO:0003676, GO:000372...     NA     NA  1.000  \n",
+       "2      [GO:0001539, GO:0003674, GO:0003774, GO:000377...  1.000  1.000  1.000  \n",
+       "3      [GO:0000323, GO:0001959, GO:0002682, GO:000367...  1.000  1.000  1.000  \n",
+       "4      [GO:0005575, GO:0005622, GO:0005623, GO:000573...  1.000  1.000  1.000  \n",
+       "...                                                  ...    ...    ...    ...  \n",
+       "11627  [GO:0000003, GO:0000165, GO:0001654, GO:000170...  1.000  1.000  1.000  \n",
+       "11628  [GO:0003674, GO:0003676, GO:0003723, GO:000548...  1.000  1.000  1.000  \n",
+       "11629  [GO:0000012, GO:0000166, GO:0000228, GO:000072...  1.000  1.000  1.000  \n",
+       "11630  [GO:0000003, GO:0000578, GO:0001700, GO:000300...  1.000  1.000  1.000  \n",
+       "11631  [GO:0000151, GO:0000209, GO:0003674, GO:000382...  1.000  1.000  1.000  \n",
+       "\n",
+       "[11632 rows x 5 columns]"
+      ]
+     },
+     "execution_count": 265,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "GOGO_result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 266,
+   "id": "05de5391-becd-4500-bac2-450632bbf482",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(1.000    11161\n",
+       " NA         471\n",
+       " Name: BPO, dtype: int64,\n",
+       " 1.000    11159\n",
+       " NA         473\n",
+       " Name: CCO, dtype: int64,\n",
+       " 1.000    10090\n",
+       " NA        1542\n",
+       " Name: MFO, dtype: int64)"
+      ]
+     },
+     "execution_count": 266,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "GOGO_result['BPO'].value_counts(),GOGO_result['CCO'].value_counts(),GOGO_result['MFO'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3582746f-d5cb-4bd4-9163-7da1b4889d70",
+   "metadata": {},
+   "source": [
+    "This output is a little weird and concerning at first. IN all cases where semantic similarity (Biological Process, Cellular component, Molecular Function) is also to be calculated, we have scores of 1.000. How is this possible when we have a few cases where there is no overlap of GO terms whatsoever?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 267,
+   "id": "a60e24f5-45dd-4938-acfa-901fcfda11d2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>GOs_struct</th>\n",
+       "      <th>GOs_seq</th>\n",
+       "      <th>overlap</th>\n",
+       "      <th>coverage_struct</th>\n",
+       "      <th>coverage_seq</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>737</th>\n",
+       "      <td>[GO:0000322, GO:0000323, GO:0000324, GO:000032...</td>\n",
+       "      <td>[GO:0003674, GO:0005488, GO:0005515]</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1341</th>\n",
+       "      <td>[GO:0003197, GO:0003205, GO:0003279, GO:000727...</td>\n",
+       "      <td>[GO:0005575, GO:0005576, GO:0005615, GO:000562...</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1715</th>\n",
+       "      <td>[GO:0000166, GO:0003674, GO:0003824, GO:000548...</td>\n",
+       "      <td>[GO:0005575, GO:0005618, GO:0005622, GO:000562...</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2818</th>\n",
+       "      <td>[GO:0005575, GO:0005623, GO:0005886, GO:001602...</td>\n",
+       "      <td>[GO:0008150, GO:0009966, GO:0009967, GO:001064...</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4012</th>\n",
+       "      <td>[GO:0003674, GO:0005488, GO:0005515, GO:001989...</td>\n",
+       "      <td>[GO:0000139, GO:0005575, GO:0005622, GO:000562...</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4937</th>\n",
+       "      <td>[GO:0003674, GO:0005488, GO:0005515, GO:001990...</td>\n",
+       "      <td>[GO:0000228, GO:0000785, GO:0000790, GO:000557...</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14652</th>\n",
+       "      <td>[GO:0005575, GO:0005622, GO:0005623, GO:0044464]</td>\n",
+       "      <td>[GO:0007275, GO:0007399, GO:0008150, GO:000998...</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20744</th>\n",
+       "      <td>[GO:0005575, GO:0005622, GO:0005623, GO:000573...</td>\n",
+       "      <td>[GO:0003674, GO:0003824, GO:0003964, GO:000613...</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21097</th>\n",
+       "      <td>[GO:0005575, GO:0005622, GO:0005623, GO:000573...</td>\n",
+       "      <td>[GO:0003674, GO:0003824, GO:0003924, GO:000548...</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27083</th>\n",
+       "      <td>[GO:0003674, GO:0003824, GO:0006629, GO:000815...</td>\n",
+       "      <td>[GO:0005575, GO:0005622, GO:0005623, GO:000573...</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28309</th>\n",
+       "      <td>[GO:0000075, GO:0000077, GO:0000278, GO:000028...</td>\n",
+       "      <td>[GO:0003674, GO:0005488, GO:0005515]</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35725</th>\n",
+       "      <td>[GO:0003008, GO:0005575, GO:0005623, GO:000727...</td>\n",
+       "      <td>[GO:0003674, GO:0005488, GO:0005515, GO:0005516]</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36403</th>\n",
+       "      <td>[GO:0003674, GO:0003779, GO:0005488, GO:000551...</td>\n",
+       "      <td>[GO:0006355, GO:0008150, GO:0009889, GO:000989...</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38385</th>\n",
+       "      <td>[GO:0007154, GO:0007267, GO:0008150, GO:000960...</td>\n",
+       "      <td>[GO:0005575, GO:0005576]</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38735</th>\n",
+       "      <td>[GO:0000322, GO:0000323, GO:0000324, GO:000032...</td>\n",
+       "      <td>[GO:0003674, GO:0005488, GO:0005515]</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39987</th>\n",
+       "      <td>[GO:0000322, GO:0000323, GO:0000324, GO:000032...</td>\n",
+       "      <td>[GO:0003674, GO:0005488, GO:0005515]</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>41192</th>\n",
+       "      <td>[GO:0005575, GO:0005622, GO:0005623, GO:000563...</td>\n",
+       "      <td>[GO:0003674, GO:0005488, GO:0005515, GO:0042802]</td>\n",
+       "      <td>no overlap</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                              GOs_struct  \\\n",
+       "737    [GO:0000322, GO:0000323, GO:0000324, GO:000032...   \n",
+       "1341   [GO:0003197, GO:0003205, GO:0003279, GO:000727...   \n",
+       "1715   [GO:0000166, GO:0003674, GO:0003824, GO:000548...   \n",
+       "2818   [GO:0005575, GO:0005623, GO:0005886, GO:001602...   \n",
+       "4012   [GO:0003674, GO:0005488, GO:0005515, GO:001989...   \n",
+       "4937   [GO:0003674, GO:0005488, GO:0005515, GO:001990...   \n",
+       "14652   [GO:0005575, GO:0005622, GO:0005623, GO:0044464]   \n",
+       "20744  [GO:0005575, GO:0005622, GO:0005623, GO:000573...   \n",
+       "21097  [GO:0005575, GO:0005622, GO:0005623, GO:000573...   \n",
+       "27083  [GO:0003674, GO:0003824, GO:0006629, GO:000815...   \n",
+       "28309  [GO:0000075, GO:0000077, GO:0000278, GO:000028...   \n",
+       "35725  [GO:0003008, GO:0005575, GO:0005623, GO:000727...   \n",
+       "36403  [GO:0003674, GO:0003779, GO:0005488, GO:000551...   \n",
+       "38385  [GO:0007154, GO:0007267, GO:0008150, GO:000960...   \n",
+       "38735  [GO:0000322, GO:0000323, GO:0000324, GO:000032...   \n",
+       "39987  [GO:0000322, GO:0000323, GO:0000324, GO:000032...   \n",
+       "41192  [GO:0005575, GO:0005622, GO:0005623, GO:000563...   \n",
+       "\n",
+       "                                                 GOs_seq     overlap  \\\n",
+       "737                 [GO:0003674, GO:0005488, GO:0005515]  no overlap   \n",
+       "1341   [GO:0005575, GO:0005576, GO:0005615, GO:000562...  no overlap   \n",
+       "1715   [GO:0005575, GO:0005618, GO:0005622, GO:000562...  no overlap   \n",
+       "2818   [GO:0008150, GO:0009966, GO:0009967, GO:001064...  no overlap   \n",
+       "4012   [GO:0000139, GO:0005575, GO:0005622, GO:000562...  no overlap   \n",
+       "4937   [GO:0000228, GO:0000785, GO:0000790, GO:000557...  no overlap   \n",
+       "14652  [GO:0007275, GO:0007399, GO:0008150, GO:000998...  no overlap   \n",
+       "20744  [GO:0003674, GO:0003824, GO:0003964, GO:000613...  no overlap   \n",
+       "21097  [GO:0003674, GO:0003824, GO:0003924, GO:000548...  no overlap   \n",
+       "27083  [GO:0005575, GO:0005622, GO:0005623, GO:000573...  no overlap   \n",
+       "28309               [GO:0003674, GO:0005488, GO:0005515]  no overlap   \n",
+       "35725   [GO:0003674, GO:0005488, GO:0005515, GO:0005516]  no overlap   \n",
+       "36403  [GO:0006355, GO:0008150, GO:0009889, GO:000989...  no overlap   \n",
+       "38385                           [GO:0005575, GO:0005576]  no overlap   \n",
+       "38735               [GO:0003674, GO:0005488, GO:0005515]  no overlap   \n",
+       "39987               [GO:0003674, GO:0005488, GO:0005515]  no overlap   \n",
+       "41192   [GO:0003674, GO:0005488, GO:0005515, GO:0042802]  no overlap   \n",
+       "\n",
+       "       coverage_struct  coverage_seq  \n",
+       "737                0.0           0.0  \n",
+       "1341               0.0           0.0  \n",
+       "1715               0.0           0.0  \n",
+       "2818               0.0           0.0  \n",
+       "4012               0.0           0.0  \n",
+       "4937               0.0           0.0  \n",
+       "14652              0.0           0.0  \n",
+       "20744              0.0           0.0  \n",
+       "21097              0.0           0.0  \n",
+       "27083              0.0           0.0  \n",
+       "28309              0.0           0.0  \n",
+       "35725              0.0           0.0  \n",
+       "36403              0.0           0.0  \n",
+       "38385              0.0           0.0  \n",
+       "38735              0.0           0.0  \n",
+       "39987              0.0           0.0  \n",
+       "41192              0.0           0.0  "
+      ]
+     },
+     "execution_count": 267,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "GOs[GOs['overlap'] == 'no overlap']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d38d20e1-6644-4215-998d-6ba2213fcd2c",
+   "metadata": {},
+   "source": [
+    "Looking more closely at these cases, one realizes that these are similar GO terms all over, such as GO:0005575 or GO:0003674. This happenes on both sides. There is something weird about it. Niko, please have a look!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 268,
+   "id": "1ff24373-d621-4e76-8c2b-3318a6f17577",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "protein_id                                                           63159\n",
+       "eggNOG_OGs_struct        COG5096@1|root,KOG1061@2759|Eukaryota,37I5T@33...\n",
+       "MSA size                                                            4182.0\n",
+       "alignment length                                                     841.0\n",
+       "query length                                                         790.0\n",
+       "seq. id.                                                             0.361\n",
+       "bit score                                                           2903.0\n",
+       "plddt                                                            82.705063\n",
+       "complete_protein                                                     False\n",
+       "Preferred_name_struct                                                    -\n",
+       "Description_struct       Subunit of clathrin-associated adaptor protein...\n",
+       "GOs_struct               GO:0003674,GO:0005488,GO:0005515,GO:0019899,GO...\n",
+       "eggNOG_OGs_seq           COG5096@1|root,KOG1061@2759|Eukaryota,38E7X@33...\n",
+       "score                                                                733.0\n",
+       "Preferred_name_seq                                                   AP4B1\n",
+       "Description_seq                                           clathrin binding\n",
+       "GOs_seq                  GO:0000139,GO:0005575,GO:0005622,GO:0005623,GO...\n",
+       "Name: 4012, dtype: object"
+      ]
+     },
+     "execution_count": 268,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "annotations_complete.iloc[4012]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "035867d4-3369-4e99-aaed-a99c2994e468",
+   "metadata": {},
+   "source": [
+    "This is an example in which the GO terms do not overlap. However, the description is very similar and the OGs are the same to the Eukaryotic level. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 270,
+   "id": "34504af3-0021-4b2f-a856-e642ece7c88a",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "__init__() missing 2 required positional arguments: 'go2obj' and 'annots'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m/tmp/ipykernel_278/2809043968.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0;31m# Create a TermCounts object to store information about the GO terms\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mterm_counts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTermCounts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[0;31m# Calculate the information content of each GO term in the lists\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mTypeError\u001b[0m: __init__() missing 2 required positional arguments: 'go2obj' and 'annots'"
+     ]
+    }
+   ],
+   "source": [
+    "from goatools.semantic import TermCounts, get_info_content\n",
+    "\n",
+    "# Define your two lists of GO terms\n",
+    "list1 = ['GO:0000001', 'GO:0000002', 'GO:0000003']\n",
+    "list2 = ['GO:0000004', 'GO:0000005', 'GO:0000006']\n",
+    "\n",
+    "# Create a TermCounts object to store information about the GO terms\n",
+    "term_counts = TermCounts()\n",
+    "\n",
+    "# Calculate the information content of each GO term in the lists\n",
+    "ic1 = [get_info_content(go, term_counts) for go in list1]\n",
+    "ic2 = [get_info_content(go, term_counts) for go in list2]\n",
+    "\n",
+    "# Calculate the semantic similarity between the two lists of GO terms using the Resnik measure\n",
+    "similarity = sum([ic1[i] * ic2[i] for i in range(len(list1))])\n",
+    "\n",
+    "# Print the semantic similarity score\n",
+    "print(similarity)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f388fa0c-5b4d-49ce-9b48-386d3f33acfe",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
-- 
GitLab