diff --git a/analysis/revision-GO_term_comparison.ipynb b/analysis/revision-GO_term_comparison.ipynb index 2ca4cc73c9106ff3d13f9959aa2af8780cf5a8be..50a4a24b5dacb903c6c535c8e1ea4db67ee9b2f6 100644 --- a/analysis/revision-GO_term_comparison.ipynb +++ b/analysis/revision-GO_term_comparison.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 237, + "execution_count": 109, "id": "99e6a23c-f2a3-4369-977e-4d2c74707d0c", "metadata": {}, "outputs": [ @@ -10,7 +10,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "2022-12-14 17:26\n" + "2022-12-20 10:26\n" ] } ], @@ -27,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 238, + "execution_count": 110, "id": "a6e66eaf-c7a8-4b26-a99b-5032d49c25a3", "metadata": {}, "outputs": [], @@ -40,6 +40,7 @@ "\n", "import numpy as np\n", "import pandas as pd\n", + "import itertools\n", "\n", "import matplotlib.pyplot as plt\n", "from matplotlib_venn import venn2, venn3\n", @@ -47,9 +48,19 @@ "import seaborn as sns" ] }, + { + "cell_type": "markdown", + "id": "36443851-596d-4f0e-913e-7fcdd312afef", + "metadata": {}, + "source": [ + "The reviews made it obvious that comparison of functional equivalence between pairs of proteins often are carried out on the level of GO terms. Therefore, we decided to conduct different analyses on GO terms, comparing the annotations of proteins given by structural or sequence similarity.\n", + "\n", + "First we will solely compare the GO-term assignments between strucural and sequence based annotations:" + ] + }, { "cell_type": "code", - "execution_count": 239, + "execution_count": 111, "id": "0cd74ade-73c8-44f6-9d6e-f0f37bb760d3", "metadata": {}, "outputs": [], @@ -60,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 240, + "execution_count": 112, "id": "c3d289af-b99b-4403-af5b-a8fc15f8269f", "metadata": {}, "outputs": [], @@ -74,12 +85,12 @@ "id": "d3b9a268-6698-415f-a0c4-4f7e0fd515d8", "metadata": {}, "source": [ - "## Comparison of GO-terms between structural and sequence annotation" + "## Comparison of GO-terms between structural (MorF) and sequence based (EggNOG) annotation" ] }, { "cell_type": "code", - "execution_count": 241, + "execution_count": 113, "id": "63dd7ec5-8841-4bfe-9e38-db55f67c298d", "metadata": {}, "outputs": [], @@ -96,7 +107,7 @@ }, { "cell_type": "code", - "execution_count": 242, + "execution_count": 114, "id": "469c03ab-31cf-474c-b6eb-fc9f613015ba", "metadata": {}, "outputs": [], @@ -109,7 +120,7 @@ "id": "34f832e3-b005-464b-9788-07f758239276", "metadata": {}, "source": [ - "Now that I have all cases in which structure and sequence annotations actually produce GO-terms (via EggNOG-mapper), I can compare the overlap between those annotations on the level of GO-terms." + "Now that we have all cases in which structure and sequence annotations actually produce GO-terms, we can compare the overlap between those annotations on the level of GO-terms." ] }, { @@ -119,17 +130,17 @@ "source": [ "There will be a couple of different levels:\n", "\n", - " - Complete overlap\n", + " - Complete overlap (GO terms are identical)\n", " - Partial overlap:\n", - " - Unique GO-terms on both sides\n", - " - All structure GO-terms in sequence GO-terms\n", - " - All sequence GO-terms in structure GO-terms\n", - " - NO overlap at all" + " - Unique GO-terms on both sides ('unique GOs')\n", + " - All structure GO-terms in sequence GO-terms ('Sequence GOs expanded')\n", + " - All sequence GO-terms in structure GO-terms ('Structure GOs expanded')\n", + " - No overlap" ] }, { "cell_type": "code", - "execution_count": 243, + "execution_count": 115, "id": "fe53ea5e-26de-46cf-ab69-44288f1a2b26", "metadata": {}, "outputs": [], @@ -139,7 +150,7 @@ }, { "cell_type": "code", - "execution_count": 244, + "execution_count": 116, "id": "3761bb31-a522-40a0-8e86-c262b2b9e150", "metadata": {}, "outputs": [ @@ -147,13 +158,13 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_278/2049958699.py:1: SettingWithCopyWarning: \n", + "/tmp/ipykernel_125/2049958699.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " GOs['GOs_struct'] = GOs['GOs_struct'].str.split(',')\n", - "/tmp/ipykernel_278/2049958699.py:2: SettingWithCopyWarning: \n", + "/tmp/ipykernel_125/2049958699.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", @@ -169,7 +180,7 @@ }, { "cell_type": "code", - "execution_count": 245, + "execution_count": 117, "id": "1b45906f-bd6c-4424-93de-ffbef8fad67e", "metadata": {}, "outputs": [], @@ -194,14 +205,14 @@ " # Check if list2 contains all elements of list1 and more\n", " elif len(overlap) == len(list1) and len(overlap) < len(list2):\n", " return 'partial overlap - sequence GOs expanded'\n", - " # If none of the above, then overlap is partial but not unique or expanded\n", + " # If none of the above, then something is weird\n", " else:\n", - " return 'partial'" + " return 'something is weird'" ] }, { "cell_type": "code", - "execution_count": 246, + "execution_count": 118, "id": "90a4a44d-2931-4b87-a7a8-1fe677e8912b", "metadata": {}, "outputs": [], @@ -215,7 +226,7 @@ }, { "cell_type": "code", - "execution_count": 247, + "execution_count": 119, "id": "9c799814-04f9-49d4-9427-873c50e3a2cb", "metadata": {}, "outputs": [], @@ -229,7 +240,7 @@ }, { "cell_type": "code", - "execution_count": 248, + "execution_count": 120, "id": "bcc70854-7a7b-4bdc-b59f-c9244f9ffae8", "metadata": {}, "outputs": [ @@ -237,7 +248,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_278/1677403612.py:1: SettingWithCopyWarning: \n", + "/tmp/ipykernel_125/1677403612.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", @@ -252,7 +263,7 @@ }, { "cell_type": "code", - "execution_count": 249, + "execution_count": 121, "id": "948c120b-f680-4a9d-ba9e-11ddc15be017", "metadata": {}, "outputs": [ @@ -260,7 +271,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_278/2972605403.py:1: SettingWithCopyWarning: \n", + "/tmp/ipykernel_125/2972605403.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", @@ -275,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": 250, + "execution_count": 122, "id": "ca29c96d-a8da-4c7c-bdf2-5ad1f380c4da", "metadata": {}, "outputs": [ @@ -283,7 +294,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_278/1789197005.py:1: SettingWithCopyWarning: \n", + "/tmp/ipykernel_125/1789197005.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", @@ -298,7 +309,7 @@ }, { "cell_type": "code", - "execution_count": 251, + "execution_count": 123, "id": "9195a8b5-f5e4-4e10-a8e6-fcf190fdd42f", "metadata": {}, "outputs": [ @@ -313,7 +324,7 @@ "Name: overlap, dtype: int64" ] }, - "execution_count": 251, + "execution_count": 123, "metadata": {}, "output_type": "execute_result" } @@ -324,7 +335,7 @@ }, { "cell_type": "code", - "execution_count": 252, + "execution_count": 124, "id": "6a4352ef-1326-4102-9ca6-799217ba6be4", "metadata": {}, "outputs": [], @@ -332,8 +343,8 @@ "color_reference = {\n", " 'complete overlap': cm.tab20.colors[0],\n", " 'partial overlap - unique GOs': cm.tab20.colors[18],\n", - " 'partial overlap - sequence GOs expanded': cm.tab20.colors[19],\n", - " 'partial overlap - structure GOs expanded': cm.tab20.colors[19],\n", + " 'partial overlap - sequence GOs expanded': cm.tab20.colors[7],\n", + " 'partial overlap - structure GOs expanded': cm.tab20.colors[7],\n", " 'no overlap': cm.tab20.colors[16]\n", "}\n", "\n", @@ -342,13 +353,13 @@ }, { "cell_type": "code", - "execution_count": 253, + "execution_count": 125, "id": "ae8bac54-f10d-4020-8019-6221c13b56b7", "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "<Figure size 432x288 with 1 Axes>" ] @@ -368,15 +379,26 @@ "# explode = (0.1, 0, 0, 0, 0)\n", "\n", "fig, ax = plt.subplots()\n", - "piechart = ax.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=False, colors=colors)\n", + "piechart = ax.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=False, colors=colors, textprops={'fontsize': 14})\n", "ax.axis('equal');\n", "\n", - "#plt.savefig('./figures/analysis-sequence_structure_agreement.pdf')" + "plt.savefig('/g/arendt/Fabian/PhD/Computational/Spongefold/coffe-paper/figures/GO_count.svg', bbox_inches=\"tight\")" + ] + }, + { + "cell_type": "markdown", + "id": "706a31e9-2a25-4fd0-82b4-be394b94453b", + "metadata": {}, + "source": [ + "In 60.6 % of cases, all proteins that have GO-annotations from both structural and sequence similarity based annotations are overlapping 100 %. This is close to the number of proteins that share the same *most specific* orthogroup, which is not surprising. \n", + "Another 39.2 % of all annotation pairs show at least partial overlap with 20.3 % of cases having unique GO terms on each side, 10.6 % of cases have additional GO terms on the sequence annotations side and 8.3 % of cases have additional GO terms in their strucure based annotation. Only in 0.1 % of cases (= 17 annotation pairs), the GO terms do not overlap whatsoever.\n", + "\n", + "We then wondered how much the annotation pairs in the 'partial overlap' category actually differ from each other." ] }, { "cell_type": "code", - "execution_count": 254, + "execution_count": 126, "id": "ab2a9480-aaee-43d3-af23-ca05886b3695", "metadata": {}, "outputs": [ @@ -386,13 +408,13 @@ "<AxesSubplot:xlabel='coverage_struct', ylabel='Count'>" ] }, - "execution_count": 254, + "execution_count": 126, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEHCAYAAABfkmooAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAZMUlEQVR4nO3dfbBkdX3n8ffHGUEeBIEZEGbQgTCiwMaoI8GHTVTcBTUlmBV3XBVEKlMxxKhJDJDsxmylpmR3Uyk1K7oTNWDWEhEfGJ+lRtQ1ijgoIg8S7g4RJhBm0AQFDe4M3/3jnMHm2n1Pc+d2971z36+qW/f07zx9f/1wPn3O6T6dqkKSpJk8atIFSJLmP8NCktTJsJAkdTIsJEmdDAtJUifDQpLUaemkCxiVZcuW1apVqyZdhiQtKNdee+09VbV8evseGxarVq1i8+bNky5DkhaUJN/v1+5hKElSJ8NCktTJsJAkdTIsJEmdDAtJUifDQpLUybCQJHXaY79nIUmLwc6dO5mamnro9jHHHMOSJUvmfD2GhSQtYFNTU6x716fZb9kR3H/PnWw49yUce+yxc74ew0KSFrj9lh3BAY9/4kjX4TkLSVInw0KS1MmwkCR1MiwkSZ0MC0lSp5GFRZL3J9mW5Iaetv+R5HtJrk/y8SSP6xl3QZKpJLckOaWn/RlJvtuOe2eSjKpmSVJ/o9yzuBg4dVrblcAJVfXLwN8DFwAkOQ5YCxzfznNRkl3fKnk3sA5Y3f5NX6YkacRGFhZV9RXgh9PavlBVO9qbVwMr2+HTgEur6oGqug2YAk5McjhwQFV9vaoK+ABw+qhqliT1N8lzFq8DPtsOrwDu6Bm3tW1b0Q5Pb5ckjdFEwiLJnwA7gA/uauozWc3QPmi565JsTrJ5+/btu1+oJAmYQFgkOQv4DeBV7aElaPYYjuyZbCVwZ9u+sk97X1W1oarWVNWa5cuXz23hkrSIjTUskpwKnAe8tKp+0jNqI7A2yd5JjqI5kX1NVd0F/DjJSe2noM4ErhhnzZKkEV5IMMmHgOcBy5JsBd5K8+mnvYEr20/AXl1Vv11VNya5DLiJ5vDUuVW1s13U62k+WbUPzTmOzyJJGquRhUVVvbJP8/tmmH49sL5P+2bghDksTZL0CPkNbklSJ8NCktTJsJAkdTIsJEmdDAtJUifDQpLUybCQJHUyLCRJnQwLSVInw0KS1MmwkCR1MiwkSZ0MC0lSJ8NCktTJsJAkdTIsJEmdDAtJUifDQpLUybCQJHUyLCRJnQwLSVInw0KS1MmwkCR1MiwkSZ1GFhZJ3p9kW5IbetoOTnJlklvb/wf1jLsgyVSSW5Kc0tP+jCTfbce9M0lGVbMkqb9R7llcDJw6re18YFNVrQY2tbdJchywFji+neeiJEvaed4NrANWt3/TlylJGrGRhUVVfQX44bTm04BL2uFLgNN72i+tqgeq6jZgCjgxyeHAAVX19aoq4AM980iSxmTc5ywOq6q7ANr/h7btK4A7eqbb2rataIent0uSxmi+nODudx6iZmjvv5BkXZLNSTZv3759zoqTpMVu3GFxd3toifb/trZ9K3Bkz3QrgTvb9pV92vuqqg1Vtaaq1ixfvnxOC5ekxWzcYbEROKsdPgu4oqd9bZK9kxxFcyL7mvZQ1Y+TnNR+CurMnnkkSWOydFQLTvIh4HnAsiRbgbcCFwKXJTkHuB04A6CqbkxyGXATsAM4t6p2tot6Pc0nq/YBPtv+SZLGaGRhUVWvHDDq5AHTrwfW92nfDJwwh6VJkh6h+XKCW5I0jxkWkqROhoUkqZNhIUnqZFhIkjoZFpKkToaFJKmTYSFJ6mRYSJI6GRaSpE6GhSSpk2EhSepkWEiSOhkWkqROhoUkqZNhIUnqZFhIkjoZFpKkToaFJKmTYSFJ6mRYSJI6GRaSpE6GhSSpk2EhSeo0kbBI8uYkNya5IcmHkjwmycFJrkxya/v/oJ7pL0gyleSWJKdMomZJWszGHhZJVgC/B6ypqhOAJcBa4HxgU1WtBja1t0lyXDv+eOBU4KIkS8ZdtyQtZpM6DLUU2CfJUmBf4E7gNOCSdvwlwOnt8GnApVX1QFXdBkwBJ463XEla3MYeFlX1j8BfALcDdwH3VtUXgMOq6q52mruAQ9tZVgB39Cxia9v2C5KsS7I5yebt27ePqguStOhM4jDUQTR7C0cBRwD7JXn1TLP0aat+E1bVhqpaU1Vrli9fvvvFSpKAyRyGeiFwW1Vtr6r/B3wMeDZwd5LDAdr/29rptwJH9sy/kuawlSRpTCYRFrcDJyXZN0mAk4GbgY3AWe00ZwFXtMMbgbVJ9k5yFLAauGbMNUvSorZ03Cusqm8kuRz4FrAD+DawAdgfuCzJOTSBckY7/Y1JLgNuaqc/t6p2jrtuSVrMxh4WAFX1VuCt05ofoNnL6Df9emD9qOuSJPXnN7glSZ0MC0lSJ8NCktTJsJAkdRoqLJI8Z5g2SdKeadg9i78ask2StAea8aOzSZ5F8+3q5Ul+v2fUATRXi5UkLQJd37PYi+bLckuBx/a0/wh4+aiKkiTNLzOGRVV9Gfhykour6vtjqkmSNM8M+w3uvZNsAFb1zlNVLxhFUZKk+WXYsPgI8B7gvYDXZZKkRWbYsNhRVe8eaSWSpHlr2I/OfjLJ7yQ5PMnBu/5GWpkkad4Yds9i1+9MvKWnrYCj57YcSdJ8NFRYVNVRoy5EkjR/DRUWSc7s115VH5jbciRJ89Gwh6Ge2TP8GJofKfoWYFhI0iIw7GGoN/TeTnIg8LcjqUiSNO/M9hLlPwFWz2UhkqT5a9hzFp+k+fQTNBcQfApw2aiKkiTNL8Oes/iLnuEdwPerausI6pEkzUNDHYZqLyj4PZorzx4E/GyURUmS5pdhfynvFcA1wBnAK4BvJPES5ZK0SAx7gvtPgGdW1VlVdSZwIvBfZrvSJI9LcnmS7yW5Ocmz2kuIXJnk1vb/QT3TX5BkKsktSU6Z7XolSbMzbFg8qqq29dz+wSOYt593AJ+rqicDTwVuBs4HNlXVamBTe5skxwFrgeOBU4GLkvgrfZI0RsNu8D+X5PNJXpvktcCngc/MZoVJDgB+DXgfQFX9rKr+BTgNuKSd7BLg9Hb4NODSqnqgqm4Dpmj2bCRJY9L1G9zHAIdV1VuS/CbwXCDA14EPznKdRwPbgb9J8lTgWuCN7XruAqiqu5Ic2k6/Ari6Z/6tbZskaUy69izeDvwYoKo+VlW/X1VvptmrePss17kUeDrw7qp6GnA/7SGnAdKnrfq0kWRdks1JNm/fvn2W5UmSpusKi1VVdf30xqraTPMTq7OxFdhaVd9ob19OEx53JzkcoP2/rWf6I3vmXwnc2W/BVbWhqtZU1Zrly5fPsjxJ0nRdYfGYGcbtM5sVVtU/AXckObZtOhm4CdjIz3834yzginZ4I7A2yd5JjqK5zMg1s1m3JGl2ur7B/c0kv1VVf93bmOQcmnMNs/UG4INJ9gK2AGfTBNdl7bJvp/lOB1V1Y5LLaAJlB3BuVfk74JI0Rl1h8Sbg40lexc/DYQ2wF/Cy2a60qq5rlzPdyQOmXw+sn+36JEm7Z8awqKq7gWcneT5wQtv86ar64sgrkyTNG8P+nsVVwFUjrkWSNE/tzrewJUmLhGEhSepkWEiSOhkWkqROhoUkqZNhIUnqZFhIkjoZFpKkToaFJKmTYSFJ6mRYSJI6GRaSpE6GhSSpk2EhSepkWEiSOhkWkqROhoUkqZNhIUnqZFhIkjoZFpKkToaFJKmTYSFJ6jSxsEiyJMm3k3yqvX1wkiuT3Nr+P6hn2guSTCW5Jckpk6pZkharSe5ZvBG4uef2+cCmqloNbGpvk+Q4YC1wPHAqcFGSJWOuVZIWtYmERZKVwEuA9/Y0nwZc0g5fApze035pVT1QVbcBU8CJYypVksTk9izeDvwR8GBP22FVdRdA+//Qtn0FcEfPdFvbNknSmIw9LJL8BrCtqq4ddpY+bTVg2euSbE6yefv27bOuUZL0cJPYs3gO8NIk/wBcCrwgyf8G7k5yOED7f1s7/VbgyJ75VwJ39ltwVW2oqjVVtWb58uWjql+SFp2xh0VVXVBVK6tqFc2J6y9W1auBjcBZ7WRnAVe0wxuBtUn2TnIUsBq4ZsxlS9KitnTSBfS4ELgsyTnA7cAZAFV1Y5LLgJuAHcC5VbVzcmVK0uIz0bCoqi8BX2qHfwCcPGC69cD6sRUmSXoYv8EtSepkWEiSOhkWkqROhoUkqZNhIUnqZFhIkjoZFpKkToaFJKmTYSFJ6mRYSJI6GRaSpE6GhSSpk2EhSepkWEiSOhkWkqROhoUkqZNhIUnqZFhIkjoZFpKkToaFJKmTYSFJ6mRYSJI6GRaSpE6GhSSp09jDIsmRSa5KcnOSG5O8sW0/OMmVSW5t/x/UM88FSaaS3JLklHHXLEmL3ST2LHYAf1BVTwFOAs5NchxwPrCpqlYDm9rbtOPWAscDpwIXJVkygboladEae1hU1V1V9a12+MfAzcAK4DTgknayS4DT2+HTgEur6oGqug2YAk4ca9GStMhN9JxFklXA04BvAIdV1V3QBApwaDvZCuCOntm2tm2SpDGZWFgk2R/4KPCmqvrRTJP2aasBy1yXZHOSzdu3b5+LMiVJTCgskjyaJig+WFUfa5vvTnJ4O/5wYFvbvhU4smf2lcCd/ZZbVRuqak1VrVm+fPloipekRWgSn4YK8D7g5qr6y55RG4Gz2uGzgCt62tcm2TvJUcBq4Jpx1StJgqUTWOdzgNcA301yXdv2x8CFwGVJzgFuB84AqKobk1wG3ETzSapzq2rn2KuWpEVs7GFRVV+l/3kIgJMHzLMeWD+yoiRJM/Ib3JKkToaFJKmTYSFJ6mRYSJI6GRaSpE6GhSSpk2EhSepkWEiSOhkWkqROhoUkqZNhIUnqZFhIkjoZFpKkToaFJKmTYSFJ6mRYSJI6GRaSpE6GhSSp0yR+g1vzzM6dO5mamnro9jHHHMOSJUsmWFF/o6qzd7k7dzY/775rufP1vtDsTX8ewWQe54XyutvFsJjnxvGEmpqaYt27Ps1+y47g/nvuZMO5L+HYY4+dF0/m3hq2bNnC2z5zM/svf3idc7nc7bdex6P2PZBDVhy12+uYa+N4PObDYz5qvc93YGKP86DX3TCmP3+rRllpw7CY52bzhJrNC36/ZUdwwOOfOHDd923bygUvOZ6jjz566GXOtraZNuSPPfIpHPD4J1IPPsiWLVsemh542HK66uvtW+9y77vnTpbuf0jnOkbZ/2Fq3t1QHzTPMI/5I30MB913c7WcYfrYO++WLVvY95CfP9+HeZwH1fpI+zB9ml2vu94ahllOv9fFqBkWYzbMk27QE3v6E6p3/l6DXvC9y52+jl3vTHrX0bvu++65kz/7xHUcsuLegRuqQcufvq5dT/JBtQ0KiPvuufOhPt7/w3/izz7xfQ5Zce/D9gaAoTZy0/vWz6B1DHOfDtro7O4GvyvUZ3pDMexe2q51DHrMhwmU6WHcb29tmLoHLWfQYzBMDdM3roMe52H6POh+HHRfT5931+uut4ZhljPodTFKhsVueqTv6oZ50g16Yvc+oWDwRhH6v+CnL7drHdNfVPv1Ca2Z6p5pXV21DfNC2K9nY79rbwB42HJnuo+HeTfWbx3D3qeDDmf1ezc5bIgOOtzQb5nTN6LD7KVNX8egx7zfm4jp9/W+fe67QcuZ8TFYNvxjMEwN/Z5T/R7nYfrcez/2mmnPdfq8Xff1JAOil2Exze4cJhnmHfcwT7qZntj79by4Znqh9nvBT19u1zqGfcc9qO5Huq6ZXsyP1H5D3MdztfyuPg/aGM90P84UovCLe4CDltm7xzXsXlq/EB3mTcRMG8Ku5QwKuRkDcogg353DM8P2eZDekBtU97DrnVRA9FowYZHkVOAdwBLgvVV14SjWM8whnGHfoXXtKcBwT7qZPJIX6lza3brHadK1zmaj0xWiwy5z+h7XIENt/EY4zaCQm81zeS4f70eyrJn20Ea53nFZEGGRZAnwLuDfAVuBbybZWFU3jWJ9wxzCGfYd2ijeNQ+sex4+wdQYxWOzpz3eo9rLHJdh9tAWsgURFsCJwFRVbQFIcilwGjCSsLi/fYL+9J+38ah9D+w/zQ/u5EeP2buZ5oEHfj7cM/3AaeZg+KH65ni5C33Y+8X7ZaL3y5he/4OGm23X0+Z8mwgLJyxWAHf03N4K/Or0iZKsA9a1N+9Lcsss17cMuGeW8y5U9nlxWGx9Xmz95cl/9Qe72+e+nzhYKGGRPm2/cESwqjYAG3Z7Zcnmqlqzu8tZSOzz4rDY+rzY+guj6/NCuTbUVuDIntsrgYVzMFOSFriFEhbfBFYnOSrJXsBaYOOEa5KkRWNBHIaqqh1Jfhf4PM1HZ99fVTeOcJW7fShrAbLPi8Ni6/Ni6y+MqM+pcVyBSpK0oC2Uw1CSpAkyLCRJnRZ1WCQ5NcktSaaSnN9nfJK8sx1/fZKnT6LOuTJEf1/V9vP6JF9L8tRJ1DmXuvrcM90zk+xM8vJx1jcKw/Q5yfOSXJfkxiRfHneNc22I5/aBST6Z5Dttn8+eRJ1zJcn7k2xLcsOA8XO/7aqqRflHc6L8/wJHA3sB3wGOmzbNi4HP0nzP4yTgG5Oue8T9fTZwUDv8ooXc32H73DPdF4HPAC+fdN1jeJwfR3P1gye0tw+ddN1j6PMfA/+tHV4O/BDYa9K170affw14OnDDgPFzvu1azHsWD11CpKp+Buy6hEiv04APVONq4HFJDh93oXOks79V9bWq+uf25tU032dZyIZ5jAHeAHwU2DbO4kZkmD7/J+BjVXU7QFUt9H4P0+cCHpskwP40YbFjvGXOnar6Ck0fBpnzbddiDot+lxBZMYtpFopH2pdzaN6ZLGSdfU6yAngZ8J4x1jVKwzzOTwIOSvKlJNcmOXNs1Y3GMH3+n8BTaL7M+13gjVX14HjKm4g533YtiO9ZjMgwlxAZ6jIjC8TQfUnyfJqweO5IKxq9Yfr8duC8qtrZvOlc8Ibp81LgGcDJwD7A15NcXVV/P+riRmSYPp8CXAe8APgl4Mok/6eqfjTi2iZlzrddizkshrmEyJ50mZGh+pLkl4H3Ai+qqh+MqbZRGabPa4BL26BYBrw4yY6q+sRYKpx7wz6v76mq+4H7k3wFeCqwUMNimD6fDVxYzQH9qSS3AU8GrhlPiWM359uuxXwYaphLiGwEzmw/WXAScG9V3TXuQudIZ3+TPAH4GPCaBfwus1dnn6vqqKpaVVWrgMuB31nAQQHDPa+vAP5tkqVJ9qW5gvPNY65zLg3T59tp9qRIchhwLLCFPdecb7sW7Z5FDbiESJLfbse/h+bTMS8GpoCf0Lw7WZCG7O+fAocAF7XvtHfUAr5i55B93qMM0+equjnJ54DrgQdpfnmy70cwF4IhH+c/By5O8l2aQzTnVdWCvXR5kg8BzwOWJdkKvBV4NIxu2+XlPiRJnRbzYShJ0pAMC0lSJ8NCktTJsJAkdTIspAUmyelJjpvD5b02yRFztTztmQwLaZaSTOqj56cDfcNiljW9FjAsNCM/Oqs9Unu9oz+kucTB9cB/Bt5Pc8XR7TSfO7+X5gqlR1fVg+0X1G6huXrpE4B3tdP/BPitqvpekotpLuD2NOBbwIdpLhmyD/BT4OyquqVd1sU03xK+GVgFnFtVm5P8e+C/AnvTXC317Kq6b0A/LgReSnPRuy/QfGnyU23t9wL/AXgf8DXgOTRfxvo3wKeq6vJ2GfdV1f7t8B8Br6H5fsVngc1tnf/Y1v+sqvrpI7u3tShM+lK7/vk313/A8TQb/WXt7YOBTwJntbdfB3yiHb4CeH47/B9pvqAGsAlY3Q7/KvDFdvhimo31kvb2AcDSdviFwEfb4T8E/lc7fALNxn4NzSVFvgLs1447D/jTAf04uO3Hrjd1j+up4eU9030JuKjn9vTx97X/X0QTKvvuWn7P/Gsm/bj5N7//Fu03uLVHewFwebXf0K2qHyZ5FvCb7fi/Bf57O/xhmpC4iuYyERcl2Z/mtz0+0nNxwb17lv+RqtrZDh8IXJJkNc1ezKPb9ucC72jXf0OS69v2k2gOIf1du+y9gK8P6MePgH8F3pvk0zQhNciHZxi3ywuBv6mqn7R1zXSJa+lhDAvtiUL3FTZ3jd8IvC3JwTRXYv0isB/wL1X1KwPmvb9n+M+Bq6rqZUlW0bxL31XDoNqurKpXdtRHNZexOJHmmkZrgd+lCcKumnbQno9sf79hr551e9xZs+IJbu2JNgGvSHIIQBsEX6PZ4AK8CvgqQDXnCq6h2Qv4VFXtrOay1bclOaOdPxn8E7MH0hzvh+ZE8S5fBV7Rzn8czXkEaH5U6jlJjmnH7ZvkSf0W3O7hHFhVnwHeBPxKO+rHwGNn6P8/0AQfND+Cs2tv5wvA69rzKbvul2GWJxkW2vNU1Y3AeuDLSb4D/CXwe8DZ7eGg1wBv7Jnlw8CrefihnFcB57Tz30j/X9iD5nDW25L8Hc1F7Ha5CFjeru88mpPs91bVdppQ+VA77mqak+D9PBb4VDvdl4E3t+2XAm9J8u0kv9Rnvr8Gfj3JNTTnW+5v75fP0exJbU5yHc15FWjOcbwnzW9y7zOgFi1yfhpKGoEkS4BHV9W/thv0TcCTqvnZT2nB8ZyFNBr7AlcleTTNuYLXGxRayNyzkOaBJB8HjprWfF5VfX4S9UjTGRaSpE6e4JYkdTIsJEmdDAtJUifDQpLUybCQJHUyLCRJnf4/VGT/VIxV70gAAAAASUVORK5CYII=\n", + "image/png": "\n", "text/plain": [ "<Figure size 432x288 with 1 Axes>" ] @@ -404,43 +426,107 @@ } ], "source": [ - "sns.histplot(GOs[GOs['overlap'] != 'complete overlap']['coverage_struct'], bins=100)" + "sns.histplot(GOs[GOs['overlap'] == 'partial overlap - unique GOs']['coverage_struct'], bins=100)" ] }, { "cell_type": "code", - "execution_count": 255, + "execution_count": 127, "id": "f628c3eb-422c-403d-b267-ec10489171df", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.6454013592969468" + "0.4821899253270338" ] }, - "execution_count": 255, + "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "np.mean(GOs[GOs['overlap'] != 'complete overlap']['coverage_struct'])" + "np.mean(GOs[GOs['overlap'] == 'partial overlap - unique GOs']['coverage_struct'])" ] }, { "cell_type": "markdown", - "id": "1bc8b46c-37e3-46b3-8ca8-e85ab8e47038", + "id": "1052628d-b179-431e-8207-4c8065783aaf", + "metadata": {}, + "source": [ + "Within the 'partial overlap - unique GOs' category, stuctural GO terms on average contain around 50 % of the overlapping GO terms with a fairly uniform distribution between 0 and 100 %." + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "93eca691-0659-4516-8945-32a9b854c7e9", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<AxesSubplot:xlabel='coverage_seq', ylabel='Count'>" + ] + }, + "execution_count": 128, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX4AAAEGCAYAAABiq/5QAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAWCElEQVR4nO3de5BkZXnH8e/DAnKHvQy4gMuii6vERDQrIhgLQQyXJERLUYOyGipbxhsmxiwxVSaWlQqmLMtLMLpFLNAYFATlZlSy3LwAyiIgiIQNChI2LOCFgLdiefJHn4FmmN7pmelzuk+/30/V1Jw+fTnP2937m9737fc9kZlIksqxzbALkCQ1y+CXpMIY/JJUGINfkgpj8EtSYbYddgH9WLJkSS5fvnzYZUhSq2zYsOH+zJyYur8Vwb98+XKuu+66YZchSa0SEXdOt9+uHkkqjMEvSYUx+CWpMAa/JBXG4Jekwhj8klQYg1+SCmPwS1JhWjGBS/XZsmULGzdufOzyihUrWLBgwRArklQ3g79wGzduZM3pl7Dzkr15+P57WPfW41i5cuWwy5JUI4Nf7Lxkb3Z76n7DLkNSQ+zjl6TCGPySVBiDX5IKY/BLUmEMfkkqjMEvSYXx65zSNJzYpnFm8EvTcGKbxpnBL/XgxDaNK/v4JakwBr8kFcbgl6TCGPySVBiDX5IKY/BLUmEMfkkqjMEvSYUx+CWpMLUHf0QsiIjvRsTF1eVFEXFpRNxe/V5Ydw2SpMc18Yn/FODWrsunAusz8wBgfXVZktSQWoM/IvYFjgPO6Np9PHBWtX0W8Md11iBJeqK6P/F/GPhr4NGufXtl5iaA6vee090xItZExHURcd19991Xc5mSVI7agj8i/gDYnJkb5nL/zFyXmasyc9XExMSAq5OkctW5LPNhwB9FxLHADsBuEfFvwL0RsTQzN0XEUmBzjTVIkqao7RN/Zv5NZu6bmcuB1wKXZebrgQuB1dXNVgMX1FWDJOnJhvE9/tOAoyLiduCo6rIkqSGNnIErM68Arqi2HwCObOK4kqQnc+auJBXG4Jekwhj8klQYg1+SCmPwS1JhDH5JKozBL0mFaeR7/Jq7LVu2sHHjxse2ARYsWADAihUrHtuWpH4Z/CNu48aNrDn9EnZesjf33X4D2+y0O4v32Z+H77+HdW89jpUrVw67REktY/C3wM5L9ma3p+7HQ/ffw7a7LGa3p+437JIktZh9/JJUGD/xN6C7nx6G3zffXc8dd9xB5mAfc9Kw2ylpegZ/A7r76Uehb37quMGuT3v2QB8TGIl2Spqewd+QyX76UdE9bjDox5Q02uzjl6TCGPySVBi7ejSjURucljQ/Br9mNGqD05Lmx+BXXxy4lcaHffySVBg/8as16liwzvELlcjgV2vUsWCd4xcqkcGvVqljwTrHL1Qa+/glqTAGvyQVxq6eQtS9IuegHlNS/Qz+QtS9IuegHlNS/ezqKcjkIOaOC/cc6ceUVC+DX5IKY/BLUmHs42+pfPRR7rjjjscuO+O0LM441nwY/C318E/+l7//0p0s3ufnzjgtkDOONR8Gf4vtvNgZpyVzxrHmyj5+SSqMn/ilSve4Sa8Jafatj5dSX0+DX6p0j5v0mpBm3/p4KfX1NPilLpPjJg/df0/v29i3PlZKfD1r6+OPiB0i4tsRcWNE3BIR76v2L4qISyPi9ur3wrpqkCQ9WZ2Du78GjsjM5wIHAUdHxCHAqcD6zDwAWF9dliQ1pLaunsxM4KHq4nbVTwLHA4dX+88CrgDW1lWH2qHuQbZep210VVGVqNY+/ohYAGwAVgCnZ+a1EbFXZm4CyMxNETHt6l4RsQZYA7Bs2bI6y9QIqHuQrddpG11VVCWq9Xv8mbklMw8C9gUOjojnzOK+6zJzVWaumpiYqK1GjY7JQbadl+xd6+PvuHDPxwZxXVVUJWpkAldm/oxOl87RwL0RsRSg+r25iRokSR11fqtnIiL2qLZ3BF4G/AC4EFhd3Ww1cEFdNWj0bNmyhdtuu+2xn8n+dknNqbOPfylwVtXPvw1wTmZeHBFXA+dExMnAXcCra6xBI6bUCTPSKKnzWz03Ac+bZv8DwJF1HVejr8QJM9IocZE2SSqMwS9JhSlyrZ5SV+SbSVvP6tXWujV3/huenyKD3wHG6bX1rF5trVtz57/h+Sky+MEBxl7aelavttatufPf8NzZxy9JhTH4JakwxXb11K178MnT+DWnyYHe+R7L118wnPeBwV+TqatBehq/ZjQ50DvfY/n6C4bzPjD4azQ5+ORp/JrV5EDvfI/l6y9o/n1gH78kFaavT/wRcVhmfnOmfRq+ces37mesZBQMcmyh7a9h2+svQb9dPR8Dnt/HPg3ZuPUb9zNWMgoGObbQ9tew7fWXYKvBHxEvAg4FJiLiL7uu2g3wT/iIGrd+437GSkbBIMcW2v4atr3+cTfTJ/7tgV2q2+3atf9B4FV1FSVJqs9Wgz8zrwSujIgzM/POhmqSJNWo3z7+p0TEOmB5930y84g6imqrtgxEzkf3IOa4tlGq27Czot/gPxf4BHAG4ElSe2jLQOR8dA9ijmsbpboNOyv6Df5HMvNfaq1kTLRlIHI+Jgcxx7mNUt2GmRX9TuC6KCLeEhFLI2LR5E+tlUmSatHvJ/7V1e93d+1L4OmDLUcaPY5raDbaMIGtr+DPzP3rLkQaVY5raDbaMIGt3yUbTppuf2Z+erDlSKPJcQ3NxqhPYOu3q+cFXds7AEcC1wMGvyS1TL9dPW/vvhwRuwOfqaUiSVKt5roe/y+AAwZZyChrerBmtpM76hh8bGJAs+5jlDgoO9v3aq/b9/M4df27aMPgaNv128d/EZ1v8UBncbZnA+fUVdSoaXqwZraTO+oYfGxiQLPuY5Q4KDvb92qv2/fzOHX9u2jD4Gjb9fuJ/4Nd248Ad2bm3TXUM7KaHqyZ7eSOOgYfmxjQrPsYJQ7Kzva92uv2/TxOXf8uRn1wtO36msBVLdb2AzordC4EflNnUZKk+vQV/BFxAvBt4NXACcC1EeGyzJLUQv129fwt8ILM3AwQERPAfwJfqKswtdsgT0Wo9up+H2zZ0lnfcXLweHJ70ji+R0b130G/wb/NZOhXHsATtWsrBnkqQrXX1AH2bXbancX77P+EbWBs3yOj+u+g3+D/SkR8FTi7uvwa4Mv1lKRxMchTEaq9ugfYt91l8ZO2x90o/juY6Zy7K4C9MvPdEfFK4MVAAFcDn22gPknSgM30if/DwHsAMvN84HyAiFhVXfeHNdbWCsM+k04blDiRqpf5vF8GdV8YfF/zKL7Gg2rzOE4omyn4l2fmTVN3ZuZ1EbG8npLaZdhn0mmDEidS9TKf98ug7ltHX/MovsaDavM4TiibKfh32Mp1Ow6ykDYr4axb81XiRKpe5vN+GcR96zKKr/Gg2jxuE8pm+mbOdyLiz6bujIiTgQ1bu2NEPC0iLo+IWyPilog4pdq/KCIujYjbq98L516+JGm2ZvrE/07gixFxIo8H/Spge+AVM9z3EeBdmXl9ROwKbIiIS4E3Ausz87SIOBU4FVg7x/olSbO01eDPzHuBQyPipcBzqt2XZOZlMz1wZm4CNlXb/xcRtwL7AMcDh1c3Owu4AoN/JAfHhmWcnoupA4O92tNkm8fp+W3aqE7Imq1+1+O/HLh8rgepBoKfB1xL5+uhk38QNkXEnj3uswZYA7Bs2bK5Hro1RnFwbFjG6bnoHhgEeranyTaP0/PbtFGdkDVbtc++jYhdgPOAd2bmg/3eLzPXZeaqzFw1MTFRX4EjZHJwbMeF0/4tLMo4PReTA4MztafJNo/T89u0yedu8o95G9Ua/BGxHZ3Q/2w1DwDg3ohYWl2/FNjc6/6SpMGb6xm4ZhQRAfwrcGtmfqjrqguB1cBp1e8L6qphFNmXq7YZZr/2bI89rAmVbZvIWVvwA4cBbwC+FxE3VPveQyfwz6m+EnoXnaWei2FfrtpmmP3asz32sCZUtm0iZ23Bn5nfoLOuz3SOrOu4bdDkRJdRnFSj9hnmQmOzPfawJlS2aSKnSytLUmEMfkkqTJ19/JLGjF8YeFybnwuDX1Lf/MLA49r8XNjVI2lWnPz1uLY+Fwa/JBXG4JekwtjHLw3YIAf92jyAOBvzaWcpz9EgGfzSgA1y0K/NA4izMZ92lvIcDZJdPVINBjno19YBxNmaTztLeY4GxeCXpMIU39Uz29X/pt7ePkUNm33c7TBKr1PxwT/b1f+6bw+9z6gkNcU+7nYYpdep+OCHOaz+13X7NqzEp/HnKqztMCqvk338klQYg1+SCmNXj6Si9HOaxLadSnG2DH5JRennNIltO5XibNnVI6k4k6dJ3NqEr35u01YGvyQVxq4eaUyN0oShQShhIbepE0Rh5kmlc2HwS2NqlCYMDUIJC7lNnSDaz6TSuTD4pTE2KhOGBmU+7WnLczHbCaVzYR+/JBXG4JekwtjV08O4T+CQpmrLAKjmz+DvYdwncEhTtWUAVPNnV89WjPMEDmk6nsmqDAa/JBXG4JekwtjH38XBLUklMPi7OLglqQR29Uzh4JakcWfwS1JhDH5JKozBL0mFMfglqTC1BX9EfCoiNkfEzV37FkXEpRFxe/V7YV3HlyRNr85P/GcCR0/ZdyqwPjMPANZXlyVJDarte/yZeVVELJ+y+3jg8Gr7LOAKYG1dNXRztU1JW1PSBM6mJ3DtlZmbADJzU0T0/LJ8RKwB1gAsW7Zs3gd2tU1JW1PSBM6RHdzNzHWZuSozV01MTAzkMV1tU9LWlDKBs+ngvzcilgJUvzc3fHxJKl7TwX8hsLraXg1c0PDxJal4dX6d82zgamBlRNwdEScDpwFHRcTtwFHVZUlSg+r8Vs/relx1ZF3HlCTNbGQHdyVJ9TD4JakwBr8kFcbgl6TCGPySVBiDX5IKY/BLUmGaXqStUa7IKUlPNtbB74qckvRkY9/V44qckvREYx/8kqQnMvglqTAGvyQVxuCXpMIY/JJUGINfkgpj8EtSYQx+SSqMwS9JhTH4JakwBr8kFcbgl6TCGPySVBiDX5IKY/BLUmEMfkkqjMEvSYUx+CWpMAa/JBXG4Jekwhj8klQYg1+SCmPwS1JhDH5JKozBL0mFMfglqTAGvyQVZijBHxFHR8RtEbExIk4dRg2SVKrGgz8iFgCnA8cABwKvi4gDm65Dkkq17RCOeTCwMTPvAIiIzwHHA9+v42AP338PAL/86Wa2+fWveXCHpwxsu67Hbfu2z4vPi8/LYJ6XTn49b+C5OIzg3wf4cdflu4EXTr1RRKwB1lQXH4qI22Z5nCXA/XOqsL1KbDOU2W7bXIhnfexd82n3ftPtHEbwxzT78kk7MtcB6+Z8kIjrMnPVXO/fRiW2Gcpst20uRx3tHsbg7t3A07ou7wvcM4Q6JKlIwwj+7wAHRMT+EbE98FrgwiHUIUlFaryrJzMfiYi3AV8FFgCfysxbajjUnLuJWqzENkOZ7bbN5Rh4uyPzSd3rkqQx5sxdSSqMwS9JhWl18M+09EN0fLS6/qaIeP4w6hy0Ptp9YtXemyLiWxHx3GHUOUj9LvMRES+IiC0R8aom66tLP+2OiMMj4oaIuCUirmy6xkHr4/29e0RcFBE3Vm1+0zDqHKSI+FREbI6Im3tcP9gsy8xW/tAZGP5v4OnA9sCNwIFTbnMs8B905g4cAlw77LobavehwMJq+5i2t7ufNnfd7jLgy8Crhl13Q6/1HnRmvS+rLu857LobaPN7gA9U2xPAT4Dth137PNv9EuD5wM09rh9olrX5E/9jSz9k5m+AyaUfuh0PfDo7rgH2iIilTRc6YDO2OzO/lZk/rS5eQ2euRJv181oDvB04D9jcZHE16qfdfwKcn5l3AWRm29veT5sT2DUiAtiFTvA/0myZg5WZV9FpRy8DzbI2B/90Sz/sM4fbtM1s23QynU8KbTZjmyNiH+AVwCcarKtu/bzWzwQWRsQVEbEhIk5qrLp69NPmfwaeTWfi5/eAUzLz0WbKG5qBZtkwlmwYlH6WfuhreYiW6btNEfFSOsH/4lorql8/bf4wsDYzt3Q+CI6Fftq9LfC7wJHAjsDVEXFNZv5X3cXVpJ82/z5wA3AE8Azg0oj4emY+WHNtwzTQLGtz8Pez9MM4Lg/RV5si4neAM4BjMvOBhmqrSz9tXgV8rgr9JcCxEfFIZn6pkQrr0e97/P7MfBh4OCKuAp4LtDX4+2nzm4DTstP5vTEifgg8C/h2MyUOxUCzrM1dPf0s/XAhcFI1In4I8PPM3NR0oQM2Y7sjYhlwPvCGFn/y6zZjmzNz/8xcnpnLgS8Ab2l56EN/7/ELgN+LiG0jYic6K93e2nCdg9RPm++i8z8cImIvYCVwR6NVNm+gWdbaT/zZY+mHiHhzdf0n6Hy741hgI/ALOp8UWq3Pdr8XWAx8vPoE/Ei2eFXDPts8dvppd2beGhFfAW4CHgXOyMxpvxLYBn2+1u8HzoyI79HpAlmbma1erjkizgYOB5ZExN3A3wHbQT1Z5pINklSYNnf1SJLmwOCXpMIY/JJUGINfkgpj8EtSYQx+aSsiorVfeZZ6MfjVWhFxUrVE7Y0R8ZmI2C8i1lf71kfEsmoJ3x9FxDbVfXaKiB9HxHYR8YyI+Eq1xs3XI+JZ1W3OjIgPRcTlwAci4uBqeevvVr9Xdj3WOdXxPh8R10bEquq6l0fE1RFxfUScGxG7bKUdp0XE96vH+WC1byIizouI71Q/h1X7F0fE16paPhkRd0bEkpqfao2bYS9H6o8/c/kBfgu4DVhSXV4EXASsri7/KfClavsC4KXV9mvoTHICWA8cUG2/ELis2j4TuBhYUF3eDdi22n4ZcF61/VfAJ6vt59BZIXIVnSUjrgJ2rq5bC7y3RzsWVe2YnFOzR/X734EXV9vLgFur7Y9OPhZwHJ31WpYM+/Xwp10//jdWbXUE8IWsZmxm5k8i4kXAK6vrPwP8U7X9eTqBfzmdJQA+Xn0CPxQ4t2tRt6d0Pf65mbml2t4dOCsiDqATtNtV+18MfKQ6/s0RcVO1/xDgQOCb1WNvD1zdox0PAr8CzoiIS+j8wYHOH5gDu2rbLSJ2pbNu+yurY14SET9FmiWDX20VzLw64eT1FwL/GBGL6KxkeRmwM/CzzDyox30f7tp+P3B5Zr4iIpYDV3TV0Ku2SzPzdTPUR3aWKDiYztozrwXeRueP2jbAizLzl0944M4fAqfba17s41dbrQdOiIjFAFWof4tOeAKcCHwDIDMforNy40eAizNzS3aW8P1hRLy6un9E71NU7g78T7X9xq793wBOqO5/IPDb1f5rgMMiYkV13U4R8czpHrj6n8fumfll4J3AQdVVX6PzR2DydpP7r6raRkQcAyzsUbPUk8GvVsrMW4B/AK6MiBuBDwHvAN5Udbm8ATil6y6fB15f/Z50InBydf9bmP6sXtDpMvrHiPgmnYXDJn0cmKiOt5bOQmk/z8z76PyBOLu67ho6ywZPZ1fg4up2VwJ/Ue1/B7CqGvD9PvDmav/7gJdExPXAy+msVCnNiou0SXMUEQuA7TLzVxHxDDr/C3lmdk4Z2FQNPwJWZctXp1Sz7OOX5m4n4PKI2I5Ov/6fNxn60lz5iV9qSER8Edh/yu61mfnVYdSjchn8klQYB3clqTAGvyQVxuCXpMIY/JJUGINfkgrz/zFDS3B2j8IhAAAAAElFTkSuQmCC\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], "source": [ - "Of all proteins that are not completely overlapping, the coverage of GO terms overlapping within the GO terms of *strucural* annotations is around 65 %. " + "sns.histplot(GOs[GOs['overlap'] == 'partial overlap - unique GOs']['coverage_seq'], bins=100)" ] }, { "cell_type": "code", - "execution_count": 256, - "id": "582eaae9-25f8-4be3-99f0-aef7a0c438a0", + "execution_count": 129, + "id": "7036de66-bc32-4445-a88c-53ba9d37dae3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.48056632457400206" + ] + }, + "execution_count": 129, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.mean(GOs[GOs['overlap'] == 'partial overlap - unique GOs']['coverage_seq'])" + ] + }, + { + "cell_type": "markdown", + "id": "5cd43819-c561-40dd-bbf8-226e2830ba4f", "metadata": {}, + "source": [ + "Within the 'partial overlap - unique GOs' category, sequence GO terms on average contain around 50 % of the overlapping GO terms. This is very similar to the strucutral GO terms." + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "id": "ae564ff4-2ed4-48bf-83c3-ca9f27cf7acb", + "metadata": { + "tags": [] + }, "outputs": [ { "data": { @@ -448,13 +534,13 @@ "<AxesSubplot:xlabel='coverage_seq', ylabel='Count'>" ] }, - "execution_count": 256, + "execution_count": 130, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "<Figure size 432x288 with 1 Axes>" ] @@ -466,36 +552,144 @@ } ], "source": [ - "sns.histplot(GOs[GOs['overlap'] != 'complete overlap']['coverage_seq'], bins=100)" + "sns.histplot(GOs[GOs['overlap'] == 'partial overlap - sequence GOs expanded']['coverage_seq'], bins=200)" ] }, { "cell_type": "code", - "execution_count": 257, - "id": "cc844026-2562-4ee2-a107-95a68c627a40", + "execution_count": 131, + "id": "93c9e292-8a15-4fcb-9695-bfc77e13048c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.6510767765619442" + "0.7126174034458369" ] }, - "execution_count": 257, + "execution_count": 131, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "np.mean(GOs[GOs['overlap'] != 'complete overlap']['coverage_seq'])" + "np.mean(GOs[GOs['overlap'] == 'partial overlap - sequence GOs expanded']['coverage_seq'])" ] }, { "cell_type": "markdown", - "id": "b6659ef1-6b7c-4a5b-b50e-0ff4ab593783", + "id": "a1b232a6-c2e4-494e-ae1a-b4a84173efa4", + "metadata": {}, + "source": [ + "In the cases in which sequence annotations produce more GO terms (but have all structure GO terms included), the distribution shows that on average, structure GO terms make up around 70 % of the sequence GO terms." + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "id": "75c5c9ba-9bfa-4934-8656-a43ac2bcb0e7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<AxesSubplot:xlabel='coverage_struct', ylabel='Count'>" + ] + }, + "execution_count": 132, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.histplot(GOs[GOs['overlap'] == 'partial overlap - structure GOs expanded']['coverage_struct'], bins=200)" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "id": "143b5d56-fad4-449e-ae57-4c35afa6fefd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6018304051791242" + ] + }, + "execution_count": 133, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.mean(GOs[GOs['overlap'] == 'partial overlap - structure GOs expanded']['coverage_struct'])" + ] + }, + { + "cell_type": "markdown", + "id": "b4fec2e2-2ce6-44cd-be93-3a223d930475", + "metadata": {}, + "source": [ + "In the cases in which structure annotations produce more GO terms (but have all sequence GO terms included), the distribution shows that on average, sequence GO terms make up around 60 % of the sequence GO terms. Structure produced in these cases 40 % more GO terms. " + ] + }, + { + "cell_type": "markdown", + "id": "cec19425-3e48-4df5-bda0-b8a93b4fb3bd", + "metadata": {}, + "source": [ + "In this analysis, we solely compare the identifier and number of GO terms, however not their similarity. To compare the similarity of GO term annotations, we focus on semantic similarity of GO terms from annotations pairs in the 'partial overlap' category:" + ] + }, + { + "cell_type": "markdown", + "id": "0b9f8227-8625-4e53-88f0-84f7564cbb21", + "metadata": {}, + "source": [ + "## GO term semantic similarity for partial GO term overlaps between structure and sequence based protein annotations" + ] + }, + { + "cell_type": "code", + "execution_count": 258, + "id": "119a9f49-871c-445c-a729-db5fe80d0cf4", + "metadata": {}, + "outputs": [], + "source": [ + "partial = (GOs['overlap'] == 'partial overlap - unique GOs') |( GOs['overlap'] == 'partial overlap - structure GOs expanded') | (GOs['overlap'] == 'partial overlap - sequence GOs expanded')" + ] + }, + { + "cell_type": "code", + "execution_count": 259, + "id": "2123153f-fec7-41d7-a6ec-5f8e50f3b51e", + "metadata": {}, + "outputs": [], + "source": [ + "GOs_partial = GOs[partial]" + ] + }, + { + "cell_type": "code", + "execution_count": 213, + "id": "2a78b465-9630-4b42-976a-d94ecf6f6ff5", "metadata": {}, + "outputs": [], "source": [ - "Of all proteins that are not completely overlapping, the coverage of GO terms overlapping within the GO terms of sequence annotations is again around 65 %. " + "GOs_partial_unique = GOs_partial[GOs_partial['overlap'] == 'partial overlap - unique GOs']" ] }, { @@ -503,7 +697,7 @@ "id": "2531a431-6d03-4c4b-bb22-86ef4755c6ce", "metadata": {}, "source": [ - "## Check semantic similarity of GO terms of sequence-structure protein pairs" + "### Check semantic similarity of GO terms of partial overlapping sequence-structure protein pairs with unique GO terms on both sides" ] }, { @@ -516,31 +710,32 @@ }, { "cell_type": "code", - "execution_count": 258, - "id": "3d0dbc57-a86a-4898-9bc0-61970b379817", + "execution_count": 137, + "id": "bb190be1-e0c8-440d-92bf-fea4b44c7374", "metadata": {}, "outputs": [], "source": [ - "GOs_GOGO = GOs[['GOs_struct', 'GOs_seq']]" + "GOs_partial_unique_GOGO = GOs_partial_unique[['GOs_struct', 'GOs_seq']]" ] }, { "cell_type": "code", - "execution_count": 259, - "id": "c13c5605-d293-4558-a9d1-5551ef57fb1b", + "execution_count": 138, + "id": "9dc03262-9257-4447-80d6-304743793b4a", "metadata": {}, "outputs": [], "source": [ "# iterate over each row and each column\n", - "for i, row in GOs_GOGO.iterrows():\n", - " for col in GOs_GOGO.columns:\n", + "for i, row in GOs_partial_unique_GOGO.iterrows():\n", + " for col in GOs_partial_unique_GOGO.columns:\n", " # separate the elements in the list with a space\n", - " GOs_GOGO.at[i, col] = \" \".join(str(x) for x in row[col])\n", - " # add the index of the row to the beginning of the list\n", - " GOs_GOGO.at[i, col] = str(i) + \" \" + GOs_GOGO.at[i, col]\n", + " GOs_partial_unique_GOGO.at[i, col] = \" \".join(str(x) for x in row[col])\n", + " # add the index of the row to the beginning of the list\n", + " GOs_partial_unique_GOGO.at[i, 'GOs_struct'] = str(i) + \"_struct\" + \" \" + GOs_partial_unique_GOGO.at[i, 'GOs_struct']\n", + " GOs_partial_unique_GOGO.at[i, 'GOs_seq'] = str(i) + \"_seq\" + \" \" + GOs_partial_unique_GOGO.at[i, 'GOs_seq']\n", "\n", "# save the dataframe to a txt file without the header and index\n", - "GOs_GOGO.to_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/GOGO_input.txt\", sep=\";\", index=False, header=False)" + "GOs_partial_unique_GOGO.to_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/GOGO_partial_unique_input.txt\", sep=\";\", index=False, header=False)" ] }, { @@ -548,73 +743,73 @@ "id": "5958c38f-58b8-4c2e-9fe3-f5ffceba9da0", "metadata": {}, "source": [ - "I ran GOGO locally: `perl gene_pair_comb.pl ~/Desktop/GOGO_input.txt ~/Desktop/GOGO_input_result.txt`" + "I ran GOGO locally: `perl gene_pair_comb.pl ~/Desktop/GOGO_partial_unique_input.txt ~/Desktop/GOGO_partial_unique_input_result.txt`" ] }, { "cell_type": "code", - "execution_count": 260, + "execution_count": 139, "id": "101e4b9a-e315-406c-8cee-ba1115e55208", "metadata": {}, "outputs": [], "source": [ - "GOGO_result = pd.read_csv('/g/arendt/Fabian/PhD/Computational/Spongefold/GOGO_input_result.txt', sep=';', header=None)" + "GOGO_result_unique = pd.read_csv('/g/arendt/Fabian/PhD/Computational/Spongefold/GOGO_partial_unique_input_result.txt', sep=';', header=None)" ] }, { "cell_type": "code", - "execution_count": 261, + "execution_count": 140, "id": "23985e19-becf-40ed-8047-cbcd3b1258f4", "metadata": {}, "outputs": [], "source": [ - "GOGO_result.rename(columns={0 :'GOs_struct', 1 :'GOs_seq'}, inplace=True)" + "GOGO_result_unique.rename(columns={0 :'GOs_struct', 1 :'GOs_seq'}, inplace=True)" ] }, { "cell_type": "code", - "execution_count": 262, + "execution_count": 141, "id": "854e6e6d-5834-418e-8fcc-24c33846a5ef", "metadata": {}, "outputs": [], "source": [ - "GOGO_result['GOs_struct'] = GOGO_result['GOs_struct'].str.split(' ').str[1:]" + "GOGO_result_unique['GOs_struct'] = GOGO_result_unique['GOs_struct'].str.split(' ').str[1:]" ] }, { "cell_type": "code", - "execution_count": 263, + "execution_count": 142, "id": "60e1b05f-761a-4153-a38f-586143883f34", "metadata": {}, "outputs": [], "source": [ - "GOGO_result['GOs_seq'] = GOGO_result['GOs_seq'].str.split(' ').str[1:]" + "GOGO_result_unique['GOs_seq'] = GOGO_result_unique['GOs_seq'].str.split(' ').str[1:]" ] }, { "cell_type": "code", - "execution_count": 264, + "execution_count": 143, "id": "86603707-9eab-4fee-ade7-42077ed24068", "metadata": {}, "outputs": [], "source": [ "# create a new column in the dataframe\n", - "GOGO_result['BPO'] = None\n", - "GOGO_result['CCO'] = None\n", - "GOGO_result['MFO'] = None\n", + "GOGO_result_unique['BPO'] = None\n", + "GOGO_result_unique['CCO'] = None\n", + "GOGO_result_unique['MFO'] = None\n", "\n", "# iterate over each row in the dataframe\n", - "for i, row in GOGO_result.iterrows():\n", + "for i, row in GOGO_result_unique.iterrows():\n", " # get the fifth last element of the list in col1\n", " # and store it in the new column\n", - " GOGO_result.at[i, 'BPO'] = row['GOs_seq'][-5]\n", - " GOGO_result.at[i, 'CCO'] = row['GOs_seq'][-3]\n", - " GOGO_result.at[i, 'MFO'] = row['GOs_seq'][-1]" + " GOGO_result_unique.at[i, 'BPO'] = row['GOs_seq'][-5]\n", + " GOGO_result_unique.at[i, 'CCO'] = row['GOs_seq'][-3]\n", + " GOGO_result_unique.at[i, 'MFO'] = row['GOs_seq'][-1]" ] }, { "cell_type": "code", - "execution_count": 265, + "execution_count": 144, "id": "fe401867-0ab5-46df-88ba-4b560b00cf7d", "metadata": {}, "outputs": [ @@ -649,43 +844,43 @@ " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>[GO:0000902, GO:0000904, GO:0001654, GO:000174...</td>\n", - " <td>[GO:0000902, GO:0000904, GO:0001654, GO:000174...</td>\n", - " <td>1.000</td>\n", - " <td>1.000</td>\n", - " <td>1.000</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", " <td>[GO:0003674, GO:0005215]</td>\n", " <td>[GO:0000166, GO:0003674, GO:0003676, GO:000372...</td>\n", " <td>NA</td>\n", " <td>NA</td>\n", - " <td>1.000</td>\n", + " <td>0.414</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>[GO:0001101, GO:0005575, GO:0005623, GO:000588...</td>\n", + " <td>[GO:0003674, GO:0003824, GO:0004721, GO:000472...</td>\n", + " <td>0.410</td>\n", + " <td>0.635</td>\n", + " <td>NA</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", - " <td>[GO:0001539, GO:0003674, GO:0003774, GO:000377...</td>\n", - " <td>[GO:0001539, GO:0003674, GO:0003774, GO:000377...</td>\n", - " <td>1.000</td>\n", - " <td>1.000</td>\n", - " <td>1.000</td>\n", + " <td>[GO:0003674, GO:0003824, GO:0004721, GO:000557...</td>\n", + " <td>[GO:0000003, GO:0000079, GO:0000082, GO:000008...</td>\n", + " <td>0.598</td>\n", + " <td>0.704</td>\n", + " <td>0.820</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", - " <td>[GO:0000323, GO:0001959, GO:0002682, GO:000367...</td>\n", - " <td>[GO:0000323, GO:0001959, GO:0002682, GO:000367...</td>\n", - " <td>1.000</td>\n", - " <td>1.000</td>\n", - " <td>1.000</td>\n", + " <td>[GO:0001731, GO:0002181, GO:0002183, GO:000367...</td>\n", + " <td>[GO:0001731, GO:0002181, GO:0002183, GO:000367...</td>\n", + " <td>0.872</td>\n", + " <td>0.822</td>\n", + " <td>0.897</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", - " <td>[GO:0000166, GO:0000323, GO:0001882, GO:000188...</td>\n", - " <td>[GO:0005575, GO:0005622, GO:0005623, GO:000573...</td>\n", - " <td>1.000</td>\n", - " <td>1.000</td>\n", - " <td>1.000</td>\n", + " <td>[GO:0003674, GO:0005215, GO:0005216, GO:000522...</td>\n", + " <td>[GO:0003674, GO:0005215, GO:0005216, GO:000522...</td>\n", + " <td>0.759</td>\n", + " <td>0.869</td>\n", + " <td>0.961</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", @@ -696,466 +891,1285 @@ " <td>...</td>\n", " </tr>\n", " <tr>\n", - " <th>11627</th>\n", - " <td>[GO:0000003, GO:0000165, GO:0000166, GO:000156...</td>\n", - " <td>[GO:0000003, GO:0000165, GO:0001654, GO:000170...</td>\n", - " <td>1.000</td>\n", - " <td>1.000</td>\n", - " <td>1.000</td>\n", + " <th>2355</th>\n", + " <td>[GO:0000003, GO:0000302, GO:0001501, GO:000155...</td>\n", + " <td>[GO:0003674, GO:0005488, GO:0005515, GO:000554...</td>\n", + " <td>0.575</td>\n", + " <td>0.785</td>\n", + " <td>0.992</td>\n", " </tr>\n", " <tr>\n", - " <th>11628</th>\n", - " <td>[GO:0003674, GO:0003676, GO:0003723, GO:000548...</td>\n", - " <td>[GO:0003674, GO:0003676, GO:0003723, GO:000548...</td>\n", - " <td>1.000</td>\n", - " <td>1.000</td>\n", - " <td>1.000</td>\n", + " <th>2356</th>\n", + " <td>[GO:0000228, GO:0000785, GO:0000790, GO:000557...</td>\n", + " <td>[GO:0000428, GO:0003674, GO:0003824, GO:000389...</td>\n", + " <td>0.425</td>\n", + " <td>0.889</td>\n", + " <td>NA</td>\n", " </tr>\n", " <tr>\n", - " <th>11629</th>\n", - " <td>[GO:0000012, GO:0000166, GO:0000228, GO:000072...</td>\n", - " <td>[GO:0000012, GO:0000166, GO:0000228, GO:000072...</td>\n", - " <td>1.000</td>\n", - " <td>1.000</td>\n", + " <th>2357</th>\n", + " <td>[GO:0000138, GO:0000139, GO:0000226, GO:000367...</td>\n", + " <td>[GO:0000138, GO:0003674, GO:0005488, GO:000551...</td>\n", + " <td>0.552</td>\n", + " <td>0.923</td>\n", " <td>1.000</td>\n", " </tr>\n", " <tr>\n", - " <th>11630</th>\n", - " <td>[GO:0000003, GO:0000578, GO:0001700, GO:000300...</td>\n", - " <td>[GO:0000003, GO:0000578, GO:0001700, GO:000300...</td>\n", - " <td>1.000</td>\n", - " <td>1.000</td>\n", - " <td>1.000</td>\n", + " <th>2358</th>\n", + " <td>[GO:0000323, GO:0000902, GO:0000904, GO:000367...</td>\n", + " <td>[GO:0001501, GO:0001894, GO:0001932, GO:000193...</td>\n", + " <td>0.644</td>\n", + " <td>0.701</td>\n", + " <td>0.891</td>\n", " </tr>\n", " <tr>\n", - " <th>11631</th>\n", - " <td>[GO:0000151, GO:0000209, GO:0000226, GO:000367...</td>\n", - " <td>[GO:0000151, GO:0000209, GO:0003674, GO:000382...</td>\n", - " <td>1.000</td>\n", - " <td>1.000</td>\n", - " <td>1.000</td>\n", + " <th>2359</th>\n", + " <td>[GO:0000122, GO:0000149, GO:0000165, GO:000018...</td>\n", + " <td>[GO:0001944, GO:0001945, GO:0001946, GO:000367...</td>\n", + " <td>0.665</td>\n", + " <td>0.851</td>\n", + " <td>0.825</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", - "<p>11632 rows × 5 columns</p>\n", + "<p>2360 rows × 5 columns</p>\n", "</div>" ], "text/plain": [ - " GOs_struct \\\n", - "0 [GO:0000902, GO:0000904, GO:0001654, GO:000174... \n", - "1 [GO:0003674, GO:0005215] \n", - "2 [GO:0001539, GO:0003674, GO:0003774, GO:000377... \n", - "3 [GO:0000323, GO:0001959, GO:0002682, GO:000367... \n", - "4 [GO:0000166, GO:0000323, GO:0001882, GO:000188... \n", - "... ... \n", - "11627 [GO:0000003, GO:0000165, GO:0000166, GO:000156... \n", - "11628 [GO:0003674, GO:0003676, GO:0003723, GO:000548... \n", - "11629 [GO:0000012, GO:0000166, GO:0000228, GO:000072... \n", - "11630 [GO:0000003, GO:0000578, GO:0001700, GO:000300... \n", - "11631 [GO:0000151, GO:0000209, GO:0000226, GO:000367... \n", + " GOs_struct \\\n", + "0 [GO:0003674, GO:0005215] \n", + "1 [GO:0001101, GO:0005575, GO:0005623, GO:000588... \n", + "2 [GO:0003674, GO:0003824, GO:0004721, GO:000557... \n", + "3 [GO:0001731, GO:0002181, GO:0002183, GO:000367... \n", + "4 [GO:0003674, GO:0005215, GO:0005216, GO:000522... \n", + "... ... \n", + "2355 [GO:0000003, GO:0000302, GO:0001501, GO:000155... \n", + "2356 [GO:0000228, GO:0000785, GO:0000790, GO:000557... \n", + "2357 [GO:0000138, GO:0000139, GO:0000226, GO:000367... \n", + "2358 [GO:0000323, GO:0000902, GO:0000904, GO:000367... \n", + "2359 [GO:0000122, GO:0000149, GO:0000165, GO:000018... \n", "\n", - " GOs_seq BPO CCO MFO \n", - "0 [GO:0000902, GO:0000904, GO:0001654, GO:000174... 1.000 1.000 1.000 \n", - "1 [GO:0000166, GO:0003674, GO:0003676, GO:000372... NA NA 1.000 \n", - "2 [GO:0001539, GO:0003674, GO:0003774, GO:000377... 1.000 1.000 1.000 \n", - "3 [GO:0000323, GO:0001959, GO:0002682, GO:000367... 1.000 1.000 1.000 \n", - "4 [GO:0005575, GO:0005622, GO:0005623, GO:000573... 1.000 1.000 1.000 \n", - "... ... ... ... ... \n", - "11627 [GO:0000003, GO:0000165, GO:0001654, GO:000170... 1.000 1.000 1.000 \n", - "11628 [GO:0003674, GO:0003676, GO:0003723, GO:000548... 1.000 1.000 1.000 \n", - "11629 [GO:0000012, GO:0000166, GO:0000228, GO:000072... 1.000 1.000 1.000 \n", - "11630 [GO:0000003, GO:0000578, GO:0001700, GO:000300... 1.000 1.000 1.000 \n", - "11631 [GO:0000151, GO:0000209, GO:0003674, GO:000382... 1.000 1.000 1.000 \n", + " GOs_seq BPO CCO MFO \n", + "0 [GO:0000166, GO:0003674, GO:0003676, GO:000372... NA NA 0.414 \n", + "1 [GO:0003674, GO:0003824, GO:0004721, GO:000472... 0.410 0.635 NA \n", + "2 [GO:0000003, GO:0000079, GO:0000082, GO:000008... 0.598 0.704 0.820 \n", + "3 [GO:0001731, GO:0002181, GO:0002183, GO:000367... 0.872 0.822 0.897 \n", + "4 [GO:0003674, GO:0005215, GO:0005216, GO:000522... 0.759 0.869 0.961 \n", + "... ... ... ... ... \n", + "2355 [GO:0003674, GO:0005488, GO:0005515, GO:000554... 0.575 0.785 0.992 \n", + "2356 [GO:0000428, GO:0003674, GO:0003824, GO:000389... 0.425 0.889 NA \n", + "2357 [GO:0000138, GO:0003674, GO:0005488, GO:000551... 0.552 0.923 1.000 \n", + "2358 [GO:0001501, GO:0001894, GO:0001932, GO:000193... 0.644 0.701 0.891 \n", + "2359 [GO:0001944, GO:0001945, GO:0001946, GO:000367... 0.665 0.851 0.825 \n", "\n", - "[11632 rows x 5 columns]" + "[2360 rows x 5 columns]" ] }, - "execution_count": 265, + "execution_count": 144, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "GOGO_result" + "GOGO_result_unique" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "id": "c7dfe425-7158-4f16-97a1-eb6ea0542d03", + "metadata": {}, + "outputs": [], + "source": [ + "GOGO_result_unique_BPO = GOGO_result_unique['BPO'][GOGO_result_unique['BPO'] != 'NA'].astype('float')" ] }, { "cell_type": "code", - "execution_count": 266, + "execution_count": 146, "id": "05de5391-becd-4500-bac2-450632bbf482", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(1.000 11161\n", - " NA 471\n", - " Name: BPO, dtype: int64,\n", - " 1.000 11159\n", - " NA 473\n", - " Name: CCO, dtype: int64,\n", - " 1.000 10090\n", - " NA 1542\n", - " Name: MFO, dtype: int64)" + "<AxesSubplot:xlabel='BPO', ylabel='Count'>" ] }, - "execution_count": 266, + "execution_count": 146, "metadata": {}, "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX4AAAEGCAYAAABiq/5QAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAWZ0lEQVR4nO3de7BddXXA8e8iiLwRSMAIxosFBXQq2iu16nSo2Jb6aNQK1T7MWKaZPqcdHQvaae1Mp1P6mI4VrU7aOkJrVaoi+LYNRaatVRMbFRU0pkgpKQHEB8hoc1n94+wbTm7uY597z36c8/t+ZjL3PO9ed+dkZd/1+/3WLzITSVI5Dus6AElSu0z8klQYE78kFcbEL0mFMfFLUmEO7zqAOtavX58zMzNdhyFJE2Xnzp33ZOaGhY9PROKfmZlhx44dXYchSRMlIr6+2OOWeiSpMCZ+SSqMiV+SCmPil6TCmPglqTAmfkkqjIlfkgpj4pekwkzEAi5JMDc3x+7duw/cP/PMM1m3bl2HEWlSmfilCbF79262vvlDHLP+MTxwz51s+/Xn88QnPrHrsDSBGk38EXEb8B1gDtifmbMRcRLwbmAGuA24JDPvazIOaVocs/4xHP/ox3UdhiZcGzX+H8vM8zJztrp/ObA9M88Ctlf3JUkt6WJwdzNwVXX7KuBFHcQgScVqOvEn8PGI2BkRW6vHTs3MvQDV11MWe2NEbI2IHRGx4+677244TEkqR9ODu8/KzDsj4hTgnyLilrpvzMxtwDaA2dnZbCpASSpNo1f8mXln9XUfcC1wPnBXRGwEqL7uazIGSdLBGkv8EXFMRBw3fxv4CeBm4HpgS/WyLcB1TcUgSTpUk6WeU4FrI2L+OP+QmR+NiM8A10TEpcDtwMUNxiBJWqCxxJ+Ze4CnLPL4vcCFTR1XkrQ8e/VIUmFM/JJUGBO/JBXGxC9JhTHxS1JhbMusqWX/emlxJn5NLfvXS4sz8Wuq2b9eOpQ1fkkqjIlfkgpj4pekwpj4JakwJn5JKoyzeqQJlA89xJ49ew7cd42CRmHilybQA9/4X/7g/V/n5NO+5RoFjczEL02oY052jYJWxxq/JBXGxC9JhTHxS1JhTPySVBgTvyQVxsQvSYUx8UtSYUz8klQYE78kFcaVu9II3MdX08DEL43AfXw1DUz80ojcx1eTzhq/JBXGK35pEdNQy5+Gn0HNMPFLi5iGWv40/AxqRuOJPyLWATuA/8nMF0TEScC7gRngNuCSzLyv6TikUU1DLX8afgaNXxs1/t8Cvjx0/3Jge2aeBWyv7kuSWtLoFX9EnA48H/gj4FXVw5uBC6rbVwE3Apc1GYfUhOF9b+fm5gAO1NCtp6vPmi71vAH4HeC4ocdOzcy9AJm5NyJOWeyNEbEV2AqwadOmhsOURje87+3dX93FYUefwMmnnWE9Xb3XWKknIl4A7MvMnat5f2Zuy8zZzJzdsGHDmKOTxmN+39ujTjzlwO1j1j+m67CkZTV5xf8s4Kcj4nnAkcDxEfH3wF0RsbG62t8I7GswBknSAo1d8WfmazPz9MycAV4G3JCZvwBcD2ypXrYFuK6pGCQNzI9H3Hrrrdx6660HxiRUpi7m8V8BXBMRlwK3Axd3EINUlOHxCMcg1Eriz8wbGczeITPvBS5s47iSHjY/BiHZq0eSCmPil6TC2KtH6rHhRmt79uwhc+3fc3jhGbjYrEQmfqnHhhut3f3VXRz32HPW/D0d6JWJX+q5+UZr999z5/i+pwO9RbPGL0mF8YpfE6nPm4xYQ1ffmfg1kfq8yYg1dPWdiV8Tq8+bjFhDV59Z45ekwnjFr870uU4/bLhmv9a59H37mR2PKJOJX53pc51+2MINV9Yyl75vP7PjEWUy8atTfa7TD5uv2Y9jLn3ffmbHI8pjjV+SCuMVv4rTtzq71DYTv4rTtzq71DYTv4rUtzq71CZr/JJUGK/4pQYtnCc/rp760lqY+KUGDc+TB8bWU19aCxO/1LDhefLj7KkvrZY1fkkqjFf8Ug9M4tqCSYxZAyZ+qQcmcW3BJMasARO/1BOTuLZgEmOWNX5JKo5X/FqRtVyNws9L/5n4tSJruRqFn5f+M/GrFmu5GoWfl36zxi9JhfGKX4sartO20V+mxL1fx7mX77y2/95GYe2/P0z8WtRwnbaN/jIl7v06zr1857X99zYKa//90Vjij4gjgZuAR1bHeU9mvj4iTgLeDcwAtwGXZOZ9TcWh1Zuv07bVX6bEvV/HuZfvge/Z8t/bKKz990OTNf7vAc/JzKcA5wEXRcQzgMuB7Zl5FrC9ui9JakljV/yZmcD91d1HVH8S2AxcUD1+FXAjcFlTcagsa6lx97k+vpwmxgo03Rqt8UfEOmAncCbw5sz8VEScmpl7ATJzb0ScssR7twJbATZt2tRkmJoia6lx97k+vpwmxgo03WqVeiLiWXUeWygz5zLzPOB04PyIeHLdwDJzW2bOZubshg0b6r5NOlBHPurERa8pGntvl+bHCiYtbnWjbo3/ypqPLSozv8mgpHMRcFdEbASovu6r+30kSWu3bKknIn4EeCawISJeNfTU8cCyE3AjYgPwf5n5zYg4Cngu8CfA9cAW4Irq63WrD19SE5xzP91WqvEfARxbve64oce/Dbx0hfduBK6q6vyHAddk5gcj4pPANRFxKXA7cPGqIpfUGOfcT7dlE39mfgL4RES8PTO/Pso3zszPA09d5PF7gQtHilJS65xzP73qzup5ZERsY7Do6sB7MvM5TQQlSWpO3cT/j8Bbgb8B5poLRxqd89il0dRN/Psz8y2NRiKtkvPYpdHUnc75gYj4tYjYGBEnzf9pNDJpBM5jl+qre8W/pfr6mqHHEnj8eMORJDWtVuLPzDOaDkSTbeG8b3Dut9bO9QTNqJX4I+IViz2emVePNxxNquF534BzvzUWridoRt1Sz9OHbh/JYB7+ZwETvw5w3rea4Odq/OqWen5z+H5EnAD8XSMRSZIatdq2zN8FzhpnIBoPa6Kj6eMagK5iauK4fTy/ql/j/wCDWTwwaM52DnBNU0Fp9ayJjqaPawC6iqmJ4/bx/Kr+Ff+fD93eD3w9M+9oIB6NgTXR0TSx7+1adRVTI3sA9/D8lq7WAq6qWdstDDp0ngh8v8mgJEnNqVvquQT4MwabqQRwZUS8JjPf02BsqqmPe8UO13ZhtLGGtbxX0srqlnp+F3h6Zu6DA5us/DNg4u+BPu4VO1zbHXWsYS3vlbSyuon/sPmkX7mX+n1+1IL5un6f6qjztd223ytpeXUT/0cj4mPAO6v7Pwt8uJmQJElNWmnP3TOBUzPzNRHxEuDZDGr8nwTe0UJ8kjRWrnVZ+Yr/DcDrADLzfcD7ACJitnruhQ3GJklj51qXlRP/TLV37kEyc0dEzDQTkiQ1q/S1LisN0B65zHNHjTMQSVI7Vrri/0xE/HJm/vXwgxFxKbCzubDUhabWAwzPy5+bG2zZvG7dut6sOdDyXFcxfVZK/L8NXBsRP8/DiX4WOAJ4cYNxqQNNrQdY2K/lsKNP4OTTzujNmgMtz3UV02fZxJ+ZdwHPjIgfA55cPfyhzLyh8cjUiabWAwz3azn82JN7t+ZAy3NdxXSp24//X4B/aTgWSVILVtuPX1KBSuyv3+a8/7b2rjbxS6qtxP76bc77b2vvahO/pJGU2F+/zXn/bRzLRmuSVBgTvyQVxlKPRuJiHmnyNZb4I+KxwNXAo4GHgG2Z+ZcRcRLwbmAGuA24JDPvayoOjZeLeaTJ12SpZz/w6sw8B3gG8OsRcS5wObA9M88Ctlf3NUHmB/fmZx5ImiyNJf7M3JuZn61ufwf4MnAasBm4qnrZVcCLmopBknSoVmr8VQvnpwKfYrCxy14Y/OcQEae0EYMmk2MKmjfqZ8ENV5bWeOKPiGOB9wK/nZnfjoi679sKbAXYtGlTcwGq1xxT0LxRPwtuuLK0RhN/RDyCQdJ/R7WDF8BdEbGxutrfCOxb7L2ZuQ3YBjA7O1vAwnAtxQZhmjfqZ6H0DVeW0liNPwaX9n8LfDkz/2LoqeuBLdXtLcB1TcUgSTpUk1f8zwJ+EfhCROyqHnsdcAVwTbWZy+3AxQ3GoClSYoMwqQmNJf7M/FdgqYL+hU0dV9OrxAZhUhNs2aCJMl/jPepEJ4NJq2Xil6TC2KtngqxlXrJzmjVNXN+xNib+CbKWecnOadY0cX3H2pj4J8xa5iU7p1nTxPUdq2eNX5IK4xV/4YZr/86N12pN6hqLUscKTPyFG679OzdeqzWpayxKHSsw8etA7b+kzbM1fpO6CXuJYwXW+CWpMF7xSxJl9fs38UsSZfX7N/FLUqWUfv/W+CWpMF7xT6g+zJvuQwxSE6b9s23in1B9mDfdhxikJkz7Z9tSzwTrQ2/6PsQgNWGaP9smfkkqjKWelo0693ctvXSWqlNOe/1yMSX+zNJSTPwtG3Xu71p66SxVp5z2+uViSvyZpaWY+Dsw6tzftfTSWap/yqT2VVmLEn9maTHW+CWpMF7x98RwLX9ubg6AdevWWY+WRtDV/hKT1tffxN8TC2v5hx19Aiefdob1aGkEXe0vMWl9/U38PTJcyz/82JOtR0ur0NX+EpPU198avyQVxiv+NaozL999baWDLayJD49rQf9r5HX1tWe/iX+N6szLd19b6WDDNXHgoHGtSaiR19XXnv0m/jGoMy/ffW2lgw3XxIfHtaZNH3v2W+OXpMKY+CWpMCZ+SSpMY4k/It4WEfsi4uahx06KiH+KiK9WX09s6viSpMU1ecX/duCiBY9dDmzPzLOA7dV9SVKLGpvVk5k3RcTMgoc3AxdUt68CbgQuayqGJiycl7tUn3voz5xdaZI0sXfCuL5nE7F1sc6n7emcp2bmXoDM3BsRS+5pFhFbga0AmzZtaim8lQ3PywWW7HPfpzm70iRpYu+EcX3PJmLrYp1Pbwd3M3NbZs5m5uyGDRu6Ducg8/NyF9uPc35u8vx/DJJG18R+t+P6no3Etr7d/X3bTvx3RcRGgOrrvpaPL0nFa7vUcz2wBbii+npdy8df1jj7arjHq9SOvv1bq7PXNXQ7BthY4o+IdzIYyF0fEXcAr2eQ8K+JiEuB24GLmzr+aoyzr4Z7vErt6Nu/tTp7XXc9BtjkrJ6XL/HUhU0dcxzG2VfDPV6ldvTt39pKe113rbeDu5KkZhTfndNe+ZJKU3zit1e+pNJY6qH9ObSS1CUTvyQVpvhSz1LamHPbt/nHkspg4l9CG3Nu+zb/WFIZTPzLaGPObd/mH0uaftb4JakwXvHXYC1e0jh1nVNM/DVYi5c0Tl3nFEs9NTXRg1tSubrMKSZ+SSqMiV+SCmPil6TCmPglqTAmfkkqjIlfkgpj4pekwpj4JakwJn5JKoyJX5IKU2SvHjdYl1SyIhO/G6xLKlmxpR43WJdUqmITvySVaqpLPcO1fGhmw3RJmjRTnfiHa/lNbZguSZNmqhM/PFzLlyQNWOOXpMJM/RX/vK43N5akvigm8Xe9ubEk9UUnpZ6IuCgibo2I3RFxeVvHdcN0Seog8UfEOuDNwE8B5wIvj4hz245DkkrVRannfGB3Zu4BiIh3AZuBLzVxsAfuuROAB+/bx2Hf+x7fPvKRY7vd1Ped9NueF8+L52U852WQv5469rzYReI/Dfjvoft3AD+88EURsRXYWt29PyJuXcMx1wP3rOH9bTLW5kxSvJMUK0xWvJMUK2df+eq1xLvoXPYuEn8s8tghc2wycxuwbSwHjNiRmbPj+F5NM9bmTFK8kxQrTFa8kxQrNBNvF4O7dwCPHbp/OnBnB3FIUpG6SPyfAc6KiDMi4gjgZcD1HcQhSUVqvdSTmfsj4jeAjwHrgLdl5hcbPuxYSkYtMdbmTFK8kxQrTFa8kxQrNBBvpEtYJako9uqRpMKY+CWpMFOT+FdqAxERmyPi8xGxKyJ2RMSzu4hzKJ5abSsi4ukRMRcRL20zvgUxrHRuL4iIb1XndldE/H4XcVaxrHheq3h3RcQXI+ITbce4IJaVzu1rhs7rzdVn4aSexnpCRHwgIj5XndtXdhHnUDwrxXtiRFxb5YVPR8STu4iziuVtEbEvIm5e4vmIiDdWP8vnI+JpazpgZk78HwaDxF8DHg8cAXwOOHfBa47l4TGNHwRu6XO8Q6+7Afgw8NK+xgpcAHxwQj4Hj2KwSnxTdf+UPse74PUvBG7oa6zA64A/qW5vAL4BHNHjeP8MeH11+2xge4efhR8FngbcvMTzzwM+wmAd1DOAT63leNNyxX+gDURmfh+YbwNxQGben9UZBI5hkUVjLVox3spvAu8F9rUZ3AJ1Y+2DOrH+HPC+zLwdIDMn6dy+HHhnK5Edqk6sCRwXEcHgQusbwP52wzygTrznAtsBMvMWYCYiTm03zIHMvInB+VrKZuDqHPgP4FERsXG1x5uWxL9YG4jTFr4oIl4cEbcAHwJ+qaXYFrNivBFxGvBi4K0txrWYWucW+JHqV/yPRMST2gntEHVifQJwYkTcGBE7I+IVrUV3qLrnlog4GriIwYVAF+rE+ibgHAYLMr8A/FZmPtROeIeoE+/ngJcARMT5DNobnN5KdKOr/VmpY1oSf902ENdm5tnAi4A/bDqoZdSJ9w3AZZk513w4y6oT62eBx2XmU4Argfc3HdQS6sR6OPBDwPOBnwR+LyKe0HRgS6j1ua28EPi3zFzuqrBJdWL9SWAX8BjgPOBNEXF8s2EtqU68VzC4CNjF4Lfr/6S731BWMspnZUXTshHLSG0gMvOmiPiBiFifmV00a6oT7yzwrsFvzawHnhcR+zPz/a1E+LAVY83Mbw/d/nBE/FVH57bOeb0DuCczHwAeiIibgKcAX2knxENiqfu5fRndlXmgXqyvBK6oSqq7I+K/GNTOP91OiAep+7l9JQwGT4H/qv700Xhb3XQ1mDHmgZHDgT3AGTw8kPOkBa85k4cHd58G/M/8/T7Gu+D1b6e7wd065/bRQ+f2fOD2Ls5tzVjPYVDXPRw4GrgZeHJfz231uhMY1H+P6SLOEc7tW4A/qG6fWv0bW9/jeB9FNfgM/DKDGnon57eKYYalB3efz8GDu59ey7Gm4oo/l2gDERG/Uj3/VuBngFdExP8BDwI/m9UZ7Wm8vVAz1pcCvxoR+xmc25d1cW7rxJqZX46IjwKfBx4C/iYzF51C14d4q5e+GPh4Dn5L6UTNWP8QeHtEfIFBgrosu/mNum685wBXR8Qcg5lel3YRK0BEvJPB7Lj1EXEH8HrgEXAg1g8zmNmzG/gu1W8qqz5eR7lPktSRaRnclSTVZOKXpMKY+CWpMCZ+SSqMiV+SCmPil5ZQdcLcVbWi+GxEPLN6fCYiHqye+1JEvDUiDquee1JE3BARX4mIr0bE71WLg6TeMPFLS3swM8/LQSuK1wJ/PPTc1zLzPAadXs8FXhQRRzHYP/qKzHwCgxXBzwR+rd2wpeWZ+KV6jgfuW/hgZu4H/p3ByvCfY9BP5+PVc98FfgNYcr8FqQtTsXJXashRVQOvI4GNwHMWvqDqmnkh8PvAjwM7h5/PzK9FxLERcXwO9TSSuuQVv7S0+VLP2QxaIl89VK//geo/hX8DPpSZ831UlloK7xJ59YZX/FINmfnJiFjPYGcpeLjGP+yLDHZSOiAiHg/cn5nfaT5KqR6v+KUaIuJsBs2+7l3mZe8Anh0Rz63ecxTwRuBPm49Qqs8rfmlp8zV+GJRxtmTm3FKzMzPzwYjYDFwZEW9m8B/F3zHYmUrqDbtzSlJhLPVIUmFM/JJUGBO/JBXGxC9JhTHxS1JhTPySVBgTvyQV5v8BFD0myOV4G3sAAAAASUVORK5CYII=\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" } ], "source": [ - "GOGO_result['BPO'].value_counts(),GOGO_result['CCO'].value_counts(),GOGO_result['MFO'].value_counts()" + "sns.histplot(GOGO_result_unique_BPO, bins=100)" ] }, { - "cell_type": "markdown", - "id": "3582746f-d5cb-4bd4-9163-7da1b4889d70", + "cell_type": "code", + "execution_count": 147, + "id": "d717eedf-d4db-48c2-a2ed-baa0529c40a1", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6656481398476022" + ] + }, + "execution_count": 147, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "This output is a little weird and concerning at first. IN all cases where semantic similarity (Biological Process, Cellular component, Molecular Function) is also to be calculated, we have scores of 1.000. How is this possible when we have a few cases where there is no overlap of GO terms whatsoever?" + "np.mean(GOGO_result_unique_BPO)" ] }, { "cell_type": "code", - "execution_count": 267, - "id": "a60e24f5-45dd-4938-acfa-901fcfda11d2", + "execution_count": 148, + "id": "f388fa0c-5b4d-49ce-9b48-386d3f33acfe", "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>GOs_struct</th>\n", - " <th>GOs_seq</th>\n", - " <th>overlap</th>\n", - " <th>coverage_struct</th>\n", - " <th>coverage_seq</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>737</th>\n", - " <td>[GO:0000322, GO:0000323, GO:0000324, GO:000032...</td>\n", - " <td>[GO:0003674, GO:0005488, GO:0005515]</td>\n", - " <td>no overlap</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1341</th>\n", - " <td>[GO:0003197, GO:0003205, GO:0003279, GO:000727...</td>\n", - " <td>[GO:0005575, GO:0005576, GO:0005615, GO:000562...</td>\n", - " <td>no overlap</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1715</th>\n", - " <td>[GO:0000166, GO:0003674, GO:0003824, GO:000548...</td>\n", - " <td>[GO:0005575, GO:0005618, GO:0005622, GO:000562...</td>\n", - " <td>no overlap</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2818</th>\n", - " <td>[GO:0005575, GO:0005623, GO:0005886, GO:001602...</td>\n", - " <td>[GO:0008150, GO:0009966, GO:0009967, GO:001064...</td>\n", - " <td>no overlap</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4012</th>\n", - " <td>[GO:0003674, GO:0005488, GO:0005515, GO:001989...</td>\n", - " <td>[GO:0000139, GO:0005575, GO:0005622, GO:000562...</td>\n", - " <td>no overlap</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4937</th>\n", - " <td>[GO:0003674, GO:0005488, GO:0005515, GO:001990...</td>\n", - " <td>[GO:0000228, GO:0000785, GO:0000790, GO:000557...</td>\n", - " <td>no overlap</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", + "text/plain": [ + "<AxesSubplot:xlabel='CCO', ylabel='Count'>" + ] + }, + "execution_count": 148, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX4AAAEGCAYAAABiq/5QAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAVhUlEQVR4nO3de5BkZXnH8e/DIMIuF2FnuCwqi1lcURKjLsZLylJJKpRowBQakqiEIm4lQeMlUdFURavyD8YkhUkIZqOGNTEoQUAMeMuCmqQUXS4iuBrGVRB3ZXfFqKwGZXjyR5/ZbWanZ07P9Olzus/3UzU13ae75zzdO/ub97zve94TmYkkqT0OqLsASdJwGfyS1DIGvyS1jMEvSS1j8EtSyxxYdwFlTE5O5po1a+ouQ5JGys0337w7M6fmbh+J4F+zZg1btmypuwxJGikRcfd82+3qkaSWMfglqWUMfklqGYNfklrG4JekljH4JallDH5JahmDX5JaZiRO4JKkNpiZmWF6evoR29auXcvExMRA92PwS1JDTE9Ps+GS61g5uRqAPbu3s/GCM1i3bt1A92PwS1KDrJxczeHHnlDpPuzjl6SWMfglqWUMfklqGYNfklrG4JekljH4JallDH5JahmDX5JaxuCXpJapNPgj4g0RcWdE3BERl0fEwRFxVER8OiLuKr4fWWUNkqRHqiz4I+J44I+A9Zl5CjABnANcCGzOzJOAzcV9SdKQVN3VcyBwSEQcCKwAtgNnApuKxzcBZ1VcgySpS2XBn5nfAf4SuAfYAfwgMz8FHJOZO4rn7ACOnu/1EbEhIrZExJZdu3ZVVaYktU6VXT1H0mndnwisBlZGxCvKvj4zN2bm+sxcPzU1VVWZktQ6VXb1/ArwzczclZk/A64CngPcFxHHARTfd1ZYgyRpjiqD/x7gWRGxIiICOA3YClwLnFs851zgoxXWIEmao7ILsWTmTRFxJXAL8BBwK7AROBS4IiLOp/PH4WVV1SBJ2l+lV+DKzLcDb5+z+UE6rX9JUg08c1eSWsbgl6SWMfglqWUMfklqGYNfklrG4JekljH4JallDH5JahmDX5JaxuCXpJYx+CWpZQx+SWoZg1+SWsbgl6SWMfglqWUMfklqGYNfklrG4JekljH4JallDH5JaplKL7YuafTNzMwwPT299/7atWuZmJiosSItl8EvaUHT09NsuOQ6Vk6uZs/u7Wy84AzWrVtXd1laBoNf0qJWTq7m8GNPqLsMDYh9/JLUMrb4JTWW4wvVMPglNZbjC9Uw+CU1muMLg2cfvyS1jMEvSS1j8EtSyxj8ktQyBr8ktYzBL0ktY/BLUssY/JLUMga/JLWMwS9JLWPwS1LLGPyS1DKVBn9EPCYiroyIr0XE1oh4dkQcFRGfjoi7iu9HVlmDJOmRqm7xvxv4RGY+CXgqsBW4ENicmScBm4v7kqQhqSz4I+Jw4HnA+wAy86eZ+b/AmcCm4mmbgLOqqkGStL8qW/xPAHYB/xQRt0bEeyNiJXBMZu4AKL4fPd+LI2JDRGyJiC27du2qsExJapcqg/9A4OnApZn5NGAPfXTrZObGzFyfmeunpqaqqlGSWqfK4L8XuDczbyruX0nnD8F9EXEcQPF9Z4U1SJLmqCz4M/O7wLcjYvYCmacBXwWuBc4ttp0LfLSqGiRJ+6v6mruvBT4YEQcB24Dz6PyxuSIizgfuAV5WcQ2SxtjMzAzT09N7769du5aJiYkaK2q+SoM/M28D1s/z0GlV7ldSe0xPT7PhkutYObmaPbu3s/GCM1i3bt3iL2yxqlv8klS5lZOrOfzYE+ouY2S4ZIMktYzBL0ktY1ePpKFwELY5DH5JQ+EgbHMY/JKGxkHYZrCPX5JaxuCXpJaxq0dSafnww2zbtm3vfQdoR5PBL6m0Pfd/l3dcczerjv+BA7QjzOCXBJSfbrlylQO0o87glwQ43bJNDH5Jezndsh0MfkkaoFE4Q9ngl6QBGoUuM4Nfkgas6V1mnsAlSS1j8EtSy5QK/oh4bpltkqTmK9vi/9uS2yRJDbfg4G5EPBt4DjAVEW/seuhwoFnzkyRJpSw2q+cg4NDieYd1bf8hcHZVRUmSqrNg8GfmZ4HPRsRlmXn3kGqSNObmrvIJzTzRaVyVncf/6IjYCKzpfk1mvrCKoiSNt+5VPoHGnug0rsoG/78B7wHeC8xUV46ktnCVz/qUDf6HMvPSSiuRNHa6163Ztm0bmc3+uW1RNvg/FhF/CFwNPDi7MTPvr6QqSWOhe92aXXfdxmGPO7nRP7ctygb/ucX3N3VtS+AJgy1H0riZXbfmgd3bR+LntkGp4M/ME6suRJI0HKWCPyJeNd/2zPzAYMuRVMYorPmu5irb1XNq1+2DgdOAWwCDX6rBKKz5ruYq29Xz2u77EXEE8M+VVCSplKav+a7mWuqFWH4MnDTIQiSNh3GYajnuXWll+/g/RmcWD3QWZzsZuKKqoiSNrnGYajnuXWllW/x/2XX7IeDuzLy3gnokjYEmT7Us25of5660sn38n42IY9g3yHtXdSVJUnXGvTVfRtkrcL0c+CLwMuDlwE0R4bLMkkbSbGt+5eTqukupRdmunj8FTs3MnQARMQX8B3BlVYVJGpxxH6ycTxXveVw+x7LBf8Bs6Be+hxdql0ZGG7s3qnjP4/I5lg3+T0TEJ4HLi/u/CVxf5oURMQFsAb6TmS+OiKOAD9NZ2/9bwMsz8/v9FC2pf+M8WNlLFe95ED+z7iOHBVvtEbE2Ip6bmW8C/gH4BeCpwOeBjSX38Tpga9f9C4HNmXkSsLm4L0mtMXvk8IYP38qGS657xB+BYVisu+Zi4EcAmXlVZr4xM99Ap7V/8WI/PCIeC5xB5wIus84ENhW3NwFn9VWxJI2BOgeYFwv+NZl5+9yNmbmFTlfNYi4G3gw83LXtmMzcUfycHcDR870wIjZExJaI2LJr164Su5IklbFY8B+8wGOHLPTCiHgxsDMzb+67KiAzN2bm+sxcPzU1tZQfIUmax2KDu1+KiFdn5j92b4yI84HFAv25wK9HxIvo/AE5PCL+BbgvIo7LzB0RcRywc8GfImmg8uGH2bZt29778w0sdj9nVNfbqVqZz7GpFgv+1wNXR8TvsC/o1wMHAS9d6IWZ+VbgrQAR8XzgTzLzFRHxLjpX9Lqo+P7RJdYuaQn23P9d3nHN3aw6/gc9pyR2P2dU19upWpnPsakWDP7MvA94TkS8ADil2HxdZt6wjH1eBFxRHDXcQ+dsYElDtHJVZ2BxoZb97HOGsd7OqB6FzH5Gy1HH+yq7Vs+NwI1L3Ulmfgb4THH7e3Qu5CKpZk1p2bf5KKSO97XU9fgljYlhtuzL1FHmOXXXOmjDfl8GvzSmxuGCKKqGwS+NqXG4IIqqYfBLY2RuK3/FkLoQBnl00cRB3KVq6pRPg18aI3W18ge533EaxG3qlE+DXxozdV32cJD7HadB3EFM+Rw0g1/SkoxTl8wwNOnzMvglLck4dckMQ5M+L4NfaoC6L8yxVE3rkmlSq3o+Tfm8DH6pAcblkn51a1KruskMfqkhFruknydkldOUVnWTGfzSiPCELA3KYhdikdQgs0cFhxw574XrpFJs8UsN0z1AOTMzA8DExITdOxVo6pm1VTP4pYaZO0B5wIojWHX8iXbvVKCpZ9ZWzeCXGqh7gPLAQ1c5WFmhxc6sbfoU0aUw+CVpAeM4RdTBXUlaxOxRwbgMqtvil2rivPzhqbq7ZtS6gwx+qSbOyx+eqrtrRq07yK4eqUbOyx+eqrtrRqk7yOCXpJaxq0cacaPWv6z6GfzSiBu1/mXVz64eaQyMUv+y6meLX6rQ3AusQHvWg1FzGfxShbqnbAKtWg9GzWXwSxVb7AIr0rDZxy9JLWPwS1LLGPyS1DIGvyS1jMEvSS1j8EtSyxj8ktQyBr8ktYzBL0kt45m7kkS7lrc2+CWJdi1vXVlXT0Q8LiJujIitEXFnRLyu2H5URHw6Iu4qvh9ZVQ2S1I+2LG9dZR//Q8AfZ+bJwLOACyLiycCFwObMPAnYXNyXJA1JZcGfmTsy85bi9o+ArcDxwJnApuJpm4CzqqpBkrS/ofTxR8Qa4GnATcAxmbkDOn8cImK8j6mkLm0aQFRzVR78EXEo8BHg9Zn5w4go+7oNwAaAxz/+8dUVKA1RmwYQ1VyVzuOPiEfRCf0PZuZVxeb7IuK44vHjgJ3zvTYzN2bm+sxcPzU1VWWZ0lC1ZQBRzVXlrJ4A3gdszcy/7nroWuDc4va5wEerqkGStL8qu3qeC7wS+EpE3FZsextwEXBFRJwP3AO8rMIaJElzVBb8mflfQK8O/dOq2q8kaWGu1SNJLWPwS1LLGPyS1DIu0iYNwMzMDNPT03vvr127lomJiRorknoz+KUBmJ6eZsMl17FycjV7dm9n4wVnsG7durrLkuZl8EsDsnKyc2KW1HQGv1qtTBdNv904rsejpjP41Wplumj67cZxPR41ncGv1ivTRTP7nO7WPPRu/c+ux/PA7u0Dr1daLoNf6kN3a95BXI0qg1/q02xrXhpVnsAlSS1j8EtSy9jVo5HUhDNlnbapUWXwayQ14UxZp21qVBn8GllVninbfUSxUGveaZsaRQa/NI/uIwpb8xo3Du5KPcweUXhRdI0bW/waK91dNDMzMwB7B31dKlnqMPg1VuZ20Ryw4ghWHX+iZ9lKXQx+jZ3ZLpoHdm/nwENX7TcA3Gvg1umZaguDX63Ta+DW6ZlqCwd31Uq9Bm5np2c6oKtxZotfI88uGqk/Br9Gnl00Un8Mfi3ZctfLGeR6O4udQetRgbSPwa8lW+56OcNcb8ejAmkfg3+ZmrBKZC/DqG256+Us9vphHhVIbWHwL1MTVonspcm1lTUO70FqGoN/AKpcJXK5mlxbWePwHqQmMfhHVL9dIN2Dm2WeX2a/ZQdJ+611KfuQVJ7BP6L67QLpHtxcTpfJUpYr7rdWl0SWqmXwj5C5LeEVq/rrAlnZ5/N7/pzJxQdJ+6117nTLFQ7ESpUx+EfIKLWE+63V6ZbS8LhWz4gZpYuD9Fur6+RIw2GLv4dhz89vwvkAy6lh7uBxr0HZ5ZxB69m30mAY/D0Me/54E+arL6eG7q4aoGd3zXK6dOwOkgZjrIO/Vwu2bMt2OfPHl9J6nm9/VU9t7DUI29267r6E4UI1dA8eLzQou5wzaD37Vlq+sQ7+Xi3YYbSuB7WPqgd0y16UZPYShra0pdE31sEPvVvts9vLnNi01JOfFmvBl9lfmamN/bbOy/787tb17CUMbWlLo6+W4I+I04F3AxPAezPzojrqgHInNg3q5CcodyTQb192v61z+8qldht68EfEBHAJ8KvAvcCXIuLazPzqsGuZVebEpkGd/ATlxg767cvut3VuX7nUXnW0+J8JTGfmNoCI+BBwJlBJ8O8pgm3P7u1s23YE0OnemN3+k+/v5IAHH+SHBz+67+fMfV6v1/e7vXt/o3h7HN6Dn4ufS9239+XX0wYXiIXIIU+GjoizgdMz8/eK+68EfikzXzPneRuADcXddcDXh1ro/iaB3TXXsByjXP8o1w7WX7c2139CZk7N3VhHiz/m2bbfX5/M3AhsrL6cciJiS2aur7uOpRrl+ke5drD+uln//upYsuFe4HFd9x8L2NEsSUNSR/B/CTgpIk6MiIOAc4Bra6hDklpp6F09mflQRLwG+CSd6Zzvz8w7h13HEjSm22mJRrn+Ua4drL9u1j/H0Ad3JUn1cllmSWoZg1+SWsbg7xIRp0fE1yNiOiIuXOB5p0bETHFOQmMsVn9EPD8ifhARtxVff1ZHnb2U+fyL93BbRNwZEZ8ddo0LKfH5v6nrs7+j+B06qo5a51Oi/iMi4mMR8eXi8z+vjjrnU6L2IyPi6oi4PSK+GBGn1FFnLxHx/ojYGRF39Hg8IuJvivd3e0Q8fVk7zEy/OuMcE8A3gCcABwFfBp7c43k3ANcDZ9dddz/1A88H/r3uWpdR/2PonOH9+OL+0XXX3e/vT9fzXwLcUHfdfX7+bwPeWdyeAu4HDhqR2t8FvL24/SRgc911z6nvecDTgTt6PP4i4ON0zoN6FnDTcvZni3+fvUtJZOZPgdmlJOZ6LfARYOcwiyuhbP1NVab+3wauysx7ADKzSf8G/X7+vwVcPpTKyilTfwKHRUQAh9IJ/oeGW+a8ytT+ZGAzQGZ+DVgTEccMt8zeMvNzdD7PXs4EPpAdXwAeExHHLXV/Bv8+xwPf7rp/b7Ftr4g4Hngp8J4h1lXWovUXnl0cqn88Ip4ynNJKKVP/E4EjI+IzEXFzRLxqaNUtruznT0SsAE6n04BoijL1/x1wMp0TLr8CvC4zHx5OeQsqU/uXgd8AiIhnAifQOXl0VJT+/Spj7Nfj70OZpSQuBt6SmTOdRk+jlKn/FjprdzwQES8CrgFOqrqwksrUfyDwDOA04BDg8xHxhcz8n6qLK6HUUiSFlwD/nZkLtfCGrUz9vwbcBrwQ+Dng0xHxn5n5w4prW0yZ2i8C3h0Rt9H5o3UrzThaKauf369FGfz7lFlKYj3woSL0J4EXRcRDmXnNUCpc2KL1d/8HzczrI+LvI2IyM5uwgFWZz/9eYHdm7gH2RMTngKcCTQj+fpYiOYdmdfNAufrPAy7KTqfzdER8k05/+ReHU2JPZX/3z4POQCnwzeJrVAx2qZu6BzWa8kXnj+A24ET2DRA9ZYHnX0azBncXrR84ln0n7T0TuGf2ft1fJes/mU4/7YHACuAO4JS6a+/n9wc4gk5f7sq6a17C538p8I7i9jHAd4DJEan9MRQD0cCr6fSX1/65z6lxDb0Hd8/gkYO7X1zOvmzxF7LHUhIR8fvF403s19+rZP1nA38QEQ8BPwHOyeK3qm5l6s/MrRHxCeB24GE6V2+bd/rbsPXx+/NS4FPZOWppjJL1/zlwWUR8hU4AvSUbcLRYsvaTgQ9ExAydmWHn11bwPCLicjqz7iYj4l7g7cCjYG/919OZ2TMN/Jji6GXJ+2vI/3tJ0pA4q0eSWsbgl6SWMfglqWUMfklqGYNfklrG4Jd6iIhjI+JDEfGNiPhqRFwfEU8svq4vVkrcGhFXzK77EhG/XKz++LXia0Pd70Oay3n80jyKszuvBjZl5jnFtl+kc+LS+4E3ZubHiu0vAKaK1/wrcFZm3hIRk8AnI+I7mXldHe9Dmo8tfml+LwB+1n3iXmbeRmdto8/Phn6x/cbiRLILgMsy85Zi+27gzUDPaztIdTD4pfmdAtzcx3aAp8zz2JZiu9QYBr80OMH8KyZ6erwaxeCX5ncnnSWgy26ffWz9nG3PoLM2jNQYBr80vxuAR0fEq2c3RMSpdBbJek5EnNG1/fSI+HngEuB3i0FgImIV8E7gL4ZZuLQYF2mTeoiI1XQuvvMM4P+AbwGvp7MC5MV0LkbyMzqrhb4uM++LiOcBfwUcRqfr5+LMvHTIpUsLMvglqWXs6pGkljH4JallDH5JahmDX5JaxuCXpJYx+CWpZQx+SWqZ/wf4n2qK+lgVeAAAAABJRU5ErkJggg==\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "GOGO_result_unique_CCO = GOGO_result_unique['CCO'][GOGO_result_unique['CCO'] != 'NA'].astype('float')\n", + "sns.histplot(GOGO_result_unique_CCO, bins=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "id": "13d7959f-32ce-4688-b387-22e1f9db6d09", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8254611161939618" + ] + }, + "execution_count": 149, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.mean(GOGO_result_unique_CCO)" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "id": "d85ef2a1-e21d-4a1a-b5e9-926544becf5b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<AxesSubplot:xlabel='MFO', ylabel='Count'>" + ] + }, + "execution_count": 150, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "GOGO_result_unique_MFO = GOGO_result_unique['MFO'][GOGO_result_unique['MFO'] != 'NA'].astype('float')\n", + "sns.histplot(GOGO_result_unique_MFO, bins=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "id": "7a07389d-92e8-42c4-9331-412b5fd29079", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7859392819429791" + ] + }, + "execution_count": 151, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.mean(GOGO_result_unique_MFO)" + ] + }, + { + "cell_type": "markdown", + "id": "7a66d7b6-84fd-43d9-ac17-d810ed63c11c", + "metadata": {}, + "source": [ + "### Check semantic similarity of GO terms of partial overlapping sequence-structure protein pairs with additional GO terms on either side" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "id": "84d8e039-e16d-4482-8552-9099501bf47e", + "metadata": {}, + "outputs": [], + "source": [ + "GOs_partial_expanded = GOs_partial[GOs_partial['overlap'] != 'partial overlap - unique GOs']" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "id": "b49002d7-ffb8-4e2c-a3bd-b0c16835318c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>GOs_struct</th>\n", + " <th>GOs_seq</th>\n", + " <th>overlap</th>\n", + " <th>coverage_struct</th>\n", + " <th>coverage_seq</th>\n", " </tr>\n", + " </thead>\n", + " <tbody>\n", " <tr>\n", - " <th>14652</th>\n", - " <td>[GO:0005575, GO:0005622, GO:0005623, GO:0044464]</td>\n", - " <td>[GO:0007275, GO:0007399, GO:0008150, GO:000998...</td>\n", - " <td>no overlap</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", + " <th>11</th>\n", + " <td>[GO:0000166, GO:0000323, GO:0001882, GO:000188...</td>\n", + " <td>[GO:0005575, GO:0005622, GO:0005623, GO:000573...</td>\n", + " <td>partial overlap - structure GOs expanded</td>\n", + " <td>0.183246</td>\n", + " <td>1.000000</td>\n", " </tr>\n", " <tr>\n", - " <th>20744</th>\n", - " <td>[GO:0005575, GO:0005622, GO:0005623, GO:000573...</td>\n", - " <td>[GO:0003674, GO:0003824, GO:0003964, GO:000613...</td>\n", - " <td>no overlap</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", + " <th>12</th>\n", + " <td>[GO:0003674, GO:0004857, GO:0005095, GO:000548...</td>\n", + " <td>[GO:0002376, GO:0002520, GO:0003674, GO:000485...</td>\n", + " <td>partial overlap - sequence GOs expanded</td>\n", + " <td>1.000000</td>\n", + " <td>0.742424</td>\n", " </tr>\n", " <tr>\n", - " <th>21097</th>\n", - " <td>[GO:0005575, GO:0005622, GO:0005623, GO:000573...</td>\n", - " <td>[GO:0003674, GO:0003824, GO:0003924, GO:000548...</td>\n", - " <td>no overlap</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", + " <th>21</th>\n", + " <td>[GO:0000166, GO:0001501, GO:0001503, GO:000150...</td>\n", + " <td>[GO:0000041, GO:0000166, GO:0001501, GO:000150...</td>\n", + " <td>partial overlap - sequence GOs expanded</td>\n", + " <td>1.000000</td>\n", + " <td>0.896000</td>\n", " </tr>\n", " <tr>\n", - " <th>27083</th>\n", - " <td>[GO:0003674, GO:0003824, GO:0006629, GO:000815...</td>\n", - " <td>[GO:0005575, GO:0005622, GO:0005623, GO:000573...</td>\n", - " <td>no overlap</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", + " <th>35</th>\n", + " <td>[GO:0000122, GO:0000166, GO:0000228, GO:000078...</td>\n", + " <td>[GO:0000122, GO:0000166, GO:0000228, GO:000078...</td>\n", + " <td>partial overlap - sequence GOs expanded</td>\n", + " <td>1.000000</td>\n", + " <td>0.969112</td>\n", " </tr>\n", " <tr>\n", - " <th>28309</th>\n", - " <td>[GO:0000075, GO:0000077, GO:0000278, GO:000028...</td>\n", - " <td>[GO:0003674, GO:0005488, GO:0005515]</td>\n", - " <td>no overlap</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", + " <th>66</th>\n", + " <td>[GO:0003674, GO:0003824, GO:0004364, GO:000548...</td>\n", + " <td>[GO:0001101, GO:0001885, GO:0002064, GO:000315...</td>\n", + " <td>partial overlap - sequence GOs expanded</td>\n", + " <td>1.000000</td>\n", + " <td>0.675159</td>\n", " </tr>\n", " <tr>\n", - " <th>35725</th>\n", - " <td>[GO:0003008, GO:0005575, GO:0005623, GO:000727...</td>\n", - " <td>[GO:0003674, GO:0005488, GO:0005515, GO:0005516]</td>\n", - " <td>no overlap</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", " </tr>\n", " <tr>\n", - " <th>36403</th>\n", - " <td>[GO:0003674, GO:0003779, GO:0005488, GO:000551...</td>\n", - " <td>[GO:0006355, GO:0008150, GO:0009889, GO:000989...</td>\n", - " <td>no overlap</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", + " <th>41924</th>\n", + " <td>[GO:0000278, GO:0001726, GO:0002090, GO:000209...</td>\n", + " <td>[GO:0000278, GO:0001726, GO:0002090, GO:000209...</td>\n", + " <td>partial overlap - sequence GOs expanded</td>\n", + " <td>1.000000</td>\n", + " <td>0.825397</td>\n", " </tr>\n", " <tr>\n", - " <th>38385</th>\n", - " <td>[GO:0007154, GO:0007267, GO:0008150, GO:000960...</td>\n", - " <td>[GO:0005575, GO:0005576]</td>\n", - " <td>no overlap</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", + " <th>41925</th>\n", + " <td>[GO:0000049, GO:0000166, GO:0001514, GO:000188...</td>\n", + " <td>[GO:0001514, GO:0003674, GO:0003676, GO:000372...</td>\n", + " <td>partial overlap - structure GOs expanded</td>\n", + " <td>0.729167</td>\n", + " <td>1.000000</td>\n", " </tr>\n", " <tr>\n", - " <th>38735</th>\n", - " <td>[GO:0000322, GO:0000323, GO:0000324, GO:000032...</td>\n", - " <td>[GO:0003674, GO:0005488, GO:0005515]</td>\n", - " <td>no overlap</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", + " <th>41931</th>\n", + " <td>[GO:0000775, GO:0003674, GO:0005488, GO:000551...</td>\n", + " <td>[GO:0000775, GO:0003674, GO:0005488, GO:000551...</td>\n", + " <td>partial overlap - structure GOs expanded</td>\n", + " <td>0.944785</td>\n", + " <td>1.000000</td>\n", " </tr>\n", " <tr>\n", - " <th>39987</th>\n", - " <td>[GO:0000322, GO:0000323, GO:0000324, GO:000032...</td>\n", - " <td>[GO:0003674, GO:0005488, GO:0005515]</td>\n", - " <td>no overlap</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", + " <th>41936</th>\n", + " <td>[GO:0000003, GO:0000165, GO:0000166, GO:000156...</td>\n", + " <td>[GO:0000003, GO:0000165, GO:0001654, GO:000170...</td>\n", + " <td>partial overlap - structure GOs expanded</td>\n", + " <td>0.386813</td>\n", + " <td>1.000000</td>\n", " </tr>\n", " <tr>\n", - " <th>41192</th>\n", - " <td>[GO:0005575, GO:0005622, GO:0005623, GO:000563...</td>\n", - " <td>[GO:0003674, GO:0005488, GO:0005515, GO:0042802]</td>\n", - " <td>no overlap</td>\n", - " <td>0.0</td>\n", - " <td>0.0</td>\n", + " <th>41942</th>\n", + " <td>[GO:0000151, GO:0000209, GO:0000226, GO:000367...</td>\n", + " <td>[GO:0000151, GO:0000209, GO:0003674, GO:000382...</td>\n", + " <td>partial overlap - structure GOs expanded</td>\n", + " <td>0.795918</td>\n", + " <td>1.000000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", + "<p>2203 rows × 5 columns</p>\n", "</div>" ], "text/plain": [ " GOs_struct \\\n", - "737 [GO:0000322, GO:0000323, GO:0000324, GO:000032... \n", - "1341 [GO:0003197, GO:0003205, GO:0003279, GO:000727... \n", - "1715 [GO:0000166, GO:0003674, GO:0003824, GO:000548... \n", - "2818 [GO:0005575, GO:0005623, GO:0005886, GO:001602... \n", - "4012 [GO:0003674, GO:0005488, GO:0005515, GO:001989... \n", - "4937 [GO:0003674, GO:0005488, GO:0005515, GO:001990... \n", - "14652 [GO:0005575, GO:0005622, GO:0005623, GO:0044464] \n", - "20744 [GO:0005575, GO:0005622, GO:0005623, GO:000573... \n", - "21097 [GO:0005575, GO:0005622, GO:0005623, GO:000573... \n", - "27083 [GO:0003674, GO:0003824, GO:0006629, GO:000815... \n", - "28309 [GO:0000075, GO:0000077, GO:0000278, GO:000028... \n", - "35725 [GO:0003008, GO:0005575, GO:0005623, GO:000727... \n", - "36403 [GO:0003674, GO:0003779, GO:0005488, GO:000551... \n", - "38385 [GO:0007154, GO:0007267, GO:0008150, GO:000960... \n", - "38735 [GO:0000322, GO:0000323, GO:0000324, GO:000032... \n", - "39987 [GO:0000322, GO:0000323, GO:0000324, GO:000032... \n", - "41192 [GO:0005575, GO:0005622, GO:0005623, GO:000563... \n", + "11 [GO:0000166, GO:0000323, GO:0001882, GO:000188... \n", + "12 [GO:0003674, GO:0004857, GO:0005095, GO:000548... \n", + "21 [GO:0000166, GO:0001501, GO:0001503, GO:000150... \n", + "35 [GO:0000122, GO:0000166, GO:0000228, GO:000078... \n", + "66 [GO:0003674, GO:0003824, GO:0004364, GO:000548... \n", + "... ... \n", + "41924 [GO:0000278, GO:0001726, GO:0002090, GO:000209... \n", + "41925 [GO:0000049, GO:0000166, GO:0001514, GO:000188... \n", + "41931 [GO:0000775, GO:0003674, GO:0005488, GO:000551... \n", + "41936 [GO:0000003, GO:0000165, GO:0000166, GO:000156... \n", + "41942 [GO:0000151, GO:0000209, GO:0000226, GO:000367... \n", + "\n", + " GOs_seq \\\n", + "11 [GO:0005575, GO:0005622, GO:0005623, GO:000573... \n", + "12 [GO:0002376, GO:0002520, GO:0003674, GO:000485... \n", + "21 [GO:0000041, GO:0000166, GO:0001501, GO:000150... \n", + "35 [GO:0000122, GO:0000166, GO:0000228, GO:000078... \n", + "66 [GO:0001101, GO:0001885, GO:0002064, GO:000315... \n", + "... ... \n", + "41924 [GO:0000278, GO:0001726, GO:0002090, GO:000209... \n", + "41925 [GO:0001514, GO:0003674, GO:0003676, GO:000372... \n", + "41931 [GO:0000775, GO:0003674, GO:0005488, GO:000551... \n", + "41936 [GO:0000003, GO:0000165, GO:0001654, GO:000170... \n", + "41942 [GO:0000151, GO:0000209, GO:0003674, GO:000382... \n", "\n", - " GOs_seq overlap \\\n", - "737 [GO:0003674, GO:0005488, GO:0005515] no overlap \n", - "1341 [GO:0005575, GO:0005576, GO:0005615, GO:000562... no overlap \n", - "1715 [GO:0005575, GO:0005618, GO:0005622, GO:000562... no overlap \n", - "2818 [GO:0008150, GO:0009966, GO:0009967, GO:001064... no overlap \n", - "4012 [GO:0000139, GO:0005575, GO:0005622, GO:000562... no overlap \n", - "4937 [GO:0000228, GO:0000785, GO:0000790, GO:000557... no overlap \n", - "14652 [GO:0007275, GO:0007399, GO:0008150, GO:000998... no overlap \n", - "20744 [GO:0003674, GO:0003824, GO:0003964, GO:000613... no overlap \n", - "21097 [GO:0003674, GO:0003824, GO:0003924, GO:000548... no overlap \n", - "27083 [GO:0005575, GO:0005622, GO:0005623, GO:000573... no overlap \n", - "28309 [GO:0003674, GO:0005488, GO:0005515] no overlap \n", - "35725 [GO:0003674, GO:0005488, GO:0005515, GO:0005516] no overlap \n", - "36403 [GO:0006355, GO:0008150, GO:0009889, GO:000989... no overlap \n", - "38385 [GO:0005575, GO:0005576] no overlap \n", - "38735 [GO:0003674, GO:0005488, GO:0005515] no overlap \n", - "39987 [GO:0003674, GO:0005488, GO:0005515] no overlap \n", - "41192 [GO:0003674, GO:0005488, GO:0005515, GO:0042802] no overlap \n", + " overlap coverage_struct coverage_seq \n", + "11 partial overlap - structure GOs expanded 0.183246 1.000000 \n", + "12 partial overlap - sequence GOs expanded 1.000000 0.742424 \n", + "21 partial overlap - sequence GOs expanded 1.000000 0.896000 \n", + "35 partial overlap - sequence GOs expanded 1.000000 0.969112 \n", + "66 partial overlap - sequence GOs expanded 1.000000 0.675159 \n", + "... ... ... ... \n", + "41924 partial overlap - sequence GOs expanded 1.000000 0.825397 \n", + "41925 partial overlap - structure GOs expanded 0.729167 1.000000 \n", + "41931 partial overlap - structure GOs expanded 0.944785 1.000000 \n", + "41936 partial overlap - structure GOs expanded 0.386813 1.000000 \n", + "41942 partial overlap - structure GOs expanded 0.795918 1.000000 \n", "\n", - " coverage_struct coverage_seq \n", - "737 0.0 0.0 \n", - "1341 0.0 0.0 \n", - "1715 0.0 0.0 \n", - "2818 0.0 0.0 \n", - "4012 0.0 0.0 \n", - "4937 0.0 0.0 \n", - "14652 0.0 0.0 \n", - "20744 0.0 0.0 \n", - "21097 0.0 0.0 \n", - "27083 0.0 0.0 \n", - "28309 0.0 0.0 \n", - "35725 0.0 0.0 \n", - "36403 0.0 0.0 \n", - "38385 0.0 0.0 \n", - "38735 0.0 0.0 \n", - "39987 0.0 0.0 \n", - "41192 0.0 0.0 " + "[2203 rows x 5 columns]" ] }, - "execution_count": 267, + "execution_count": 153, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "GOs[GOs['overlap'] == 'no overlap']" + "GOs_partial_expanded" ] }, { "cell_type": "markdown", - "id": "d38d20e1-6644-4215-998d-6ba2213fcd2c", + "id": "c4bdd8f6-0fb7-4a46-bf87-c332aa5e255f", "metadata": {}, "source": [ - "Looking more closely at these cases, one realizes that these are similar GO terms all over, such as GO:0005575 or GO:0003674. This happenes on both sides. There is something weird about it. Niko, please have a look!" + "Create a table that can be used as an input for GOGO (https://www.nature.com/articles/s41598-018-33219-y):" ] }, { "cell_type": "code", - "execution_count": 268, - "id": "1ff24373-d621-4e76-8c2b-3318a6f17577", + "execution_count": 154, + "id": "e1d45be9-ee8c-4b86-9163-ba054ba9ed5c", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "protein_id 63159\n", - "eggNOG_OGs_struct COG5096@1|root,KOG1061@2759|Eukaryota,37I5T@33...\n", - "MSA size 4182.0\n", - "alignment length 841.0\n", - "query length 790.0\n", - "seq. id. 0.361\n", - "bit score 2903.0\n", - "plddt 82.705063\n", - "complete_protein False\n", - "Preferred_name_struct -\n", - "Description_struct Subunit of clathrin-associated adaptor protein...\n", - "GOs_struct GO:0003674,GO:0005488,GO:0005515,GO:0019899,GO...\n", - "eggNOG_OGs_seq COG5096@1|root,KOG1061@2759|Eukaryota,38E7X@33...\n", - "score 733.0\n", - "Preferred_name_seq AP4B1\n", - "Description_seq clathrin binding\n", - "GOs_seq GO:0000139,GO:0005575,GO:0005622,GO:0005623,GO...\n", - "Name: 4012, dtype: object" - ] - }, - "execution_count": 268, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], + "source": [ + "GOs_partial_expanded_GOGO = GOs_partial_expanded[['GOs_struct', 'GOs_seq']]" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "id": "5c758b4e-9833-40eb-910c-51b8e2ee1216", + "metadata": {}, + "outputs": [], "source": [ - "annotations_complete.iloc[4012]" + "# iterate over each row and each column\n", + "for i, row in GOs_partial_expanded_GOGO.iterrows():\n", + " for col in GOs_partial_expanded_GOGO.columns:\n", + " # separate the elements in the list with a space\n", + " GOs_partial_expanded_GOGO.at[i, col] = \" \".join(str(x) for x in row[col])\n", + " # add the index of the row to the beginning of the list\n", + " GOs_partial_expanded_GOGO.at[i, 'GOs_struct'] = str(i) + \"_struct\" + \" \" + GOs_partial_expanded_GOGO.at[i, 'GOs_struct']\n", + " GOs_partial_expanded_GOGO.at[i, 'GOs_seq'] = str(i) + \"_seq\" + \" \" + GOs_partial_expanded_GOGO.at[i, 'GOs_seq']\n", + "\n", + "# save the dataframe to a txt file without the header and index\n", + "GOs_partial_expanded_GOGO.to_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/GOGO_partial_expanded_input.txt\", sep=\";\", index=False, header=False)" ] }, { "cell_type": "markdown", - "id": "035867d4-3369-4e99-aaed-a99c2994e468", + "id": "74702fd4-be39-4062-915f-0f367c2e5afc", "metadata": {}, "source": [ - "This is an example in which the GO terms do not overlap. However, the description is very similar and the OGs are the same to the Eukaryotic level. " + "I ran GOGO locally: `perl gene_pair_comb.pl ~/Desktop/GOGO_partial_expanded_input.txt ~/Desktop/GOGO_partial_expanded_input_result.txt`" ] }, { "cell_type": "code", - "execution_count": 270, - "id": "34504af3-0021-4b2f-a856-e642ece7c88a", + "execution_count": 156, + "id": "b2670268-3990-430d-bb82-e7bba51bf81b", "metadata": {}, - "outputs": [ - { - "ename": "TypeError", - "evalue": "__init__() missing 2 required positional arguments: 'go2obj' and 'annots'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipykernel_278/2809043968.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;31m# Create a TermCounts object to store information about the GO terms\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mterm_counts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTermCounts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;31m# Calculate the information content of each GO term in the lists\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: __init__() missing 2 required positional arguments: 'go2obj' and 'annots'" - ] - } - ], + "outputs": [], "source": [ - "from goatools.semantic import TermCounts, get_info_content\n", - "\n", - "# Define your two lists of GO terms\n", - "list1 = ['GO:0000001', 'GO:0000002', 'GO:0000003']\n", - "list2 = ['GO:0000004', 'GO:0000005', 'GO:0000006']\n", - "\n", - "# Create a TermCounts object to store information about the GO terms\n", - "term_counts = TermCounts()\n", - "\n", - "# Calculate the information content of each GO term in the lists\n", - "ic1 = [get_info_content(go, term_counts) for go in list1]\n", - "ic2 = [get_info_content(go, term_counts) for go in list2]\n", - "\n", - "# Calculate the semantic similarity between the two lists of GO terms using the Resnik measure\n", - "similarity = sum([ic1[i] * ic2[i] for i in range(len(list1))])\n", + "GOGO_result_expanded = pd.read_csv('/g/arendt/Fabian/PhD/Computational/Spongefold/GOGO_partial_expanded_input_result.txt', sep=';', header=None)" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "id": "55340e34-3b2f-48b3-b4cf-f96e50b56504", + "metadata": {}, + "outputs": [], + "source": [ + "GOGO_result_expanded.rename(columns={0 :'GOs_struct', 1 :'GOs_seq'}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "id": "4aa34798-6754-4b95-a61a-653bf5fc235f", + "metadata": {}, + "outputs": [], + "source": [ + "GOGO_result_expanded['GOs_struct'] = GOGO_result_expanded['GOs_struct'].str.split(' ').str[1:]" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "id": "8553a1e2-868e-4f19-96bf-c840ca7a6db9", + "metadata": {}, + "outputs": [], + "source": [ + "GOGO_result_expanded['GOs_seq'] = GOGO_result_expanded['GOs_seq'].str.split(' ').str[1:]" + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "id": "81a05e12-6558-4867-9796-79bef9b3e3de", + "metadata": {}, + "outputs": [], + "source": [ + "# create a new column in the dataframe\n", + "GOGO_result_expanded['BPO'] = None\n", + "GOGO_result_expanded['CCO'] = None\n", + "GOGO_result_expanded['MFO'] = None\n", "\n", - "# Print the semantic similarity score\n", - "print(similarity)" + "# iterate over each row in the dataframe\n", + "for i, row in GOGO_result_expanded.iterrows():\n", + " # get the fifth last element of the list in col1\n", + " # and store it in the new column\n", + " GOGO_result_expanded.at[i, 'BPO'] = row['GOs_seq'][-5]\n", + " GOGO_result_expanded.at[i, 'CCO'] = row['GOs_seq'][-3]\n", + " GOGO_result_expanded.at[i, 'MFO'] = row['GOs_seq'][-1]" ] }, { "cell_type": "code", - "execution_count": null, - "id": "f388fa0c-5b4d-49ce-9b48-386d3f33acfe", + "execution_count": 161, + "id": "ea64dbe5-1c66-46e9-95df-7ed602c6eb47", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>GOs_struct</th>\n", + " <th>GOs_seq</th>\n", + " <th>BPO</th>\n", + " <th>CCO</th>\n", + " <th>MFO</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>[GO:0000166, GO:0000323, GO:0001882, GO:000188...</td>\n", + " <td>[GO:0005575, GO:0005622, GO:0005623, GO:000573...</td>\n", + " <td>0.594</td>\n", + " <td>0.829</td>\n", + " <td>NA</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>[GO:0003674, GO:0004857, GO:0005095, GO:000548...</td>\n", + " <td>[GO:0002376, GO:0002520, GO:0003674, GO:000485...</td>\n", + " <td>0.888</td>\n", + " <td>0.942</td>\n", + " <td>1.000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>[GO:0000166, GO:0001501, GO:0001503, GO:000150...</td>\n", + " <td>[GO:0000041, GO:0000166, GO:0001501, GO:000150...</td>\n", + " <td>0.981</td>\n", + " <td>0.966</td>\n", + " <td>0.993</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>[GO:0000122, GO:0000166, GO:0000228, GO:000078...</td>\n", + " <td>[GO:0000122, GO:0000166, GO:0000228, GO:000078...</td>\n", + " <td>0.990</td>\n", + " <td>1.000</td>\n", + " <td>1.000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>[GO:0003674, GO:0003824, GO:0004364, GO:000548...</td>\n", + " <td>[GO:0001101, GO:0001885, GO:0002064, GO:000315...</td>\n", + " <td>0.920</td>\n", + " <td>0.887</td>\n", + " <td>0.930</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2198</th>\n", + " <td>[GO:0000278, GO:0001726, GO:0002090, GO:000209...</td>\n", + " <td>[GO:0000278, GO:0001726, GO:0002090, GO:000209...</td>\n", + " <td>0.951</td>\n", + " <td>0.968</td>\n", + " <td>1.000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2199</th>\n", + " <td>[GO:0000049, GO:0000166, GO:0001514, GO:000188...</td>\n", + " <td>[GO:0001514, GO:0003674, GO:0003676, GO:000372...</td>\n", + " <td>1.000</td>\n", + " <td>0.903</td>\n", + " <td>0.808</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2200</th>\n", + " <td>[GO:0000775, GO:0003674, GO:0005488, GO:000551...</td>\n", + " <td>[GO:0000775, GO:0003674, GO:0005488, GO:000551...</td>\n", + " <td>0.999</td>\n", + " <td>0.968</td>\n", + " <td>1.000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2201</th>\n", + " <td>[GO:0000003, GO:0000165, GO:0000166, GO:000156...</td>\n", + " <td>[GO:0000003, GO:0000165, GO:0001654, GO:000170...</td>\n", + " <td>0.823</td>\n", + " <td>0.721</td>\n", + " <td>0.704</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2202</th>\n", + " <td>[GO:0000151, GO:0000209, GO:0000226, GO:000367...</td>\n", + " <td>[GO:0000151, GO:0000209, GO:0003674, GO:000382...</td>\n", + " <td>0.974</td>\n", + " <td>0.827</td>\n", + " <td>0.988</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>2203 rows × 5 columns</p>\n", + "</div>" + ], + "text/plain": [ + " GOs_struct \\\n", + "0 [GO:0000166, GO:0000323, GO:0001882, GO:000188... \n", + "1 [GO:0003674, GO:0004857, GO:0005095, GO:000548... \n", + "2 [GO:0000166, GO:0001501, GO:0001503, GO:000150... \n", + "3 [GO:0000122, GO:0000166, GO:0000228, GO:000078... \n", + "4 [GO:0003674, GO:0003824, GO:0004364, GO:000548... \n", + "... ... \n", + "2198 [GO:0000278, GO:0001726, GO:0002090, GO:000209... \n", + "2199 [GO:0000049, GO:0000166, GO:0001514, GO:000188... \n", + "2200 [GO:0000775, GO:0003674, GO:0005488, GO:000551... \n", + "2201 [GO:0000003, GO:0000165, GO:0000166, GO:000156... \n", + "2202 [GO:0000151, GO:0000209, GO:0000226, GO:000367... \n", + "\n", + " GOs_seq BPO CCO MFO \n", + "0 [GO:0005575, GO:0005622, GO:0005623, GO:000573... 0.594 0.829 NA \n", + "1 [GO:0002376, GO:0002520, GO:0003674, GO:000485... 0.888 0.942 1.000 \n", + "2 [GO:0000041, GO:0000166, GO:0001501, GO:000150... 0.981 0.966 0.993 \n", + "3 [GO:0000122, GO:0000166, GO:0000228, GO:000078... 0.990 1.000 1.000 \n", + "4 [GO:0001101, GO:0001885, GO:0002064, GO:000315... 0.920 0.887 0.930 \n", + "... ... ... ... ... \n", + "2198 [GO:0000278, GO:0001726, GO:0002090, GO:000209... 0.951 0.968 1.000 \n", + "2199 [GO:0001514, GO:0003674, GO:0003676, GO:000372... 1.000 0.903 0.808 \n", + "2200 [GO:0000775, GO:0003674, GO:0005488, GO:000551... 0.999 0.968 1.000 \n", + "2201 [GO:0000003, GO:0000165, GO:0001654, GO:000170... 0.823 0.721 0.704 \n", + "2202 [GO:0000151, GO:0000209, GO:0003674, GO:000382... 0.974 0.827 0.988 \n", + "\n", + "[2203 rows x 5 columns]" + ] + }, + "execution_count": 161, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "GOGO_result_expanded" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "id": "0be39243-e149-430c-8dd9-6dc46c3bac81", + "metadata": {}, + "outputs": [], + "source": [ + "GOGO_result_expanded_BPO = GOGO_result_expanded['BPO'][GOGO_result_expanded['BPO'] != 'NA'].astype('float')" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "id": "22a18199-4907-4423-a321-2d2c98517bad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<AxesSubplot:xlabel='BPO', ylabel='Count'>" + ] + }, + "execution_count": 163, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.histplot(GOGO_result_expanded_BPO, bins=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "id": "a0cd16ae-8fb2-49fa-ae00-d4f0c651b781", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8768518696069056" + ] + }, + "execution_count": 164, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.mean(GOGO_result_expanded_BPO)" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "id": "1125e2fe-6176-4a50-a144-7567bb5a366b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<AxesSubplot:xlabel='CCO', ylabel='Count'>" + ] + }, + "execution_count": 165, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEGCAYAAACKB4k+AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAW0UlEQVR4nO3de7SldX3f8feHQUCugnPAYaAMlAkRbL1kJF5ao9KWWZoU0iWuSVpFF8mstiTV2mWEtKsmq51V7CUL24rpVK1jGqVTozIqwdDx1nYZcVC8DJdyBIHpEGbAW0SLMn77x37Ok82Zc8485/Lsfc7M+7XWWXvv334u3985M89nP7/nslNVSJIEcNS4C5AkLR+GgiSpZShIklqGgiSpZShIklpHj7uAxVi9enWtW7du3GVI0opy++23P1pVEzO9t6JDYd26dezatWvcZUjSipLkgdnec/hIktQyFCRJLUNBktQyFCRJLUNBktQyFCRJLUNBktQyFCRJrRV98ZokHSkOHDjA5ORk+/r8889n1apVS74eQ0GSVoDJyUk2v+uTnLD6TB5/dC9br341F1xwwZKvx1CQpBXihNVncvKzzul1HR5TkCS1DAVJUstQkCS1DAVJUstQkCS1DAVJUstQkCS1eg2FJM9I8uEkdye5K8mLk5yW5NYk9zaPpw5Nf22SyST3JLm0z9okSQfre0/hncAtVfWzwHOBu4BrgJ1VtR7Y2bwmyYXAJuAiYCNwQ5Klv4ZbkjSr3kIhycnAy4D3AlTVj6vqu8BlwLZmsm3A5c3zy4Abq+qJqrofmAQu7qs+SdLB+txTOA/YD/yXJF9J8p4kJwBnVNXDAM3j6c30a4GHhubf07RJkkakz1A4GngB8O6qej7wOM1Q0SwyQ1sdNFGyOcmuJLv279+/NJVKkoB+Q2EPsKeqvti8/jCDkHgkyRqA5nHf0PRnD81/FrB3+kKramtVbaiqDRMTE70VL0lHot5Coar+DHgoydS9XS8B7gR2AFc2bVcCNzXPdwCbkhyb5FxgPXBbX/VJkg7W962zfxP4wyTHAPcBb2QQRNuTXAU8CFwBUFW7k2xnEBxPAldX1YGe65MkDek1FKrqDmDDDG9dMsv0W4AtfdYkSZqdVzRLklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSp1WsoJPlWkq8nuSPJrqbttCS3Jrm3eTx1aPprk0wmuSfJpX3WJkk62Cj2FF5RVc+rqg3N62uAnVW1HtjZvCbJhcAm4CJgI3BDklUjqE+S1BjH8NFlwLbm+Tbg8qH2G6vqiaq6H5gELh59eZJ05Oo7FAr4kyS3J9nctJ1RVQ8DNI+nN+1rgYeG5t3TtD1Fks1JdiXZtX///h5Ll6Qjz9E9L/+lVbU3yenArUnunmPazNBWBzVUbQW2AmzYsOGg9yVJC9frnkJV7W0e9wEfZTAc9EiSNQDN475m8j3A2UOznwXs7bM+SdJT9RYKSU5IctLUc+BvAd8AdgBXNpNdCdzUPN8BbEpybJJzgfXAbX3VJ0k6WJ/DR2cAH00ytZ4PVtUtSb4EbE9yFfAgcAVAVe1Osh24E3gSuLqqDvRYnyRpmt5CoaruA547Q/tjwCWzzLMF2NJXTZKkuXlFsySpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSpZShIklqGgiSp1XsoJFmV5CtJPtG8Pi3JrUnubR5PHZr22iSTSe5JcmnftUmSnmoUewpvAu4aen0NsLOq1gM7m9ckuRDYBFwEbARuSLJqBPVJkhq9hkKSs4BXA+8Zar4M2NY83wZcPtR+Y1U9UVX3A5PAxX3WJ0l6qr73FK4Hfgv46VDbGVX1MEDzeHrTvhZ4aGi6PU3bUyTZnGRXkl379+/vpWhJOlL1FgpJfhHYV1W3d51lhrY6qKFqa1VtqKoNExMTi6pRkvRUR/e47JcCfzvJq4DjgJOT/FfgkSRrqurhJGuAfc30e4Czh+Y/C9jbY32SpGl621Ooqmur6qyqWsfgAPKnq+rvATuAK5vJrgRuap7vADYlOTbJucB64La+6pMkHazPPYXZXAdsT3IV8CBwBUBV7U6yHbgTeBK4uqoOjKE+STpijSQUquqzwGeb548Bl8wy3RZgyyhqkiQdzCuaJUktQ0GS1DIUJEmtTqGQ5KVd2iRJK1vXPYX/0LFNkrSCzXn2UZIXAy8BJpK8ZeitkwFvVidJh5lDnZJ6DHBiM91JQ+3fB17TV1GSpPGYMxSq6nPA55K8v6oeGFFNkqQx6Xrx2rFJtgLrhuepqlf2UZQkaTy6hsJ/B36fwfcieOsJSTpMdQ2FJ6vq3b1WIkkau66npH48yT9Msqb5juXTkpzWa2WSpJHruqcwdavrtw61FXDe0pYjSRqnTqFQVef2XYgkafw6hUKS18/UXlUfWNpyJEnj1HX46IVDz49j8H0IXwYMBUk6jHQdPvrN4ddJTgH+oJeKJEljs9BbZ/+QwXcoS5IOI12PKXycwdlGMLgR3rOB7X0VJUkaj67HFP7t0PMngQeqak8P9UiSxqjT8FFzY7y7Gdwp9VTgx30WJUkaj67fvPZa4DbgCuC1wBeTeOtsSTrMdB0++qfAC6tqH0CSCeB/AB/uqzBJ0uh1PfvoqKlAaDw2j3klSStE1w37LUk+leQNSd4AfBK4ea4ZkhyX5LYkX02yO8nvNu2nJbk1yb3N46lD81ybZDLJPUkuXWinJEkLM2coJDk/yUur6q3AfwL+KvBc4AvA1kMs+wnglVX1XOB5wMYkLwKuAXZW1XpgZ/OaJBcCm4CLgI3ADUn8HmhJGqFD7SlcD/w5QFV9pKreUlX/mMFewvVzzVgDP2hePq35KeAyYFvTvg24vHl+GXBjVT1RVfcDk8DF8+mMJGlxDhUK66rqa9Mbq2oXg6/mnFOSVUnuAPYBt1bVF4EzqurhZjkPA6c3k68FHhqafU/TNn2Zm5PsSrJr//79hypBkjQPhwqF4+Z47+mHWnhVHaiq5wFnARcnec4ck2emRcywzK1VtaGqNkxMTByqBEnSPBwqFL6U5NenNya5Cri960qq6rvAZxkcK3gkyZpmOWsY7EXAYM/g7KHZzgL2dl2HJGnxDnWdwpuBjyb5u/xFCGwAjgF+ea4Zm2sZflJV303ydOBvAO8AdjD4Jrfrmsebmll2AB9M8nvAmQxuuHfbfDskSVq4OUOhqh4BXpLkFcDU0M8nq+rTHZa9BtjWnEF0FLC9qj6R5AvA9mZv40EGV0lTVbuTbAfuZHB/paur6sCCeiVJWpCu36fwGeAz81lwc4D6+TO0P8bgS3pmmmcLsGU+65EkLR2vSpYktQwFSVLLUJAktQwFSVLLUJAktQwFSVLLUJAktQwFSVLLUJAktQwFSVLLUJAktQwFSVLLUJAktQwFSVLLUJAktQwFSVLLUJAktQwFSVLLUJAktQwFSVLLUJAktQwFSVLLUJAktXoLhSRnJ/lMkruS7E7ypqb9tCS3Jrm3eTx1aJ5rk0wmuSfJpX3VJkmaWZ97Ck8C/6Sqng28CLg6yYXANcDOqloP7Gxe07y3CbgI2AjckGRVj/VJkqbpLRSq6uGq+nLz/M+Bu4C1wGXAtmaybcDlzfPLgBur6omquh+YBC7uqz5J0sFGckwhyTrg+cAXgTOq6mEYBAdwejPZWuChodn2NG3Tl7U5ya4ku/bv399r3ZJ0pOk9FJKcCPwR8Oaq+v5ck87QVgc1VG2tqg1VtWFiYmKpypQk0XMoJHkag0D4w6r6SNP8SJI1zftrgH1N+x7g7KHZzwL29lmfJOmp+jz7KMB7gbuq6veG3toBXNk8vxK4aah9U5Jjk5wLrAdu66s+SdLBju5x2S8FXgd8PckdTdtvA9cB25NcBTwIXAFQVbuTbAfuZHDm0tVVdaDH+iRJ0/QWClX1v5j5OAHAJbPMswXY0ldNkqS5eUWzJKllKEiSWoaCJKllKEiSWoaCJKllKEiSWoaCJKllKEiSWoaCJKllKEiSWoaCJKllKEiSWoaCJKllKEiSWoaCJKllKEiSWoaCJKllKEiSWoaCJKllKEiSWoaCJKllKEiSWkePuwBJ0swOHDjA5OQkAPfddx9V/a+zt1BI8j7gF4F9VfWcpu004L8B64BvAa+tqu80710LXAUcAP5RVX2qr9okaSWYnJxk87s+yQmrz2T/vXdw0tnP7n2dfQ4fvR/YOK3tGmBnVa0HdjavSXIhsAm4qJnnhiSreqxNklaEE1afycnPOoenn3r6SNbXWyhU1eeBb09rvgzY1jzfBlw+1H5jVT1RVfcDk8DFfdUmSZrZqA80n1FVDwM0j1PRtxZ4aGi6PU3bQZJsTrIrya79+/f3WqwkHWmWy9lHmaFtxkMqVbW1qjZU1YaJiYmey5KkI8uoQ+GRJGsAmsd9Tfse4Oyh6c4C9o64Nkk64o06FHYAVzbPrwRuGmrflOTYJOcC64HbRlybJB3x+jwl9UPAy4HVSfYAbweuA7YnuQp4ELgCoKp2J9kO3Ak8CVxdVQf6qk2SNLPeQqGqfmWWty6ZZfotwJa+6pGk5Wr4IjWA888/n1WrxnNWvlc0S9IYTL9a+V/dfBcnTpzJ44/uZevVr+aCCy4YS12GgiSNwUxXK5/8rHPGXZahIEl9mmtoaOpq5R88unxOtjQUJKlHw3sEXYaG6qc/5b777gNGdxO8YYaCJPVsao+gi8e//Wf8zsce4Jlrvzeym+ANWy5XNEuSGic8c7Q3wRtmKEiSWg4fSdISG8eX4ywVQ0GSFmD6WUXwF2cWjePLcZaKoSBJCzC84QcOOrNoptNNx31mUReGgiQt0HzOKoLxn1nUhaEg6bDR9z2EluJYwdSZRcvpgrVhhoKkw8Z8LxRbzPKX6yf9xTIUJB1W5juks9DlT/+kvxKOF3RhKEha0ZbL6Z8r4XhBF4aCpBWtjyGd4aA5cGDwfV+rVq06ZOgs9+MFXRgKkla8pb7b6PSgOer4U3jm2nNX9B5AV4aCpGWlyxlEixkymm0vYPoewfFDn/qPPvGZK34PoCtDQdKSmG1jPt/TRLucQdRlyGj4wO/0Df7Ut5xN3ws4kvYIZmMoSIeRcX7X72wb84WcJjrTGUTT9w6OP8T4/fQDv9M3+DPtBRxJewSzMRSkw0jf5+lPN9uGerpDnSbaZThoIQeUTzhCh4AWw1CQDjMLPU+/617GbF8432VDPTykM7yOrhv85fj1lYcbQ0EaoT6Gd5bqoOvwBn54L2N6zTMFQdcN9fCQzg/27eHaV1/EeeedN+tw0OFyQdhKYijoiLaQjfRiDqgOfyIe3ih2XfdMZvuUPduB1rkOuk5t4KdvjKemAeYdBNMND+n8zsfumPNir8PlgrCVZNmFQpKNwDuBVcB7quq6MZekZW6+pzDOtlHsOgY/24Z9eFnD7XOd6ji1UZztk/nwvMN963LQda4DrbMddJ1t3qlpgCUduulysdfhcEHYSrKsQiHJKuBdwN8E9gBfSrKjqu5c6nXN9z/ebJ+0ukzfxzK7TN+1tvmub7Z1z/b7ne80862h68Z4ttMQZ/p0PNe6Z9uwT9/ADrfPdqrj1AZvtk/mw/POFkBzjsHPcqC1y0HXxWyMHfZZuZZVKAAXA5NVdR9AkhuBy4AlD4XJyUl+9Xf/M8efejrffuBuVh13EqecsZYffmcf//J1l7T/8f7ZH+w8aJr5Tt/HMrtM32U5wLzXN9u6h81WR5dp5lvDtx+4mxPXrudE4Effe5S3vfeWOaeZ7vHH9vL9447l0W9+jbd940ft7+VQ60vgR9/Zx1HHn3LQsqa3z7S+H31nH0c98cRB656t1ul9G55mpmX28RzoNN1M/Wl/Xz3WN67nXX8vS/X88Uf3As8/5HZuIVLLKMKTvAbYWFW/1rx+HfDzVfUbQ9NsBjY3Ly8A7hl5of1YDTw67iKW2OHYJ7BfK439Otg5VTUx0xvLbU8hM7Q9JbWqaiuwdTTljE6SXVW1Ydx1LKXDsU9gv1Ya+zU/Ry31AhdpD3D20OuzAI8uSdKILLdQ+BKwPsm5SY4BNgE7xlyTJB0xltXwUVU9meQ3gE8xOCX1fVW1e8xljcphNyTG4dknsF8rjf2ah2V1oFmSNF7LbfhIkjRGhoIkqWUojFCSjUnuSTKZ5JoZ3n95ku8luaP5+efjqHO+DtWvZpqXN33aneRzo65xITr8vd469Lf6RpIDSU4bR63z0aFfpyT5eJKvNn+vN46jzvnq0K9Tk3w0ydeS3JbkOeOocz6SvC/JviTfmOX9JPn3TZ+/luQFi15pVfkzgh8GB86/CZwHHAN8Fbhw2jQvBz4x7lp76NczGFyV/pea16ePu+6l6Ne06X8J+PS4616iv9dvA+9onk8A3waOGXftS9CvfwO8vXn+s8DOcdfdoV8vA14AfGOW918F/DGDa7xeBHxxset0T2F02lt4VNWPgalbeKx0Xfr1q8BHqupBgKraN+IaF2K+f69fAT40ksoWp0u/CjgpSYATGYTCk6Mtc9669OtCYCdAVd0NrEtyxmjLnJ+q+jyD3/9sLgM+UAN/CjwjyZrFrNNQGJ21wENDr/c0bdO9uNlt/+MkF42mtEXp0q+fAU5N8tkktyd5/ciqW7iufy+SHA9sBP5oBHUtVpd+/Ufg2QwuHP068Kaq+uloyluwLv36KvB3AJJcDJzD4ALZlazzv9OultV1Coe5Q97CA/gyg3uS/CDJq4CPAev7LmyRuvTraODngEuApwNfSPKnVfV/+i5uEbr0a8ovAf+7qub6RLdcdOnXpcAdwCuBvwzcmuR/VtX3e65tMbr06zrgnUnuYBB2X2H57wEdynz+nXbinsLoHPIWHlX1/ar6QfP8ZuBpSVaPrsQF6XJrkj3ALVX1eFU9CnweeO6I6luo+dxyZRMrY+gIuvXrjQyG+6qqJoH7GYzBL2dd/3+9saqeB7yewfGS+0dWYT+W/NZAhsLoHPIWHkme1YzjTu3eHgU8NvJK56fLrUluAv56kqOboZafB+4acZ3z1emWK0lOAX6BQR9Xgi79epDBXh3NmPsFwH0sb13+fz2jeQ/g14DPL/O9ny52AK9vzkJ6EfC9qnp4MQt0+GhEapZbeCT5+837vw+8BvgHSZ4EfgRsquYUg+WqS7+q6q4ktwBfA37K4Bv1ZjzFbrno+PcC+GXgT6rq8TGVOi8d+/UvgPcn+TqD4Ym3NXt4y1bHfj0b+ECSAwzOhrtqbAV3lORDDM5KXJ1kD/B24GnQ9ulmBmcgTQI/ZLCXt7h1LvNtjiRphBw+kiS1DAVJUstQkCS1DAVJUstQkCS1DAVpnprrSW5M8s0kdya5OcnPND83N3esvCvJ9ql76yT5a82dOe9ufjaPux/STLxOQZqH5uLCjwLbqmpT0/Y84AzgfcBbqurjTfsrgIlmng8Cl1fVl5ur1D+V5P9W1SfH0Q9pNu4pSPPzCuAnQxevUVV3MLhH1RemAqFp/0xzkd7VwPur6stN+6PAbwEzfveENE6GgjQ/zwFun0c7wEUzvLeraZeWFUNB6l+Y+c6V3k5Ay46hIM3Pbga3Ae/aPvXehmltP8fg/jvSsmIoSPPzaeDYJL8+1ZDkhQxuSPaSJK8eat+Y5K8A7wLe0ByQJskzgXcA/3qUhUtdeEM8aZ6SnAlcz+DT/v8DvgW8mcHdOa9n8MU0P2FwV9g3VdUjSV4G/DvgJAbDSddX1btHXLp0SIaCJKnl8JEkqWUoSJJahoIkqWUoSJJahoIkqWUoSJJahoIkqfX/AWvg/P7EiWWSAAAAAElFTkSuQmCC\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "GOGO_result_expanded_CCO = GOGO_result_expanded['CCO'][GOGO_result_expanded['CCO'] != 'NA'].astype('float')\n", + "sns.histplot(GOGO_result_expanded_CCO, bins=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "id": "c5d6aaf5-dbd8-4a7f-8f35-f1d6ed438100", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9267089588377744" + ] + }, + "execution_count": 166, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.mean(GOGO_result_expanded_CCO)" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "id": "e92d5b43-f861-4fc5-809e-6f91a4744310", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<AxesSubplot:xlabel='MFO', ylabel='Count'>" + ] + }, + "execution_count": 167, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "GOGO_result_expanded_MFO = GOGO_result_expanded['MFO'][GOGO_result_expanded['MFO'] != 'NA'].astype('float')\n", + "sns.histplot(GOGO_result_expanded_MFO, bins=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "id": "a815cfe3-6610-4436-8eb0-cfbd90efe671", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9309306777030519" + ] + }, + "execution_count": 168, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.mean(GOGO_result_expanded_MFO)" + ] + }, + { + "cell_type": "markdown", + "id": "b5caa65a-c2e1-4660-8c6a-b0455347e37f", + "metadata": {}, + "source": [ + "### Create a summary of the GO term semantic similarity results:" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "id": "24226d52-2db6-4c9e-af7a-e0d8921ff8d1", + "metadata": {}, + "outputs": [], + "source": [ + "GOGO_result_unique_results = GOGO_result_unique[['BPO', 'CCO', 'MFO']]" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "id": "effa91b7-eefc-4f44-89ca-91e4d35c1358", + "metadata": {}, + "outputs": [], + "source": [ + "GOGO_result_unique_results_long = GOGO_result_unique_results.melt(value_vars=['BPO', 'CCO', 'MFO'], var_name = 'GO ontology')\n", + "GOGO_result_unique_results_long = GOGO_result_unique_results_long[GOGO_result_unique_results_long['value'] != 'NA']\n", + "GOGO_result_unique_results_long['value'] = GOGO_result_unique_results_long['value'].astype('float')\n", + "GOGO_result_unique_results_long['category'] = 'unique GOs'" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "id": "deab143b-1d8b-435e-88cc-2f381cd3ca0e", + "metadata": {}, + "outputs": [], + "source": [ + "GOGO_result_expanded_results = GOGO_result_expanded[['BPO', 'CCO', 'MFO']]" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "id": "faea203a-1579-43b9-9116-80a0d22c2c6c", + "metadata": {}, + "outputs": [], + "source": [ + "GOGO_result_expanded_results_long = GOGO_result_expanded_results.melt(value_vars=['BPO', 'CCO', 'MFO'], var_name = 'GO ontology')\n", + "GOGO_result_expanded_results_long = GOGO_result_expanded_results_long[GOGO_result_expanded_results_long['value'] != 'NA']\n", + "GOGO_result_expanded_results_long['value'] = GOGO_result_expanded_results_long['value'].astype('float')\n", + "GOGO_result_expanded_results_long['category'] = 'GOs expanded'" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "id": "9895083b-e84c-4055-a7ba-19a872efd149", + "metadata": {}, + "outputs": [], + "source": [ + "GOGO_results_long_combined = GOGO_result_expanded_results_long.append(GOGO_result_unique_results_long)" + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "id": "affe2f61-4040-4163-9ebc-2c8c82a99f41", + "metadata": {}, + "outputs": [], + "source": [ + "color_reference = {\n", + " 'unique GOs': cm.tab20.colors[18],\n", + " 'GOs expanded': cm.tab20.colors[7]\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "id": "ac7bba95-aec3-49c0-a6ed-c990a0c44db2", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 504x432 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "f, ax = plt.subplots(figsize=(7, 6))\n", + "\n", + "sns.boxplot(data=GOGO_results_long_combined, y='value', x='GO ontology', hue='category', order=['MFO', 'CCO', 'BPO'], whis=[5, 95], palette=color_reference)\n", + "ax.set(ylim=(0, 1.05))\n", + "ax.tick_params(axis='both', which='major', labelsize=20)\n", + "\n", + "ax.set_xlabel('GO term ontologies', size=20)\n", + "ax.set_ylabel('Semantic Similarity',size=20)\n", + "\n", + "plt.legend(title = '', fontsize=15)\n", + "\n", + "plt.savefig('/g/arendt/Fabian/PhD/Computational/Spongefold/coffe-paper/figures/GO_semantic_similarities.svg', bbox_inches=\"tight\")" + ] + }, + { + "cell_type": "markdown", + "id": "4af2e08f-5945-436d-b6fc-ff6062667207", + "metadata": {}, + "source": [ + "## GO depth analysis and comparison" + ] + }, + { + "cell_type": "markdown", + "id": "9f0e0092-714b-494b-8ea9-e07d5bb32397", + "metadata": {}, + "source": [ + "As a last measure, we would love to compare GO term depths in the partially overlapping GO term categories. This is the plan:\n", + "\n", + " 1. Make new column with overlap between GO term assignments from sequence- and structure based annotations from the partial overlap category\n", + " 2. Assign GO term depths for each GO term within each GO ontology using GOATOOLS (https://www.nature.com/articles/s41598-018-28948-z)\n", + " 3. Return maxiumum GO depths within each GO ontology\n", + " 4. Plot\n", + " \n", + " \n", + "This will answer the question: \"How deep did the common annotation between sequence- and structure based annotation go in each of the GO ontologies?" + ] + }, + { + "cell_type": "code", + "execution_count": 260, + "id": "db844512-5486-42eb-846d-b972959fb02d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_125/2522461405.py:1: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " GOs_partial['overlap_GOs'] = ''\n" + ] + } + ], + "source": [ + "GOs_partial['overlap_GOs'] = ''\n", + "\n", + "for index, row in GOs_partial.iterrows():\n", + " lst1 = row['GOs_struct']\n", + " lst2 = row['GOs_seq']\n", + " GOs_partial.at[index, 'overlap_GOs'] = list(set(lst1) & set(lst2))" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "93490009-b7e5-4d48-be71-93a869774f25", + "metadata": {}, + "outputs": [], + "source": [ + "fin_obo = '/g/arendt/Fabian/PhD/Computational/spongeprot/data/GO_analysis/go-basic.obo' # DAG containing HPO terms" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "id": "f71d1e64-0ce8-4bed-ab39-904b1e65ccad", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/g/arendt/Fabian/PhD/Computational/spongeprot/data/GO_analysis/go-basic.obo: fmt(1.2) rel(2022-07-01) 50,918 Terms\n" + ] + } + ], + "source": [ + "from goatools.obo_parser import GODag\n", + "\n", + "obodag = GODag(fin_obo, load_obsolete=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 281, + "id": "32a4dec1-f25e-461e-9008-44babcb6c5f7", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_125/2065038866.py:1: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " GOs_partial['MFO_max_depth'] = ''\n", + "/tmp/ipykernel_125/2065038866.py:2: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " GOs_partial['CCO_max_depth'] = ''\n", + "/tmp/ipykernel_125/2065038866.py:3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " GOs_partial['BPO_max_depth'] = ''\n" + ] + } + ], + "source": [ + "GOs_partial['MFO_max_depth'] = ''\n", + "GOs_partial['CCO_max_depth'] = ''\n", + "GOs_partial['BPO_max_depth'] = ''\n", + "\n", + "for index, row in GOs_partial.iterrows():\n", + " MFO_max_depth = []\n", + " CCO_max_depth = []\n", + " BPO_max_depth = []\n", + " for GO in row['overlap_GOs']:\n", + " if obodag[GO].namespace == 'molecular_function':\n", + " MFO_max_depth.append(obodag[GO].depth)\n", + " elif obodag[GO].namespace == 'cellular_component':\n", + " CCO_max_depth.append(obodag[GO].depth)\n", + " elif obodag[GO].namespace == 'biological_process':\n", + " BPO_max_depth.append(obodag[GO].depth)\n", + " else: print('something is wrong')\n", + " GOs_partial.at[index, 'MFO_max_depth'] = MFO_max_depth\n", + " GOs_partial.at[index, 'CCO_max_depth'] = CCO_max_depth\n", + " GOs_partial.at[index, 'BPO_max_depth'] = BPO_max_depth\n", + " GOs_partial.at[index, 'MFO_max_depth'] = max(GOs_partial.at[index, 'MFO_max_depth'], default=None)\n", + " GOs_partial.at[index, 'CCO_max_depth'] = max(GOs_partial.at[index, 'CCO_max_depth'], default=None)\n", + " GOs_partial.at[index, 'BPO_max_depth'] = max(GOs_partial.at[index, 'BPO_max_depth'], default=None)" + ] + }, + { + "cell_type": "markdown", + "id": "c13e82ac-9548-420d-8671-f4b1f4a50145", + "metadata": {}, + "source": [ + "Format dataframe for plotting" + ] + }, + { + "cell_type": "code", + "execution_count": 285, + "id": "a48026bc-002f-462d-8747-b4cba1d1276b", + "metadata": {}, + "outputs": [], + "source": [ + "GO_depth_plotting = GOs_partial[['overlap', 'MFO_max_depth', 'CCO_max_depth', 'BPO_max_depth']]" + ] + }, + { + "cell_type": "code", + "execution_count": 288, + "id": "3cdb7257-f13a-402a-8e36-de344cf2790e", + "metadata": {}, + "outputs": [], + "source": [ + "GO_depth_plotting_long = GO_depth_plotting.melt(id_vars='overlap', var_name='ontology')" + ] + }, + { + "cell_type": "code", + "execution_count": 300, + "id": "147fdf80-0b7a-43c1-8ac8-97c64228b51d", + "metadata": {}, + "outputs": [], + "source": [ + "GO_depth_plotting_long = GO_depth_plotting_long[~GO_depth_plotting_long['value'].isnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": 317, + "id": "94786163-db0b-4f81-bd9b-f3dc726d8093", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_125/1019678659.py:1: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " GO_depth_plotting_long['overlap_condensed'] = ''\n" + ] + } + ], + "source": [ + "GO_depth_plotting_long['overlap_condensed'] = ''\n", + "\n", + "for index, row in GO_depth_plotting_long.iterrows():\n", + " if row['overlap'] == 'partial overlap - unique GOs':\n", + " GO_depth_plotting_long.at[index, 'overlap_condensed'] = 'unique GOs'\n", + " else: GO_depth_plotting_long.at[index, 'overlap_condensed'] = 'GOs expanded'" + ] + }, + { + "cell_type": "code", + "execution_count": 320, + "id": "f1f2a46a-bef6-45dd-ba4e-3d5fadcc1d9c", + "metadata": {}, + "outputs": [], + "source": [ + "color_reference = {\n", + " 'unique GOs': cm.tab20.colors[18],\n", + " 'GOs expanded': cm.tab20.colors[7]\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 329, + "id": "02cc7560-1334-4eb5-a7f6-9a3d00f4ff3e", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 576x504 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "ontologies = ['MFO', 'COO', 'BPO']\n", + "\n", + "f, ax = plt.subplots(figsize=(8, 7))\n", + "\n", + "sns.boxplot(data=GO_depth_plotting_long, y='value', x='ontology', hue='overlap_condensed', hue_order=['GOs expanded', 'unique GOs'] , whis=[5, 95], palette=color_reference)\n", + "ax.tick_params(axis='both', which='major', labelsize=20)\n", + "ax.set_xticklabels(ontologies, size=20)\n", + "\n", + "ax.set_xlabel('GO term ontologies', size=20)\n", + "ax.set_ylabel('Maximum GO term depth',size=20)\n", + "\n", + "plt.legend(title = '', fontsize=15)\n", + "\n", + "plt.savefig('/g/arendt/Fabian/PhD/Computational/Spongefold/coffe-paper/figures/GO_depth.svg', bbox_inches=\"tight\")" + ] + }, + { + "cell_type": "markdown", + "id": "e97dce55-30b0-43f0-aee1-9cc7c7982092", + "metadata": {}, + "source": [ + "Here we go. In general however, it is difficult to interpret and compare GO depths. Similar GO depths can specify very different levels of details in different branches of the hierarchy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b31eadd8-fa8c-40f4-b3d8-90a94e11e46c", "metadata": {}, "outputs": [], "source": [] diff --git a/analysis/revision-proteome_coverage.ipynb b/analysis/revision-proteome_coverage.ipynb index bb42fdbeb6ad1b0d7d760462e24cb4d14abce85e..b4d3446ca059db98a235d955d99f2659ebbbc728 100644 --- a/analysis/revision-proteome_coverage.ipynb +++ b/analysis/revision-proteome_coverage.ipynb @@ -658,7 +658,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.9.6" } }, "nbformat": 4, diff --git a/analysis/suppl-annotation_categories.ipynb b/analysis/suppl-annotation_categories.ipynb index fb38f74496160f60bf0d3946f6946cd398efba29..7ab0a4b499dff332530745839d66567477b5db0f 100755 --- a/analysis/suppl-annotation_categories.ipynb +++ b/analysis/suppl-annotation_categories.ipynb @@ -99,7 +99,7 @@ "id": "bd1879d9", "metadata": {}, "source": [ - "Use the same bit score cutoff for the CoFFE annotation:" + "Use the same bit score cutoff for the MorF annotation:" ] }, { @@ -632,7 +632,7 @@ "id": "eeabd82b", "metadata": {}, "source": [ - "This shows that bigger proteins have a higher tendency to have a CoFFE annotation." + "This shows that bigger proteins have a higher tendency to have a MorF annotation." ] }, {