From ed65d4946d8e3d1b25cf4af95925c432a50d8209 Mon Sep 17 00:00:00 2001 From: Fabian Ruperti <fabian.ruperti@embl.de> Date: Tue, 20 Dec 2022 15:53:17 +0100 Subject: [PATCH] update GO analysis with GO depth --- analysis/revision-GO_term_comparison.ipynb | 473 ++++++++++++++++----- analysis/suppl-annotation_categories.ipynb | 4 +- 2 files changed, 365 insertions(+), 112 deletions(-) diff --git a/analysis/revision-GO_term_comparison.ipynb b/analysis/revision-GO_term_comparison.ipynb index 17ea9d9..50a4a24 100644 --- a/analysis/revision-GO_term_comparison.ipynb +++ b/analysis/revision-GO_term_comparison.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 143, + "execution_count": 109, "id": "99e6a23c-f2a3-4369-977e-4d2c74707d0c", "metadata": {}, "outputs": [ @@ -10,7 +10,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "2022-12-16 16:34\n" + "2022-12-20 10:26\n" ] } ], @@ -27,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 144, + "execution_count": 110, "id": "a6e66eaf-c7a8-4b26-a99b-5032d49c25a3", "metadata": {}, "outputs": [], @@ -60,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 111, "id": "0cd74ade-73c8-44f6-9d6e-f0f37bb760d3", "metadata": {}, "outputs": [], @@ -71,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 146, + "execution_count": 112, "id": "c3d289af-b99b-4403-af5b-a8fc15f8269f", "metadata": {}, "outputs": [], @@ -90,7 +90,7 @@ }, { "cell_type": "code", - "execution_count": 147, + "execution_count": 113, "id": "63dd7ec5-8841-4bfe-9e38-db55f67c298d", "metadata": {}, "outputs": [], @@ -107,7 +107,7 @@ }, { "cell_type": "code", - "execution_count": 148, + "execution_count": 114, "id": "469c03ab-31cf-474c-b6eb-fc9f613015ba", "metadata": {}, "outputs": [], @@ -140,7 +140,7 @@ }, { "cell_type": "code", - "execution_count": 149, + "execution_count": 115, "id": "fe53ea5e-26de-46cf-ab69-44288f1a2b26", "metadata": {}, "outputs": [], @@ -150,7 +150,7 @@ }, { "cell_type": "code", - "execution_count": 150, + "execution_count": 116, "id": "3761bb31-a522-40a0-8e86-c262b2b9e150", "metadata": {}, "outputs": [ @@ -158,13 +158,13 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_467/2049958699.py:1: SettingWithCopyWarning: \n", + "/tmp/ipykernel_125/2049958699.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " GOs['GOs_struct'] = GOs['GOs_struct'].str.split(',')\n", - "/tmp/ipykernel_467/2049958699.py:2: SettingWithCopyWarning: \n", + "/tmp/ipykernel_125/2049958699.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", @@ -180,7 +180,7 @@ }, { "cell_type": "code", - "execution_count": 151, + "execution_count": 117, "id": "1b45906f-bd6c-4424-93de-ffbef8fad67e", "metadata": {}, "outputs": [], @@ -212,7 +212,7 @@ }, { "cell_type": "code", - "execution_count": 152, + "execution_count": 118, "id": "90a4a44d-2931-4b87-a7a8-1fe677e8912b", "metadata": {}, "outputs": [], @@ -226,7 +226,7 @@ }, { "cell_type": "code", - "execution_count": 153, + "execution_count": 119, "id": "9c799814-04f9-49d4-9427-873c50e3a2cb", "metadata": {}, "outputs": [], @@ -240,7 +240,7 @@ }, { "cell_type": "code", - "execution_count": 154, + "execution_count": 120, "id": "bcc70854-7a7b-4bdc-b59f-c9244f9ffae8", "metadata": {}, "outputs": [ @@ -248,7 +248,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_467/1677403612.py:1: SettingWithCopyWarning: \n", + "/tmp/ipykernel_125/1677403612.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", @@ -263,7 +263,7 @@ }, { "cell_type": "code", - "execution_count": 155, + "execution_count": 121, "id": "948c120b-f680-4a9d-ba9e-11ddc15be017", "metadata": {}, "outputs": [ @@ -271,7 +271,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_467/2972605403.py:1: SettingWithCopyWarning: \n", + "/tmp/ipykernel_125/2972605403.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", @@ -286,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": 156, + "execution_count": 122, "id": "ca29c96d-a8da-4c7c-bdf2-5ad1f380c4da", "metadata": {}, "outputs": [ @@ -294,7 +294,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_467/1789197005.py:1: SettingWithCopyWarning: \n", + "/tmp/ipykernel_125/1789197005.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", @@ -309,7 +309,7 @@ }, { "cell_type": "code", - "execution_count": 157, + "execution_count": 123, "id": "9195a8b5-f5e4-4e10-a8e6-fcf190fdd42f", "metadata": {}, "outputs": [ @@ -324,7 +324,7 @@ "Name: overlap, dtype: int64" ] }, - "execution_count": 157, + "execution_count": 123, "metadata": {}, "output_type": "execute_result" } @@ -335,7 +335,7 @@ }, { "cell_type": "code", - "execution_count": 178, + "execution_count": 124, "id": "6a4352ef-1326-4102-9ca6-799217ba6be4", "metadata": {}, "outputs": [], @@ -353,7 +353,7 @@ }, { "cell_type": "code", - "execution_count": 179, + "execution_count": 125, "id": "ae8bac54-f10d-4020-8019-6221c13b56b7", "metadata": {}, "outputs": [ @@ -382,7 +382,7 @@ "piechart = ax.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=False, colors=colors, textprops={'fontsize': 14})\n", "ax.axis('equal');\n", "\n", - "#plt.savefig('./figures/analysis-sequence_structure_agreement.pdf')" + "plt.savefig('/g/arendt/Fabian/PhD/Computational/Spongefold/coffe-paper/figures/GO_count.svg', bbox_inches=\"tight\")" ] }, { @@ -398,7 +398,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 126, "id": "ab2a9480-aaee-43d3-af23-ca05886b3695", "metadata": {}, "outputs": [ @@ -408,7 +408,7 @@ "<AxesSubplot:xlabel='coverage_struct', ylabel='Count'>" ] }, - "execution_count": 18, + "execution_count": 126, "metadata": {}, "output_type": "execute_result" }, @@ -431,7 +431,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 127, "id": "f628c3eb-422c-403d-b267-ec10489171df", "metadata": {}, "outputs": [ @@ -441,7 +441,7 @@ "0.4821899253270338" ] }, - "execution_count": 19, + "execution_count": 127, "metadata": {}, "output_type": "execute_result" } @@ -460,7 +460,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 128, "id": "93eca691-0659-4516-8945-32a9b854c7e9", "metadata": {}, "outputs": [ @@ -470,7 +470,7 @@ "<AxesSubplot:xlabel='coverage_seq', ylabel='Count'>" ] }, - "execution_count": 20, + "execution_count": 128, "metadata": {}, "output_type": "execute_result" }, @@ -493,7 +493,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 129, "id": "7036de66-bc32-4445-a88c-53ba9d37dae3", "metadata": {}, "outputs": [ @@ -503,7 +503,7 @@ "0.48056632457400206" ] }, - "execution_count": 21, + "execution_count": 129, "metadata": {}, "output_type": "execute_result" } @@ -522,7 +522,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 130, "id": "ae564ff4-2ed4-48bf-83c3-ca9f27cf7acb", "metadata": { "tags": [] @@ -534,7 +534,7 @@ "<AxesSubplot:xlabel='coverage_seq', ylabel='Count'>" ] }, - "execution_count": 22, + "execution_count": 130, "metadata": {}, "output_type": "execute_result" }, @@ -557,7 +557,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 131, "id": "93c9e292-8a15-4fcb-9695-bfc77e13048c", "metadata": {}, "outputs": [ @@ -567,7 +567,7 @@ "0.7126174034458369" ] }, - "execution_count": 23, + "execution_count": 131, "metadata": {}, "output_type": "execute_result" } @@ -586,7 +586,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 132, "id": "75c5c9ba-9bfa-4934-8656-a43ac2bcb0e7", "metadata": {}, "outputs": [ @@ -596,7 +596,7 @@ "<AxesSubplot:xlabel='coverage_struct', ylabel='Count'>" ] }, - "execution_count": 24, + "execution_count": 132, "metadata": {}, "output_type": "execute_result" }, @@ -619,7 +619,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 133, "id": "143b5d56-fad4-449e-ae57-4c35afa6fefd", "metadata": {}, "outputs": [ @@ -629,7 +629,7 @@ "0.6018304051791242" ] }, - "execution_count": 25, + "execution_count": 133, "metadata": {}, "output_type": "execute_result" } @@ -664,7 +664,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 258, "id": "119a9f49-871c-445c-a729-db5fe80d0cf4", "metadata": {}, "outputs": [], @@ -674,7 +674,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 259, "id": "2123153f-fec7-41d7-a6ec-5f8e50f3b51e", "metadata": {}, "outputs": [], @@ -684,7 +684,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 213, "id": "2a78b465-9630-4b42-976a-d94ecf6f6ff5", "metadata": {}, "outputs": [], @@ -710,7 +710,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 137, "id": "bb190be1-e0c8-440d-92bf-fea4b44c7374", "metadata": {}, "outputs": [], @@ -720,7 +720,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 138, "id": "9dc03262-9257-4447-80d6-304743793b4a", "metadata": {}, "outputs": [], @@ -748,7 +748,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 139, "id": "101e4b9a-e315-406c-8cee-ba1115e55208", "metadata": {}, "outputs": [], @@ -758,7 +758,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 140, "id": "23985e19-becf-40ed-8047-cbcd3b1258f4", "metadata": {}, "outputs": [], @@ -768,7 +768,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 141, "id": "854e6e6d-5834-418e-8fcc-24c33846a5ef", "metadata": {}, "outputs": [], @@ -778,7 +778,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 142, "id": "60e1b05f-761a-4153-a38f-586143883f34", "metadata": {}, "outputs": [], @@ -788,7 +788,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 143, "id": "86603707-9eab-4fee-ade7-42077ed24068", "metadata": {}, "outputs": [], @@ -809,7 +809,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 144, "id": "fe401867-0ab5-46df-88ba-4b560b00cf7d", "metadata": {}, "outputs": [ @@ -965,7 +965,7 @@ "[2360 rows x 5 columns]" ] }, - "execution_count": 37, + "execution_count": 144, "metadata": {}, "output_type": "execute_result" } @@ -976,7 +976,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 145, "id": "c7dfe425-7158-4f16-97a1-eb6ea0542d03", "metadata": {}, "outputs": [], @@ -986,7 +986,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 146, "id": "05de5391-becd-4500-bac2-450632bbf482", "metadata": {}, "outputs": [ @@ -996,7 +996,7 @@ "<AxesSubplot:xlabel='BPO', ylabel='Count'>" ] }, - "execution_count": 39, + "execution_count": 146, "metadata": {}, "output_type": "execute_result" }, @@ -1019,7 +1019,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 147, "id": "d717eedf-d4db-48c2-a2ed-baa0529c40a1", "metadata": {}, "outputs": [ @@ -1029,7 +1029,7 @@ "0.6656481398476022" ] }, - "execution_count": 40, + "execution_count": 147, "metadata": {}, "output_type": "execute_result" } @@ -1040,7 +1040,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 148, "id": "f388fa0c-5b4d-49ce-9b48-386d3f33acfe", "metadata": {}, "outputs": [ @@ -1050,7 +1050,7 @@ "<AxesSubplot:xlabel='CCO', ylabel='Count'>" ] }, - "execution_count": 41, + "execution_count": 148, "metadata": {}, "output_type": "execute_result" }, @@ -1074,7 +1074,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 149, "id": "13d7959f-32ce-4688-b387-22e1f9db6d09", "metadata": {}, "outputs": [ @@ -1084,7 +1084,7 @@ "0.8254611161939618" ] }, - "execution_count": 42, + "execution_count": 149, "metadata": {}, "output_type": "execute_result" } @@ -1095,7 +1095,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 150, "id": "d85ef2a1-e21d-4a1a-b5e9-926544becf5b", "metadata": {}, "outputs": [ @@ -1105,7 +1105,7 @@ "<AxesSubplot:xlabel='MFO', ylabel='Count'>" ] }, - "execution_count": 43, + "execution_count": 150, "metadata": {}, "output_type": "execute_result" }, @@ -1129,7 +1129,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 151, "id": "7a07389d-92e8-42c4-9331-412b5fd29079", "metadata": {}, "outputs": [ @@ -1139,7 +1139,7 @@ "0.7859392819429791" ] }, - "execution_count": 44, + "execution_count": 151, "metadata": {}, "output_type": "execute_result" } @@ -1158,7 +1158,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 152, "id": "84d8e039-e16d-4482-8552-9099501bf47e", "metadata": {}, "outputs": [], @@ -1168,7 +1168,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 153, "id": "b49002d7-ffb8-4e2c-a3bd-b0c16835318c", "metadata": {}, "outputs": [ @@ -1337,7 +1337,7 @@ "[2203 rows x 5 columns]" ] }, - "execution_count": 50, + "execution_count": 153, "metadata": {}, "output_type": "execute_result" } @@ -1356,7 +1356,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 154, "id": "e1d45be9-ee8c-4b86-9163-ba054ba9ed5c", "metadata": {}, "outputs": [], @@ -1366,7 +1366,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 155, "id": "5c758b4e-9833-40eb-910c-51b8e2ee1216", "metadata": {}, "outputs": [], @@ -1394,7 +1394,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 156, "id": "b2670268-3990-430d-bb82-e7bba51bf81b", "metadata": {}, "outputs": [], @@ -1404,7 +1404,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 157, "id": "55340e34-3b2f-48b3-b4cf-f96e50b56504", "metadata": {}, "outputs": [], @@ -1414,7 +1414,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 158, "id": "4aa34798-6754-4b95-a61a-653bf5fc235f", "metadata": {}, "outputs": [], @@ -1424,7 +1424,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 159, "id": "8553a1e2-868e-4f19-96bf-c840ca7a6db9", "metadata": {}, "outputs": [], @@ -1434,7 +1434,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 160, "id": "81a05e12-6558-4867-9796-79bef9b3e3de", "metadata": {}, "outputs": [], @@ -1455,7 +1455,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 161, "id": "ea64dbe5-1c66-46e9-95df-7ed602c6eb47", "metadata": {}, "outputs": [ @@ -1611,7 +1611,7 @@ "[2203 rows x 5 columns]" ] }, - "execution_count": 58, + "execution_count": 161, "metadata": {}, "output_type": "execute_result" } @@ -1622,7 +1622,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 162, "id": "0be39243-e149-430c-8dd9-6dc46c3bac81", "metadata": {}, "outputs": [], @@ -1632,7 +1632,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 163, "id": "22a18199-4907-4423-a321-2d2c98517bad", "metadata": {}, "outputs": [ @@ -1642,7 +1642,7 @@ "<AxesSubplot:xlabel='BPO', ylabel='Count'>" ] }, - "execution_count": 60, + "execution_count": 163, "metadata": {}, "output_type": "execute_result" }, @@ -1665,7 +1665,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 164, "id": "a0cd16ae-8fb2-49fa-ae00-d4f0c651b781", "metadata": {}, "outputs": [ @@ -1675,7 +1675,7 @@ "0.8768518696069056" ] }, - "execution_count": 61, + "execution_count": 164, "metadata": {}, "output_type": "execute_result" } @@ -1686,7 +1686,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 165, "id": "1125e2fe-6176-4a50-a144-7567bb5a366b", "metadata": {}, "outputs": [ @@ -1696,7 +1696,7 @@ "<AxesSubplot:xlabel='CCO', ylabel='Count'>" ] }, - "execution_count": 62, + "execution_count": 165, "metadata": {}, "output_type": "execute_result" }, @@ -1720,7 +1720,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 166, "id": "c5d6aaf5-dbd8-4a7f-8f35-f1d6ed438100", "metadata": {}, "outputs": [ @@ -1730,7 +1730,7 @@ "0.9267089588377744" ] }, - "execution_count": 63, + "execution_count": 166, "metadata": {}, "output_type": "execute_result" } @@ -1741,7 +1741,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 167, "id": "e92d5b43-f861-4fc5-809e-6f91a4744310", "metadata": {}, "outputs": [ @@ -1751,7 +1751,7 @@ "<AxesSubplot:xlabel='MFO', ylabel='Count'>" ] }, - "execution_count": 66, + "execution_count": 167, "metadata": {}, "output_type": "execute_result" }, @@ -1775,7 +1775,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 168, "id": "a815cfe3-6610-4436-8eb0-cfbd90efe671", "metadata": {}, "outputs": [ @@ -1785,7 +1785,7 @@ "0.9309306777030519" ] }, - "execution_count": 67, + "execution_count": 168, "metadata": {}, "output_type": "execute_result" } @@ -1804,7 +1804,7 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 169, "id": "24226d52-2db6-4c9e-af7a-e0d8921ff8d1", "metadata": {}, "outputs": [], @@ -1814,7 +1814,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 170, "id": "effa91b7-eefc-4f44-89ca-91e4d35c1358", "metadata": {}, "outputs": [], @@ -1827,7 +1827,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 171, "id": "deab143b-1d8b-435e-88cc-2f381cd3ca0e", "metadata": {}, "outputs": [], @@ -1837,7 +1837,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 172, "id": "faea203a-1579-43b9-9116-80a0d22c2c6c", "metadata": {}, "outputs": [], @@ -1850,7 +1850,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 173, "id": "9895083b-e84c-4055-a7ba-19a872efd149", "metadata": {}, "outputs": [], @@ -1860,7 +1860,7 @@ }, { "cell_type": "code", - "execution_count": 181, + "execution_count": 174, "id": "affe2f61-4040-4163-9ebc-2c8c82a99f41", "metadata": {}, "outputs": [], @@ -1873,20 +1873,10 @@ }, { "cell_type": "code", - "execution_count": 183, + "execution_count": 175, "id": "ac7bba95-aec3-49c0-a6ed-c990a0c44db2", "metadata": {}, "outputs": [ - { - "data": { - "text/plain": [ - "<matplotlib.legend.Legend at 0x7f259eca0ac0>" - ] - }, - "execution_count": 183, - "metadata": {}, - "output_type": "execute_result" - }, { "data": { "image/png": "\n", @@ -1910,13 +1900,276 @@ "ax.set_xlabel('GO term ontologies', size=20)\n", "ax.set_ylabel('Semantic Similarity',size=20)\n", "\n", - "plt.legend(title = '', fontsize=15)" + "plt.legend(title = '', fontsize=15)\n", + "\n", + "plt.savefig('/g/arendt/Fabian/PhD/Computational/Spongefold/coffe-paper/figures/GO_semantic_similarities.svg', bbox_inches=\"tight\")" + ] + }, + { + "cell_type": "markdown", + "id": "4af2e08f-5945-436d-b6fc-ff6062667207", + "metadata": {}, + "source": [ + "## GO depth analysis and comparison" + ] + }, + { + "cell_type": "markdown", + "id": "9f0e0092-714b-494b-8ea9-e07d5bb32397", + "metadata": {}, + "source": [ + "As a last measure, we would love to compare GO term depths in the partially overlapping GO term categories. This is the plan:\n", + "\n", + " 1. Make new column with overlap between GO term assignments from sequence- and structure based annotations from the partial overlap category\n", + " 2. Assign GO term depths for each GO term within each GO ontology using GOATOOLS (https://www.nature.com/articles/s41598-018-28948-z)\n", + " 3. Return maxiumum GO depths within each GO ontology\n", + " 4. Plot\n", + " \n", + " \n", + "This will answer the question: \"How deep did the common annotation between sequence- and structure based annotation go in each of the GO ontologies?" + ] + }, + { + "cell_type": "code", + "execution_count": 260, + "id": "db844512-5486-42eb-846d-b972959fb02d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_125/2522461405.py:1: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " GOs_partial['overlap_GOs'] = ''\n" + ] + } + ], + "source": [ + "GOs_partial['overlap_GOs'] = ''\n", + "\n", + "for index, row in GOs_partial.iterrows():\n", + " lst1 = row['GOs_struct']\n", + " lst2 = row['GOs_seq']\n", + " GOs_partial.at[index, 'overlap_GOs'] = list(set(lst1) & set(lst2))" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "93490009-b7e5-4d48-be71-93a869774f25", + "metadata": {}, + "outputs": [], + "source": [ + "fin_obo = '/g/arendt/Fabian/PhD/Computational/spongeprot/data/GO_analysis/go-basic.obo' # DAG containing HPO terms" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "id": "f71d1e64-0ce8-4bed-ab39-904b1e65ccad", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/g/arendt/Fabian/PhD/Computational/spongeprot/data/GO_analysis/go-basic.obo: fmt(1.2) rel(2022-07-01) 50,918 Terms\n" + ] + } + ], + "source": [ + "from goatools.obo_parser import GODag\n", + "\n", + "obodag = GODag(fin_obo, load_obsolete=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 281, + "id": "32a4dec1-f25e-461e-9008-44babcb6c5f7", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_125/2065038866.py:1: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " GOs_partial['MFO_max_depth'] = ''\n", + "/tmp/ipykernel_125/2065038866.py:2: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " GOs_partial['CCO_max_depth'] = ''\n", + "/tmp/ipykernel_125/2065038866.py:3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " GOs_partial['BPO_max_depth'] = ''\n" + ] + } + ], + "source": [ + "GOs_partial['MFO_max_depth'] = ''\n", + "GOs_partial['CCO_max_depth'] = ''\n", + "GOs_partial['BPO_max_depth'] = ''\n", + "\n", + "for index, row in GOs_partial.iterrows():\n", + " MFO_max_depth = []\n", + " CCO_max_depth = []\n", + " BPO_max_depth = []\n", + " for GO in row['overlap_GOs']:\n", + " if obodag[GO].namespace == 'molecular_function':\n", + " MFO_max_depth.append(obodag[GO].depth)\n", + " elif obodag[GO].namespace == 'cellular_component':\n", + " CCO_max_depth.append(obodag[GO].depth)\n", + " elif obodag[GO].namespace == 'biological_process':\n", + " BPO_max_depth.append(obodag[GO].depth)\n", + " else: print('something is wrong')\n", + " GOs_partial.at[index, 'MFO_max_depth'] = MFO_max_depth\n", + " GOs_partial.at[index, 'CCO_max_depth'] = CCO_max_depth\n", + " GOs_partial.at[index, 'BPO_max_depth'] = BPO_max_depth\n", + " GOs_partial.at[index, 'MFO_max_depth'] = max(GOs_partial.at[index, 'MFO_max_depth'], default=None)\n", + " GOs_partial.at[index, 'CCO_max_depth'] = max(GOs_partial.at[index, 'CCO_max_depth'], default=None)\n", + " GOs_partial.at[index, 'BPO_max_depth'] = max(GOs_partial.at[index, 'BPO_max_depth'], default=None)" + ] + }, + { + "cell_type": "markdown", + "id": "c13e82ac-9548-420d-8671-f4b1f4a50145", + "metadata": {}, + "source": [ + "Format dataframe for plotting" + ] + }, + { + "cell_type": "code", + "execution_count": 285, + "id": "a48026bc-002f-462d-8747-b4cba1d1276b", + "metadata": {}, + "outputs": [], + "source": [ + "GO_depth_plotting = GOs_partial[['overlap', 'MFO_max_depth', 'CCO_max_depth', 'BPO_max_depth']]" + ] + }, + { + "cell_type": "code", + "execution_count": 288, + "id": "3cdb7257-f13a-402a-8e36-de344cf2790e", + "metadata": {}, + "outputs": [], + "source": [ + "GO_depth_plotting_long = GO_depth_plotting.melt(id_vars='overlap', var_name='ontology')" + ] + }, + { + "cell_type": "code", + "execution_count": 300, + "id": "147fdf80-0b7a-43c1-8ac8-97c64228b51d", + "metadata": {}, + "outputs": [], + "source": [ + "GO_depth_plotting_long = GO_depth_plotting_long[~GO_depth_plotting_long['value'].isnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": 317, + "id": "94786163-db0b-4f81-bd9b-f3dc726d8093", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_125/1019678659.py:1: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " GO_depth_plotting_long['overlap_condensed'] = ''\n" + ] + } + ], + "source": [ + "GO_depth_plotting_long['overlap_condensed'] = ''\n", + "\n", + "for index, row in GO_depth_plotting_long.iterrows():\n", + " if row['overlap'] == 'partial overlap - unique GOs':\n", + " GO_depth_plotting_long.at[index, 'overlap_condensed'] = 'unique GOs'\n", + " else: GO_depth_plotting_long.at[index, 'overlap_condensed'] = 'GOs expanded'" + ] + }, + { + "cell_type": "code", + "execution_count": 320, + "id": "f1f2a46a-bef6-45dd-ba4e-3d5fadcc1d9c", + "metadata": {}, + "outputs": [], + "source": [ + "color_reference = {\n", + " 'unique GOs': cm.tab20.colors[18],\n", + " 'GOs expanded': cm.tab20.colors[7]\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 329, + "id": "02cc7560-1334-4eb5-a7f6-9a3d00f4ff3e", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 576x504 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "ontologies = ['MFO', 'COO', 'BPO']\n", + "\n", + "f, ax = plt.subplots(figsize=(8, 7))\n", + "\n", + "sns.boxplot(data=GO_depth_plotting_long, y='value', x='ontology', hue='overlap_condensed', hue_order=['GOs expanded', 'unique GOs'] , whis=[5, 95], palette=color_reference)\n", + "ax.tick_params(axis='both', which='major', labelsize=20)\n", + "ax.set_xticklabels(ontologies, size=20)\n", + "\n", + "ax.set_xlabel('GO term ontologies', size=20)\n", + "ax.set_ylabel('Maximum GO term depth',size=20)\n", + "\n", + "plt.legend(title = '', fontsize=15)\n", + "\n", + "plt.savefig('/g/arendt/Fabian/PhD/Computational/Spongefold/coffe-paper/figures/GO_depth.svg', bbox_inches=\"tight\")" + ] + }, + { + "cell_type": "markdown", + "id": "e97dce55-30b0-43f0-aee1-9cc7c7982092", + "metadata": {}, + "source": [ + "Here we go. In general however, it is difficult to interpret and compare GO depths. Similar GO depths can specify very different levels of details in different branches of the hierarchy." ] }, { "cell_type": "code", "execution_count": null, - "id": "4a3f2065-8115-415f-a8e9-fc73ce178f9e", + "id": "b31eadd8-fa8c-40f4-b3d8-90a94e11e46c", "metadata": {}, "outputs": [], "source": [] diff --git a/analysis/suppl-annotation_categories.ipynb b/analysis/suppl-annotation_categories.ipynb index fb38f74..7ab0a4b 100755 --- a/analysis/suppl-annotation_categories.ipynb +++ b/analysis/suppl-annotation_categories.ipynb @@ -99,7 +99,7 @@ "id": "bd1879d9", "metadata": {}, "source": [ - "Use the same bit score cutoff for the CoFFE annotation:" + "Use the same bit score cutoff for the MorF annotation:" ] }, { @@ -632,7 +632,7 @@ "id": "eeabd82b", "metadata": {}, "source": [ - "This shows that bigger proteins have a higher tendency to have a CoFFE annotation." + "This shows that bigger proteins have a higher tendency to have a MorF annotation." ] }, { -- GitLab