From 9d321e6af9c98c45b4e501745ba734e591f8f292 Mon Sep 17 00:00:00 2001 From: Nikolaos Papadopoulos <npapadop@embl.de> Date: Wed, 17 Aug 2022 12:56:22 +0200 Subject: [PATCH] fixed ModelArchive bugs (protein names/models, protein query length); added AFDB self comparison as per editors' request --- analysis/suppl-ModelArchive_metadata.ipynb | 217 +++-- analysis/suppl-ModelArchive_preparation.ipynb | 2 +- analysis/suppl-compare_seq_profile.ipynb | 898 ++++++++++++++++++ scripts/fs_self_search_afdb.sh | 23 + 4 files changed, 1064 insertions(+), 76 deletions(-) create mode 100644 analysis/suppl-compare_seq_profile.ipynb create mode 100755 scripts/fs_self_search_afdb.sh diff --git a/analysis/suppl-ModelArchive_metadata.ipynb b/analysis/suppl-ModelArchive_metadata.ipynb index f30f805..d59fb11 100644 --- a/analysis/suppl-ModelArchive_metadata.ipynb +++ b/analysis/suppl-ModelArchive_metadata.ipynb @@ -10,7 +10,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "2022-07-28 08:02\n" + "2022-08-17 11:07\n" ] } ], @@ -41,43 +41,24 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd" + "import pandas as pd\n", + "import numpy as np" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "d7d72594", "metadata": {}, - "outputs": [ - { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: '../ol_data/Slacustris_eggnog.tsv'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipykernel_136/1368120052.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msequence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../ol_data/Slacustris_eggnog.tsv'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'\\t'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mstructure\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_parquet\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../old_data/structure_annotation.parquet'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 309\u001b[0m \u001b[0mstacklevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstacklevel\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 310\u001b[0m )\n\u001b[0;32m--> 311\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 312\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 313\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[1;32m 584\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkwds_defaults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 585\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 586\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 587\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 588\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 480\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 481\u001b[0m \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 482\u001b[0;31m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 483\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 484\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 809\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"has_index_names\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"has_index_names\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 810\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 811\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 812\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 813\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, engine)\u001b[0m\n\u001b[1;32m 1038\u001b[0m )\n\u001b[1;32m 1039\u001b[0m \u001b[0;31m# error: Too many arguments for \"ParserBase\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1040\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mmapping\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# type: ignore[call-arg]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1041\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1042\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_failover_to_python\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/io/parsers/c_parser_wrapper.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, src, **kwds)\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[0;31m# open handles\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 51\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_open_handles\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 52\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandles\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/io/parsers/base_parser.py\u001b[0m in \u001b[0;36m_open_handles\u001b[0;34m(self, src, kwds)\u001b[0m\n\u001b[1;32m 220\u001b[0m \u001b[0mLet\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mreaders\u001b[0m \u001b[0mopen\u001b[0m \u001b[0mIOHandles\u001b[0m \u001b[0mafter\u001b[0m \u001b[0mthey\u001b[0m \u001b[0mare\u001b[0m \u001b[0mdone\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mtheir\u001b[0m \u001b[0mpotential\u001b[0m \u001b[0mraises\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 221\u001b[0m \"\"\"\n\u001b[0;32m--> 222\u001b[0;31m self.handles = get_handle(\n\u001b[0m\u001b[1;32m 223\u001b[0m \u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 224\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/io/common.py\u001b[0m in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 699\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoding\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m\"b\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 700\u001b[0m \u001b[0;31m# Encoding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 701\u001b[0;31m handle = open(\n\u001b[0m\u001b[1;32m 702\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 703\u001b[0m \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../ol_data/Slacustris_eggnog.tsv'" - ] - } - ], + "outputs": [], "source": [ - "sequence = pd.read_csv('../old_data/Slacustris_eggnog.tsv', sep='\\t')\n", - "structure = pd.read_parquet('../old_data/structure_annotation.parquet')" + "sequence = pd.read_csv('../data/Slacustris_eggnog.tsv', sep='\\t')\n", + "structure = pd.read_parquet('../data/structure_annotation.parquet')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "6532f2fc", "metadata": {}, "outputs": [], @@ -87,39 +68,65 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, + "id": "98d46a4b-3ee2-48b0-9100-cbdfaf5a863f", + "metadata": {}, + "outputs": [], + "source": [ + "merged['plddt'].replace('-', np.NaN, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "6bec4cc1-3dff-48fe-91a6-0546f94be850", + "metadata": {}, + "source": [ + "amend the fact that we counted * characters when calculating sequence length:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "741d43fc-b176-4d23-9770-41348c1b2725", + "metadata": {}, + "outputs": [], + "source": [ + "merged['query length'] = merged['query length'] - merged['has_end']" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "id": "4a137f30", "metadata": {}, "outputs": [], "source": [ "def generate_description(row):\n", - " desc = f\"Preferred name [EggNOG]: {row['Preferred_name_seq']}\\n\" + \\\n", - " f\"Description [EggNOG]: {row['Description_seq']}\\n\" + \\\n", - " f\"CoFFE preferred name [EggNOG]: {row['Preferred_name_str']}\\n\" + \\\n", - " f\"CoFFE description [EggNOG]: {row['Description_str']}\\n\" + \\\n", - " f\"CoFFE UniProt function: {row['Function [CC]']}\\n\\n\" + \\\n", - " f\"isoform ID: {row['isoform']}\\n\" + \\\n", + " desc = f\"isoform ID: {row['isoform']}\\n\" + \\\n", " f\"peptide length: {row['query length']}\\n\" + \\\n", - " f\"input MSA size [MMseqs2]: {row['MSA size']}\\n\" + \\\n", + " f\"input MSA size [MMseqs2]: {row['MSA size']:.0f}\\n\" + \\\n", " f\"gene ID: {row['gene_id_str']}\\n\" + \\\n", " f\"protein ID: {row['protein_id_str']}\\n\" + \\\n", " f\"mRNA has start codon [TransDecoder]: {row['has_start']}\\n\" + \\\n", " f\"mRNA has stop codon [TransDecoder]: {row['has_end']}\\n\\n\" + \\\n", - " f\"average pLDDT [ColabFold]: {row['plddt']}\\n\\n\" + \\\n", + " f\"Preferred name [EggNOG]: {row['Preferred_name_seq']}\\n\" + \\\n", + " f\"Description [EggNOG]: {row['Description_seq']}\\n\" + \\\n", + " f\"CoFFE preferred name [EggNOG]: {row['Preferred_name_str']}\\n\" + \\\n", + " f\"CoFFE description [EggNOG]: {row['Description_str']}\\n\" + \\\n", + " f\"CoFFE UniProt function: {row['Function [CC]']}\\n\\n\" + \\\n", + " f\"average pLDDT [ColabFold]: {row['plddt']:.2f}\\n\\n\" + \\\n", " f\"best structural hit [FoldSeek]: {row['target']}\\n\" + \\\n", " f\"structural state %identity [FoldSeek]: {row['seq. id.']}\\n\" + \\\n", " f\"bit score [FoldSeek]: {row['bit score']}\\n\\n\" + \\\n", " f\"UniProt ID of best structural hit (CoFFE): {row['uniprot']}\\n\" + \\\n", " f\"CoFFE Emapper bit score [EggNOG]: {row['score_str']}\\n\" + \\\n", - " f\"CoFFE orthogroups [EggNOG]: {row['eggNOG_OGs_str']}\\n\" + \\\n", + " f\"CoFFE orthogroups [EggNOG]: {row['eggNOG_OGs_str'].replace(',', ', ')}\\n\" + \\\n", " f\"CoFFE maximum annotation level [EggNOG]: {row['max_annot_lvl_str']}\\n\" + \\\n", - "# f\"CoFFE GO terms [EggNOG]: {row['GOs_str']}\\n\" + \\\n", - " f\"CoFFE PFAM domains [EggNOG]: {row['PFAMs_str']}\\n\\n\" + \\\n", + " f\"CoFFE PFAM domains [EggNOG]: {row['PFAMs_str'].replace(',', ', ')}\\n\\n\" + \\\n", " f\"best sequence hit bit score [Emapper]: {row['score_seq']}\\n\" + \\\n", - " f\"Orthogroups [EggNOG]: {row['eggNOG_OGs_seq']}\\n\" + \\\n", + " f\"Orthogroups [EggNOG]: {row['eggNOG_OGs_seq'].replace(',', ', ')}\\n\" + \\\n", " f\"Maximum annotation level [EggNOG]: {row['max_annot_lvl_seq']}\\n\" + \\\n", - "# f\"Emapper GO terms [EggNOG]: {row['GOs_seq']}\\n\" + \\\n", - " f\"PFAM domains [EggNOG]: {row['PFAMs_seq']}\\n\"\n", + " f\"PFAM domains [EggNOG]: {row['PFAMs_seq'].replace(',', ', ')}\\n\"\n", " return desc" ] }, @@ -133,7 +140,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "id": "935a770c", "metadata": {}, "outputs": [ @@ -141,44 +148,42 @@ "name": "stdout", "output_type": "stream", "text": [ - "Emapper preferred name [EggNOG]: -\n", - "Emapper description [EggNOG]: -\n", - "CoFFE preferred name [EggNOG]: PLA2G15\n", - "CoFFE description [EggNOG]: Lecithin:cholesterol acyltransferase\n", - "CoFFE UniProt function: -\n", - "\n", - "isoform ID: c103531_g3_i1\n", - "peptide length: 202\n", - "input MSA size [MMseqs2]: 1924.0\n", - "gene ID: c103531_g3\n", - "protein ID: 66482\n", + "isoform ID: c12419_g1_i1\n", + "peptide length: 125\n", + "input MSA size [MMseqs2]: 973\n", + "gene ID: c12419_g1\n", + "protein ID: 1183\n", "mRNA has start codon [TransDecoder]: False\n", - "mRNA has stop codon [TransDecoder]: True\n", + "mRNA has stop codon [TransDecoder]: False\n", + "\n", + "Preferred name [EggNOG]: DBR1\n", + "Description [EggNOG]: Debranching enzyme homolog 1 (S. cerevisiae)\n", + "CoFFE preferred name [EggNOG]: USE1\n", + "CoFFE description [EggNOG]: Unconventional SNARE in the ER 1 homolog (S. cerevisiae)\n", + "CoFFE UniProt function: FUNCTION: SNARE that may be involved in targeting and fusion of Golgi-derived retrograde transport vesicles with the ER. {ECO:0000250}.\n", "\n", - "average pLDDT [ColabFold]: 88.31183168316831\n", + "average pLDDT [ColabFold]: 64.27\n", "\n", - "best structural hit [FoldSeek]: AF-A0A077YYC2-F1\n", - "structural state %identity [FoldSeek]: 0.295\n", - "bit score [FoldSeek]: 659.0\n", + "best structural hit [FoldSeek]: AF-Q7ZTY7-F1\n", + "structural state %identity [FoldSeek]: 0.134\n", + "bit score [FoldSeek]: 73.0\n", "\n", - "UniProt ID of best structural hit (CoFFE): A0A077YYC2\n", - "CoFFE Emapper bit score [EggNOG]: 526.0\n", - "CoFFE orthogroups [EggNOG]: KOG2369@1|root,KOG3236@1|root,KOG2369@2759|Eukaryota,KOG3236@2759|Eukaryota,38DE3@33154|Opisthokonta,3BDIK@33208|Metazoa,3CTGA@33213|Bilateria,40D2W@6231|Nematoda\n", + "UniProt ID of best structural hit (CoFFE): Q7ZTY7\n", + "CoFFE Emapper bit score [EggNOG]: 487.0\n", + "CoFFE orthogroups [EggNOG]: KOG2678@1|root, KOG2678@2759|Eukaryota, 38T50@33154|Opisthokonta, 3BE5T@33208|Metazoa, 3CWSZ@33213|Bilateria, 4852S@7711|Chordata, 48XPJ@7742|Vertebrata, 49UTR@7898|Actinopterygii\n", "CoFFE maximum annotation level [EggNOG]: 33208|Metazoa\n", - "CoFFE GO terms [EggNOG]: GO:0000323,GO:0003674,GO:0003824,GO:0004620,GO:0004622,GO:0004623,GO:0005488,GO:0005543,GO:0005575,GO:0005576,GO:0005615,GO:0005622,GO:0005623,GO:0005634,GO:0005654,GO:0005737,GO:0005764,GO:0005773,GO:0006082,GO:0006629,GO:0006631,GO:0006643,GO:0006644,GO:0006650,GO:0006665,GO:0006672,GO:0006793,GO:0006796,GO:0006807,GO:0008150,GO:0008152,GO:0008289,GO:0008374,GO:0009056,GO:0009062,GO:0009395,GO:0009987,GO:0016042,GO:0016054,GO:0016298,GO:0016740,GO:0016746,GO:0016747,GO:0016787,GO:0016788,GO:0019637,GO:0019752,GO:0031974,GO:0031981,GO:0032787,GO:0034638,GO:0034641,GO:0043167,GO:0043168,GO:0043226,GO:0043227,GO:0043229,GO:0043231,GO:0043233,GO:0043436,GO:0043603,GO:0044237,GO:0044238,GO:0044242,GO:0044248,GO:0044255,GO:0044281,GO:0044282,GO:0044421,GO:0044422,GO:0044424,GO:0044428,GO:0044444,GO:0044446,GO:0044464,GO:0046337,GO:0046338,GO:0046395,GO:0046434,GO:0046470,GO:0046475,GO:0046486,GO:0046503,GO:0047499,GO:0052689,GO:0070013,GO:0071704,GO:0072329,GO:0097164,GO:1901564,GO:1901565,GO:1901575\n", - "CoFFE PFAM domains [EggNOG]: LCAT\n", + "CoFFE PFAM domains [EggNOG]: Use1\n", "\n", - "best sequence hit bit score [Emapper]: -\n", - "Emapper orthogroups [EggNOG]: -\n", - "Emapper maximum annotation level [EggNOG]: -\n", - "Emapper GO terms [EggNOG]: -\n", - "Emapper PFAM domains [EggNOG]: -\n", + "best sequence hit bit score [Emapper]: 453.0\n", + "Orthogroups [EggNOG]: KOG2863@1|root, KOG2863@2759|Eukaryota, 38B4T@33154|Opisthokonta, 3BA79@33208|Metazoa, 3CU1D@33213|Bilateria, 47Z8A@7711|Chordata, 48Y57@7742|Vertebrata, 49SNU@7898|Actinopterygii\n", + "Maximum annotation level [EggNOG]: 33208|Metazoa\n", + "PFAM domains [EggNOG]: DBR1, Metallophos\n", "\n" ] } ], "source": [ - "print(generate_description(merged.loc[10001]))" + "print(generate_description(merged.loc[20360]))" ] }, { @@ -191,7 +196,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "id": "1c5133d7", "metadata": {}, "outputs": [], @@ -209,7 +214,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "id": "fa278548", "metadata": {}, "outputs": [], @@ -221,12 +226,74 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "id": "1f718df1", "metadata": {}, "outputs": [], "source": [ - "merged.to_csv('/g/arendt/npapadop/data/spongfold_publish/model_archive_metadata.csv')" + "merged.to_csv('../data/model_archive_metadata.csv')" + ] + }, + { + "cell_type": "markdown", + "id": "69a67bca-9839-4711-95f3-313f65cdab04", + "metadata": {}, + "source": [ + "We also need an overview table that lists all the proteins. We are going to use the same columns as for the description but save them as a table instead:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1858268f-5bed-467a-89ac-a203fbdd1942", + "metadata": {}, + "outputs": [], + "source": [ + "keep = ['isoform', 'query length', 'MSA size', 'gene_id_str',\n", + " 'protein_id_str', 'has_start', 'has_end',\n", + " 'Preferred_name_seq', 'Description_seq', 'Preferred_name_str',\n", + " 'Description_str', 'Function [CC]', 'plddt', 'target',\n", + " 'seq. id.', 'bit score', 'uniprot', 'score_str',\n", + " 'eggNOG_OGs_str', 'max_annot_lvl_str', 'PFAMs_str',\n", + " 'score_seq', 'eggNOG_OGs_seq', 'max_annot_lvl_seq',\n", + " 'PFAMs_seq']\n", + "\n", + "columns = ['isoform ID', 'peptide length', 'input MSA size [MMseqs2]', 'gene ID',\n", + " 'protein ID', 'mRNA has start codon [TransDecoder]',\n", + " 'mRNA has stop codon [TransDecoder]',\n", + " 'Preferred name [EggNOG]', 'Description [EggNOG]',\n", + " 'CoFFE preferred name [EggNOG]', 'CoFFE description [EggNOG]',\n", + " 'CoFFE UniProt function', 'average pLDDT [ColabFold]',\n", + " 'best structural hit [FoldSeek]', 'structural state %identity [FoldSeek]',\n", + " 'bit score [FoldSeek]', 'UniProt ID of best structural hit (CoFFE)',\n", + " 'CoFFE Emapper bit score [EggNOG]', 'CoFFE orthogroups [EggNOG]',\n", + " 'CoFFE maximum annotation level [EggNOG]', 'CoFFE PFAM domains [EggNOG]',\n", + " 'best sequence hit bit score [Emapper]', 'Orthogroups [EggNOG]',\n", + " 'Maximum annotation level [EggNOG]', 'PFAM domains [EggNOG]']" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "76b04936-1189-47ac-9a19-cabd1bea79cb", + "metadata": {}, + "outputs": [], + "source": [ + "merged = structure.join(sequence, on='protein_id', lsuffix='_str', rsuffix='_seq').fillna('-')\n", + "merged['query length'] = merged['query length'] - merged['has_end']\n", + "overview = merged[keep]\n", + "overview.columns = columns\n", + "overview.reset_index(inplace=True, drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "4f5ce19c-b264-4652-bcac-ef92651d5c02", + "metadata": {}, + "outputs": [], + "source": [ + "overview.to_csv('../data/model_archive_overview.csv')" ] } ], @@ -246,7 +313,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.9.6" } }, "nbformat": 4, diff --git a/analysis/suppl-ModelArchive_preparation.ipynb b/analysis/suppl-ModelArchive_preparation.ipynb index 518b90f..a673989 100644 --- a/analysis/suppl-ModelArchive_preparation.ipynb +++ b/analysis/suppl-ModelArchive_preparation.ipynb @@ -10,7 +10,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "2022-06-08 14:02\n" + "2022-08-16 07:52\n" ] } ], diff --git a/analysis/suppl-compare_seq_profile.ipynb b/analysis/suppl-compare_seq_profile.ipynb new file mode 100644 index 0000000..f51fa7d --- /dev/null +++ b/analysis/suppl-compare_seq_profile.ipynb @@ -0,0 +1,898 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "273c6a8d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2022-07-29 15:06\n" + ] + } + ], + "source": [ + "from datetime import datetime, timezone\n", + "import pytz\n", + "\n", + "utc_dt = datetime.now(timezone.utc) # UTC time\n", + "dt = utc_dt.astimezone()\n", + "tz = pytz.timezone('Europe/Berlin')\n", + "berlin_now = datetime.now(tz)\n", + "print(f'{berlin_now:%Y-%m-%d %H:%M}')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "09defaef-09b9-4d8e-84a1-dc3c7a56b80a", + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "from os.path import exists\n", + "from tqdm import tqdm\n", + "import requests\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8aaa0695", + "metadata": {}, + "outputs": [], + "source": [ + "structural_annotation = pd.read_parquet('../old_data/structure_annotation.parquet')\n", + "sequence_annotation = pd.read_csv('../old_data/Slacustris_eggnog.tsv', sep='\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "95457088", + "metadata": {}, + "outputs": [], + "source": [ + "mmseqs = pd.read_csv('../data/cfres_mmseqs_s75_e1.m8', sep=\"\\s+\", header=None)\n", + "mmseqs.columns = [\"query\", \"target\", \"seq. id.\", \"alignment length\", \"no. mismatches\",\n", + " \"no. gap open\", \"query start\", \"query end\", \"target start\", \"target end\",\n", + " \"e value\", \"bit score\"]\n", + "\n", + "mmseqs['gene_id'] = mmseqs['query'].str.split('_').str[:2].apply(lambda x: '_'.join(x))\n", + "mmseqs['normalized bit score'] = mmseqs['bit score'] / mmseqs['alignment length']" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "093bff38", + "metadata": {}, + "outputs": [], + "source": [ + "def best_bit_score(df, sort_by='normalized bit score', tiebreak='alignment length'):\n", + " have_max = df[sort_by] == np.max(df[sort_by])\n", + " max_ali = df[have_max][tiebreak] == np.max(df[have_max][tiebreak])\n", + " return df[have_max][max_ali].index.values[0]\n", + "\n", + "def keep_best(df, groupby='gene_id'):\n", + " df.reset_index(inplace=True)\n", + " idx = df.groupby(groupby).apply(best_bit_score)\n", + " res = df.loc[idx].copy()\n", + " return res" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "042ff08e", + "metadata": {}, + "outputs": [], + "source": [ + "mmseqs_filtered = keep_best(mmseqs)\n", + "del mmseqs" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9645fec8", + "metadata": {}, + "outputs": [], + "source": [ + "hhblits = pd.read_csv('../data/cfres_hhblits_e1.m8', sep=\"\\s+\", header=None)\n", + "hhblits.columns = [\"query\", \"target\", \"seq. id.\", \"alignment length\", \"no. mismatches\",\n", + " \"no. gap open\", \"query start\", \"query end\", \"target start\", \"target end\",\n", + " \"e value\", \"bit score\"]\n", + "\n", + "hhblits['gene_id'] = hhblits['query'].str.split('_').str[:2].apply(lambda x: '_'.join(x))\n", + "hhblits['normalized bit score'] = hhblits['bit score'] / hhblits['alignment length']" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7e9d0601", + "metadata": {}, + "outputs": [], + "source": [ + "hhblits_filtered = keep_best(hhblits)\n", + "del hhblits" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "da58c74f-5cd2-435a-8281-6195ed7f15af", + "metadata": {}, + "outputs": [], + "source": [ + "mmseqs_filtered.drop(columns=['index'], inplace=True)\n", + "hhblits_filtered.drop(columns=['index'], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "260c26ec", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['evalue', 'score', 'eggNOG_OGs', 'max_annot_lvl', 'COG_category',\n", + " 'Description', 'Preferred_name', 'GOs', 'PFAMs', 'gene_id',\n", + " 'protein_id'],\n", + " dtype='object')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sequence_annotation.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c92b4a01", + "metadata": {}, + "outputs": [], + "source": [ + "structural_annotation['normalized bit score'] = structural_annotation['bit score'] / structural_annotation['alignment length']" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5ceaaff2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<matplotlib.legend.Legend at 0x7f7975465070>" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD4CAYAAAD8Zh1EAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAZdUlEQVR4nO3df3BV5b3v8feXgAUUxJLcthJs0jOIAgkBg3oLF+GgGNCRudoZRUFOW0qZi0due0XwtMcWOu30jJxbD1WhDKWWuVSoSCn3CtpSsIK1hx+6G34oGmlKQlqNoShyoPzwe//ITs4m7JCdZO2svdf+vGYyZK317LW/e0/45Mmzn/Usc3dERCT7dQu7ABERCYYCXUQkIhToIiIRoUAXEYkIBbqISER0D+uJ8/PzvaioKKynFxHJSnv27Hnf3QuSHQst0IuKiti9e3dYTy8ikpXM7E+tHdOQi4hIRCjQRUQiQoEuIhIRoY2hi0juOHPmDLW1tZw6dSrsUrJGz549KSwspEePHik/RoEuImlXW1tLnz59KCoqwszCLifjuTsNDQ3U1tZSXFyc8uM05CIiaXfq1Cn69++vME+RmdG/f/92/0WjQBeRLqEwb5+OvF8KdBGRiNAYuoh0uS0H3g30fDcP+VSg58tW6qFfzMHNYVcgIpIyBbqI5ITq6mquueYaZs6cybBhw7jvvvvYsmULo0ePZtCgQezcuZNvf/vbzJgxg4kTJ1JUVMT69et5+OGHKSkpoaKigjNnzgCwYMEChgwZQmlpKQ899BAA9fX13HXXXYwaNYpRo0bxyiuvANDQ0MDEiRMZMWIEX/3qV/nsZz/L+++/z4kTJ7jtttsYPnw4w4YNY+3atZ1+jQp0EckZVVVVzJ07l8rKSt58801+9rOfsWPHDhYvXsz3vvc9AN555x2ef/55fvnLXzJt2jTGjx/P3r176dWrF88//zxHjx7lF7/4Bfv376eyspJvfvObAMydO5evfe1r7Nq1i+eee46ZM2cCsHDhQsaMGcPrr7/OHXfcweHDhwF44YUXuPLKK/nDH/7Avn37qKio6PTrU6CLSM4oLi6mpKSEbt26MXToUCZMmICZUVJSQnV1NQCTJk2iR48elJSUcO7cueagbWrTt29fevbsycyZM1m/fj29e/cGYMuWLTzwwAOUlZVxxx138OGHH3L8+HFefvllpk2bBsBtt93GFVdc0Xy+LVu2MH/+fLZv387ll1/e6denQL+IWM2xsEsQkQB94hOfaP6+W7duzdvdunXj7Nmz57Xp1q0bPXr0aJ4+2NSme/fu7Ny5k7vuuosNGzY0B/7HH3/Mq6++SiwWIxaLceTIEfr06QMkn4J49dVXs2fPHkpKSnjkkUdYtGhRp1+fAr01+kBURJL46KOP+OCDD5g8eTKPP/44sVgMgIkTJ/LEE080t2vaP3bsWFavXg3A5s2b+etf/wpAXV0dvXv3Ztq0aTz00EO89tprna5N0xZFpMtl8zTD48ePM2XKFE6dOoW784Mf/ACAJUuWMGfOHEpLSzl79ixjx45l2bJlfOtb32Lq1KmMHDmSm266iauuugqAvXv3Mm/evOa/BJYuXdrp2szdO32SjigvL/eMvsHFwc3Eao5RdvPUsCsRyXpvvPEG1157bdhlZISmm/vk5+e32TbZ+2Zme9y9PFn7NodczGylmb1nZvtaOX6fmVXGv35nZsPbrFJERAKXyhj608DF5tP8EbjJ3UuB7wDLA6hLRCSSqqurU+qdd0SbY+ju/rKZFV3k+O8SNn8PFAZQl4iItFPQs1y+DLQ6PcTMZpnZbjPbXV9fH/BTB0tTFkUk2wQW6GY2nsZAn99aG3df7u7l7l5eUFAQ1FN3maAXFBIRCVIg0xbNrBRYAUxy94YgzikiIu3T6UA3s6uA9cB0d3+r8yVlrvy6rTBE0xhFOi3oC/cGT2qzSXV1Nbfffjv79p0/YW/cuHEsXryY8vLzZwI+/fTT7N69+7yLhZpcdtllfPTRR9TV1fHggw+ybt06YrEYdXV1TJ48uXOvpRNSmbb4DPAqMNjMas3sy2Y228xmx5s8CvQHnjKzmJll8OTyTtCVoyLSwpVXXsm6deuAxitDN23aFGo9bQa6u09198+4ew93L3T3H7v7MndfFj8+092vcPey+FfSCe/Z5LyxcgW5SGScO3eOr3zlKwwdOpSJEydy8uRJAJ599lmuv/56rr76arZv397cvqamhoqKCgYPHszChQsvOF91dTXDhg3j9OnTPProo6xdu5aysjLWrl3Lb3/7W8rKyigrK2PEiBEcP3487a9Pa7k0aQrug5sbh1biNNtFJDrefvtt5syZw/79++nXrx/PPfccAGfPnmXnzp08/vjj5wX3zp07Wb16NbFYjGeffZbWrm6/5JJLWLRoEXfffTexWIy7776bxYsX8+STTxKLxdi+fTu9evVK++tToItIziguLqasrAyA6667rnnJ3DvvvPOCfQC33HIL/fv3p1evXtx5553s2LEj5ecaPXo0X//611myZAnHjh2je/f0L52lQBeRnJG4fG5eXt4FS+Ym7oMLl71NtgxuaxYsWMCKFSs4efIkN954I2+++WZnSk+JAl1EpBW//vWvOXr0KCdPnmTDhg2MHj261bZ9+vQ5b5z8nXfeoaSkhPnz51NeXt4lga7lc0Wk66UwzTATjBkzhunTp1NVVcW99957wdTGROPHj+f73/8+ZWVlPPLII+zYsYNt27aRl5fHkCFDmDQp/a9Zy+c2Obi58YcsvmxuorKbp2o5XZFO0PK5HRP48rnSSLNdRCTTKdBFRCJCgZ5IFxGJSBZToIuIRIQCXUQkIhToIiIRoXno7bTlwLvcPORTYZchktVeqnkp0PONGzgusHPV19dz++23c/r0aZYsWcJf/vIXHn30UT796U+zbdu2wJ4nHRToCWI1xygb2C/sMkQkRL/5zW+45ppr+OlPfwpARUUFTz31FOPHjw+5srZpyEVEcsKqVasoLS1l+PDhTJ8+nT/96U9MmDCB0tJSJkyYwOHDh4nFYjz88MNs2rSJsrIyFi5cyI4dO5g9ezbz5s3j3LlzzJs3j1GjRlFaWsqPfvSjsF/WedRD74imq0pFJCvs37+f7373u7zyyivk5+dz9OhRZsyYwf3338+MGTNYuXIlDz74IBs2bGDRokXn3alo27ZtzXc0Wr58OZdffjm7du3ib3/7G6NHj2bixIkUFxeH/AobqYcepytBRaJr69atfOELXyA/Px+AT37yk7z66qvce++9AEyfPj2lpXF/9atfsWrVKsrKyrjhhhtoaGjg7bffTmvt7aEeejvl120FjbOLZBV3b3Pp21SWxnV3fvjDH3LrrbcGVVqg1EMXkcibMGECP//5z2loaADg6NGjfP7zn2fNmjUArF69mjFjxrR5nltvvZWlS5dy5swZAN566y1OnDiRvsLbST30FMS2PBN2CSKREuQ0w1QMHTqUb3zjG9x0003k5eUxYsQIlixZwpe+9CUee+wxCgoK+MlPftLmeWbOnEl1dTUjR47E3SkoKGDDhg3pfwEp0vK5cU2hXTawX5vj6WUD++lDUZF20PK5HaPlc0VEcpQCXUQkIhToItIlwhrezVYdeb/aDHQzW2lm75nZvlaOm5ktMbMqM6s0s5HtriKLbTnwbtgliGS8nj170tDQoFBPkbvT0NBAz5492/W4VGa5PA08Aaxq5fgkYFD86wZgafxfEREACgsLqa2tpb6+PuxSskbPnj0pLCxs12PaDHR3f9nMii7SZAqwyht/9f7ezPqZ2Wfc/c/tqkREIqtHjx4Zc3l8lAUxhj4AqEnYro3vu4CZzTKz3Wa2O1N/U2sJABHJVkEEerLrZZMOlLn7cncvd/fygoKCAJ5aRESaBBHotcDAhO1CoC6A83aZznywmV+3NcBKREQ6LohA3wjcH5/tciPwQc6Mnx/cHHYFIiLN2vxQ1MyeAcYB+WZWC3wL6AHg7suATcBkoAr4D+CL6So2LQ5uJr/uWNhViIh0WiqzXKa2cdyBOYFVJCIiHaIrRUVEIkKB3gma4igimUSB3gEKchHJRAp0EZGIUKCLiESEAl1EJCJyPtA1Hi4iUZHzgS4iEhUKdBGRiFCgB0FruohIBlCgd5BuPScimUaBLiISEQp0EZGIUKCLiESEAl1EJCIU6CIiEaFAD4CuNhWRTKBAFxGJCAW6iEhE5Hagd+IKz/y6rQEWIiLSebkd6AHSlaMiEjYFuohIROR0oGt2iohESUqBbmYVZnbQzKrMbEGS45eb2f81sz+Y2X4z+2LwpYqIyMW0Gehmlgc8CUwChgBTzWxIi2ZzgAPuPhwYB/yrmV0ScK0iInIRqfTQrweq3P2Qu58G1gBTWrRxoI+ZGXAZcBQ4G2ilIiJyUakE+gCgJmG7Nr4v0RPAtUAdsBeY6+4fB1JhmmhWiohETSqBbkn2eYvtW4EYcCVQBjxhZn0vOJHZLDPbbWa76+vr21lqsDSPXESiJpVArwUGJmwX0tgTT/RFYL03qgL+CFzT8kTuvtzdy929vKCgoKM1Zy7dik5EQpRKoO8CBplZcfyDznuAjS3aHAYmAJjZp4DBwKEgCxURkYvr3lYDdz9rZg8ALwJ5wEp3329ms+PHlwHfAZ42s700DtHMd/f301h3xsmv2woD+4VdhojksDYDHcDdNwGbWuxblvB9HTAx2NJERKQ9cvpK0aDpylMRCZMCXUQkIhToIiIRoUBPB01fFJEQ5GSg6ypREYminAx0EZEoUqAHTL1/EQmLAl1EJCIU6CIiEaFAFxGJCAW6iEhEKNBFRCJCgS4iEhEKdBGRiFCgi4hEhAJdRCQiFOgiIhGRc4G+5cC7jbeLSyPd6EJEwpBzgS4iElUKdBGRiFCgi4hEhAI9YOkenxcRaY0CXUQkIhToIiIRkVKgm1mFmR00syozW9BKm3FmFjOz/Wb222DLDI6GREQkqrq31cDM8oAngVuAWmCXmW109wMJbfoBTwEV7n7YzP5LmuoVEZFWpNJDvx6ocvdD7n4aWANMadHmXmC9ux8GcPf3gi0z++jeoiLS1VIJ9AFATcJ2bXxfoquBK8zsJTPbY2b3JzuRmc0ys91mtru+vr5jFYuISFKpBLol2ecttrsD1wG3AbcC/2xmV1/wIPfl7l7u7uUFBQXtLrbTDm7usqfSWL2IdLU2x9Bp7JEPTNguBOqStHnf3U8AJ8zsZWA48FYgVYqISJtS6aHvAgaZWbGZXQLcA2xs0eaXwH8zs+5m1hu4AXgj2FJFRORi2uyhu/tZM3sAeBHIA1a6+34zmx0/vszd3zCzF4BK4GNghbvvS2fhIiJyvlSGXHD3TcCmFvuWtdh+DHgsuNKCp2VtRSTKdKWoiEhEKNBFRCJCgZ5uXThVUkRymwI9nQ5u1ri9iHQZBbqISEQo0NNIvXMR6UoKdBGRiEhpHnqUvXby7aT7R/Ya1MWViIh0jnroIiIRoUAXEYmInBxyaW2YRUQkm6mHLiISEQr0rqCrRUWkCyjQRUQiQoEuIhIRCvQuoCtGRaQrKNBFRCIiZ6YtbjnwLvntaJ84tVFXjYpINlAPXUQkIhToXUnTF0UkjXIm0PPrtoZdgohIWuVMoIuIRJ0CvatouEVE0ixnZrlkwoJcsZpjlA0OuwoRiaqUeuhmVmFmB82syswWXKTdKDM7Z2ZfCK7EAKh3LCI5oM0eupnlAU8CtwC1wC4z2+juB5K0+xfgxXQUGqYg5qTralERSbdUeujXA1XufsjdTwNrgClJ2v0j8BzwXoD1BUJhKiK5IJVAHwDUJGzXxvc1M7MBwH8Hll3sRGY2y8x2m9nu+vr69tYqIiIXkUqgW5J93mL7cWC+u5+72Incfbm7l7t7eUFBQYoliohIKlKZ5VILDEzYLgTqWrQpB9aYGUA+MNnMzrr7hiCKFBGRtqXSQ98FDDKzYjO7BLgH2JjYwN2L3b3I3YuAdcD/UJgnt+XAu2GXICIR1WYP3d3PmtkDNM5eyQNWuvt+M5sdP37RcXMREekaKV1Y5O6bgE0t9iUNcnf/h86XFXEHN8PgSWFXISIRo0v/Q6BplCKSDpEP9NiWZ8IuQUSkS0Q+0EVEcoUCXUQkIhToIiIRkTPL5walswt16c5JIpIu6qGHRUv6ikjAFOgiIhER6SGXl2pe4lAG3KlIRKQrqIcuIhIRCnQRkYiI9JBLrOYYfdN4/iBuTSciEhT10EVEIiLSgd63oTLsElqlBbpEJGiRDnQRkVyiQBcRiYjIBno23OotG2oUkewR2UDPFgp1EQmKAl1EJCIU6AF57eTbzV/tkbj6onrrItIZkb2wKL9uK4fDLiJVzSsvjgy1DBHJbuqhZwDNSReRICjQM4RCXUQ6K6VAN7MKMztoZlVmtiDJ8fvMrDL+9TszGx58qdkj1fF03b1IRILUZqCbWR7wJDAJGAJMNbMhLZr9EbjJ3UuB7wDLgy40FyjgRaQzUumhXw9Uufshdz8NrAGmJDZw99+5+1/jm78HCoMtU0RE2pLKLJcBQE3Cdi1ww0XafxlIesNMM5sFzAK46qqrUiyxfV6qeQkgY+5U1KEldg9uhsGT0lSRiERVKj10S7LPkzY0G09joM9Pdtzdl7t7ubuXFxQUpF5lDtFcdBHpqFQCvRYYmLBdCNS1bGRmpcAKYIq7NwRTXm7SjBcR6YhUhlx2AYPMrBg4AtwD3JvYwMyuAtYD0939rcCrjIjWZr3obkciEoQ2A93dz5rZA8CLQB6w0t33m9ns+PFlwKNAf+ApMwM46+7l6StbRERaSunSf3ffBGxqsW9ZwvczgZnBlpY7zvvgNGEwq+kDXoBxA8d1WT0ikp0iu5ZLtmoK92M1L523P5VwT2zTUuJjWmunXxoi2S2SgR6rOUbfsItIo4703C8W9kG0b+0XRnt/+eiXikjHRTLQc0l7gzdd50rlsUHWKiIXilyga8pfZlGIi3SdyK222LehMuwSRERCEbkeOkQn1A/FXuZzZWPDLqNLpTK2rtk/IslFMtAj4ciesCvIKK0FvcJd5D8p0DPUofoTYZeQ1RT0kosU6BJ5qc7PF8l2WR/omkUhTfSzILku6wM98o7sgQHXhV1FTtAwjWS7aAV6BD9IPFR/gs+hUO9qCnfJRtEK9Ig6VH+CDz8+RtnAfmGXEjmducJVQS+ZRoGeJfo2VEK3S9VTz1Dq0UsmiFSgR32q36H6E3xuQNhVSBPNjZdME6lAzwWxGg29ZBOFu3SlyAT6odjLYZfQJfo2VMLA3FoOIIo0Li/pEJlAzyXqpWcnfQAr6aZAF8kyGsaR1ijQs1CyGS/qtUebPoCVVEQj0CN4QVG7HNkD/F3YVUjIFO6S9YEeqzlG34ZoT1dMpuliI4CyyN2mRDorlfF6hX70ZH2gR+VmFh3Rt6GSD/uXNn/PwLEaepGUXaxHrw9ns1PWB7okiA+95OKdjqRzUl2pUkGf2VIKdDOrAP4NyANWuPv3Wxy3+PHJwH8A/+DurwVcazMtk/qf+jZUQsGlQPxK2f6N+5tulq3eunSF1nr77d0vndNmoJtZHvAkcAtQC+wys43ufiCh2SRgUPzrBmBp/N+0idUco6zbO+l8iqyRbMmD5qGobpcS+/jvGoP9yJ7Gsff4MI3CXtIhlRk5qexX0LdfKj3064Eqdz8EYGZrgClAYqBPAVa5uwO/N7N+ZvYZd/9z4BXH9W2o5FC6Tp7FWn6mcKj+BH2p5FDDhW0ONcCH/UsV7JKRgvpLvLW/DFprk81SCfQBQE3Cdi0X9r6TtRkAnBfoZjYLmBXf/MjMDrarWsgH3m/nY3KB3pfk9L5cSO9Jctn0vny2tQOpBLol2ecdaIO7LweWp/CcyQsx2+3u5R19fFTpfUlO78uF9J4kF5X3JZUZzLXAwITtQqCuA21ERCSNUgn0XcAgMys2s0uAe4CNLdpsBO63RjcCH6Rz/FxERC7U5pCLu581sweAF2mctrjS3feb2ez48WXAJhqnLFbROG3xi2mqt8PDNRGn9yU5vS8X0nuSXCTeF2ucmCIiItlOq4CIiESEAl1EJCKyJtDNrMLMDppZlZktCLuesJnZQDPbZmZvmNl+M5sbdk2ZxMzyzOx1M/t/YdeSKeIX/K0zszfjPzf/NeyawmZmX4v//9lnZs+YWc+wa+qMrAj0hOUHJgFDgKlmNiTcqkJ3Fvhf7n4tcCMwR+/JeeYCb4RdRIb5N+AFd78GGE6Ovz9mNgB4ECh392E0Tvq4J9yqOicrAp2E5Qfc/TTQtPxAznL3PzctgObux2n8zzkg3Koyg5kVArcBK8KuJVOYWV9gLPBjAHc/7e7HQi0qM3QHeplZd6A3WX79TLYEemtLCwhgZkXACODfQy4lUzwOPAx8HHIdmeRzQD3wk/hQ1AozuzTsosLk7keAxcBhGpcp+cDdfxVuVZ2TLYGe0tICucjMLgOeA/6nu38Ydj1hM7PbgffcPcfvS3iB7sBIYKm7jwBOADn9WZSZXUHjX/rFwJXApWY2LdyqOidbAl1LCyRhZj1oDPPV7r4+7HoyxGjgDjOrpnFo7u/N7P+EW1JGqAVq3b3pr7h1NAZ8LrsZ+KO717v7GWA98PmQa+qUbAn0VJYfyCnxm4r8GHjD3f932PVkCnd/xN0L3b2Ixp+Tre6e1b2uILj7X4AaMxsc3zWB85fAzkWHgRvNrHf8/9MEsvyD4qy4BV1ryw+EXFbYRgPTgb1mFovv+yd33xReSZLh/hFYHe8UHSJ9S3RkBXf/dzNbB7xG46yx18nyJQB06b+ISERky5CLiIi0QYEuIhIRCnQRkYhQoIuIRIQCXUQkIhToIiIRoUAXEYmI/w9iflt0knsXdQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots()\n", + "ax.hist(mmseqs_filtered['normalized bit score'], bins=100, label='mmseqs', alpha=0.3, density=True);\n", + "ax.hist(hhblits_filtered['normalized bit score'], bins=100, label='hhblits', alpha=0.3, density=True);\n", + "ax.hist(structural_annotation['normalized bit score'], bins=100, label='coffe', alpha=0.3, density=True);\n", + "ax.legend()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7649fb1a", + "metadata": {}, + "outputs": [], + "source": [ + "unique_up_id = pd.concat([hhblits_filtered['target'].drop_duplicates(),\n", + " mmseqs_filtered['target'].drop_duplicates()])\n", + "unique_up_id.drop_duplicates(inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e8d0cfb7-c86d-41a6-99e7-d514bc1f3873", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "148" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "request_size = 500\n", + "no_chunks = np.ceil((len(unique_up_id) / request_size) * 3).astype(int)\n", + "no_chunks" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "85231f72-12ad-4cc7-897a-544e0be27228", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 148/148 [01:45<00:00, 1.41it/s]\n" + ] + } + ], + "source": [ + "request_size = 500\n", + "no_chunks = np.ceil((len(unique_up_id) / request_size) * 3).astype(int)\n", + "\n", + "with open('../data/profile_sequences.fasta', 'w') as result:\n", + " for i in tqdm(range(no_chunks)):\n", + " a = (i * request_size) // 3\n", + " b = ((i+1) * request_size) // 3\n", + " chunk = unique_up_id[a:b]\n", + " bait50 = ['UniRef50_' + c for c in chunk]\n", + " bait90 = ['UniRef90_' + c for c in chunk]\n", + " bait100 = ['UniRef100_' + c for c in chunk]\n", + " expanded_chunk = bait50 + bait90 + bait100\n", + " url = f\"https://rest.uniprot.org/uniref/ids?ids={','.join(expanded_chunk)}&format=fasta\"\n", + " response = requests.get(url)\n", + " result.write(response.text)" + ] + }, + { + "cell_type": "markdown", + "id": "c3d17ee4-6176-443a-b320-a54aaa7ff8cd", + "metadata": {}, + "source": [ + "(wait for EggNOG-mapper to annotate the sequences)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "eb0c48d8-ef71-440f-8ea0-6e7f3118f957", + "metadata": {}, + "outputs": [], + "source": [ + "file = '/g/arendt/npapadop/data/spongfold_publish/MM_ffr33uy6.emapper.annotations.tsv'\n", + "sensitive = pd.read_csv(file, sep='\\t', skiprows=4, skipfooter=3, engine='python')" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "0a97dfb7-4783-44fa-a330-f07b0585fcf8", + "metadata": {}, + "outputs": [], + "source": [ + "test = sensitive['#query'].str.split('_').str[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "942d4a2d-2a8b-44ff-89b4-8525f3ad926e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "A0A6A6LEY7 3\n", + "A0A1X7U3S9 3\n", + "A0A3N5U8U0 3\n", + "A0A6J0B743 3\n", + "A0A482W475 3\n", + " ..\n", + "A0A6I8SCD7 1\n", + "A0A1A8MN43 1\n", + "A0A0C2FDV2 1\n", + "A0A2J8INC7 1\n", + "A0A0K0D6S2 1\n", + "Name: #query, Length: 20360, dtype: int64" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "909804dd-4006-4dcc-a4ba-308a5b3975d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>#query</th>\n", + " </tr>\n", + " <tr>\n", + " <th>seed_ortholog</th>\n", + " </tr>\n", + " <tr>\n", + " <th>evalue</th>\n", + " </tr>\n", + " <tr>\n", + " <th>score</th>\n", + " </tr>\n", + " <tr>\n", + " <th>eggNOG_OGs</th>\n", + " </tr>\n", + " <tr>\n", + " <th>max_annot_lvl</th>\n", + " </tr>\n", + " <tr>\n", + " <th>COG_category</th>\n", + " </tr>\n", + " <tr>\n", + " <th>Description</th>\n", + " </tr>\n", + " <tr>\n", + " <th>Preferred_name</th>\n", + " </tr>\n", + " <tr>\n", + " <th>GOs</th>\n", + " </tr>\n", + " <tr>\n", + " <th>EC</th>\n", + " </tr>\n", + " <tr>\n", + " <th>KEGG_ko</th>\n", + " </tr>\n", + " <tr>\n", + " <th>KEGG_Pathway</th>\n", + " </tr>\n", + " <tr>\n", + " <th>KEGG_Module</th>\n", + " </tr>\n", + " <tr>\n", + " <th>KEGG_Reaction</th>\n", + " </tr>\n", + " <tr>\n", + " <th>KEGG_rclass</th>\n", + " </tr>\n", + " <tr>\n", + " <th>BRITE</th>\n", + " </tr>\n", + " <tr>\n", + " <th>KEGG_TC</th>\n", + " </tr>\n", + " <tr>\n", + " <th>CAZy</th>\n", + " </tr>\n", + " <tr>\n", + " <th>BiGG_Reaction</th>\n", + " </tr>\n", + " <tr>\n", + " <th>PFAMs</th>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: [#query, seed_ortholog, evalue, score, eggNOG_OGs, max_annot_lvl, COG_category, Description, Preferred_name, GOs, EC, KEGG_ko, KEGG_Pathway, KEGG_Module, KEGG_Reaction, KEGG_rclass, BRITE, KEGG_TC, CAZy, BiGG_Reaction, PFAMs]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sensitive[sensitive['#query'].str.contains('UPI00005B2EF3')].T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6d1eb54-d942-465e-a868-517d442fe1dd", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "1c156c27-d2ec-47d6-b764-2b54816922a0", + "metadata": {}, + "outputs": [], + "source": [ + "sensitive['uniprot'] = sensitive['#query'].str.split('_').str[1]\n", + "sensitive.reset_index(drop=True, inplace=True)\n", + "# remove unnecessary columns\n", + "dead_weight = ['#query', 'seed_ortholog', 'EC', 'KEGG_ko', 'KEGG_Pathway',\n", + " 'KEGG_Module', 'KEGG_Reaction', 'KEGG_rclass', 'BRITE',\n", + " 'KEGG_TC', 'CAZy', 'BiGG_Reaction']\n", + "sensitive.drop(dead_weight, axis=1, inplace=True)\n", + "# convert rest to categorical to save space\n", + "to_categorical = ['uniprot', 'eggNOG_OGs', 'max_annot_lvl', 'COG_category',\n", + " 'Description', 'Preferred_name', 'GOs', 'PFAMs']\n", + "sensitive[to_categorical] = sensitive[to_categorical].astype(\"category\")\n", + "# finally save in parquet format\n", + "sensitive.drop_duplicates(inplace=True)\n", + "sensitive.to_parquet('../data/uniprot_profiles.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "d68690ea-060a-480f-802c-aac6baae570d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(20360, 10)" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sensitive.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "f60ac7bf-f031-4bd6-99f3-eb3117b50cf1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7417224 A0A409VGI9\n", + "7453017 A0A6A6LEY7\n", + "3711344 A0A1A8QDB6\n", + "5432158 A0A1X7UJF6\n", + "1658532 S9X2M4\n", + " ... \n", + "144364 A0A2F0BPH8\n", + "144949 L1J317\n", + "4535244 A0A0K0D6S2\n", + "2883206 UPI0003F0A19A\n", + "2883636 A0A7S2RZY7\n", + "Name: target, Length: 24540, dtype: object" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unique_up_id" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "fd08dceb-c307-4ad6-bd23-91b70aaf7ef4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>query</th>\n", + " <th>target</th>\n", + " <th>seq. id.</th>\n", + " <th>alignment length</th>\n", + " <th>no. mismatches</th>\n", + " <th>no. gap open</th>\n", + " <th>query start</th>\n", + " <th>query end</th>\n", + " <th>target start</th>\n", + " <th>target end</th>\n", + " <th>e value</th>\n", + " <th>bit score</th>\n", + " <th>gene_id</th>\n", + " <th>normalized bit score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>3537669</th>\n", + " <td>c100005_g1_i4_m.41851</td>\n", + " <td>UPI000C6D6B72</td>\n", + " <td>0.262</td>\n", + " <td>138</td>\n", + " <td>100</td>\n", + " <td>0</td>\n", + " <td>13</td>\n", + " <td>150</td>\n", + " <td>7</td>\n", + " <td>142</td>\n", + " <td>1.464000e-04</td>\n", + " <td>53</td>\n", + " <td>c100005_g1</td>\n", + " <td>0.384058</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3711020</th>\n", + " <td>c100012_g4_i1_m.41921</td>\n", + " <td>A0A4V1IVK5</td>\n", + " <td>0.681</td>\n", + " <td>133</td>\n", + " <td>41</td>\n", + " <td>0</td>\n", + " <td>32</td>\n", + " <td>164</td>\n", + " <td>73</td>\n", + " <td>203</td>\n", + " <td>3.106000e-49</td>\n", + " <td>183</td>\n", + " <td>c100012_g4</td>\n", + " <td>1.375940</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3108519</th>\n", + " <td>c100014_g2_i1_m.41927</td>\n", + " <td>A0A1D1W000</td>\n", + " <td>0.456</td>\n", + " <td>59</td>\n", + " <td>32</td>\n", + " <td>0</td>\n", + " <td>4</td>\n", + " <td>62</td>\n", + " <td>118</td>\n", + " <td>176</td>\n", + " <td>4.196000e-04</td>\n", + " <td>52</td>\n", + " <td>c100014_g2</td>\n", + " <td>0.881356</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7530038</th>\n", + " <td>c100023_g1_i1_m.41965</td>\n", + " <td>UPI0009E61F51</td>\n", + " <td>0.342</td>\n", + " <td>90</td>\n", + " <td>57</td>\n", + " <td>0</td>\n", + " <td>3</td>\n", + " <td>92</td>\n", + " <td>2743</td>\n", + " <td>2829</td>\n", + " <td>4.556000e-04</td>\n", + " <td>53</td>\n", + " <td>c100023_g1</td>\n", + " <td>0.588889</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5277692</th>\n", + " <td>c100036_g1_i4_m.42040</td>\n", + " <td>A0A7S3BMQ4</td>\n", + " <td>0.523</td>\n", + " <td>99</td>\n", + " <td>46</td>\n", + " <td>0</td>\n", + " <td>71</td>\n", + " <td>169</td>\n", + " <td>1</td>\n", + " <td>98</td>\n", + " <td>2.166000e-19</td>\n", + " <td>100</td>\n", + " <td>c100036_g1</td>\n", + " <td>1.010101</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8219227</th>\n", + " <td>c99972_g1_i2_m.41694</td>\n", + " <td>UPI0009E28D88</td>\n", + " <td>0.466</td>\n", + " <td>63</td>\n", + " <td>33</td>\n", + " <td>0</td>\n", + " <td>8</td>\n", + " <td>70</td>\n", + " <td>392</td>\n", + " <td>453</td>\n", + " <td>1.130000e-05</td>\n", + " <td>57</td>\n", + " <td>c99972_g1</td>\n", + " <td>0.904762</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6087296</th>\n", + " <td>c99980_g3_i1_m.41746</td>\n", + " <td>UPI00005B2EF3</td>\n", + " <td>0.434</td>\n", + " <td>62</td>\n", + " <td>35</td>\n", + " <td>0</td>\n", + " <td>2</td>\n", + " <td>63</td>\n", + " <td>7</td>\n", + " <td>68</td>\n", + " <td>4.330000e-05</td>\n", + " <td>51</td>\n", + " <td>c99980_g3</td>\n", + " <td>0.822581</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3238079</th>\n", + " <td>c99987_g1_i2_m.41772</td>\n", + " <td>A0A1X7URN4</td>\n", + " <td>0.529</td>\n", + " <td>220</td>\n", + " <td>97</td>\n", + " <td>0</td>\n", + " <td>2</td>\n", + " <td>221</td>\n", + " <td>37</td>\n", + " <td>243</td>\n", + " <td>1.868000e-61</td>\n", + " <td>221</td>\n", + " <td>c99987_g1</td>\n", + " <td>1.004545</td>\n", + " </tr>\n", + " <tr>\n", + " <th>574005</th>\n", + " <td>c99993_g2_i1_m.41786</td>\n", + " <td>UPI00177B4952</td>\n", + " <td>0.663</td>\n", + " <td>137</td>\n", + " <td>41</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>137</td>\n", + " <td>23</td>\n", + " <td>145</td>\n", + " <td>1.384000e-49</td>\n", + " <td>183</td>\n", + " <td>c99993_g2</td>\n", + " <td>1.335766</td>\n", + " </tr>\n", + " <tr>\n", + " <th>574028</th>\n", + " <td>c99993_g3_i1_m.41790</td>\n", + " <td>UPI00106CF215</td>\n", + " <td>0.265</td>\n", + " <td>133</td>\n", + " <td>95</td>\n", + " <td>0</td>\n", + " <td>3</td>\n", + " <td>135</td>\n", + " <td>93</td>\n", + " <td>222</td>\n", + " <td>5.938000e-04</td>\n", + " <td>52</td>\n", + " <td>c99993_g3</td>\n", + " <td>0.390977</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>3347 rows × 14 columns</p>\n", + "</div>" + ], + "text/plain": [ + " query target seq. id. alignment length \\\n", + "3537669 c100005_g1_i4_m.41851 UPI000C6D6B72 0.262 138 \n", + "3711020 c100012_g4_i1_m.41921 A0A4V1IVK5 0.681 133 \n", + "3108519 c100014_g2_i1_m.41927 A0A1D1W000 0.456 59 \n", + "7530038 c100023_g1_i1_m.41965 UPI0009E61F51 0.342 90 \n", + "5277692 c100036_g1_i4_m.42040 A0A7S3BMQ4 0.523 99 \n", + "... ... ... ... ... \n", + "8219227 c99972_g1_i2_m.41694 UPI0009E28D88 0.466 63 \n", + "6087296 c99980_g3_i1_m.41746 UPI00005B2EF3 0.434 62 \n", + "3238079 c99987_g1_i2_m.41772 A0A1X7URN4 0.529 220 \n", + "574005 c99993_g2_i1_m.41786 UPI00177B4952 0.663 137 \n", + "574028 c99993_g3_i1_m.41790 UPI00106CF215 0.265 133 \n", + "\n", + " no. mismatches no. gap open query start query end target start \\\n", + "3537669 100 0 13 150 7 \n", + "3711020 41 0 32 164 73 \n", + "3108519 32 0 4 62 118 \n", + "7530038 57 0 3 92 2743 \n", + "5277692 46 0 71 169 1 \n", + "... ... ... ... ... ... \n", + "8219227 33 0 8 70 392 \n", + "6087296 35 0 2 63 7 \n", + "3238079 97 0 2 221 37 \n", + "574005 41 0 1 137 23 \n", + "574028 95 0 3 135 93 \n", + "\n", + " target end e value bit score gene_id normalized bit score \n", + "3537669 142 1.464000e-04 53 c100005_g1 0.384058 \n", + "3711020 203 3.106000e-49 183 c100012_g4 1.375940 \n", + "3108519 176 4.196000e-04 52 c100014_g2 0.881356 \n", + "7530038 2829 4.556000e-04 53 c100023_g1 0.588889 \n", + "5277692 98 2.166000e-19 100 c100036_g1 1.010101 \n", + "... ... ... ... ... ... \n", + "8219227 453 1.130000e-05 57 c99972_g1 0.904762 \n", + "6087296 68 4.330000e-05 51 c99980_g3 0.822581 \n", + "3238079 243 1.868000e-61 221 c99987_g1 1.004545 \n", + "574005 145 1.384000e-49 183 c99993_g2 1.335766 \n", + "574028 222 5.938000e-04 52 c99993_g3 0.390977 \n", + "\n", + "[3347 rows x 14 columns]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hhblits_filtered[~hhblits_filtered['target'].isin(sensitive['uniprot'])]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0f35e75-1a3f-4bab-8afe-160d3347738f", + "metadata": {}, + "outputs": [], + "source": [ + "mmseqs_filtered" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f99bd75f-ae5a-4d00-8a01-5588c711f45c", + "metadata": {}, + "outputs": [], + "source": [ + "foldseek_full = pd.read_parquet('../old_data/fs_targets.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1359934-a10c-4301-9ffa-000acb4159e4", + "metadata": {}, + "outputs": [], + "source": [ + "mmseqs_filtered = mmseqs_filtered.merge(sensitive, on='uniprot', how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72eed180-8cb0-4f76-b316-f5717f56b026", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "hhblits = hhblits.merge(sensitive, on='uniprot', how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5b356e4-10f9-4782-a39c-4f19a221ebdb", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scripts/fs_self_search_afdb.sh b/scripts/fs_self_search_afdb.sh new file mode 100755 index 0000000..a586869 --- /dev/null +++ b/scripts/fs_self_search_afdb.sh @@ -0,0 +1,23 @@ +#!/bin/bash -ex +#SBATCH -J fs_self_afdb +#SBATCH -t 10:00:00 +#SBATCH -c 32 +#SBATCH -N 1 +#SBATCH --mem=128G +#SBATCH -o /g/arendt/npapadop/cluster/fs_self.out +#SBATCH -e /g/arendt/npapadop/cluster/fs_self.err + +module load GCC +module load bzip2 + +FOLDSEEK="/g/arendt/npapadop/repos/foldseek/build/bin/foldseek" +QUERYDB="/scratch/npapadop/foldseek_target/afdb" +TARGETDB="/scratch/npapadop/foldseek_target/afdb" +ALIGNMENT="/scratch/npapadop/foldseek_results/self/self" + +"$FOLDSEEK" search "$QUERYDB" "$TARGETDB" "$ALIGNMENT" tmpFolder -a --threads 16 +"$FOLDSEEK" aln2tmscore "$QUERYDB" "$TARGETDB" "$ALIGNMENT" "$ALIGNMENT"_aln_tmscore +"$FOLDSEEK" createtsv "$QUERYDB" "$TARGETDB" "$ALIGNMENT"_aln_tmscore "$ALIGNMENT"_aln_tmscore.tsv +"$FOLDSEEK" convertalis "$QUERYDB" "$TARGETDB" "$ALIGNMENT" "$ALIGNMENT"_score.tsv + +module unload -- GitLab