diff --git a/analysis/analysis.ipynb b/analysis/analysis.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..2139425e6e75906e78a226662a9d3d154858ab86 --- /dev/null +++ b/analysis/analysis.ipynb @@ -0,0 +1,225 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "51c84479-ccaf-4d51-8600-9b2b61a85718", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2022-05-11 14:31:18.997326+02:00\n" + ] + } + ], + "source": [ + "from datetime import datetime, timezone\n", + "import pytz\n", + "\n", + "utc_dt = datetime.now(timezone.utc) # UTC time\n", + "dt = utc_dt.astimezone()\n", + "tz = pytz.timezone('Europe/Berlin')\n", + "berlin_now = datetime.now(tz)\n", + "print(f'{berlin_now:%Y-%m-%d %H:%M}')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "35c6fe67-281b-44ae-878f-05245275ab44", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import glob\n", + "import os\n", + "\n", + "import urllib.parse\n", + "import urllib.request\n", + "\n", + "from tqdm import tqdm\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib_venn import venn2, venn3\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "markdown", + "id": "6eed7550-ac81-4f0d-a894-de9972364e1a", + "metadata": {}, + "source": [ + "## Placing the Spongilla proteome in context\n", + "\n", + "Genes don't do shit. They just sit on the genome. Proteins, on the other hand, do shit. When doing molecular biology with an organism we often treat proteins as the important agents that perform a function, and we often merge in our minds the proteins that arise from a gene and the gene itself.\n", + "\n", + "It is super important, then, to know what proteins do. Unfortunately, it is not possible to conduct experiments for all proteins in all organisms. We know that many proteins (genes) are shared between species (were present in the last common ancestor) and we even have evidence that they still perform the same function. Generalizing from this, we figure that if proteins are much more similar to each other than chance would allow, they probably have very similar structures and perform very similar (if not identical) functions.\n", + "\n", + "People have used sequence similarity to do this with great success. In fact, in the year of our Lord 2022, when working with a non-model species it is standard procedure to use a sequence similarity search against well-annotated proteomes (such as human/fly/mouse/worm) and transfer the functional annotations where the sequence similarity is above a certain threshold. Sadly, the model species represent a very narrow subset of animal life, let alone life in general. When working with more exotic species, the portion of genes that are annotated this way is rather low.\n", + "\n", + "Consider the _Spongilla lacustris_ proteome:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "788c1f04-d76e-4f1f-afdb-41128cea3ea1", + "metadata": {}, + "outputs": [], + "source": [ + "# Multiple sequence alignments\n", + "sequence_info = pd.read_csv('../data/sequence_info.csv', index_col='query')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d60c02b1-661e-465f-b75b-89951bc35fa7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(41943, 29664)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(sequence_info), len(sequence_info['gene_id'].unique())" + ] + }, + { + "cell_type": "markdown", + "id": "3fcc888f-6973-4dda-9a09-41ba92f3d708", + "metadata": {}, + "source": [ + "A reasonable transcriptome assembly has ~42k predicted peptides from ~30k predicted genes. If we use a state-of-the-art sequence-based annotation pipeline like [EggNOG](http://eggnog5.embl.de/), the results are somewhat OK:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "269642b4-43bc-41b5-90b0-0f1637fea9a9", + "metadata": {}, + "outputs": [], + "source": [ + "# sequence-based annotation (emapper)\n", + "eggnog = pd.read_csv('../data/Slacustris_eggnog.tsv', sep='\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "49f32c15-24b4-40f7-890b-f3fdb76fb0cc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(17990, 17394)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(eggnog), np.sum(eggnog['Description'] != \"-\")" + ] + }, + { + "cell_type": "markdown", + "id": "d0835d92-e319-4513-8b00-f6576efb551f", + "metadata": {}, + "source": [ + "Close to 18k (so ~42%) of this transcriptome gets _some_ sort of annotation, and about 41% gets a description. This means that we have some idea about their function (but no specifics) or, at the very least, that these genes could at least be assigned to a group of orthologous genes, even though nothing might be known about them. Think about this: we have no reasonable information about 60% of the _Spongilla_ proteome!" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "b68b8c21-ac96-4ef4-8cbb-798be6938f1a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "12673" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.sum(eggnog['Preferred_name'] != \"-\")" + ] + }, + { + "cell_type": "markdown", + "id": "f8b183b0-ebf8-4874-977a-f930a4014ab3", + "metadata": {}, + "source": [ + "On the other hand, ~13k (predicted) proteins get a name; this is close to one third (30%) of the proteome. This is less than model species (of course), but par for the course for metazoans (??)\n", + "\n", + "|organism|%described|%named|\n", + "|---|---|---|\n", + "|<span style='background :yellow'>H_sapiens</span>|96.43|94.91|\n", + "|<span style='background :yellow'>M_musculus</span>|96.39|90.48|\n", + "|<span style='background :yellow'>D_rerio</span>|94.02|76.98|\n", + "|<span style='background :yellow'>S_cerevisiae</span>|90.82|59.22|\n", + "|<span style='background :yellow'>A_thaliana</span>|85.36|7.96|\n", + "|<span style='background :yellow'>D_melanogaster</span>|82.55|52.80|\n", + "|<span style='background :yellow'>C_elegans</span>|70.58|31.68|\n", + "|D_discoideum|61.07|2.92|\n", + "|S_lacustris|~41|~30|" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e8e14aad", + "metadata": {}, + "outputs": [], + "source": [ + "# AlphaFold predictions\n", + "alphafold = pd.read_csv('../data/structure_predictions.csv', index_col=0)\n", + "alphafold.set_index('query', inplace=True)\n", + "# FoldSeek-based annotation\n", + "foldseek = pd.read_parquet('../data/structure_annotation.parquet')\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/analysis/eggnog_examples.ipynb b/analysis/eggnog_examples.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..367a0f460e59bf147b922dfb10b2fb29694439fc --- /dev/null +++ b/analysis/eggnog_examples.ipynb @@ -0,0 +1,155 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b9897261-aab3-4f6d-9c3f-43443f9a1800", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2022-05-11 15:18\n" + ] + } + ], + "source": [ + "from datetime import datetime, timezone\n", + "import pytz\n", + "\n", + "utc_dt = datetime.now(timezone.utc) # UTC time\n", + "dt = utc_dt.astimezone()\n", + "tz = pytz.timezone('Europe/Berlin')\n", + "berlin_now = datetime.now(tz)\n", + "\n", + "print(f'{berlin_now:%Y-%m-%d %H:%M}')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1120af6d-8163-4307-becc-4529bafd0299", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "23c1a35a-4478-4fe3-871c-99324337d571", + "metadata": {}, + "outputs": [], + "source": [ + "eggnog = '/g/arendt/npapadop/data/spongfold_publish/eggnog/'\n", + "data = {}\n", + "for f in os.listdir(eggnog):\n", + " name = '_'.join(f.split('_')[:2])\n", + " data[name] = pd.read_csv(eggnog + f, sep='\\t', skiprows=4, skipfooter=3, engine='python')" + ] + }, + { + "cell_type": "markdown", + "id": "bd199201-ec2a-45a4-8c5c-3243232b4f48", + "metadata": {}, + "source": [ + "Protein count of Uniprot reference proteome. Used as input for emapper." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "357fbc79-0705-4fb1-963d-fbef77d3a19d", + "metadata": {}, + "outputs": [], + "source": [ + "proteomes = {'A_thaliana': 27473,\n", + " 'S_cerevisiae': 6059,\n", + " 'D_discoideum': 12727,\n", + " 'M_musculus': 21985,\n", + " 'D_rerio': 25707,\n", + " 'C_elegans': 19818,\n", + " 'D_melanogaster': 13821,\n", + " 'H_sapiens': 20577\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "55d83c30-b075-4337-be13-2838a53ea3bc", + "metadata": {}, + "outputs": [], + "source": [ + "def annotation_quality(df):\n", + " \"\"\"\n", + " Given an EggNOG output dataframe, it will count how many entries have a description and a name.\n", + " \"\"\"\n", + " has_desc = np.sum(df['Description'] != '-')\n", + " has_name = np.sum(df['Preferred_name'] != '-')\n", + " return has_desc, has_name" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fbdbf025-dcc2-4566-a287-80dcd25ed69d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A_thaliana\t85.36%\t7.96%\n", + "S_cerevisiae\t90.82%\t59.22%\n", + "D_discoideum\t61.07%\t2.92%\n", + "M_musculus\t96.39%\t90.48%\n", + "D_rerio\t94.02%\t76.98%\n", + "C_elegans\t70.58%\t31.68%\n", + "D_melanogaster\t82.55%\t52.80%\n", + "H_sapiens\t96.43%\t94.91%\n" + ] + } + ], + "source": [ + "for organism in proteomes.keys():\n", + " n_genes = proteomes[organism]\n", + " described, named = annotation_quality(data[organism])\n", + " print(f'{organism}\\t{described / n_genes * 100:.2f}%\\t{named / n_genes * 100:.2f}%')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e57869a7-9753-46c6-835e-62c5dfa661ff", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/analysis/preprocess.ipynb b/analysis/preprocess.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..e405334eaa66415d9a57e0e93ec4260ba62cc002 --- /dev/null +++ b/analysis/preprocess.ipynb @@ -0,0 +1,5747 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "a077b63c-aeae-450f-b1a6-e5c714332be8", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "26794124-d6be-46a6-8766-778853af4586", + "metadata": {}, + "outputs": [], + "source": [ + "# this column will hold NaNs later so convert it to Int64, which can deal with nulls.\n", + "sequence_info[\"protein_id\"] = sequence_info[\"protein_id\"].astype(\"Int64\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "96226044-425e-4170-86d8-140594c6d2b6", + "metadata": {}, + "outputs": [], + "source": [ + "pdb = pd.read_parquet(fs_pdb)\n", + "afdb = pd.read_parquet(fs_afdb)\n", + "swp = pd.read_parquet(fs_swp)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "b8869935-e18a-454c-8b30-303fdf158e78", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(41619,)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "afdb['query'].unique().shape" + ] + }, + { + "cell_type": "markdown", + "id": "aacd0206-2017-4963-832a-8bea033e6417", + "metadata": {}, + "source": [ + "this means that there are ~300 proteins that get no FoldSeek hits, either because they have no predicted structure or because there was just no match for them in the databases. --> Look into!" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "179e7156-2552-4bf6-8545-64cf0c3110a3", + "metadata": {}, + "outputs": [], + "source": [ + "def remove_unannotated(df):\n", + " no_desc = df['Description'] == \"-\"\n", + " no_func = df['Function [CC]'].isnull()\n", + " no_annot = no_desc & no_func\n", + " return df[~no_annot]\n", + "\n", + "def best_bit_score(df):\n", + " have_max = df['bit score'] == np.max(df['bit score'])\n", + " max_ali = df[have_max]['alignment length'] == np.max(df[have_max]['alignment length'])\n", + " return df[have_max][max_ali].index.values[0]\n", + "\n", + "def keep_best_annotated(df):\n", + " df = remove_unannotated(df)\n", + " idx = df.groupby('query').apply(best_bit_score)\n", + " res = df.loc[idx].copy()\n", + " return res\n", + "\n", + "def tag_origin(df, prefix):\n", + " df.columns = prefix + \" \" + df.columns\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "72712260-4726-4107-a97b-c8b901cd97a0", + "metadata": {}, + "outputs": [], + "source": [ + "pdb_best = keep_best_annotated(pdb)\n", + "pdb_best['origin'] = 'PDB'\n", + "afdb_best = keep_best_annotated(afdb)\n", + "afdb_best['origin'] = \"AFDB\"\n", + "swp_best = keep_best_annotated(swp)\n", + "swp_best['origin'] = 'SwissProt'" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "2c769295-4889-4f01-96e0-7b0ce401ec0c", + "metadata": {}, + "outputs": [], + "source": [ + "pdb_best.to_parquet('../data/fs_best_pdb.parquet')\n", + "afdb_best.to_parquet('../data/fs_best_afdb.parquet')\n", + "swp_best.to_parquet('../data/fs_best_swp.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9e087051-1fb3-4309-ae1d-bf0b79695d86", + "metadata": {}, + "outputs": [], + "source": [ + "pdb = pd.read_parquet('../data/fs_best_pdb.parquet')\n", + "afdb = pd.read_parquet('../data/fs_best_afdb.parquet')\n", + "swp = pd.read_parquet('../data/fs_best_swp.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "75703e76-6fc8-4c80-a088-437031e7f863", + "metadata": {}, + "outputs": [], + "source": [ + "best = pd.concat([pdb, afdb, swp])\n", + "best = best.sort_values('bit score', ascending=False).drop_duplicates(['query'])\n", + "best.set_index('query', inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d005fa61-5a94-471c-a00e-1c596aaefcfc", + "metadata": {}, + "outputs": [], + "source": [ + "structural_annotation = sequence_info.join(best).join(alphafold)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b79c93ce-2af4-473f-9607-a3b5177c98be", + "metadata": {}, + "outputs": [], + "source": [ + "structural_annotation.to_parquet('../data/structure_annotation.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6a778a8-2f3f-4b19-8ca5-1a7aab5739f1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7600e81-3807-4446-af22-27b23b16ba2c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe2cb947-965c-43db-9b77-8c3ddf274335", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45fcb9e0-b88f-42bc-aeda-a736393a000d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10aa1498-0863-4590-9650-3babc6b7ec8f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "51c84479-ccaf-4d51-8600-9b2b61a85718", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2022-05-09 07:34:32.598829+02:00\n" + ] + } + ], + "source": [ + "from datetime import datetime, timezone\n", + "import pytz\n", + "\n", + "utc_dt = datetime.now(timezone.utc) # UTC time\n", + "dt = utc_dt.astimezone()\n", + "tz = pytz.timezone('Europe/Berlin')\n", + "berlin_now = datetime.now(tz)\n", + "print(berlin_now)" + ] + }, + { + "cell_type": "markdown", + "id": "5b9f35aa-76b1-488f-b8b8-18f44d966fab", + "metadata": {}, + "source": [ + "# Expanding functional annotation of a non-model species using predicted protein structure similarity\n", + "\n", + "***WARNING:*** The starting point of this analysis is a **predicted proteome** produced by the Trinity <i>de-novo</i> assembler. We produced multiple sequence alignments for each **isoform** with MMSeqs2 against the ColabFoldDB and UniRef30 (see the [ColabFold website](https://colabfold.mmseqs.com)); the MSAs were used as input for AlphaFold; the predicted protein structures were queried by FoldSeek against PDB, SwissProt, and AlphaFoldDB. The same sequences were queried by EggNOG. However, ultimately the analysis we conduct here is going to be on the **gene level**; we will therefore summarize results on the isoform level, either by concatenating them or by keeping the best-scoring isoform.\n", + "\n", + "This notebook gathers the preparatory steps for the analysis. This is the following:\n", + "\n", + "1. **Parse MSA files** and extract MSA size, query length, _Spongilla_ transcript name\n", + "2. **Parse AlphaFold predictions** \n", + " 2a. extract average pLDDT score per prediction \n", + " 2b. keep best-scoring isoform per gene \n", + "3. **Read and filter FoldSeek predictions**. For each database (PDB, SwissProt, AlphaFoldDB) we will only keep the best hit per gene.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "We are going to use\n", + "* `numpy` and `pandas` for I/O and data wrangling;\n", + "* `matplotlib` and `seaborn` for visualisation;\n", + "* `tqdm` for progress bars;\n", + "* `glob`, `os` for file navigation;\n", + "* `json` to parse some AlphaFold output." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b713a10d", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import glob\n", + "import os\n", + "\n", + "import urllib.parse\n", + "import urllib.request\n", + "\n", + "from tqdm import tqdm\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib_venn import venn2, venn3\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "markdown", + "id": "ef88bd0e-d746-4106-b285-6b24e4876e00", + "metadata": {}, + "source": [ + "## 0. Setup\n", + "\n", + "We are going to gather the input here; we need\n", + "\n", + "* input MSAs for _Spongilla lacustris_ proteins (actually Transdecoder-predicted peptides). These were the input to AlphaFold.\n", + "* AlphaFold predicted structures; we obtain a list of structures (input to FoldSeek) as well as auxiliary files with e.g. prediction quality scores.\n", + "* `foldseek` search results for the _Spongilla_ predicted structures against three target databases: AlphaFoldDB, PDB, and SwissProt." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e8e14aad", + "metadata": {}, + "outputs": [], + "source": [ + "# MSA location\n", + "msas = \"/scratch/npapadop/msas/\"\n", + "# FOLDSEEK scores\n", + "fs_pdb = \"/scratch/npapadop/foldseek_results/pdb_score.tsv\"\n", + "fs_afdb = \"/scratch/npapadop/foldseek_results/afdb_score.tsv\"\n", + "fs_swp = \"/scratch/npapadop/foldseek_results/swissprot_score.tsv\"\n", + "# AlphaFold predictions\n", + "structure_list = \"/g/arendt/npapadop/data/spongilla_af/best_models\"\n", + "metadata = \"/g/arendt/npapadop/data/spongilla_af/best_model_metadata/\"" + ] + }, + { + "cell_type": "markdown", + "id": "de3a52aa-8ecc-44bd-9002-7f1412200dc9", + "metadata": {}, + "source": [ + "## 1. Multiple sequence alignments\n", + "\n", + "First read the MSAs and extract the number of sequences in each as well as the sequence length and the _Spongilla_ transcript name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f300273-15c7-41f3-acb3-658129284f25", + "metadata": {}, + "outputs": [], + "source": [ + "N = len(glob.glob(metadata+\"*.a3m\"))\n", + "seq_id = [\"\"] * N\n", + "no_seqs = [0] * N\n", + "seq_length = [0] * N\n", + "gene_name = [\"\"] * N\n", + "\n", + "for i, alignment in enumerate(tqdm(glob.glob(metadata+\"*.a3m\"))):\n", + " try:\n", + " with open(alignment, \"r\") as f:\n", + " seq_id[i] = alignment.split(\"/\")[-1].split(\".\")[0]\n", + " lines = f.readlines()\n", + " no_seqs[i] = (len(lines) - 3) / 2\n", + " seq_length[i] = lines[0].rstrip()[1:].split()[0]\n", + " gene_name[i] = lines[1].rstrip()[1:]\n", + " # print(no_seqs, seq_length, gene_name)\n", + " except FileNotFoundError:\n", + " continue\n", + "\n", + "sequence_info = pd.DataFrame({\"query\": seq_id, \"MSA size\": no_seqs, \"query length\": seq_length, \"gene name\": gene_name})\n", + "sequence_info.set_index(\"query\", inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e946268-f439-4203-a815-48a5384ccf31", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sequence_info[\"protein_id\"] = sequence_info[\"gene name\"].str.split(\".\").str[1]\n", + "sequence_info[\"gene_id\"] = sequence_info[\"gene name\"].str.split(\".\").str[0].str.split(\"_\").str[:2].str.join(\"_\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f1731c0-af6a-4caa-bc1f-b665b59c05c9", + "metadata": {}, + "outputs": [], + "source": [ + "sequence_info.to_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/sequence_info.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "c06f7868-dbc8-4760-9c3c-c9f8b6be58ea", + "metadata": {}, + "source": [ + "## 2. AlphaFold predictions\n", + "\n", + "Next, read the per-residue pLDDT score from AlphaFold and average it; then keep the best-scoring isoform per gene ID." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eed723f4-314d-4b34-89dd-fc339fd89505", + "metadata": {}, + "outputs": [], + "source": [ + "N = len(os.listdir(structure_list))\n", + "proteins = [\"\"] * N\n", + "scores = [0.] * N\n", + "\n", + "for i, protein in enumerate(tqdm(os.listdir(structure_list))):\n", + " full_name = protein.split(\".\")[0]\n", + " metadata_loc = metadata + full_name + \"_scores.json\"\n", + " with open(metadata_loc, \"r\") as f:\n", + " score = json.load(f)\n", + " name = full_name.split(\"_\")[0]\n", + " proteins[i] = name\n", + " scores[i] = np.mean(score[\"plddt\"])\n", + "\n", + "alphafold = pd.DataFrame({\"query\": proteins, \"plddt\": scores})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78ef2d0f-3a3b-46a3-ae53-b5093a61a5f9", + "metadata": {}, + "outputs": [], + "source": [ + "alphafold.to_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/structure_predictions.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "f0ed1ef2-591e-4465-abd3-c3de9d7cce3f", + "metadata": {}, + "source": [ + "Read sequence information for _Spongilla_ and the summary of the structure predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "861be87d-8cda-4074-8e9c-aa904f328e2f", + "metadata": {}, + "outputs": [], + "source": [ + "sequence_info = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/sequence_info.csv\", index_col=\"query\")\n", + "alphafold = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/structure_predictions.csv\", index_col=\"query\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0426d007-88f1-4e7e-8177-abc425f8122e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>plddt</th>\n", + " <th>MSA size</th>\n", + " <th>query length</th>\n", + " <th>gene name</th>\n", + " <th>protein_id</th>\n", + " <th>gene_id</th>\n", + " </tr>\n", + " <tr>\n", + " <th>query</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>20236</th>\n", + " <td>98.147179</td>\n", + " <td>7655.0</td>\n", + " <td>78</td>\n", + " <td>c114736_g1_i1_m.91624</td>\n", + " <td>91624</td>\n", + " <td>c114736_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8471</th>\n", + " <td>98.115354</td>\n", + " <td>16675.0</td>\n", + " <td>127</td>\n", + " <td>c103108_g2_i1_m.62395</td>\n", + " <td>62395</td>\n", + " <td>c103108_g2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10371</th>\n", + " <td>98.018750</td>\n", + " <td>2519.0</td>\n", + " <td>352</td>\n", + " <td>c103630_g1_i2_m.67428</td>\n", + " <td>67428</td>\n", + " <td>c103630_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29803</th>\n", + " <td>97.815000</td>\n", + " <td>3461.0</td>\n", + " <td>120</td>\n", + " <td>c91796_g1_i1_m.16975</td>\n", + " <td>16975</td>\n", + " <td>c91796_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20860</th>\n", + " <td>97.811325</td>\n", + " <td>14752.0</td>\n", + " <td>151</td>\n", + " <td>c2715_g1_i1_m.171</td>\n", + " <td>171</td>\n", + " <td>c2715_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26721</th>\n", + " <td>26.866756</td>\n", + " <td>3.0</td>\n", + " <td>712</td>\n", + " <td>c87685_g1_i1_m.11391</td>\n", + " <td>11391</td>\n", + " <td>c87685_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>32956</th>\n", + " <td>26.821802</td>\n", + " <td>1.0</td>\n", + " <td>172</td>\n", + " <td>c94712_g2_i1_m.22872</td>\n", + " <td>22872</td>\n", + " <td>c94712_g2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>31710</th>\n", + " <td>26.688600</td>\n", + " <td>165.0</td>\n", + " <td>200</td>\n", + " <td>c93707_g1_i1_m.20473</td>\n", + " <td>20473</td>\n", + " <td>c93707_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1202</th>\n", + " <td>26.157910</td>\n", + " <td>2.0</td>\n", + " <td>201</td>\n", + " <td>c100524_g1_i1_m.44523</td>\n", + " <td>44523</td>\n", + " <td>c100524_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1796</th>\n", + " <td>24.582727</td>\n", + " <td>10.0</td>\n", + " <td>297</td>\n", + " <td>c100770_g2_i1_m.45897</td>\n", + " <td>45897</td>\n", + " <td>c100770_g2</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>29662 rows × 6 columns</p>\n", + "</div>" + ], + "text/plain": [ + " plddt MSA size query length gene name protein_id \\\n", + "query \n", + "20236 98.147179 7655.0 78 c114736_g1_i1_m.91624 91624 \n", + "8471 98.115354 16675.0 127 c103108_g2_i1_m.62395 62395 \n", + "10371 98.018750 2519.0 352 c103630_g1_i2_m.67428 67428 \n", + "29803 97.815000 3461.0 120 c91796_g1_i1_m.16975 16975 \n", + "20860 97.811325 14752.0 151 c2715_g1_i1_m.171 171 \n", + "... ... ... ... ... ... \n", + "26721 26.866756 3.0 712 c87685_g1_i1_m.11391 11391 \n", + "32956 26.821802 1.0 172 c94712_g2_i1_m.22872 22872 \n", + "31710 26.688600 165.0 200 c93707_g1_i1_m.20473 20473 \n", + "1202 26.157910 2.0 201 c100524_g1_i1_m.44523 44523 \n", + "1796 24.582727 10.0 297 c100770_g2_i1_m.45897 45897 \n", + "\n", + " gene_id \n", + "query \n", + "20236 c114736_g1 \n", + "8471 c103108_g2 \n", + "10371 c103630_g1 \n", + "29803 c91796_g1 \n", + "20860 c2715_g1 \n", + "... ... \n", + "26721 c87685_g1 \n", + "32956 c94712_g2 \n", + "31710 c93707_g1 \n", + "1202 c100524_g1 \n", + "1796 c100770_g2 \n", + "\n", + "[29662 rows x 6 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "alphafold" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7efb172b-6adb-4f50-84f7-bb63adecd3c9", + "metadata": {}, + "outputs": [], + "source": [ + "# this column will hold NaNs later so convert it to Int64, which can deal with nulls.\n", + "sequence_info[\"protein_id\"] = sequence_info[\"protein_id\"].astype(\"Int64\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2ad95435-9023-4d97-a0ce-cd7612e57a66", + "metadata": {}, + "outputs": [], + "source": [ + "pdb = read_foldseek(fs_pdb)\n", + "pdb[\"query\"] = pdb[\"query\"].values.astype(int)\n", + "afdb = read_foldseek(fs_afdb)\n", + "afdb[\"query\"] = afdb[\"query\"].values.astype(int)\n", + "swp = read_foldseek(fs_swp)\n", + "swp[\"query\"] = swp[\"query\"].values.astype(int)" + ] + }, + { + "cell_type": "markdown", + "id": "4159e6d3-f586-4ffc-a8fa-bc2a8de343df", + "metadata": {}, + "source": [ + "Convert AFDB identifiers to UniProt identifiers" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "499f01a6-91c8-43d8-bb53-1bd86c3c6823", + "metadata": {}, + "outputs": [], + "source": [ + "afdb[\"uniprot\"] = afdb[\"target\"].str.split(\"-\").str[1]\n", + "swp[\"uniprot\"] = swp[\"target\"].str.split(\"-\").str[1]" + ] + }, + { + "cell_type": "markdown", + "id": "e95086d4-191d-4785-bd09-332ef87fb28a", + "metadata": {}, + "source": [ + "Translate PDB IDs to UniProt" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e21be25d-cc9b-43a7-9acb-5da985fbb554", + "metadata": {}, + "outputs": [], + "source": [ + "def get_from_uniprot(df, column, uniprot_from=\"ACC\", uniprot_to=\"EGGNOG_ID\", request_size=40000):\n", + " ids = df[column][~df[column].isnull()]\n", + " url = 'https://www.uniprot.org/uploadlists/'\n", + "\n", + " params = {\n", + " 'from': uniprot_from,\n", + " 'to': uniprot_to,\n", + " 'format': 'tab',\n", + " 'query': \" \".join(ids.unique())\n", + " }\n", + "\n", + " data = urllib.parse.urlencode(params)\n", + " data = data.encode('utf-8')\n", + " req = urllib.request.Request(url, data)\n", + " with urllib.request.urlopen(req) as f:\n", + " response = f.read()\n", + " return response\n", + "\n", + "def create_id_map(response, id_in, id_out):\n", + " map_from = []\n", + " map_to = []\n", + " for row in tqdm(response.decode(\"utf-8\").split(\"\\n\")[:-1]):\n", + " col1, col2 = row.split(\"\\t\")\n", + " map_from.append(col1)\n", + " map_to.append(col2)\n", + "\n", + " res = pd.DataFrame({id_in: map_from[1:], id_out: map_to[1:]})\n", + " res = pd.DataFrame(res.groupby(id_in)[id_out].apply(\",\".join))\n", + "# res.set_index(id_in, inplace=True)\n", + " return res\n", + "\n", + "def enrich_from_uniprot(df, column_from, column_to, uniprot_from=\"PDB_ID\", uniprot_to=\"ACC\"):\n", + " response = get_from_uniprot(df, column_from, uniprot_from=uniprot_from, uniprot_to=uniprot_to)\n", + " df_map = create_id_map(response, column_from, column_to)\n", + " return df.join(df_map, on=column_from)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "47c101a2-47b5-40db-9693-f6af15031437", + "metadata": {}, + "outputs": [], + "source": [ + "df = afdb[:1000]\n", + "column=\"uniprot\"\n", + "ids = df[column][~df[column].isnull()]\n", + "url = 'https://www.uniprot.org/uploadlists/'" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c31956e6-ed67-4be5-8faf-f7c0ff4d7c58", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(998,)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ids.unique().shape" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b3e5038f-c7c9-4cef-961a-4e5682236f1c", + "metadata": {}, + "outputs": [], + "source": [ + "request_size = 400\n", + "unique_ids = ids.unique()\n", + "steps = unique_ids.shape[0] // request_size + 1\n", + "response = [[]] * steps" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "507753ca-7ef1-4e82-be1d-bbc23d7ec553", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 3/3 [00:53<00:00, 17.97s/it]\n" + ] + } + ], + "source": [ + "for i in tqdm(range(steps)):\n", + " params = {\n", + " 'from': \"ACC\",\n", + " 'to': \"GENENAME\",\n", + " 'format': 'tab',\n", + " 'query': \" \".join(unique_ids[(i*request_size):((i+1)*request_size)])\n", + " }\n", + "\n", + " data = urllib.parse.urlencode(params)\n", + " data = data.encode('utf-8')\n", + " req = urllib.request.Request(url, data)\n", + " with urllib.request.urlopen(req) as f:\n", + " response[i] = f.read()" + ] + }, + { + "cell_type": "markdown", + "id": "f50296d5-ac0f-48fc-80ef-85b952b843f4", + "metadata": {}, + "source": [ + "First translate PDB IDs to UniProt IDs:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "0acd0a47-df91-4ffd-9e8d-bc8cb8882005", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 142581/142581 [00:00<00:00, 1929888.50it/s]\n" + ] + } + ], + "source": [ + "pdb = enrich_from_uniprot(pdb, \"target\", \"uniprot\", uniprot_from=\"PDB_ID\", uniprot_to=\"ACC\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "07e0cb24-97e1-49d5-8c2f-1f757d5b2a5f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "418323" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdb[\"uniprot\"].str.contains(\",\").sum()" + ] + }, + { + "cell_type": "markdown", + "id": "3f03690e-ac94-4502-b372-f77da44ba6f3", + "metadata": {}, + "source": [ + "418323 rows in the pdb file have multiple entrances in the uniprot column, separated by a comma. I will separate them and duplicate the rest of the rows." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "6233210d-ba4b-4ca3-afb4-1b4203550298", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pdb = pdb.drop('uniprot', axis=1).join(pdb['uniprot'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('uniprot'))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "574b2698-4bcc-4c9d-8db8-61e327362d3e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdb[\"uniprot\"].str.contains(\",\").sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "8525155a-ffdc-47e4-8bb6-ce720f0dbc49", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4600922" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(pdb)" + ] + }, + { + "cell_type": "markdown", + "id": "7cc67429-33c7-4596-b269-7d6f71aed6f4", + "metadata": {}, + "source": [ + "I will try and use the tool UNIMAPI to retrieve information from the UniProtIDs, including the sequences that we will need for a new emapper run. \n", + "UNIMAPI takes a csv file as an imput. I will export those from the foldseek result files as csv files.\n", + "\n", + "I just realized that I need to add the commas and make it a one-liner. Because I don't know how to do this in Python, I will just use regex and safe the files as afdb_uniprotIDs.txt, etc. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0c4592d-1465-4ff8-bf7a-b20ac08b2586", + "metadata": {}, + "outputs": [], + "source": [ + "afdb[\"uniprot\"].to_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/afdb_uniprotIDs.csv\", index=False, index_label=False, header=False, sep=\",\")\n", + "pdb[\"uniprot\"].to_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/pdb_uniprotIDs.csv\", index=False, index_label=False, header=False, sep=\",\")\n", + "swp[\"uniprot\"].to_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/swp_uniprotIDs.csv\", index=False, index_label=False, header=False, sep=\",\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "c4f36d35-8a75-4577-a948-15f9d6a76945", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Q8WZ42 13900\n", + "Q99996 3626\n", + "Q8WXI7 3612\n", + "Q9Y6V0 3004\n", + "Q15149 2975\n", + " ... \n", + "A0A1P6CI10 1\n", + "K7LJC4 1\n", + "K7LPC1 1\n", + "A0A0N7KCM8 1\n", + "P71601 1\n", + "Name: uniprot, Length: 524585, dtype: int64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "afdb.uniprot.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "33436cf6-558d-42a1-9e3e-25f2c86d9e0f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "E3WDQ9 2605\n", + "Q5RC05 2472\n", + "Q5M7C3 2382\n", + "P59597 2346\n", + "P02467 2107\n", + " ... \n", + "Q6LN45 1\n", + "Q56647 1\n", + "B1JQI1 1\n", + "C5FM58 1\n", + "G3KIM4 1\n", + "Name: uniprot, Length: 405846, dtype: int64" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "swp.uniprot.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "a84a9095-e6ba-4902-b7b0-192005816f5c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "P01116 18010\n", + "P0CX51 10846\n", + "O13516 10730\n", + "P05756 10359\n", + "Q3E7X9 10323\n", + " ... \n", + "P80379 1\n", + "Q56691 1\n", + "Q7SIH1 1\n", + "Q63041 1\n", + "Q484B6 1\n", + "Name: uniprot, Length: 35260, dtype: int64" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdb.uniprot.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "49605efa-c6ef-4f62-8c65-9c5fb87e07b8", + "metadata": { + "tags": [] + }, + "source": [ + "afdb contains 524585 unique uniprot IDs. \n", + "swp contains 405846 unique uniprot IDs.\n", + "pdb contains 35260 unique uniprot IDs.\n", + "\n", + "```\n", + "tr -d '\\n' < afdb_uniprotIDs.txt > afdb_uniprotIDs_oneline.txt\n", + "tr -d '\\n' < pdb_uniprotIDs.txt > pdb_uniprotIDs_oneline.txt\n", + "tr -d '\\n' < swp_uniprotIDs.txt > swp_uniprotIDs_oneline.txt\n", + "```\n", + "\n", + "For the pdb oneliner file, I additionally had to remove those thingies -> \" because sometimes a pdb entrance has multiple uniprot ids. So in principle I had to delete the cases which didn't have uniprot ids (\"\",) and afterwards delete the empty lines. After that I had to delete leftover quotation marks. \n", + "```\n", + "tr -d '\"' < pdb_uniprotIDs_oneline.txt > pdb_uniprotIDs_oneline.txt\n", + "```\n", + "\n", + "After that, I can run the command for each file using terminal, retrieving specificly picked information.\n", + "\n", + "First, I will make use of the --fasta option, that will just return a fasta file for the sequences. This fasta file can then be used to run EggNOG mapper manually.\n", + "\n", + "```\n", + "upimapi.py -i afdb_uniprotIDs_oneline.txt -o /Volumes/arendt/Fabian/PhD/Computational/Spongefold/UNIMAPI/afdb/ --fasta\n", + "upimapi.py -i pdb_uniprotIDs_oneline.txt -o /Volumes/arendt/Fabian/PhD/Computational/Spongefold/UNIMAPI/pdb/ --fasta\n", + "upimapi.py -i swp_uniprotIDs_oneline.txt -o /Volumes/arendt/Fabian/PhD/Computational/Spongefold/UNIMAPI/swp/ --fasta\n", + "```\n", + "\n", + "Fasta files are saved in the respective folders as: uniprotinfo.fasta\n", + "\n", + "After UPIMAPI is translating UniprotID to fasta, it tries to check if all uniprotIDs have a corresponding fasta sequence. However, while the translation happens using the unique UniprotIDs, for some reason it checks if all UniprotIDs (in the case of afdb 12.5 million) are represented. I got a timeout after 24 hours for afdb and swp. Only pdb ran though. I will check how many \">\" are in the fasta files:\n", + "\n", + "* afdb: 525613\n", + "* swp: 405935\n", + "* pdb: 35280\n", + "\n", + "How can there be more fasta entrances than unique uniprotIDs...?\n", + "\n", + "For the pdb fasta I could immediatelly run e-mapper. For afdb and swp I had to divide the fasta files into 100000 entries files (maximum input for emapper). I did this using the script fasta splitter (http://kirill-kryukov.com/study/tools/fasta-splitter/) with the following commands:\n", + "```\n", + "perl fasta-splitter.pl --part-size 100000 ./afdb/uniprotinfo.fasta --nopad --measure count --out-dir ./afdb/fasta_split/\n", + "perl fasta-splitter.pl --part-size 100000 ./swp/uniprotinfo.fasta --nopad --measure count --out-dir ./swp/fasta_split/\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "5e43cbad-cc59-4814-8ab6-bf1bae920a3e", + "metadata": {}, + "outputs": [], + "source": [ + "pdb_emapper = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/UPIMAPI/pdb/emapper/pdb_emapper.tsv\", sep='\\t', skiprows=4, skipfooter=3, engine='python')" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "747a5c72-a19b-4afe-ae1c-b896c298d7ee", + "metadata": {}, + "outputs": [], + "source": [ + "pdb_emapper[\"uniprot\"] = pdb_emapper[\"#query\"].str.split(\"|\").str[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "e3a291be-38e8-495e-b455-72b0770631fe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "P62238 1\n", + "Q9Y324 1\n", + "D5DKI8 1\n", + "Q6TEK8 1\n", + "P04157 1\n", + " ..\n", + "P04181 1\n", + "A0A1H6Q8Z5 1\n", + "P16038 1\n", + "Q4JB24 1\n", + "P38326 1\n", + "Name: uniprot, Length: 34551, dtype: int64" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdb_emapper.uniprot.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "50150ed8-4d19-4cad-9a7c-5528e3f622f2", + "metadata": {}, + "source": [ + "Although I have 35280 entries in the fasta file, emapper only scans though 34554 (tail of emapper file). Unique uniprot IDs in the pdb emapper files are then 34551. The question is where the other 700 entries went..." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "c522cd0d-704a-486f-a528-a31cf09315de", + "metadata": {}, + "outputs": [], + "source": [ + "pdb_merge = pd.merge(pdb, pdb_emapper, on=\"uniprot\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "7387e9ab-8037-4a6e-8a9a-ec18166247d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "P01116 18010\n", + "P0CX51 10846\n", + "O13516 10730\n", + "P05756 10359\n", + "Q3E7X9 10323\n", + " ... \n", + "A0KKT0 1\n", + "Q06672 1\n", + "P86179 1\n", + "A0A090BWT0 1\n", + "Q484B6 1\n", + "Name: uniprot, Length: 34551, dtype: int64" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdb_merge.uniprot.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "08775c42-6ab6-4ab0-8104-e9848d8029f4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4500383" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(pdb_merge)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "0f0bd0c2-79de-44cf-95ec-3eeb7687be46", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['query',\n", + " 'target',\n", + " 'seq. id.',\n", + " 'alignment length',\n", + " 'no. mismatches',\n", + " 'no. gap open',\n", + " 'query start',\n", + " 'target start',\n", + " 'query end',\n", + " 'target end',\n", + " 'e value',\n", + " 'bit score',\n", + " 'uniprot',\n", + " '#query',\n", + " 'seed_ortholog',\n", + " 'evalue',\n", + " 'score',\n", + " 'eggNOG_OGs',\n", + " 'max_annot_lvl',\n", + " 'COG_category',\n", + " 'Description',\n", + " 'Preferred_name',\n", + " 'GOs',\n", + " 'EC',\n", + " 'KEGG_ko',\n", + " 'KEGG_Pathway',\n", + " 'KEGG_Module',\n", + " 'KEGG_Reaction',\n", + " 'KEGG_rclass',\n", + " 'BRITE',\n", + " 'KEGG_TC',\n", + " 'CAZy',\n", + " 'BiGG_Reaction',\n", + " 'PFAMs']" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(pdb_merge.columns)" + ] + }, + { + "cell_type": "markdown", + "id": "fe4e7ac9-697d-4a67-a4ec-04ea44f56b48", + "metadata": {}, + "source": [ + "I additionally will run UPIMAPI with pdb, afdb and swp input and retrieving information that is stored in Uniprot.\n", + "\n", + "```\n", + "upimapi.py -i afdb_uniprotIDs_oneline.txt -o /Volumes/arendt/Fabian/PhD/Computational/Spongefold/UNIMAPI/afdb/ -dbs \"evolutionary genealogy of genes: Non-supervised Orthologous Groups\"\n", + "\n", + "upimapi.py -i pdb_uniprotIDs_oneline.txt -o /Volumes/arendt/Fabian/PhD/Computational/Spongefold/UNIMAPI/pdb/ -dbs \"evolutionary genealogy of genes: Non-supervised Orthologous Groups\"\n", + "\n", + "upimapi.py -i swp_uniprotIDs_oneline.txt -o /Volumes/arendt/Fabian/PhD/Computational/Spongefold/UNIMAPI/swp/ -dbs \"evolutionary genealogy of genes: Non-supervised Orthologous Groups\"\n", + "```\n", + "\n", + "I will run these on the cluster. The bash scripts can be found in the respective folders." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "7dbb6daf-fbf9-46bf-9f4b-1ef6805ec2e4", + "metadata": {}, + "outputs": [], + "source": [ + "pdb_uniprot_info = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/UPIMAPI/pdb/pdb_upimapi_mapping_eggnog.tsv\", sep='\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "cc007880-726c-457a-b078-e43d251eebc1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pdb_uniprot_info.rename(columns={'Entry': 'uniprot'}, inplace=True)\n", + "pdb_uniprot_info = pdb_uniprot_info[[\"uniprot\", \"Entry name\", \"Gene names\", \"Function [CC]\", \"Taxonomic lineage (PHYLUM)\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "f17d7bc1-a6c4-4bd6-85f1-a95fb134901a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>uniprot</th>\n", + " <th>Entry name</th>\n", + " <th>Gene names</th>\n", + " <th>Function [CC]</th>\n", + " <th>Taxonomic lineage (PHYLUM)</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>P21553</td>\n", + " <td>CISY_THEAC</td>\n", + " <td>gltA Ta0169</td>\n", + " <td>NaN</td>\n", + " <td>Candidatus Thermoplasmatota</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Q61179</td>\n", + " <td>IRF9_MOUSE</td>\n", + " <td>Irf9 Isgf3g</td>\n", + " <td>FUNCTION: Transcription factor that plays an e...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>O34873</td>\n", + " <td>HMGCL_BACSU</td>\n", + " <td>yngG BSU18230</td>\n", + " <td>FUNCTION: Involved in the catabolism of branch...</td>\n", + " <td>Firmicutes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Q9BU02</td>\n", + " <td>THTPA_HUMAN</td>\n", + " <td>THTPA</td>\n", + " <td>FUNCTION: Hydrolase highly specific for thiami...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Q9LLQ2</td>\n", + " <td>P102B_LUPLU</td>\n", + " <td>PR10.2B</td>\n", + " <td>FUNCTION: Class II ribonuclease (RNase) (By si...</td>\n", + " <td>Streptophyta</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>35255</th>\n", + " <td>Q7JZW2</td>\n", + " <td>Q7JZW2_DROME</td>\n", + " <td>RpS15 anon-EST:Posey137 anon-EST:Posey185 Dmel...</td>\n", + " <td>NaN</td>\n", + " <td>Arthropoda</td>\n", + " </tr>\n", + " <tr>\n", + " <th>35256</th>\n", + " <td>Q9HW09</td>\n", + " <td>PANE_PSEAE</td>\n", + " <td>panE PA4397</td>\n", + " <td>FUNCTION: Catalyzes the NADPH-dependent reduct...</td>\n", + " <td>Proteobacteria</td>\n", + " </tr>\n", + " <tr>\n", + " <th>35257</th>\n", + " <td>F2Z508</td>\n", + " <td>F2Z508_PIG</td>\n", + " <td>STMN4</td>\n", + " <td>NaN</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>35258</th>\n", + " <td>Q8ZM82</td>\n", + " <td>IDI_SALTY</td>\n", + " <td>idi STM3039</td>\n", + " <td>FUNCTION: Catalyzes the 1,3-allylic rearrangem...</td>\n", + " <td>Proteobacteria</td>\n", + " </tr>\n", + " <tr>\n", + " <th>35259</th>\n", + " <td>P12873</td>\n", + " <td>RL29_BACSU</td>\n", + " <td>rpmC BSU01240</td>\n", + " <td>NaN</td>\n", + " <td>Firmicutes</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>35260 rows × 5 columns</p>\n", + "</div>" + ], + "text/plain": [ + " uniprot Entry name \\\n", + "0 P21553 CISY_THEAC \n", + "1 Q61179 IRF9_MOUSE \n", + "2 O34873 HMGCL_BACSU \n", + "3 Q9BU02 THTPA_HUMAN \n", + "4 Q9LLQ2 P102B_LUPLU \n", + "... ... ... \n", + "35255 Q7JZW2 Q7JZW2_DROME \n", + "35256 Q9HW09 PANE_PSEAE \n", + "35257 F2Z508 F2Z508_PIG \n", + "35258 Q8ZM82 IDI_SALTY \n", + "35259 P12873 RL29_BACSU \n", + "\n", + " Gene names \\\n", + "0 gltA Ta0169 \n", + "1 Irf9 Isgf3g \n", + "2 yngG BSU18230 \n", + "3 THTPA \n", + "4 PR10.2B \n", + "... ... \n", + "35255 RpS15 anon-EST:Posey137 anon-EST:Posey185 Dmel... \n", + "35256 panE PA4397 \n", + "35257 STMN4 \n", + "35258 idi STM3039 \n", + "35259 rpmC BSU01240 \n", + "\n", + " Function [CC] \\\n", + "0 NaN \n", + "1 FUNCTION: Transcription factor that plays an e... \n", + "2 FUNCTION: Involved in the catabolism of branch... \n", + "3 FUNCTION: Hydrolase highly specific for thiami... \n", + "4 FUNCTION: Class II ribonuclease (RNase) (By si... \n", + "... ... \n", + "35255 NaN \n", + "35256 FUNCTION: Catalyzes the NADPH-dependent reduct... \n", + "35257 NaN \n", + "35258 FUNCTION: Catalyzes the 1,3-allylic rearrangem... \n", + "35259 NaN \n", + "\n", + " Taxonomic lineage (PHYLUM) \n", + "0 Candidatus Thermoplasmatota \n", + "1 Chordata \n", + "2 Firmicutes \n", + "3 Chordata \n", + "4 Streptophyta \n", + "... ... \n", + "35255 Arthropoda \n", + "35256 Proteobacteria \n", + "35257 Chordata \n", + "35258 Proteobacteria \n", + "35259 Firmicutes \n", + "\n", + "[35260 rows x 5 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdb_uniprot_info" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "afb95431-1ec2-491d-83cf-b288322e7fd6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pdb_merge = pd.merge(pdb_merge, pdb_uniprot_info, on=\"uniprot\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "cc0ab000-40e8-423a-8d75-6c3b59753150", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "P01116 18010\n", + "P0CX51 10846\n", + "O13516 10730\n", + "P05756 10359\n", + "Q3E7X9 10323\n", + " ... \n", + "A0KKT0 1\n", + "Q06672 1\n", + "P86179 1\n", + "A0A090BWT0 1\n", + "Q484B6 1\n", + "Name: uniprot, Length: 34551, dtype: int64" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdb_merge.uniprot.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "8560bfda-d99c-463f-8ed5-f834449d252f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['query',\n", + " 'target',\n", + " 'seq. id.',\n", + " 'alignment length',\n", + " 'no. mismatches',\n", + " 'no. gap open',\n", + " 'query start',\n", + " 'target start',\n", + " 'query end',\n", + " 'target end',\n", + " 'e value',\n", + " 'bit score',\n", + " 'uniprot',\n", + " '#query',\n", + " 'seed_ortholog',\n", + " 'evalue',\n", + " 'score',\n", + " 'eggNOG_OGs',\n", + " 'max_annot_lvl',\n", + " 'COG_category',\n", + " 'Description',\n", + " 'Preferred_name',\n", + " 'GOs',\n", + " 'EC',\n", + " 'KEGG_ko',\n", + " 'KEGG_Pathway',\n", + " 'KEGG_Module',\n", + " 'KEGG_Reaction',\n", + " 'KEGG_rclass',\n", + " 'BRITE',\n", + " 'KEGG_TC',\n", + " 'CAZy',\n", + " 'BiGG_Reaction',\n", + " 'PFAMs',\n", + " 'Entry name',\n", + " 'Gene names',\n", + " 'Function [CC]',\n", + " 'Taxonomic lineage (PHYLUM)']" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(pdb_merge.columns)" + ] + }, + { + "cell_type": "markdown", + "id": "09ae8063-9c39-4654-8042-031148dca85d", + "metadata": {}, + "source": [ + "This list seems quiet confusing but in general it now contains the following information:\n", + " - Foldseek query number of that protein and target. The next columns all have to do with foldseek alignment quality, including evalue and bit score.\n", + " - uniprot ID of foldseek pdb, translated with the uniprot API.\n", + " - \"#query\" is the long query name of the fasta that UPIMAPI pulled out from uniprot using the uniprot ID. All columns until PFAM stem from the eggnog search using the fasta file.\n", + " - Entry name, gene names, Function and Taxonomic lineage (PHYLUM) additionally stem from UPIMAPI retrieving additional infomration from uniprot." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "3d1e3600-dbe4-4185-86b1-fd439e04ff1c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>query</th>\n", + " <th>target</th>\n", + " <th>seq. id.</th>\n", + " <th>alignment length</th>\n", + " <th>no. mismatches</th>\n", + " <th>no. gap open</th>\n", + " <th>query start</th>\n", + " <th>target start</th>\n", + " <th>query end</th>\n", + " <th>target end</th>\n", + " <th>...</th>\n", + " <th>KEGG_rclass</th>\n", + " <th>BRITE</th>\n", + " <th>KEGG_TC</th>\n", + " <th>CAZy</th>\n", + " <th>BiGG_Reaction</th>\n", + " <th>PFAMs</th>\n", + " <th>Entry name</th>\n", + " <th>Gene names</th>\n", + " <th>Function [CC]</th>\n", + " <th>Taxonomic lineage (PHYLUM)</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>4731</td>\n", + " <td>5htk</td>\n", + " <td>0.601</td>\n", + " <td>251</td>\n", + " <td>97</td>\n", + " <td>3</td>\n", + " <td>5</td>\n", + " <td>254</td>\n", + " <td>176</td>\n", + " <td>424</td>\n", + " <td>...</td>\n", + " <td>RC00152</td>\n", + " <td>ko00000,ko00001,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>6PF2K,His_Phos_1</td>\n", + " <td>F262_HUMAN</td>\n", + " <td>PFKFB2</td>\n", + " <td>FUNCTION: Synthesis and degradation of fructos...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>4731</td>\n", + " <td>5htk</td>\n", + " <td>0.629</td>\n", + " <td>240</td>\n", + " <td>87</td>\n", + " <td>2</td>\n", + " <td>9</td>\n", + " <td>248</td>\n", + " <td>181</td>\n", + " <td>418</td>\n", + " <td>...</td>\n", + " <td>RC00152</td>\n", + " <td>ko00000,ko00001,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>6PF2K,His_Phos_1</td>\n", + " <td>F262_HUMAN</td>\n", + " <td>PFKFB2</td>\n", + " <td>FUNCTION: Synthesis and degradation of fructos...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>3214</td>\n", + " <td>5htk</td>\n", + " <td>0.127</td>\n", + " <td>212</td>\n", + " <td>128</td>\n", + " <td>12</td>\n", + " <td>135</td>\n", + " <td>293</td>\n", + " <td>1</td>\n", + " <td>208</td>\n", + " <td>...</td>\n", + " <td>RC00152</td>\n", + " <td>ko00000,ko00001,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>6PF2K,His_Phos_1</td>\n", + " <td>F262_HUMAN</td>\n", + " <td>PFKFB2</td>\n", + " <td>FUNCTION: Synthesis and degradation of fructos...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>3214</td>\n", + " <td>5htk</td>\n", + " <td>0.132</td>\n", + " <td>212</td>\n", + " <td>127</td>\n", + " <td>12</td>\n", + " <td>135</td>\n", + " <td>293</td>\n", + " <td>1</td>\n", + " <td>208</td>\n", + " <td>...</td>\n", + " <td>RC00152</td>\n", + " <td>ko00000,ko00001,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>6PF2K,His_Phos_1</td>\n", + " <td>F262_HUMAN</td>\n", + " <td>PFKFB2</td>\n", + " <td>FUNCTION: Synthesis and degradation of fructos...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>3311</td>\n", + " <td>5htk</td>\n", + " <td>0.180</td>\n", + " <td>211</td>\n", + " <td>112</td>\n", + " <td>11</td>\n", + " <td>9</td>\n", + " <td>182</td>\n", + " <td>3</td>\n", + " <td>189</td>\n", + " <td>...</td>\n", + " <td>RC00152</td>\n", + " <td>ko00000,ko00001,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>6PF2K,His_Phos_1</td>\n", + " <td>F262_HUMAN</td>\n", + " <td>PFKFB2</td>\n", + " <td>FUNCTION: Synthesis and degradation of fructos...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4500378</th>\n", + " <td>26834</td>\n", + " <td>4c65</td>\n", + " <td>0.157</td>\n", + " <td>484</td>\n", + " <td>270</td>\n", + " <td>25</td>\n", + " <td>7</td>\n", + " <td>423</td>\n", + " <td>6</td>\n", + " <td>418</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>Amidohydro_1</td>\n", + " <td>OTASE_ASPNC</td>\n", + " <td>Am2 An14g02080</td>\n", + " <td>FUNCTION: Carboxypeptidase that catalyzes the ...</td>\n", + " <td>Ascomycota</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4500379</th>\n", + " <td>26834</td>\n", + " <td>4c5y</td>\n", + " <td>0.142</td>\n", + " <td>497</td>\n", + " <td>265</td>\n", + " <td>26</td>\n", + " <td>7</td>\n", + " <td>424</td>\n", + " <td>7</td>\n", + " <td>421</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>Amidohydro_1</td>\n", + " <td>OTASE_ASPNC</td>\n", + " <td>Am2 An14g02080</td>\n", + " <td>FUNCTION: Carboxypeptidase that catalyzes the ...</td>\n", + " <td>Ascomycota</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4500380</th>\n", + " <td>26834</td>\n", + " <td>4c5z</td>\n", + " <td>0.158</td>\n", + " <td>493</td>\n", + " <td>269</td>\n", + " <td>27</td>\n", + " <td>2</td>\n", + " <td>424</td>\n", + " <td>5</td>\n", + " <td>421</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>Amidohydro_1</td>\n", + " <td>OTASE_ASPNA</td>\n", + " <td>Am2 ASPNIDRAFT_41631</td>\n", + " <td>FUNCTION: Carboxypeptidase that catalyzes the ...</td>\n", + " <td>Ascomycota</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4500381</th>\n", + " <td>26834</td>\n", + " <td>4c5z</td>\n", + " <td>0.155</td>\n", + " <td>495</td>\n", + " <td>267</td>\n", + " <td>24</td>\n", + " <td>2</td>\n", + " <td>424</td>\n", + " <td>6</td>\n", + " <td>421</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>Amidohydro_1</td>\n", + " <td>OTASE_ASPNA</td>\n", + " <td>Am2 ASPNIDRAFT_41631</td>\n", + " <td>FUNCTION: Carboxypeptidase that catalyzes the ...</td>\n", + " <td>Ascomycota</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4500382</th>\n", + " <td>26834</td>\n", + " <td>5xgx</td>\n", + " <td>0.129</td>\n", + " <td>477</td>\n", + " <td>250</td>\n", + " <td>32</td>\n", + " <td>5</td>\n", + " <td>423</td>\n", + " <td>1</td>\n", + " <td>370</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>ko00000,ko01000,ko01002</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>Amidohydro_1,Amidohydro_3</td>\n", + " <td>Q484B6_COLP3</td>\n", + " <td>iadA CPS_1869</td>\n", + " <td>FUNCTION: Catalyzes the hydrolytic cleavage of...</td>\n", + " <td>Proteobacteria</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>4500383 rows × 38 columns</p>\n", + "</div>" + ], + "text/plain": [ + " query target seq. id. alignment length no. mismatches \\\n", + "0 4731 5htk 0.601 251 97 \n", + "1 4731 5htk 0.629 240 87 \n", + "2 3214 5htk 0.127 212 128 \n", + "3 3214 5htk 0.132 212 127 \n", + "4 3311 5htk 0.180 211 112 \n", + "... ... ... ... ... ... \n", + "4500378 26834 4c65 0.157 484 270 \n", + "4500379 26834 4c5y 0.142 497 265 \n", + "4500380 26834 4c5z 0.158 493 269 \n", + "4500381 26834 4c5z 0.155 495 267 \n", + "4500382 26834 5xgx 0.129 477 250 \n", + "\n", + " no. gap open query start target start query end target end ... \\\n", + "0 3 5 254 176 424 ... \n", + "1 2 9 248 181 418 ... \n", + "2 12 135 293 1 208 ... \n", + "3 12 135 293 1 208 ... \n", + "4 11 9 182 3 189 ... \n", + "... ... ... ... ... ... ... \n", + "4500378 25 7 423 6 418 ... \n", + "4500379 26 7 424 7 421 ... \n", + "4500380 27 2 424 5 421 ... \n", + "4500381 24 2 424 6 421 ... \n", + "4500382 32 5 423 1 370 ... \n", + "\n", + " KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction \\\n", + "0 RC00152 ko00000,ko00001,ko01000 - - - \n", + "1 RC00152 ko00000,ko00001,ko01000 - - - \n", + "2 RC00152 ko00000,ko00001,ko01000 - - - \n", + "3 RC00152 ko00000,ko00001,ko01000 - - - \n", + "4 RC00152 ko00000,ko00001,ko01000 - - - \n", + "... ... ... ... ... ... \n", + "4500378 - - - - - \n", + "4500379 - - - - - \n", + "4500380 - - - - - \n", + "4500381 - - - - - \n", + "4500382 - ko00000,ko01000,ko01002 - - - \n", + "\n", + " PFAMs Entry name Gene names \\\n", + "0 6PF2K,His_Phos_1 F262_HUMAN PFKFB2 \n", + "1 6PF2K,His_Phos_1 F262_HUMAN PFKFB2 \n", + "2 6PF2K,His_Phos_1 F262_HUMAN PFKFB2 \n", + "3 6PF2K,His_Phos_1 F262_HUMAN PFKFB2 \n", + "4 6PF2K,His_Phos_1 F262_HUMAN PFKFB2 \n", + "... ... ... ... \n", + "4500378 Amidohydro_1 OTASE_ASPNC Am2 An14g02080 \n", + "4500379 Amidohydro_1 OTASE_ASPNC Am2 An14g02080 \n", + "4500380 Amidohydro_1 OTASE_ASPNA Am2 ASPNIDRAFT_41631 \n", + "4500381 Amidohydro_1 OTASE_ASPNA Am2 ASPNIDRAFT_41631 \n", + "4500382 Amidohydro_1,Amidohydro_3 Q484B6_COLP3 iadA CPS_1869 \n", + "\n", + " Function [CC] \\\n", + "0 FUNCTION: Synthesis and degradation of fructos... \n", + "1 FUNCTION: Synthesis and degradation of fructos... \n", + "2 FUNCTION: Synthesis and degradation of fructos... \n", + "3 FUNCTION: Synthesis and degradation of fructos... \n", + "4 FUNCTION: Synthesis and degradation of fructos... \n", + "... ... \n", + "4500378 FUNCTION: Carboxypeptidase that catalyzes the ... \n", + "4500379 FUNCTION: Carboxypeptidase that catalyzes the ... \n", + "4500380 FUNCTION: Carboxypeptidase that catalyzes the ... \n", + "4500381 FUNCTION: Carboxypeptidase that catalyzes the ... \n", + "4500382 FUNCTION: Catalyzes the hydrolytic cleavage of... \n", + "\n", + " Taxonomic lineage (PHYLUM) \n", + "0 Chordata \n", + "1 Chordata \n", + "2 Chordata \n", + "3 Chordata \n", + "4 Chordata \n", + "... ... \n", + "4500378 Ascomycota \n", + "4500379 Ascomycota \n", + "4500380 Ascomycota \n", + "4500381 Ascomycota \n", + "4500382 Proteobacteria \n", + "\n", + "[4500383 rows x 38 columns]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdb_merge" + ] + }, + { + "cell_type": "markdown", + "id": "81c73dbe-3b82-465e-a6c0-27a0465a003a", + "metadata": {}, + "source": [ + "NOw lets do the same for afdb and swp" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "414f25d8-73d5-4c87-9cc9-e0afbfd0644b", + "metadata": {}, + "outputs": [], + "source": [ + "# get data file names\n", + "path = \"/g/arendt/Fabian/PhD/Computational/Spongefold/UPIMAPI/afdb/emapper/\"\n", + "filenames = glob.glob(path + \"/*.tsv\")\n", + "\n", + "dfs = []\n", + "for filename in filenames:\n", + " dfs.append(pd.read_csv(filename, sep='\\t', skiprows=4, skipfooter=3, engine='python'))\n", + "\n", + "# Concatenate all data into one DataFrame\n", + "afdb_emapper = pd.concat(dfs, ignore_index=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "ae989a35-adec-4930-a614-488a37029c05", + "metadata": {}, + "outputs": [], + "source": [ + "afdb_emapper[\"uniprot\"] = afdb_emapper[\"#query\"].str.split(\"|\").str[1]\n", + "\n", + "afdb_merge = pd.merge(afdb, afdb_emapper, on=\"uniprot\")" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "8841c12f-8f85-45e8-a048-5d276b6c7bef", + "metadata": {}, + "outputs": [], + "source": [ + "afdb_uniprot_info = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/UPIMAPI/afdb/afdb_upimapi_mapping_eggnog.tsv\", sep='\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "66c30db2-7cd5-4e13-a2c3-d2b2a5ef648d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "afdb_uniprot_info.rename(columns={'Entry': 'uniprot'}, inplace=True)\n", + "afdb_uniprot_info = afdb_uniprot_info[[\"uniprot\", \"Entry name\", \"Gene names\", \"Function [CC]\", \"Taxonomic lineage (PHYLUM)\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "1d541f5d-7556-4095-9f31-d1f5c23cbbbd", + "metadata": {}, + "outputs": [], + "source": [ + "afdb_merge = pd.merge(afdb_merge, afdb_uniprot_info, on=\"uniprot\")" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "552d4713-5a5c-450a-9f58-d21e6cd7d501", + "metadata": {}, + "outputs": [], + "source": [ + "# get data file names\n", + "path = \"/g/arendt/Fabian/PhD/Computational/Spongefold/UPIMAPI/swp/emapper/\"\n", + "filenames = glob.glob(path + \"/*.tsv\")\n", + "\n", + "dfs = []\n", + "for filename in filenames:\n", + " dfs.append(pd.read_csv(filename, sep='\\t', skiprows=4, skipfooter=3, engine='python'))\n", + "\n", + "# Concatenate all data into one DataFrame\n", + "swp_emapper = pd.concat(dfs, ignore_index=True)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "81f84c74-3372-4b27-84ab-2c31d1bb4126", + "metadata": {}, + "outputs": [], + "source": [ + "swp_emapper[\"uniprot\"] = swp_emapper[\"#query\"].str.split(\"|\").str[1]\n", + "\n", + "swp_merge = pd.merge(swp, swp_emapper, on=\"uniprot\")" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "9865cc34-8bd4-480f-9375-36f82401b07d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "swp_uniprot_info = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/UPIMAPI/swp/swp_upimapi_mapping_eggnog.tsv\", sep='\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "12cc6569-4954-4ee9-a352-ae4353f71aef", + "metadata": {}, + "outputs": [], + "source": [ + "swp_uniprot_info.rename(columns={'Entry': 'uniprot'}, inplace=True)\n", + "swp_uniprot_info = swp_uniprot_info[[\"uniprot\", \"Entry name\", \"Gene names\", \"Function [CC]\", \"Taxonomic lineage (PHYLUM)\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "f216f616-b9c6-4371-a2eb-9476584f24c0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "swp_merge = pd.merge(swp_merge, swp_uniprot_info, on=\"uniprot\")" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "39f10d29-f245-4685-a999-5440a165be3c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>query</th>\n", + " <th>target</th>\n", + " <th>seq. id.</th>\n", + " <th>alignment length</th>\n", + " <th>no. mismatches</th>\n", + " <th>no. gap open</th>\n", + " <th>query start</th>\n", + " <th>target start</th>\n", + " <th>query end</th>\n", + " <th>target end</th>\n", + " <th>...</th>\n", + " <th>KEGG_rclass</th>\n", + " <th>BRITE</th>\n", + " <th>KEGG_TC</th>\n", + " <th>CAZy</th>\n", + " <th>BiGG_Reaction</th>\n", + " <th>PFAMs</th>\n", + " <th>Entry name</th>\n", + " <th>Gene names</th>\n", + " <th>Function [CC]</th>\n", + " <th>Taxonomic lineage (PHYLUM)</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>10000</td>\n", + " <td>AF-O35840-F1</td>\n", + " <td>0.251</td>\n", + " <td>139</td>\n", + " <td>91</td>\n", + " <td>4</td>\n", + " <td>4</td>\n", + " <td>136</td>\n", + " <td>161</td>\n", + " <td>292</td>\n", + " <td>...</td>\n", + " <td>RC00020,RC00037,RC00041,RC00055</td>\n", + " <td>ko00000,ko00001,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>LCAT</td>\n", + " <td>LCAT_GERGM</td>\n", + " <td>LCAT</td>\n", + " <td>FUNCTION: Central enzyme in the extracellular ...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1414</td>\n", + " <td>AF-O35840-F1</td>\n", + " <td>0.314</td>\n", + " <td>343</td>\n", + " <td>177</td>\n", + " <td>9</td>\n", + " <td>62</td>\n", + " <td>397</td>\n", + " <td>1</td>\n", + " <td>292</td>\n", + " <td>...</td>\n", + " <td>RC00020,RC00037,RC00041,RC00055</td>\n", + " <td>ko00000,ko00001,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>LCAT</td>\n", + " <td>LCAT_GERGM</td>\n", + " <td>LCAT</td>\n", + " <td>FUNCTION: Central enzyme in the extracellular ...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1413</td>\n", + " <td>AF-O35840-F1</td>\n", + " <td>0.350</td>\n", + " <td>134</td>\n", + " <td>84</td>\n", + " <td>3</td>\n", + " <td>44</td>\n", + " <td>175</td>\n", + " <td>160</td>\n", + " <td>292</td>\n", + " <td>...</td>\n", + " <td>RC00020,RC00037,RC00041,RC00055</td>\n", + " <td>ko00000,ko00001,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>LCAT</td>\n", + " <td>LCAT_GERGM</td>\n", + " <td>LCAT</td>\n", + " <td>FUNCTION: Central enzyme in the extracellular ...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>9999</td>\n", + " <td>AF-O35840-F1</td>\n", + " <td>0.235</td>\n", + " <td>140</td>\n", + " <td>99</td>\n", + " <td>5</td>\n", + " <td>73</td>\n", + " <td>209</td>\n", + " <td>158</td>\n", + " <td>292</td>\n", + " <td>...</td>\n", + " <td>RC00020,RC00037,RC00041,RC00055</td>\n", + " <td>ko00000,ko00001,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>LCAT</td>\n", + " <td>LCAT_GERGM</td>\n", + " <td>LCAT</td>\n", + " <td>FUNCTION: Central enzyme in the extracellular ...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>26188</td>\n", + " <td>AF-O35840-F1</td>\n", + " <td>0.414</td>\n", + " <td>82</td>\n", + " <td>47</td>\n", + " <td>1</td>\n", + " <td>74</td>\n", + " <td>154</td>\n", + " <td>158</td>\n", + " <td>239</td>\n", + " <td>...</td>\n", + " <td>RC00020,RC00037,RC00041,RC00055</td>\n", + " <td>ko00000,ko00001,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>LCAT</td>\n", + " <td>LCAT_GERGM</td>\n", + " <td>LCAT</td>\n", + " <td>FUNCTION: Central enzyme in the extracellular ...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11935340</th>\n", + " <td>7835</td>\n", + " <td>AF-Q8DKL5-F1</td>\n", + " <td>0.145</td>\n", + " <td>48</td>\n", + " <td>41</td>\n", + " <td>0</td>\n", + " <td>7</td>\n", + " <td>54</td>\n", + " <td>240</td>\n", + " <td>287</td>\n", + " <td>...</td>\n", + " <td>RC00004,RC00039,RC00041</td>\n", + " <td>ko00000,ko00001,ko00002,ko01000,ko01004</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>FA_synthesis</td>\n", + " <td>PLSX_THEVB</td>\n", + " <td>plsX tlr0844</td>\n", + " <td>FUNCTION: Catalyzes the reversible formation o...</td>\n", + " <td>Cyanobacteria</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11935341</th>\n", + " <td>7835</td>\n", + " <td>AF-Q5N5X4-F1</td>\n", + " <td>0.090</td>\n", + " <td>44</td>\n", + " <td>40</td>\n", + " <td>0</td>\n", + " <td>7</td>\n", + " <td>50</td>\n", + " <td>236</td>\n", + " <td>279</td>\n", + " <td>...</td>\n", + " <td>RC00004,RC00039,RC00041</td>\n", + " <td>ko00000,ko00001,ko00002,ko01000,ko01004</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>FA_synthesis</td>\n", + " <td>PLSX_SYNP6</td>\n", + " <td>plsX syc0103_c</td>\n", + " <td>FUNCTION: Catalyzes the reversible formation o...</td>\n", + " <td>Cyanobacteria</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11935342</th>\n", + " <td>8001</td>\n", + " <td>AF-Q5F786-F1</td>\n", + " <td>0.250</td>\n", + " <td>24</td>\n", + " <td>18</td>\n", + " <td>0</td>\n", + " <td>13</td>\n", + " <td>36</td>\n", + " <td>220</td>\n", + " <td>243</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>ko00000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>DUF711</td>\n", + " <td>Y1297_NEIG1</td>\n", + " <td>NGO1297</td>\n", + " <td>NaN</td>\n", + " <td>Proteobacteria</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11935343</th>\n", + " <td>8001</td>\n", + " <td>AF-C5CE71-F1</td>\n", + " <td>0.105</td>\n", + " <td>104</td>\n", + " <td>72</td>\n", + " <td>7</td>\n", + " <td>12</td>\n", + " <td>109</td>\n", + " <td>137</td>\n", + " <td>225</td>\n", + " <td>...</td>\n", + " <td>RC00002,RC00078</td>\n", + " <td>ko00000,ko00001,ko00002,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>CobS</td>\n", + " <td>COBS_KOSOT</td>\n", + " <td>cobS Kole_0456</td>\n", + " <td>FUNCTION: Joins adenosylcobinamide-GDP and alp...</td>\n", + " <td>Thermotogae</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11935344</th>\n", + " <td>8001</td>\n", + " <td>AF-G3KIM4-F1</td>\n", + " <td>0.033</td>\n", + " <td>120</td>\n", + " <td>71</td>\n", + " <td>6</td>\n", + " <td>23</td>\n", + " <td>111</td>\n", + " <td>161</td>\n", + " <td>266</td>\n", + " <td>...</td>\n", + " <td>RC00002,RC00818,RC01839</td>\n", + " <td>ko00000,ko00001,ko00002,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>HGD-D</td>\n", + " <td>LCDA_ANAPI</td>\n", + " <td>lcdA</td>\n", + " <td>FUNCTION: Involved in the acrylate pathway for...</td>\n", + " <td>Firmicutes</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>11935345 rows × 38 columns</p>\n", + "</div>" + ], + "text/plain": [ + " query target seq. id. alignment length no. mismatches \\\n", + "0 10000 AF-O35840-F1 0.251 139 91 \n", + "1 1414 AF-O35840-F1 0.314 343 177 \n", + "2 1413 AF-O35840-F1 0.350 134 84 \n", + "3 9999 AF-O35840-F1 0.235 140 99 \n", + "4 26188 AF-O35840-F1 0.414 82 47 \n", + "... ... ... ... ... ... \n", + "11935340 7835 AF-Q8DKL5-F1 0.145 48 41 \n", + "11935341 7835 AF-Q5N5X4-F1 0.090 44 40 \n", + "11935342 8001 AF-Q5F786-F1 0.250 24 18 \n", + "11935343 8001 AF-C5CE71-F1 0.105 104 72 \n", + "11935344 8001 AF-G3KIM4-F1 0.033 120 71 \n", + "\n", + " no. gap open query start target start query end target end ... \\\n", + "0 4 4 136 161 292 ... \n", + "1 9 62 397 1 292 ... \n", + "2 3 44 175 160 292 ... \n", + "3 5 73 209 158 292 ... \n", + "4 1 74 154 158 239 ... \n", + "... ... ... ... ... ... ... \n", + "11935340 0 7 54 240 287 ... \n", + "11935341 0 7 50 236 279 ... \n", + "11935342 0 13 36 220 243 ... \n", + "11935343 7 12 109 137 225 ... \n", + "11935344 6 23 111 161 266 ... \n", + "\n", + " KEGG_rclass \\\n", + "0 RC00020,RC00037,RC00041,RC00055 \n", + "1 RC00020,RC00037,RC00041,RC00055 \n", + "2 RC00020,RC00037,RC00041,RC00055 \n", + "3 RC00020,RC00037,RC00041,RC00055 \n", + "4 RC00020,RC00037,RC00041,RC00055 \n", + "... ... \n", + "11935340 RC00004,RC00039,RC00041 \n", + "11935341 RC00004,RC00039,RC00041 \n", + "11935342 - \n", + "11935343 RC00002,RC00078 \n", + "11935344 RC00002,RC00818,RC01839 \n", + "\n", + " BRITE KEGG_TC CAZy BiGG_Reaction \\\n", + "0 ko00000,ko00001,ko01000 - - - \n", + "1 ko00000,ko00001,ko01000 - - - \n", + "2 ko00000,ko00001,ko01000 - - - \n", + "3 ko00000,ko00001,ko01000 - - - \n", + "4 ko00000,ko00001,ko01000 - - - \n", + "... ... ... ... ... \n", + "11935340 ko00000,ko00001,ko00002,ko01000,ko01004 - - - \n", + "11935341 ko00000,ko00001,ko00002,ko01000,ko01004 - - - \n", + "11935342 ko00000 - - - \n", + "11935343 ko00000,ko00001,ko00002,ko01000 - - - \n", + "11935344 ko00000,ko00001,ko00002,ko01000 - - - \n", + "\n", + " PFAMs Entry name Gene names \\\n", + "0 LCAT LCAT_GERGM LCAT \n", + "1 LCAT LCAT_GERGM LCAT \n", + "2 LCAT LCAT_GERGM LCAT \n", + "3 LCAT LCAT_GERGM LCAT \n", + "4 LCAT LCAT_GERGM LCAT \n", + "... ... ... ... \n", + "11935340 FA_synthesis PLSX_THEVB plsX tlr0844 \n", + "11935341 FA_synthesis PLSX_SYNP6 plsX syc0103_c \n", + "11935342 DUF711 Y1297_NEIG1 NGO1297 \n", + "11935343 CobS COBS_KOSOT cobS Kole_0456 \n", + "11935344 HGD-D LCDA_ANAPI lcdA \n", + "\n", + " Function [CC] \\\n", + "0 FUNCTION: Central enzyme in the extracellular ... \n", + "1 FUNCTION: Central enzyme in the extracellular ... \n", + "2 FUNCTION: Central enzyme in the extracellular ... \n", + "3 FUNCTION: Central enzyme in the extracellular ... \n", + "4 FUNCTION: Central enzyme in the extracellular ... \n", + "... ... \n", + "11935340 FUNCTION: Catalyzes the reversible formation o... \n", + "11935341 FUNCTION: Catalyzes the reversible formation o... \n", + "11935342 NaN \n", + "11935343 FUNCTION: Joins adenosylcobinamide-GDP and alp... \n", + "11935344 FUNCTION: Involved in the acrylate pathway for... \n", + "\n", + " Taxonomic lineage (PHYLUM) \n", + "0 Chordata \n", + "1 Chordata \n", + "2 Chordata \n", + "3 Chordata \n", + "4 Chordata \n", + "... ... \n", + "11935340 Cyanobacteria \n", + "11935341 Cyanobacteria \n", + "11935342 Proteobacteria \n", + "11935343 Thermotogae \n", + "11935344 Firmicutes \n", + "\n", + "[11935345 rows x 38 columns]" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "swp_merge" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "511b875a-bdc8-43f9-8e33-34ad491d0487", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "alphafold['query'] = alphafold.index" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "f4407b86-a8b0-4ef1-8c98-3309c0084b15", + "metadata": {}, + "outputs": [], + "source": [ + "alphafold.reset_index(drop=True, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "5df8b919-554a-4df0-9e37-1e70f974f3e6", + "metadata": {}, + "source": [ + "Merge tables with alphafold results" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "b43d8449-d8be-468b-96b4-ee72531cb3ad", + "metadata": {}, + "outputs": [], + "source": [ + "pdb_res = pd.merge(pdb_merge, alphafold, on=\"query\")" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "22c0968a-140f-4ce4-a65a-cc4d668a424a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "c3376_g1 6368\n", + "c105400_g1 6368\n", + "c94782_g1 6076\n", + "c43224_g1 6002\n", + "c89761_g1 5787\n", + " ... \n", + "c87906_g2 1\n", + "c111030_g1 1\n", + "c99629_g1 1\n", + "c96209_g1 1\n", + "c74333_g1 1\n", + "Name: gene_id, Length: 10157, dtype: int64" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdb_res.gene_id.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "b8f492f6-450d-484f-ba07-8a396ea6c314", + "metadata": {}, + "source": [ + "Translate the UniProt IDs to gene names; whatever obtained a UniProt ID should have a gene name, and this will be our fallback option if emapper annotation is not present:" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "a567e55d-761f-4ca7-a2c1-8a77f24749a6", + "metadata": {}, + "outputs": [], + "source": [ + "afdb_res = pd.merge(afdb_merge, alphafold, on=\"query\")" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "7dec1532-b05e-496c-aa06-0dfa1b70e637", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "c102030_g2 330\n", + "c98824_g1 328\n", + "c96693_g1 326\n", + "c106372_g1 326\n", + "c104839_g2 325\n", + " ... \n", + "c95444_g1 38\n", + "c44058_g1 34\n", + "c105808_g1 21\n", + "c112778_g1 5\n", + "c78729_g1 5\n", + "Name: gene_id, Length: 29386, dtype: int64" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "afdb_res.gene_id.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "dc9c73b6-ec0c-4744-9d3c-f34a46d86908", + "metadata": {}, + "outputs": [], + "source": [ + "swp_res = pd.merge(swp_merge, alphafold, on=\"query\")" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "0ccb5e8c-0ecf-474a-8464-71700626ccdd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "c94230_g1 311\n", + "c103036_g1 310\n", + "c94975_g1 308\n", + "c105925_g1 308\n", + "c103624_g1 307\n", + " ... \n", + "c101908_g2 10\n", + "c44058_g1 9\n", + "c95444_g1 7\n", + "c103292_g1 6\n", + "c105808_g1 4\n", + "Name: gene_id, Length: 29385, dtype: int64" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "swp_res.gene_id.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "cdda4572-1cf5-4fd1-9c3b-5cf5f50c3e2f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['query',\n", + " 'target',\n", + " 'seq. id.',\n", + " 'alignment length',\n", + " 'no. mismatches',\n", + " 'no. gap open',\n", + " 'query start',\n", + " 'target start',\n", + " 'query end',\n", + " 'target end',\n", + " 'e value',\n", + " 'bit score',\n", + " 'uniprot',\n", + " '#query',\n", + " 'seed_ortholog',\n", + " 'evalue',\n", + " 'score',\n", + " 'eggNOG_OGs',\n", + " 'max_annot_lvl',\n", + " 'COG_category',\n", + " 'Description',\n", + " 'Preferred_name',\n", + " 'GOs',\n", + " 'EC',\n", + " 'KEGG_ko',\n", + " 'KEGG_Pathway',\n", + " 'KEGG_Module',\n", + " 'KEGG_Reaction',\n", + " 'KEGG_rclass',\n", + " 'BRITE',\n", + " 'KEGG_TC',\n", + " 'CAZy',\n", + " 'BiGG_Reaction',\n", + " 'PFAMs',\n", + " 'Entry name',\n", + " 'Gene names',\n", + " 'Function [CC]',\n", + " 'Taxonomic lineage (PHYLUM)',\n", + " 'plddt',\n", + " 'MSA size',\n", + " 'query length',\n", + " 'gene name',\n", + " 'protein_id',\n", + " 'gene_id']" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(pdb_res.columns)" + ] + }, + { + "cell_type": "markdown", + "id": "94bd0480-b089-48fe-8841-9f839beaf7b8", + "metadata": { + "tags": [] + }, + "source": [ + "Before I add the eggnog information from the sponge proteome, I need to drop and rename some of the columns" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "b866d26f-94b5-4875-ba4f-45003d74ce1f", + "metadata": {}, + "outputs": [], + "source": [ + "pdb_res.rename(columns = {'query':'fs_query', 'target':'fs_target', 'e value':'fs_e value', 'bit score':\"fs_bit score\", 'uniprot':'fs_target_uniprot', 'evalue':'fs_target_eggnog_evalue', \n", + " 'score':'fs_target_eggnog_score', 'eggNOG_OGs':'fs_target_eggnogOGs', 'max_annot_lvl':'fs_target_max_annot_lvl','COG_category':'fs_target_COG_category',\n", + " 'Description':'fs_target_Description', 'Preferred_name':'fs_target_Preferred_name', 'GOs':'fs_target_GOs', 'Entry name':'fs_target_Entry name', \n", + " 'Gene names':'fs_target_Gene names', 'Function [CC]':'fs_target_Function [CC]', 'Taxonomic lineage (PHYLUM)':'fs_target_Taxonomic lineage (PHYLUM)', 'PFAMs':'fs_target_PFAMs'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "3b7c0cc2-a740-42cb-bf08-09154963e4fd", + "metadata": {}, + "outputs": [], + "source": [ + "pdb_res.drop(['no. mismatches',\n", + " 'no. gap open',\n", + " 'query start',\n", + " 'target start',\n", + " 'query end',\n", + " 'target end','#query',\n", + " 'seed_ortholog','EC',\n", + " 'KEGG_ko',\n", + " 'KEGG_Pathway',\n", + " 'KEGG_Module',\n", + " 'KEGG_Reaction',\n", + " 'KEGG_rclass',\n", + " 'BRITE',\n", + " 'KEGG_TC',\n", + " 'CAZy',\n", + " 'BiGG_Reaction'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "0d510c96-ebaf-48f0-aa59-e6070dcecba0", + "metadata": {}, + "outputs": [], + "source": [ + "afdb_res.rename(columns = {'query':'fs_query', 'target':'fs_target', 'e value':'fs_e value', 'bit score':\"fs_bit score\", 'uniprot':'fs_target_uniprot', 'evalue':'fs_target_eggnog_evalue', \n", + " 'score':'fs_target_eggnog_score', 'eggNOG_OGs':'fs_target_eggnogOGs', 'max_annot_lvl':'fs_target_max_annot_lvl','COG_category':'fs_target_COG_category',\n", + " 'Description':'fs_target_Description', 'Preferred_name':'fs_target_Preferred_name', 'GOs':'fs_target_GOs', 'Entry name':'fs_target_Entry name', \n", + " 'Gene names':'fs_target_Gene names', 'Function [CC]':'fs_target_Function [CC]', 'Taxonomic lineage (PHYLUM)':'fs_target_Taxonomic lineage (PHYLUM)', 'PFAMs':'fs_target_PFAMs'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "d384509d-212c-4ad7-9fe8-897d7048cdf0", + "metadata": {}, + "outputs": [], + "source": [ + "afdb_res.drop(['no. mismatches',\n", + " 'no. gap open',\n", + " 'query start',\n", + " 'target start',\n", + " 'query end',\n", + " 'target end','#query',\n", + " 'seed_ortholog','EC',\n", + " 'KEGG_ko',\n", + " 'KEGG_Pathway',\n", + " 'KEGG_Module',\n", + " 'KEGG_Reaction',\n", + " 'KEGG_rclass',\n", + " 'BRITE',\n", + " 'KEGG_TC',\n", + " 'CAZy',\n", + " 'BiGG_Reaction'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "78d517da-65a2-4ac6-a007-d3d3c6c3ddae", + "metadata": {}, + "outputs": [], + "source": [ + "swp_res.rename(columns = {'query':'fs_query', 'target':'fs_target', 'e value':'fs_e value', 'bit score':\"fs_bit score\", 'uniprot':'fs_target_uniprot', 'evalue':'fs_target_eggnog_evalue', \n", + " 'score':'fs_target_eggnog_score', 'eggNOG_OGs':'fs_target_eggnogOGs', 'max_annot_lvl':'fs_target_max_annot_lvl','COG_category':'fs_target_COG_category',\n", + " 'Description':'fs_target_Description', 'Preferred_name':'fs_target_Preferred_name', 'GOs':'fs_target_GOs', 'Entry name':'fs_target_Entry name', \n", + " 'Gene names':'fs_target_Gene names', 'Function [CC]':'fs_target_Function [CC]', 'Taxonomic lineage (PHYLUM)':'fs_target_Taxonomic lineage (PHYLUM)', 'PFAMs':'fs_target_PFAMs'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "00cdb496-4bd6-40ac-a22d-ac1f5e4ba960", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "swp_res.drop(['no. mismatches',\n", + " 'no. gap open',\n", + " 'query start',\n", + " 'target start',\n", + " 'query end',\n", + " 'target end','#query',\n", + " 'seed_ortholog','EC',\n", + " 'KEGG_ko',\n", + " 'KEGG_Pathway',\n", + " 'KEGG_Module',\n", + " 'KEGG_Reaction',\n", + " 'KEGG_rclass',\n", + " 'BRITE',\n", + " 'KEGG_TC',\n", + " 'CAZy',\n", + " 'BiGG_Reaction'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "808506eb-e776-422f-8c5f-de068d8ba97b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['fs_query',\n", + " 'fs_target',\n", + " 'seq. id.',\n", + " 'alignment length',\n", + " 'fs_e value',\n", + " 'fs_bit score',\n", + " 'fs_target_uniprot',\n", + " 'fs_target_eggnog_evalue',\n", + " 'fs_target_eggnog_score',\n", + " 'fs_target_eggnogOGs',\n", + " 'fs_target_max_annot_lvl',\n", + " 'fs_target_COG_category',\n", + " 'fs_target_Description',\n", + " 'fs_target_Preferred_name',\n", + " 'fs_target_GOs',\n", + " 'fs_target_PFAMs',\n", + " 'fs_target_Entry name',\n", + " 'fs_target_Gene names',\n", + " 'fs_target_Function [CC]',\n", + " 'fs_target_Taxonomic lineage (PHYLUM)',\n", + " 'plddt',\n", + " 'MSA size',\n", + " 'query length',\n", + " 'gene name',\n", + " 'protein_id',\n", + " 'gene_id']" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(afdb_res.columns)" + ] + }, + { + "cell_type": "markdown", + "id": "74a8129b-5ae0-4b7f-9a26-7bf0947670b5", + "metadata": {}, + "source": [ + "Now the question is how to subset these huge tables a little bit more so we can work with them properly. The best way would be to filter based on fs bit score and/or fs_target_eggnogOGs. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d37e7a82-8017-4b16-9419-1cb45d41858d", + "metadata": {}, + "outputs": [], + "source": [ + "#pdb_seq = enrich_from_uniprot(pdb, \"uniprot\", \"gene name\", uniprot_from=\"ACC\", uniprot_to=\"GENENAME\")\n", + "# swp_seq = enrich_from_uniprot(swp, \"uniprot\", \"gene name\", uniprot_from=\"ACC\", uniprot_to=\"GENENAME\")\n", + "#afdb_seq = enrich_from_uniprot(afdb, \"uniprot\", \"gene name\", uniprot_from=\"ACC\", uniprot_to=\"GENENAME\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de9d5f80-9ac8-4a14-a628-1e99af934403", + "metadata": {}, + "outputs": [], + "source": [ + "#pdb_seq = enrich_from_uniprot(pdb_seq, \"uniprot\", \"eggnog\", uniprot_from=\"ACC\", uniprot_to=\"EGGNOG_ID\")\n", + "#swp_seq = enrich_from_uniprot(swp_seq, \"uniprot\", \"eggnog\", uniprot_from=\"ACC\", uniprot_to=\"EGGNOG_ID\")\n", + "#afdb_seq = enrich_from_uniprot(afdb_seq, \"uniprot\", \"eggnog\", uniprot_from=\"ACC\", uniprot_to=\"EGGNOG_ID\")" + ] + }, + { + "cell_type": "markdown", + "id": "8ec832ac-8525-438d-819a-cab2232f574f", + "metadata": {}, + "source": [ + "Cross-reference the eggNOG IDs with the eggNOG annotation that gives a nice name/description for each (most) orthogroup:" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "33c69bec-6a43-49c2-8ea8-3f011fcf86c4", + "metadata": {}, + "outputs": [], + "source": [ + "eggnog = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/spongilla_eggnog.tsv\", sep=\"\\t\", skiprows=4, skipfooter=3, engine='python')" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "4247ce9f-18b7-4fa1-a6a6-58218ef6e797", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>#query</th>\n", + " <th>seed_ortholog</th>\n", + " <th>evalue</th>\n", + " <th>score</th>\n", + " <th>eggNOG_OGs</th>\n", + " <th>max_annot_lvl</th>\n", + " <th>COG_category</th>\n", + " <th>Description</th>\n", + " <th>Preferred_name</th>\n", + " <th>GOs</th>\n", + " <th>...</th>\n", + " <th>KEGG_ko</th>\n", + " <th>KEGG_Pathway</th>\n", + " <th>KEGG_Module</th>\n", + " <th>KEGG_Reaction</th>\n", + " <th>KEGG_rclass</th>\n", + " <th>BRITE</th>\n", + " <th>KEGG_TC</th>\n", + " <th>CAZy</th>\n", + " <th>BiGG_Reaction</th>\n", + " <th>PFAMs</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>c100000_g1_i1_m.41809</td>\n", + " <td>400682.PAC_15712888</td>\n", + " <td>6.690000e-72</td>\n", + " <td>242.0</td>\n", + " <td>COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>I</td>\n", + " <td>CRAL/TRIO domain</td>\n", + " <td>MOSPD2</td>\n", + " <td>-</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>CRAL_TRIO,Motile_Sperm</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>c100000_g1_i2_m.41814</td>\n", + " <td>400682.PAC_15712888</td>\n", + " <td>1.640000e-13</td>\n", + " <td>77.8</td>\n", + " <td>COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>I</td>\n", + " <td>CRAL/TRIO domain</td>\n", + " <td>MOSPD2</td>\n", + " <td>-</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>CRAL_TRIO,Motile_Sperm</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>c100000_g2_i1_m.41818</td>\n", + " <td>400682.PAC_15712888</td>\n", + " <td>3.350000e-33</td>\n", + " <td>135.0</td>\n", + " <td>COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>I</td>\n", + " <td>CRAL/TRIO domain</td>\n", + " <td>MOSPD2</td>\n", + " <td>-</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>CRAL_TRIO,Motile_Sperm</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>c100001_g1_i2_m.41826</td>\n", + " <td>400682.PAC_15716590</td>\n", + " <td>5.880000e-48</td>\n", + " <td>176.0</td>\n", + " <td>COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>A</td>\n", + " <td>RNA secondary structure unwinding</td>\n", + " <td>DDX46</td>\n", + " <td>GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO...</td>\n", + " <td>...</td>\n", + " <td>ko:K12811</td>\n", + " <td>ko03040,map03040</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>ko00000,ko00001,ko01000,ko03041</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>DEAD,Helicase_C</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>c100001_g2_i1_m.41829</td>\n", + " <td>400682.PAC_15716590</td>\n", + " <td>1.220000e-305</td>\n", + " <td>868.0</td>\n", + " <td>COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>A</td>\n", + " <td>RNA secondary structure unwinding</td>\n", + " <td>DDX46</td>\n", + " <td>GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO...</td>\n", + " <td>...</td>\n", + " <td>ko:K12811</td>\n", + " <td>ko03040,map03040</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>ko00000,ko00001,ko01000,ko03041</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>DEAD,Helicase_C</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24193</th>\n", + " <td>c99995_g1_i1_m.41796</td>\n", + " <td>45351.EDO40823</td>\n", + " <td>3.090000e-120</td>\n", + " <td>352.0</td>\n", + " <td>COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>L</td>\n", + " <td>protein-DNA loading ATPase activity</td>\n", + " <td>RFC3</td>\n", + " <td>GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO...</td>\n", + " <td>...</td>\n", + " <td>ko:K10756</td>\n", + " <td>ko03030,ko03420,ko03430,map03030,map03420,map0...</td>\n", + " <td>M00289,M00295</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>ko00000,ko00001,ko00002,ko03032,ko03036,ko03400</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>DNA_pol3_delta2,Rep_fac_C</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24194</th>\n", + " <td>c99995_g2_i1_m.41797</td>\n", + " <td>109478.XP_005883212.1</td>\n", + " <td>1.570000e-71</td>\n", + " <td>226.0</td>\n", + " <td>COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>L</td>\n", + " <td>protein-DNA loading ATPase activity</td>\n", + " <td>RFC3</td>\n", + " <td>GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO...</td>\n", + " <td>...</td>\n", + " <td>ko:K10756</td>\n", + " <td>ko03030,ko03420,ko03430,map03030,map03420,map0...</td>\n", + " <td>M00289,M00295</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>ko00000,ko00001,ko00002,ko03032,ko03036,ko03400</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>DNA_pol3_delta2,Rep_fac_C</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24195</th>\n", + " <td>c99997_g2_i1_m.41801</td>\n", + " <td>7739.XP_002612114.1</td>\n", + " <td>1.150000e-205</td>\n", + " <td>583.0</td>\n", + " <td>COG0644@1|root,2QW6Y@2759|Eukaryota,39YRX@3315...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>C</td>\n", + " <td>FAD binding domain</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>FAD_binding_3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24196</th>\n", + " <td>c99998_g1_i1_m.41804</td>\n", + " <td>400682.PAC_15712215</td>\n", + " <td>2.240000e-15</td>\n", + " <td>89.7</td>\n", + " <td>COG0666@1|root,KOG4369@1|root,KOG0504@2759|Euk...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>T</td>\n", + " <td>positive regulation of MDA-5 signaling pathway</td>\n", + " <td>ANKRD17</td>\n", + " <td>GO:0000785,GO:0001568,GO:0001654,GO:0001745,GO...</td>\n", + " <td>...</td>\n", + " <td>ko:K16726</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>ko00000,ko03036</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>Ank_2,Ank_4,KH_1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24197</th>\n", + " <td>c99999_g1_i1_m.41806</td>\n", + " <td>400682.PAC_15718953</td>\n", + " <td>5.030000e-29</td>\n", + " <td>111.0</td>\n", + " <td>2DI7Z@1|root,2S5Y2@2759|Eukaryota,3A72E@33154|...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>24198 rows × 21 columns</p>\n", + "</div>" + ], + "text/plain": [ + " #query seed_ortholog evalue score \\\n", + "0 c100000_g1_i1_m.41809 400682.PAC_15712888 6.690000e-72 242.0 \n", + "1 c100000_g1_i2_m.41814 400682.PAC_15712888 1.640000e-13 77.8 \n", + "2 c100000_g2_i1_m.41818 400682.PAC_15712888 3.350000e-33 135.0 \n", + "3 c100001_g1_i2_m.41826 400682.PAC_15716590 5.880000e-48 176.0 \n", + "4 c100001_g2_i1_m.41829 400682.PAC_15716590 1.220000e-305 868.0 \n", + "... ... ... ... ... \n", + "24193 c99995_g1_i1_m.41796 45351.EDO40823 3.090000e-120 352.0 \n", + "24194 c99995_g2_i1_m.41797 109478.XP_005883212.1 1.570000e-71 226.0 \n", + "24195 c99997_g2_i1_m.41801 7739.XP_002612114.1 1.150000e-205 583.0 \n", + "24196 c99998_g1_i1_m.41804 400682.PAC_15712215 2.240000e-15 89.7 \n", + "24197 c99999_g1_i1_m.41806 400682.PAC_15718953 5.030000e-29 111.0 \n", + "\n", + " eggNOG_OGs max_annot_lvl \\\n", + "0 COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk... 33208|Metazoa \n", + "1 COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk... 33208|Metazoa \n", + "2 COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk... 33208|Metazoa \n", + "3 COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33... 33208|Metazoa \n", + "4 COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33... 33208|Metazoa \n", + "... ... ... \n", + "24193 COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33... 33208|Metazoa \n", + "24194 COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33... 33208|Metazoa \n", + "24195 COG0644@1|root,2QW6Y@2759|Eukaryota,39YRX@3315... 33208|Metazoa \n", + "24196 COG0666@1|root,KOG4369@1|root,KOG0504@2759|Euk... 33208|Metazoa \n", + "24197 2DI7Z@1|root,2S5Y2@2759|Eukaryota,3A72E@33154|... 33208|Metazoa \n", + "\n", + " COG_category Description \\\n", + "0 I CRAL/TRIO domain \n", + "1 I CRAL/TRIO domain \n", + "2 I CRAL/TRIO domain \n", + "3 A RNA secondary structure unwinding \n", + "4 A RNA secondary structure unwinding \n", + "... ... ... \n", + "24193 L protein-DNA loading ATPase activity \n", + "24194 L protein-DNA loading ATPase activity \n", + "24195 C FAD binding domain \n", + "24196 T positive regulation of MDA-5 signaling pathway \n", + "24197 - - \n", + "\n", + " Preferred_name GOs ... \\\n", + "0 MOSPD2 - ... \n", + "1 MOSPD2 - ... \n", + "2 MOSPD2 - ... \n", + "3 DDX46 GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO... ... \n", + "4 DDX46 GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO... ... \n", + "... ... ... ... \n", + "24193 RFC3 GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO... ... \n", + "24194 RFC3 GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO... ... \n", + "24195 - - ... \n", + "24196 ANKRD17 GO:0000785,GO:0001568,GO:0001654,GO:0001745,GO... ... \n", + "24197 - - ... \n", + "\n", + " KEGG_ko KEGG_Pathway \\\n", + "0 - - \n", + "1 - - \n", + "2 - - \n", + "3 ko:K12811 ko03040,map03040 \n", + "4 ko:K12811 ko03040,map03040 \n", + "... ... ... \n", + "24193 ko:K10756 ko03030,ko03420,ko03430,map03030,map03420,map0... \n", + "24194 ko:K10756 ko03030,ko03420,ko03430,map03030,map03420,map0... \n", + "24195 - - \n", + "24196 ko:K16726 - \n", + "24197 - - \n", + "\n", + " KEGG_Module KEGG_Reaction KEGG_rclass \\\n", + "0 - - - \n", + "1 - - - \n", + "2 - - - \n", + "3 - - - \n", + "4 - - - \n", + "... ... ... ... \n", + "24193 M00289,M00295 - - \n", + "24194 M00289,M00295 - - \n", + "24195 - - - \n", + "24196 - - - \n", + "24197 - - - \n", + "\n", + " BRITE KEGG_TC CAZy \\\n", + "0 - - - \n", + "1 - - - \n", + "2 - - - \n", + "3 ko00000,ko00001,ko01000,ko03041 - - \n", + "4 ko00000,ko00001,ko01000,ko03041 - - \n", + "... ... ... ... \n", + "24193 ko00000,ko00001,ko00002,ko03032,ko03036,ko03400 - - \n", + "24194 ko00000,ko00001,ko00002,ko03032,ko03036,ko03400 - - \n", + "24195 - - - \n", + "24196 ko00000,ko03036 - - \n", + "24197 - - - \n", + "\n", + " BiGG_Reaction PFAMs \n", + "0 - CRAL_TRIO,Motile_Sperm \n", + "1 - CRAL_TRIO,Motile_Sperm \n", + "2 - CRAL_TRIO,Motile_Sperm \n", + "3 - DEAD,Helicase_C \n", + "4 - DEAD,Helicase_C \n", + "... ... ... \n", + "24193 - DNA_pol3_delta2,Rep_fac_C \n", + "24194 - DNA_pol3_delta2,Rep_fac_C \n", + "24195 - FAD_binding_3 \n", + "24196 - Ank_2,Ank_4,KH_1 \n", + "24197 - - \n", + "\n", + "[24198 rows x 21 columns]" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eggnog[\"gene_id\"] = eggnog[\"#query\"].str.split(\"_\").str[:2].str.join(\"_\")\n", + "eggnog[['gene_id', 'protein_id']] = eggnog['#query'].str.split('_')., 1, expand=True\n", + "eggnog[\"#query\"].str.split(\"_\").str[:2].str.join(\"_\")" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "f3756118-c34c-4e6d-8954-c94473601758", + "metadata": {}, + "outputs": [], + "source": [ + "eggnog[\"gene_id\"] = eggnog[\"#query\"].str.split(\"_\").str[:2].str.join(\"_\")\n", + "eggnog[\"protein_id\"] = eggnog[\"#query\"].str.split(\".\").str[1]\n", + "eggnog.drop(['#query',\n", + " 'seed_ortholog',\n", + " 'EC',\n", + " 'KEGG_ko',\n", + " 'KEGG_Pathway',\n", + " 'KEGG_Module',\n", + " 'KEGG_Reaction',\n", + " 'KEGG_rclass',\n", + " 'BRITE',\n", + " 'KEGG_TC',\n", + " 'CAZy',\n", + " 'BiGG_Reaction'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "18507e2d-7055-44d7-ac36-0d439c05e34c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>evalue</th>\n", + " <th>score</th>\n", + " <th>eggNOG_OGs</th>\n", + " <th>max_annot_lvl</th>\n", + " <th>COG_category</th>\n", + " <th>Description</th>\n", + " <th>Preferred_name</th>\n", + " <th>GOs</th>\n", + " <th>PFAMs</th>\n", + " <th>gene_id</th>\n", + " <th>protein_id</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>6.690000e-72</td>\n", + " <td>242.0</td>\n", + " <td>COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>I</td>\n", + " <td>CRAL/TRIO domain</td>\n", + " <td>MOSPD2</td>\n", + " <td>-</td>\n", + " <td>CRAL_TRIO,Motile_Sperm</td>\n", + " <td>c100000_g1</td>\n", + " <td>41809</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1.640000e-13</td>\n", + " <td>77.8</td>\n", + " <td>COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>I</td>\n", + " <td>CRAL/TRIO domain</td>\n", + " <td>MOSPD2</td>\n", + " <td>-</td>\n", + " <td>CRAL_TRIO,Motile_Sperm</td>\n", + " <td>c100000_g1</td>\n", + " <td>41814</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>3.350000e-33</td>\n", + " <td>135.0</td>\n", + " <td>COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>I</td>\n", + " <td>CRAL/TRIO domain</td>\n", + " <td>MOSPD2</td>\n", + " <td>-</td>\n", + " <td>CRAL_TRIO,Motile_Sperm</td>\n", + " <td>c100000_g2</td>\n", + " <td>41818</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>5.880000e-48</td>\n", + " <td>176.0</td>\n", + " <td>COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>A</td>\n", + " <td>RNA secondary structure unwinding</td>\n", + " <td>DDX46</td>\n", + " <td>GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO...</td>\n", + " <td>DEAD,Helicase_C</td>\n", + " <td>c100001_g1</td>\n", + " <td>41826</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1.220000e-305</td>\n", + " <td>868.0</td>\n", + " <td>COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>A</td>\n", + " <td>RNA secondary structure unwinding</td>\n", + " <td>DDX46</td>\n", + " <td>GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO...</td>\n", + " <td>DEAD,Helicase_C</td>\n", + " <td>c100001_g2</td>\n", + " <td>41829</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24193</th>\n", + " <td>3.090000e-120</td>\n", + " <td>352.0</td>\n", + " <td>COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>L</td>\n", + " <td>protein-DNA loading ATPase activity</td>\n", + " <td>RFC3</td>\n", + " <td>GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO...</td>\n", + " <td>DNA_pol3_delta2,Rep_fac_C</td>\n", + " <td>c99995_g1</td>\n", + " <td>41796</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24194</th>\n", + " <td>1.570000e-71</td>\n", + " <td>226.0</td>\n", + " <td>COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>L</td>\n", + " <td>protein-DNA loading ATPase activity</td>\n", + " <td>RFC3</td>\n", + " <td>GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO...</td>\n", + " <td>DNA_pol3_delta2,Rep_fac_C</td>\n", + " <td>c99995_g2</td>\n", + " <td>41797</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24195</th>\n", + " <td>1.150000e-205</td>\n", + " <td>583.0</td>\n", + " <td>COG0644@1|root,2QW6Y@2759|Eukaryota,39YRX@3315...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>C</td>\n", + " <td>FAD binding domain</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>FAD_binding_3</td>\n", + " <td>c99997_g2</td>\n", + " <td>41801</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24196</th>\n", + " <td>2.240000e-15</td>\n", + " <td>89.7</td>\n", + " <td>COG0666@1|root,KOG4369@1|root,KOG0504@2759|Euk...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>T</td>\n", + " <td>positive regulation of MDA-5 signaling pathway</td>\n", + " <td>ANKRD17</td>\n", + " <td>GO:0000785,GO:0001568,GO:0001654,GO:0001745,GO...</td>\n", + " <td>Ank_2,Ank_4,KH_1</td>\n", + " <td>c99998_g1</td>\n", + " <td>41804</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24197</th>\n", + " <td>5.030000e-29</td>\n", + " <td>111.0</td>\n", + " <td>2DI7Z@1|root,2S5Y2@2759|Eukaryota,3A72E@33154|...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>c99999_g1</td>\n", + " <td>41806</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>24198 rows × 11 columns</p>\n", + "</div>" + ], + "text/plain": [ + " evalue score \\\n", + "0 6.690000e-72 242.0 \n", + "1 1.640000e-13 77.8 \n", + "2 3.350000e-33 135.0 \n", + "3 5.880000e-48 176.0 \n", + "4 1.220000e-305 868.0 \n", + "... ... ... \n", + "24193 3.090000e-120 352.0 \n", + "24194 1.570000e-71 226.0 \n", + "24195 1.150000e-205 583.0 \n", + "24196 2.240000e-15 89.7 \n", + "24197 5.030000e-29 111.0 \n", + "\n", + " eggNOG_OGs max_annot_lvl \\\n", + "0 COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk... 33208|Metazoa \n", + "1 COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk... 33208|Metazoa \n", + "2 COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk... 33208|Metazoa \n", + "3 COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33... 33208|Metazoa \n", + "4 COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33... 33208|Metazoa \n", + "... ... ... \n", + "24193 COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33... 33208|Metazoa \n", + "24194 COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33... 33208|Metazoa \n", + "24195 COG0644@1|root,2QW6Y@2759|Eukaryota,39YRX@3315... 33208|Metazoa \n", + "24196 COG0666@1|root,KOG4369@1|root,KOG0504@2759|Euk... 33208|Metazoa \n", + "24197 2DI7Z@1|root,2S5Y2@2759|Eukaryota,3A72E@33154|... 33208|Metazoa \n", + "\n", + " COG_category Description \\\n", + "0 I CRAL/TRIO domain \n", + "1 I CRAL/TRIO domain \n", + "2 I CRAL/TRIO domain \n", + "3 A RNA secondary structure unwinding \n", + "4 A RNA secondary structure unwinding \n", + "... ... ... \n", + "24193 L protein-DNA loading ATPase activity \n", + "24194 L protein-DNA loading ATPase activity \n", + "24195 C FAD binding domain \n", + "24196 T positive regulation of MDA-5 signaling pathway \n", + "24197 - - \n", + "\n", + " Preferred_name GOs \\\n", + "0 MOSPD2 - \n", + "1 MOSPD2 - \n", + "2 MOSPD2 - \n", + "3 DDX46 GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO... \n", + "4 DDX46 GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO... \n", + "... ... ... \n", + "24193 RFC3 GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO... \n", + "24194 RFC3 GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO... \n", + "24195 - - \n", + "24196 ANKRD17 GO:0000785,GO:0001568,GO:0001654,GO:0001745,GO... \n", + "24197 - - \n", + "\n", + " PFAMs gene_id protein_id \n", + "0 CRAL_TRIO,Motile_Sperm c100000_g1 41809 \n", + "1 CRAL_TRIO,Motile_Sperm c100000_g1 41814 \n", + "2 CRAL_TRIO,Motile_Sperm c100000_g2 41818 \n", + "3 DEAD,Helicase_C c100001_g1 41826 \n", + "4 DEAD,Helicase_C c100001_g2 41829 \n", + "... ... ... ... \n", + "24193 DNA_pol3_delta2,Rep_fac_C c99995_g1 41796 \n", + "24194 DNA_pol3_delta2,Rep_fac_C c99995_g2 41797 \n", + "24195 FAD_binding_3 c99997_g2 41801 \n", + "24196 Ank_2,Ank_4,KH_1 c99998_g1 41804 \n", + "24197 - c99999_g1 41806 \n", + "\n", + "[24198 rows x 11 columns]" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eggnog" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "60c99abb-679f-43a9-af6f-0fb37159f68a", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['evalue',\n", + " 'seq_eggnog_score',\n", + " 'seq_eggnogOGs',\n", + " 'seq_max_annot_lvl',\n", + " 'seq_COG_category',\n", + " 'seq_Description',\n", + " 'seq_Preferred_name',\n", + " 'seq_GOs',\n", + " 'seq_PFAMs',\n", + " 'gene_id',\n", + " 'protein_id']" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(eggnog.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "06223fb1-1492-4723-b621-3d9a13b0b3fe", + "metadata": {}, + "outputs": [], + "source": [ + "eggnog.rename(columns = {'e value':'seq_e value',\n", + " 'score':'seq_eggnog_score', 'eggNOG_OGs':'seq_eggnogOGs', 'max_annot_lvl':'seq_max_annot_lvl','COG_category':'seq_COG_category',\n", + " 'Description':'seq_Description', 'Preferred_name':'seq_Preferred_name', 'GOs':'seq_GOs', 'PFAMs':'seq_PFAMs'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "10d606c3-1bc9-4c78-81c7-279d607b8e90", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>fs_query</th>\n", + " <th>fs_target</th>\n", + " <th>seq. id.</th>\n", + " <th>alignment length</th>\n", + " <th>fs_e value</th>\n", + " <th>fs_bit score</th>\n", + " <th>fs_target_uniprot</th>\n", + " <th>fs_target_eggnog_evalue</th>\n", + " <th>fs_target_eggnog_score</th>\n", + " <th>fs_target_eggnogOGs</th>\n", + " <th>...</th>\n", + " <th>fs_target_Entry name</th>\n", + " <th>fs_target_Gene names</th>\n", + " <th>fs_target_Function [CC]</th>\n", + " <th>fs_target_Taxonomic lineage (PHYLUM)</th>\n", + " <th>plddt</th>\n", + " <th>MSA size</th>\n", + " <th>query length</th>\n", + " <th>gene name</th>\n", + " <th>protein_id</th>\n", + " <th>gene_id</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1442503</th>\n", + " <td>41612</td>\n", + " <td>AF-Q6NYT3-F1</td>\n", + " <td>0.173</td>\n", + " <td>156</td>\n", + " <td>1.632</td>\n", + " <td>90</td>\n", + " <td>Q6NYT3</td>\n", + " <td>1.470000e-217</td>\n", + " <td>600.0</td>\n", + " <td>28N8T@1|root,2QUU4@2759|Eukaryota,39TGP@33154|...</td>\n", + " <td>...</td>\n", + " <td>IER5L_DANRE</td>\n", + " <td>ier5l si:ch211-208h16.10 zgc:77455</td>\n", + " <td>NaN</td>\n", + " <td>Chordata</td>\n", + " <td>63.615046</td>\n", + " <td>747.0</td>\n", + " <td>218</td>\n", + " <td>c99854_g1_i2_m.41041</td>\n", + " <td>41041</td>\n", + " <td>c99854_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1442600</th>\n", + " <td>41612</td>\n", + " <td>AF-Q8CCI5-F1</td>\n", + " <td>0.141</td>\n", + " <td>155</td>\n", + " <td>6.330</td>\n", + " <td>68</td>\n", + " <td>Q8CCI5</td>\n", + " <td>8.230000e-132</td>\n", + " <td>377.0</td>\n", + " <td>KOG4477@1|root,KOG4477@2759|Eukaryota,39UAE@33...</td>\n", + " <td>...</td>\n", + " <td>RYBP_MOUSE</td>\n", + " <td>Rybp Dedaf</td>\n", + " <td>FUNCTION: Component of a Polycomb group (PcG) ...</td>\n", + " <td>Chordata</td>\n", + " <td>63.615046</td>\n", + " <td>747.0</td>\n", + " <td>218</td>\n", + " <td>c99854_g1_i2_m.41041</td>\n", + " <td>41041</td>\n", + " <td>c99854_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1442521</th>\n", + " <td>41612</td>\n", + " <td>AF-I1MQL7-F1</td>\n", + " <td>0.154</td>\n", + " <td>226</td>\n", + " <td>10.360</td>\n", + " <td>60</td>\n", + " <td>I1MQL7</td>\n", + " <td>0.000000e+00</td>\n", + " <td>947.0</td>\n", + " <td>KOG0724@1|root,KOG0724@2759|Eukaryota,37Q80@33...</td>\n", + " <td>...</td>\n", + " <td>I1MQL7_SOYBN</td>\n", + " <td>778089 GLYMA_16G217700</td>\n", + " <td>NaN</td>\n", + " <td>Streptophyta</td>\n", + " <td>63.615046</td>\n", + " <td>747.0</td>\n", + " <td>218</td>\n", + " <td>c99854_g1_i2_m.41041</td>\n", + " <td>41041</td>\n", + " <td>c99854_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1442614</th>\n", + " <td>41612</td>\n", + " <td>AF-Q9XUQ0-F1</td>\n", + " <td>0.140</td>\n", + " <td>235</td>\n", + " <td>11.020</td>\n", + " <td>59</td>\n", + " <td>Q9XUQ0</td>\n", + " <td>0.000000e+00</td>\n", + " <td>961.0</td>\n", + " <td>2ASSX@1|root,2RZQJ@2759|Eukaryota,39UW6@33154|...</td>\n", + " <td>...</td>\n", + " <td>Q9XUQ0_CAEEL</td>\n", + " <td>pqn-67 CELE_T16G1.1 T16G1.1</td>\n", + " <td>NaN</td>\n", + " <td>Nematoda (roundworms)</td>\n", + " <td>63.615046</td>\n", + " <td>747.0</td>\n", + " <td>218</td>\n", + " <td>c99854_g1_i2_m.41041</td>\n", + " <td>41041</td>\n", + " <td>c99854_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1442500</th>\n", + " <td>41612</td>\n", + " <td>AF-C1H5I7-F1</td>\n", + " <td>0.119</td>\n", + " <td>151</td>\n", + " <td>11.020</td>\n", + " <td>59</td>\n", + " <td>C1H5I7</td>\n", + " <td>0.000000e+00</td>\n", + " <td>1177.0</td>\n", + " <td>2CNBF@1|root,2QUZZ@2759|Eukaryota,39NHE@33154|...</td>\n", + " <td>...</td>\n", + " <td>C1H5I7_PARBA</td>\n", + " <td>PAAG_06028</td>\n", + " <td>NaN</td>\n", + " <td>Ascomycota</td>\n", + " <td>63.615046</td>\n", + " <td>747.0</td>\n", + " <td>218</td>\n", + " <td>c99854_g1_i2_m.41041</td>\n", + " <td>41041</td>\n", + " <td>c99854_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1442707</th>\n", + " <td>41612</td>\n", + " <td>AF-Q0J235-F1</td>\n", + " <td>0.107</td>\n", + " <td>140</td>\n", + " <td>481.100</td>\n", + " <td>-17</td>\n", + " <td>Q0J235</td>\n", + " <td>4.250000e-263</td>\n", + " <td>734.0</td>\n", + " <td>28J99@1|root,2QQZR@2759|Eukaryota,37SB3@33090|...</td>\n", + " <td>...</td>\n", + " <td>ROLL9_ORYSJ</td>\n", + " <td>RL9 SLL1 Os09g0395300 LOC_Os09g23200 B1040D06.24</td>\n", + " <td>FUNCTION: Probable transcription factor that r...</td>\n", + " <td>Streptophyta</td>\n", + " <td>63.615046</td>\n", + " <td>747.0</td>\n", + " <td>218</td>\n", + " <td>c99854_g1_i2_m.41041</td>\n", + " <td>41041</td>\n", + " <td>c99854_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1442619</th>\n", + " <td>41612</td>\n", + " <td>AF-P54258-F1</td>\n", + " <td>0.160</td>\n", + " <td>150</td>\n", + " <td>481.100</td>\n", + " <td>-17</td>\n", + " <td>P54258</td>\n", + " <td>0.000000e+00</td>\n", + " <td>1649.0</td>\n", + " <td>KOG2133@1|root,KOG2133@2759|Eukaryota,39T1J@33...</td>\n", + " <td>...</td>\n", + " <td>ATN1_RAT</td>\n", + " <td>Atn1 Drpla</td>\n", + " <td>FUNCTION: Transcriptional corepressor. Recruit...</td>\n", + " <td>Chordata</td>\n", + " <td>63.615046</td>\n", + " <td>747.0</td>\n", + " <td>218</td>\n", + " <td>c99854_g1_i2_m.41041</td>\n", + " <td>41041</td>\n", + " <td>c99854_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1442625</th>\n", + " <td>41612</td>\n", + " <td>AF-Q8IM56-F1</td>\n", + " <td>0.121</td>\n", + " <td>148</td>\n", + " <td>481.100</td>\n", + " <td>-19</td>\n", + " <td>Q8IM56</td>\n", + " <td>0.000000e+00</td>\n", + " <td>2878.0</td>\n", + " <td>2CMI8@1|root,2QQEE@2759|Eukaryota,3YC5G@5794|A...</td>\n", + " <td>...</td>\n", + " <td>Q8IM56_PLAF7</td>\n", + " <td>PF3D7_1403800</td>\n", + " <td>NaN</td>\n", + " <td>Apicomplexa</td>\n", + " <td>63.615046</td>\n", + " <td>747.0</td>\n", + " <td>218</td>\n", + " <td>c99854_g1_i2_m.41041</td>\n", + " <td>41041</td>\n", + " <td>c99854_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1442455</th>\n", + " <td>41612</td>\n", + " <td>AF-P13611-F8</td>\n", + " <td>0.192</td>\n", + " <td>151</td>\n", + " <td>481.100</td>\n", + " <td>-21</td>\n", + " <td>P13611</td>\n", + " <td>0.000000e+00</td>\n", + " <td>6472.0</td>\n", + " <td>28IZN@1|root,2QRBE@2759|Eukaryota,38FU8@33154|...</td>\n", + " <td>...</td>\n", + " <td>CSPG2_HUMAN</td>\n", + " <td>VCAN CSPG2</td>\n", + " <td>FUNCTION: May play a role in intercellular sig...</td>\n", + " <td>Chordata</td>\n", + " <td>63.615046</td>\n", + " <td>747.0</td>\n", + " <td>218</td>\n", + " <td>c99854_g1_i2_m.41041</td>\n", + " <td>41041</td>\n", + " <td>c99854_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1442557</th>\n", + " <td>41612</td>\n", + " <td>AF-Q96KW2-F1</td>\n", + " <td>0.105</td>\n", + " <td>247</td>\n", + " <td>481.100</td>\n", + " <td>-26</td>\n", + " <td>Q96KW2</td>\n", + " <td>0.000000e+00</td>\n", + " <td>1989.0</td>\n", + " <td>28YNY@1|root,2RWWZ@2759|Eukaryota,3AER2@33154|...</td>\n", + " <td>...</td>\n", + " <td>P12L2_HUMAN</td>\n", + " <td>POM121L2 POM121L</td>\n", + " <td>NaN</td>\n", + " <td>Chordata</td>\n", + " <td>63.615046</td>\n", + " <td>747.0</td>\n", + " <td>218</td>\n", + " <td>c99854_g1_i2_m.41041</td>\n", + " <td>41041</td>\n", + " <td>c99854_g1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>265 rows × 26 columns</p>\n", + "</div>" + ], + "text/plain": [ + " fs_query fs_target seq. id. alignment length fs_e value \\\n", + "1442503 41612 AF-Q6NYT3-F1 0.173 156 1.632 \n", + "1442600 41612 AF-Q8CCI5-F1 0.141 155 6.330 \n", + "1442521 41612 AF-I1MQL7-F1 0.154 226 10.360 \n", + "1442614 41612 AF-Q9XUQ0-F1 0.140 235 11.020 \n", + "1442500 41612 AF-C1H5I7-F1 0.119 151 11.020 \n", + "... ... ... ... ... ... \n", + "1442707 41612 AF-Q0J235-F1 0.107 140 481.100 \n", + "1442619 41612 AF-P54258-F1 0.160 150 481.100 \n", + "1442625 41612 AF-Q8IM56-F1 0.121 148 481.100 \n", + "1442455 41612 AF-P13611-F8 0.192 151 481.100 \n", + "1442557 41612 AF-Q96KW2-F1 0.105 247 481.100 \n", + "\n", + " fs_bit score fs_target_uniprot fs_target_eggnog_evalue \\\n", + "1442503 90 Q6NYT3 1.470000e-217 \n", + "1442600 68 Q8CCI5 8.230000e-132 \n", + "1442521 60 I1MQL7 0.000000e+00 \n", + "1442614 59 Q9XUQ0 0.000000e+00 \n", + "1442500 59 C1H5I7 0.000000e+00 \n", + "... ... ... ... \n", + "1442707 -17 Q0J235 4.250000e-263 \n", + "1442619 -17 P54258 0.000000e+00 \n", + "1442625 -19 Q8IM56 0.000000e+00 \n", + "1442455 -21 P13611 0.000000e+00 \n", + "1442557 -26 Q96KW2 0.000000e+00 \n", + "\n", + " fs_target_eggnog_score \\\n", + "1442503 600.0 \n", + "1442600 377.0 \n", + "1442521 947.0 \n", + "1442614 961.0 \n", + "1442500 1177.0 \n", + "... ... \n", + "1442707 734.0 \n", + "1442619 1649.0 \n", + "1442625 2878.0 \n", + "1442455 6472.0 \n", + "1442557 1989.0 \n", + "\n", + " fs_target_eggnogOGs ... \\\n", + "1442503 28N8T@1|root,2QUU4@2759|Eukaryota,39TGP@33154|... ... \n", + "1442600 KOG4477@1|root,KOG4477@2759|Eukaryota,39UAE@33... ... \n", + "1442521 KOG0724@1|root,KOG0724@2759|Eukaryota,37Q80@33... ... \n", + "1442614 2ASSX@1|root,2RZQJ@2759|Eukaryota,39UW6@33154|... ... \n", + "1442500 2CNBF@1|root,2QUZZ@2759|Eukaryota,39NHE@33154|... ... \n", + "... ... ... \n", + "1442707 28J99@1|root,2QQZR@2759|Eukaryota,37SB3@33090|... ... \n", + "1442619 KOG2133@1|root,KOG2133@2759|Eukaryota,39T1J@33... ... \n", + "1442625 2CMI8@1|root,2QQEE@2759|Eukaryota,3YC5G@5794|A... ... \n", + "1442455 28IZN@1|root,2QRBE@2759|Eukaryota,38FU8@33154|... ... \n", + "1442557 28YNY@1|root,2RWWZ@2759|Eukaryota,3AER2@33154|... ... \n", + "\n", + " fs_target_Entry name \\\n", + "1442503 IER5L_DANRE \n", + "1442600 RYBP_MOUSE \n", + "1442521 I1MQL7_SOYBN \n", + "1442614 Q9XUQ0_CAEEL \n", + "1442500 C1H5I7_PARBA \n", + "... ... \n", + "1442707 ROLL9_ORYSJ \n", + "1442619 ATN1_RAT \n", + "1442625 Q8IM56_PLAF7 \n", + "1442455 CSPG2_HUMAN \n", + "1442557 P12L2_HUMAN \n", + "\n", + " fs_target_Gene names \\\n", + "1442503 ier5l si:ch211-208h16.10 zgc:77455 \n", + "1442600 Rybp Dedaf \n", + "1442521 778089 GLYMA_16G217700 \n", + "1442614 pqn-67 CELE_T16G1.1 T16G1.1 \n", + "1442500 PAAG_06028 \n", + "... ... \n", + "1442707 RL9 SLL1 Os09g0395300 LOC_Os09g23200 B1040D06.24 \n", + "1442619 Atn1 Drpla \n", + "1442625 PF3D7_1403800 \n", + "1442455 VCAN CSPG2 \n", + "1442557 POM121L2 POM121L \n", + "\n", + " fs_target_Function [CC] \\\n", + "1442503 NaN \n", + "1442600 FUNCTION: Component of a Polycomb group (PcG) ... \n", + "1442521 NaN \n", + "1442614 NaN \n", + "1442500 NaN \n", + "... ... \n", + "1442707 FUNCTION: Probable transcription factor that r... \n", + "1442619 FUNCTION: Transcriptional corepressor. Recruit... \n", + "1442625 NaN \n", + "1442455 FUNCTION: May play a role in intercellular sig... \n", + "1442557 NaN \n", + "\n", + " fs_target_Taxonomic lineage (PHYLUM) plddt MSA size query length \\\n", + "1442503 Chordata 63.615046 747.0 218 \n", + "1442600 Chordata 63.615046 747.0 218 \n", + "1442521 Streptophyta 63.615046 747.0 218 \n", + "1442614 Nematoda (roundworms) 63.615046 747.0 218 \n", + "1442500 Ascomycota 63.615046 747.0 218 \n", + "... ... ... ... ... \n", + "1442707 Streptophyta 63.615046 747.0 218 \n", + "1442619 Chordata 63.615046 747.0 218 \n", + "1442625 Apicomplexa 63.615046 747.0 218 \n", + "1442455 Chordata 63.615046 747.0 218 \n", + "1442557 Chordata 63.615046 747.0 218 \n", + "\n", + " gene name protein_id gene_id \n", + "1442503 c99854_g1_i2_m.41041 41041 c99854_g1 \n", + "1442600 c99854_g1_i2_m.41041 41041 c99854_g1 \n", + "1442521 c99854_g1_i2_m.41041 41041 c99854_g1 \n", + "1442614 c99854_g1_i2_m.41041 41041 c99854_g1 \n", + "1442500 c99854_g1_i2_m.41041 41041 c99854_g1 \n", + "... ... ... ... \n", + "1442707 c99854_g1_i2_m.41041 41041 c99854_g1 \n", + "1442619 c99854_g1_i2_m.41041 41041 c99854_g1 \n", + "1442625 c99854_g1_i2_m.41041 41041 c99854_g1 \n", + "1442455 c99854_g1_i2_m.41041 41041 c99854_g1 \n", + "1442557 c99854_g1_i2_m.41041 41041 c99854_g1 \n", + "\n", + "[265 rows x 26 columns]" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "afdb_res[afdb_res['gene_id'].str.contains(\"c99854_g1\")].sort_values(by=\"fs_bit score\", ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "2c216345-b501-419c-b3a4-2a71894a5899", + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'gene_id'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipykernel_77/3048031996.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mafdb_res\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmerge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mafdb_merge\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0meggnog\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mon\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"gene_id\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/reshape/merge.py\u001b[0m in \u001b[0;36mmerge\u001b[0;34m(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[0mvalidate\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 106\u001b[0m ) -> DataFrame:\n\u001b[0;32m--> 107\u001b[0;31m op = _MergeOperation(\n\u001b[0m\u001b[1;32m 108\u001b[0m \u001b[0mleft\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 109\u001b[0m \u001b[0mright\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/reshape/merge.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, left, right, how, on, left_on, right_on, axis, left_index, right_index, sort, suffixes, copy, indicator, validate)\u001b[0m\n\u001b[1;32m 698\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mright_join_keys\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 699\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin_names\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 700\u001b[0;31m ) = self._get_merge_keys()\n\u001b[0m\u001b[1;32m 701\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 702\u001b[0m \u001b[0;31m# validate the merge keys dtypes. We may need to coerce\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/reshape/merge.py\u001b[0m in \u001b[0;36m_get_merge_keys\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1103\u001b[0m \u001b[0mright_keys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1104\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlk\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1105\u001b[0;31m \u001b[0mleft_keys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mleft\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_label_or_level_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1106\u001b[0m \u001b[0mjoin_names\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1107\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m_get_label_or_level_values\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 1774\u001b[0m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maxes\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_level_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1775\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1776\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1777\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1778\u001b[0m \u001b[0;31m# Check for duplicates\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'gene_id'" + ] + } + ], + "source": [ + "afdb_res = pd.merge(afdb_merge, eggnog, on=\"gene_id\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a459c82-b4cc-4442-93c5-24860a19deba", + "metadata": {}, + "outputs": [], + "source": [ + "pdb_seq = pdb_seq.join(eggnog, on=\"eggnog\")\n", + "swp_seq = swp_seq.join(eggnog, on=\"eggnog\")\n", + "afdb_seq = afdb_seq.join(eggnog, on=\"eggnog\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b02bf3b-6dee-4bf6-86c3-15f04a136ebf", + "metadata": {}, + "outputs": [], + "source": [ + "def summarize_fs(df):\n", + " res = {}\n", + " for query in tqdm(df[\"query\"].unique()):\n", + "# print(query)\n", + " tmp = df[df[\"query\"] == query]\n", + "\n", + " bit_threshold = tmp.groupby(\"name\")[\"bit score\"].mean().max()\n", + " genes = tmp.groupby(\"gene name\")[\"bit score\"].mean()\n", + " keep = genes[genes >= bit_threshold].index.values\n", + " select = tmp[tmp[\"gene name\"].isin(keep)].fillna(\"-\")\n", + " if bit_threshold is np.NaN: # this means that all scores are NaN; just take the average instead\n", + " bit_threshold = tmp[\"bit score\"].mean()\n", + " keep = tmp[tmp[\"bit score\"] >= bit_threshold].index.values\n", + " select = tmp.loc[keep].fillna(\"\")\n", + "\n", + " uniprot_ids = np.unique(\",\".join(select[\"uniprot\"].values).split(\",\"))\n", + "\n", + " res[query] = [np.mean(select[\"bit score\"]),\n", + " \",\".join(uniprot_ids),\n", + " \",\".join(np.unique(select[\"gene name\"])),\n", + " \",\".join(np.unique(select[\"eggnog\"])),\n", + " \"|\".join(np.unique(select[\"name\"]))]\n", + "# print(res[query])\n", + " res = pd.DataFrame(res).T\n", + " res.columns = [\"selection avg bit score\", \"uniprot\", \"gene name\", \"orthogroups\", \"eggnog name\"]\n", + " return res" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9db46cc0-615b-435d-824a-e3902f82529e", + "metadata": {}, + "outputs": [], + "source": [ + "pdb_annot = summarize_fs(pdb_seq)\n", + "swp_annot = summarize_fs(swp_seq)\n", + "afdb_annot = summarize_fs(afdb_seq)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5047b46b-e6e3-4999-8b9b-ff73c15e38ad", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84ca4449-c788-4b1f-a20e-e4f981cd6f8f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "907102ae-4392-4de4-9c40-61cbea7e3b12", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aebc95cf-6926-49e0-be01-5ac8faf0bfe4", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fefcf956-b79d-4090-9719-7d15cc4aa123", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "105a2ff0-8df5-4919-b1d8-6f6b3cfa585e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91e4b127-d802-4ea6-a3f9-ee47c92d99f0", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc6eca76-a585-4ff4-9542-6a43ab90e76e", + "metadata": {}, + "outputs": [], + "source": [ + "# keep best annotated e-value hit per trinity ID (e.g. keep best-scoring annotated isoform)\n", + "def annotate_foldseek(df, eggnog):\n", + " test = df.join(eggnog, on=\"protein_id\")\n", + "\n", + " name_isnan = test[\"Preferred_name\"].isnull()\n", + " name_missing = test[\"Preferred_name\"] == \"-\"\n", + " desc_isnan = test[\"Description\"].isnull()\n", + " desc_missing = test[\"Description\"] == \"-\"\n", + "\n", + " has_name = ~(name_isnan | name_missing)\n", + " has_desc = ~(desc_isnan | desc_missing)\n", + " annotated = has_name | has_desc\n", + "\n", + " return test[annotated].sort_values('evalue', ascending=False).drop_duplicates(['gene_id'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a6aaa90-f6da-4ee9-bf6d-5a4a615da4cf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pdb_ann = annotate_foldseek(pdb, eggnog)\n", + "swp_ann = annotate_foldseek(swp, eggnog)\n", + "afdb_ann = annotate_foldseek(afdb, eggnog)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "426f604d-7b72-4fa1-9e35-8ec785665e67", + "metadata": {}, + "outputs": [], + "source": [ + "name_isnan = test[\"Preferred_name\"].isnull()\n", + "name_missing = test[\"Preferred_name\"] == \"-\"\n", + "desc_isnan = test[\"Description\"].isnull()\n", + "desc_missing = test[\"Description\"] == \"-\"\n", + "\n", + "has_name = ~(name_isnan | name_missing)\n", + "has_desc = ~(desc_isnan | desc_missing)\n", + "annotated = has_name | has_desc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5118ce30-cf3a-449f-97a9-69c3897f9bfd", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57ba2ea3-0c1d-410c-b8bf-0ef86365eb1c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03a91b25-8bb2-42e4-acda-d4d0eb096348", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4a80767-5225-43aa-9ae0-3195d9770758", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92dce0ff-537a-436d-906e-50f295c93153", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fc57780-f8d5-4f47-aa42-6b10fe5be964", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1c3a456-2be9-498f-b61c-63afcc268a38", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79e8f599-3fff-488c-8e59-5b3e6a2f7060", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e115347-7e0f-4fbe-97c5-cb7b47c9b63d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "34d0479f-5efd-414d-90b2-c5149ca99811", + "metadata": {}, + "source": [ + "We will filter the tables and only keep the best match per protein; when multiple hits get the same bit score we'll only keep one (lexicographically first?)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ef2393f-3125-4108-b21a-6729f1c536a0", + "metadata": {}, + "outputs": [], + "source": [ + "def keep_best_match(df, prefix):\n", + " df = df.sort_values('bit score', ascending=False).drop_duplicates(['gene_id'])\n", + " df.drop(columns=[\"gene name\"], inplace=True)\n", + " df[\"alphafold id\"] = df.index.values\n", + " df.set_index(\"gene_id\", inplace=True)\n", + " df.columns = prefix + \" \" + df.columns\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1d9ef06-d55b-48bb-a966-5d4d8e9b6fba", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pdb = keep_best_match(pdb, \"pdb\")\n", + "afdb = keep_best_match(afdb, \"afdb\")\n", + "swp = keep_best_match(swp, \"swp\")" + ] + }, + { + "cell_type": "markdown", + "id": "8a73bb0d-a4dd-4f08-bcfb-9333adca6fa2", + "metadata": {}, + "source": [ + "We now have five tables that are all indexed by the query IDs. We can merge those and obtain the summary table of the run:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f07f22f8-bf4b-474a-a0c1-4c7e184b0b4b", + "metadata": {}, + "outputs": [], + "source": [ + "# concatenate foldseek results\n", + "foldseek = pdb.join(afdb, how=\"outer\").join(swp, how=\"outer\")\n", + "# add structure prediction confidence\n", + "foldseek = foldseek.join(alphafold.set_index(\"gene_id\"), how=\"outer\")" + ] + }, + { + "cell_type": "markdown", + "id": "a5f5f3b3-944e-41f1-be5f-71b635e83da9", + "metadata": {}, + "source": [ + "Before we continue, keep track of which proteins got any foldseek prediction. This is the first bottleneck on the road to expanding functional annotation!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5051de7c-97fb-48b0-bcb1-9b41545ac9f1", + "metadata": {}, + "outputs": [], + "source": [ + "# create a categorical variable that keeps track of which query proteins didn't get any FoldSeek hits\n", + "foldseek[\"foldsought\"] = ~foldseek[\"pdb target\"].isnull() | ~foldseek[\"afdb target\"].isnull() | ~foldseek[\"swp target\"].isnull()" + ] + }, + { + "cell_type": "markdown", + "id": "494ef8dc-c28d-4507-8c5e-100389c72ba5", + "metadata": {}, + "source": [ + "Drop empty rows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0f86145-636b-4f85-a499-a2ef25a6730e", + "metadata": {}, + "outputs": [], + "source": [ + "foldseek = foldseek.drop(index=foldseek.index[foldseek[\"plddt\"].isnull()])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3feff69f-32ce-46c8-82b4-970d4eef5448", + "metadata": {}, + "outputs": [], + "source": [ + "sns.jointplot(x=\"MSA size\", y=\"plddt\", hue=\"foldsought\", data=foldseek, kind = \"kde\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f12248e-b573-4747-935c-4977b56466fd", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots()\n", + "x = foldseek[\"MSA size\"][~foldseek[\"foldsought\"]].values\n", + "y = foldseek[\"MSA size\"][foldseek[\"foldsought\"]].values\n", + "ax.hist(x[x<500], label=\"False\", alpha=0.5);\n", + "ax.hist(y[y<500], label=\"True\", alpha=0.5);" + ] + }, + { + "cell_type": "markdown", + "id": "1afb5af0-7d59-42b9-998f-42aa3060b777", + "metadata": {}, + "source": [ + "there seems to be a bit of a positive correlation here; clearly more so for the cases where FoldSeek finds something\n", + "\n", + "### How does query size influence AF prediction quality?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7452f14-1b16-49a4-bd17-56ce972b6a9e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "foldseek[\"query length\"] = foldseek[\"query length\"].astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2210f460-7966-4625-bd48-4081a050164d", + "metadata": {}, + "outputs": [], + "source": [ + "sns.displot(foldseek, x=\"query length\", hue=\"foldsought\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b3f01ca-6ab8-4519-91cf-42f93f371414", + "metadata": {}, + "outputs": [], + "source": [ + "fs_hits_per_querylen = foldseek.groupby(\"query length\")[\"foldsought\"].apply(lambda x: np.sum(x))\n", + "fs_hits_per_querylen_cumsum = np.cumsum(fs_hits_per_querylen) / foldseek.shape[0]\n", + "\n", + "x = fs_hits_per_querylen_cumsum.index\n", + "y = fs_hits_per_querylen_cumsum.values * 100\n", + "\n", + "fig, ax = plt.subplots()\n", + "ax.plot(x, y)\n", + "ax.set_xlabel(\"query length\")\n", + "ax.set_ylabel(\"% foldseek hit\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da7c7cd7-bd73-4ac0-a007-47957b8a714a", + "metadata": {}, + "outputs": [], + "source": [ + "fs_hits_per_MSA = foldseek.groupby(\"MSA size\")[\"foldsought\"].apply(lambda x: np.sum(x))\n", + "fs_hits_per_MSA_cumsum = np.cumsum(fs_hits_per_MSA) / foldseek.shape[0]\n", + "\n", + "x = fs_hits_per_MSA_cumsum.index\n", + "y = fs_hits_per_MSA_cumsum.values * 100\n", + "\n", + "fig, ax = plt.subplots()\n", + "ax.plot(x, y)\n", + "ax.set_xlabel(\"MSA size\")\n", + "ax.set_ylabel(\"% foldseek hit\")" + ] + }, + { + "cell_type": "markdown", + "id": "09754a27-80af-46df-902f-8f5057b0bb5b", + "metadata": {}, + "source": [ + "Many short (70 aa) proteins do not have a Foldseek hit. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "250a7e1e-f7ce-4d5e-ae1a-f61bd39e0c34", + "metadata": {}, + "outputs": [], + "source": [ + "sns.jointplot(x=\"query length\", y=\"plddt\", hue=\"foldsought\", data=foldseek, kind=\"kde\", xlim=[0, 800]);" + ] + }, + { + "cell_type": "markdown", + "id": "6ee91e3e-a1c2-4a49-b3bc-19e1c3d5383c", + "metadata": {}, + "source": [ + "there doesn't seem to be a very obvious relationship between query length and prediction confidence.\n", + "\n", + "### How do the FoldSeek evalues correlate between databases?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3e212ba-1fa3-4a64-8eca-649a06345056", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(ncols=3, figsize=(15, 3))\n", + "sns.scatterplot(x=\"pdb e value\", y=\"afdb e value\", data=foldseek, ax=ax[0], alpha=0.05, linewidth=0)\n", + "ax[0].set(xscale=\"log\", yscale=\"log\");\n", + "sns.scatterplot(x=\"pdb e value\", y=\"swp e value\", data=foldseek, ax=ax[1], alpha=0.05, linewidth=0)\n", + "ax[1].set(xscale=\"log\", yscale=\"log\");\n", + "sns.scatterplot(x=\"afdb e value\", y=\"swp e value\", data=foldseek, ax=ax[2], alpha=0.05, linewidth=0)\n", + "ax[2].set(xscale=\"log\", yscale=\"log\");" + ] + }, + { + "cell_type": "markdown", + "id": "a3fbb451-bf04-4cdf-bb57-a036f5f42f90", + "metadata": {}, + "source": [ + "### How do the FoldSeek bit scores correlate between databases?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c950076-2fe9-42a1-8e9e-1804f366d6f9", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(ncols=3, figsize=(15, 3))\n", + "sns.scatterplot(x=\"pdb bit score\", y=\"afdb bit score\", data=foldseek, ax=ax[0], alpha=0.05, linewidth=0)\n", + "ax[0].set(xscale=\"log\", yscale=\"log\");\n", + "sns.scatterplot(x=\"pdb bit score\", y=\"swp bit score\", data=foldseek, ax=ax[1], alpha=0.05, linewidth=0)\n", + "ax[1].set(xscale=\"log\", yscale=\"log\");\n", + "sns.scatterplot(x=\"afdb bit score\", y=\"swp bit score\", data=foldseek, ax=ax[2], alpha=0.05, linewidth=0)\n", + "ax[2].set(xscale=\"log\", yscale=\"log\");" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aeebd09f-95fc-48e2-aecb-9d51bc8b7084", + "metadata": {}, + "outputs": [], + "source": [ + "g = sns.JointGrid(data=foldseek, x=\"pdb bit score\", y=\"afdb bit score\")\n", + "g.plot_joint(sns.scatterplot, alpha=0.05, linewidth=0)\n", + "g.plot_marginals(sns.boxplot)" + ] + }, + { + "cell_type": "markdown", + "id": "fb8640d3-f52d-460a-adf7-5eb454890c7a", + "metadata": {}, + "source": [ + "### Is there any relationship between AF score and FoldSeek e-values?\n", + "\n", + "since FoldSeek e-vals correlate so well we will only use AFDB" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b83a719-97f4-421e-aad6-05b89a3f58cc", + "metadata": {}, + "outputs": [], + "source": [ + "sns.jointplot(x=foldseek[\"afdb bit score\"], y=foldseek[\"plddt\"], kind=\"kde\", xlim=[0, 3000]);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92ad21f6-8e34-43e9-97fd-5cbfe83a81d3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "foldseek[\"query length\"] = foldseek[\"query length\"].astype(int)" + ] + }, + { + "cell_type": "markdown", + "id": "663ea290-1dcd-4a48-acb1-f2a924636746", + "metadata": {}, + "source": [ + "## 4. Sequence-based annotation from EggNOG" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "d76ac42c-3172-4e92-9b88-2b804df08974", + "metadata": {}, + "outputs": [], + "source": [ + "eggnog = pd.read_csv(\"/g/arendt/npapadop/repos/spongfold/data/MM_ipw26ffh.emapper.annotations.tsv\", sep=\"\\t\", skiprows=4, skipfooter=3, engine=\"python\")\n", + "\n", + "# subset to spongilla genes\n", + "eggnog[\"protein_id\"] = eggnog[\"#query\"].str.split(\".\").str[1].copy()\n", + "# only keep informative columns\n", + "# eggnog = eggnog[[\"protein_id\", \"evalue\", \"score\", \"Preferred_name\", \"Description\", \"GOs\", \"eggNOG_OGs\"]].copy()\n", + "# eggnog.set_index(\"protein_id\", inplace=True)\n", + "# # merge with sequence info so that we have access to gene IDs\n", + "# eggnog = sequence_info.set_index(\"protein_id\").join(eggnog.set_index(\"protein_id\"))\n", + "# # keep best evalue hit per TRINITY gene ID (e.g. keep best-scoring isoform)\n", + "# eggnog = eggnog.sort_values('evalue', ascending=False).drop_duplicates(['gene_id'])" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "5eb5d832-91ca-4a87-98bd-f49a2b920713", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['-', 'A', 'AB', 'ABO', 'AD', 'ADK', 'AI', 'AJ', 'AL', 'AT', 'ATY',\n", + " 'B', 'BD', 'BDL', 'BDLT', 'BDLTU', 'BDT', 'BK', 'BL', 'BQ', 'BT',\n", + " 'C', 'CD', 'CE', 'CG', 'CH', 'CIQ', 'CK', 'CO', 'CU', 'D', 'DJ',\n", + " 'DK', 'DKL', 'DKLT', 'DKT', 'DL', 'DN', 'DO', 'DP', 'DT', 'DTZ',\n", + " 'DU', 'DUZ', 'DV', 'DY', 'DZ', 'E', 'EF', 'EG', 'EGP', 'EH', 'EI',\n", + " 'EIOV', 'EJ', 'EO', 'EPT', 'ET', 'EU', 'F', 'FG', 'FQ', 'FT', 'G',\n", + " 'GI', 'GK', 'GM', 'GMO', 'GMW', 'GO', 'GOT', 'GOU', 'GQ', 'GT',\n", + " 'H', 'I', 'IKT', 'IM', 'IMO', 'IN', 'IO', 'IOT', 'IQ', 'IT', 'IU',\n", + " 'J', 'JK', 'JKL', 'JM', 'JO', 'JT', 'JU', 'JUY', 'K', 'KL', 'KLO',\n", + " 'KLT', 'KMO', 'KO', 'KOT', 'KT', 'KU', 'L', 'LO', 'LT', 'M', 'MO',\n", + " 'MOT', 'MU', 'MV', 'MW', 'N', 'NT', 'NU', 'O', 'OP', 'OT', 'OTU',\n", + " 'OU', 'P', 'PQ', 'PT', 'Q', 'S', 'T', 'TU', 'TUZ', 'TV', 'TW',\n", + " 'TZ', 'U', 'UY', 'UZ', 'V', 'W', 'Y', 'Z'], dtype=object)" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.unique(eggnog[\"COG_category\"])" + ] + }, + { + "cell_type": "markdown", + "id": "e78917f2-05d5-482e-bf45-1ca65aafb801", + "metadata": {}, + "source": [ + "Subset to a few columns. We'll keep e-value and bit score as proxies for sequence similarity; preferred name as a top level annotation, narrow orthogroup description as a second level, and GO terms/EC number as a third." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8e1668c-20fd-44e6-9709-00b68280759c", + "metadata": {}, + "outputs": [], + "source": [ + "foldseek[\"alphafold query\"] = foldseek.index.values" + ] + }, + { + "cell_type": "markdown", + "id": "ec550336-8348-42fc-b963-8b2a0a987759", + "metadata": {}, + "source": [ + "Actually merge with the foldseek results:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1881c868-054e-4572-b377-1c953bf6f6d3", + "metadata": {}, + "outputs": [], + "source": [ + "spongilla = foldseek.join(eggnog.set_index(\"gene_id\").drop(columns=[\"MSA size\", \"query length\", \"gene name\"]))" + ] + }, + { + "cell_type": "markdown", + "id": "9d5ee661-cddc-43d6-a40d-de237b5a7817", + "metadata": {}, + "source": [ + "Replace the `NaN`s with a dash in all the string-based columns:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89e0a74c-52d4-4ad4-ae68-6ec087305992", + "metadata": {}, + "outputs": [], + "source": [ + "spongilla[\"Preferred_name\"].fillna(\"-\", inplace=True)\n", + "spongilla[\"Description\"].fillna(\"-\", inplace=True)\n", + "spongilla[\"GOs\"].fillna(\"-\", inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "82d8f3b7-51c5-495d-a68d-c3ad8d6aa9ef", + "metadata": {}, + "source": [ + "# Comparing sequence-based and structure-based annotation\n", + "\n", + "## 1) What part of the proteome is annotated?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0dae0824-7673-4237-9b4e-23acdd3d039d", + "metadata": {}, + "outputs": [], + "source": [ + "print(f'- {np.sum(spongilla[\"Preferred_name\"] != \"-\") / spongilla.shape[0] * 100 :.2f}% of proteins have an EggNOG name.')\n", + "print(f'- {np.sum(spongilla[\"Description\"] != \"-\") / spongilla.shape[0] * 100 :.2f}% of proteins have an EggNOG description.')\n", + "print(f'- {np.sum(spongilla[\"GOs\"] != \"-\") / spongilla.shape[0] * 100 :.2f}% of proteins get EggNOG GO terms.')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d61a398a-a99f-4ffc-b43d-9edeed28f2c9", + "metadata": {}, + "outputs": [], + "source": [ + "has_name = spongilla[\"Preferred_name\"] != \"-\"\n", + "has_desc = spongilla[\"Description\"] != \"-\"\n", + "has_GO = spongilla[\"GOs\"] != \"-\"\n", + "\n", + "A = np.sum(has_name & ~has_desc & ~has_GO)\n", + "B = np.sum(~has_name & has_desc & ~has_GO)\n", + "AB = np.sum(has_name & has_desc & ~has_GO)\n", + "C = np.sum(~has_name & ~has_desc & has_GO)\n", + "AC = np.sum(has_name & ~has_desc & has_GO)\n", + "BC = np.sum(~has_name & has_desc & has_GO)\n", + "ABC = np.sum(has_name & has_desc & has_GO)\n", + "\n", + "venn3(subsets=(A, B, AB, C, AC, BC, ABC), set_labels=[\"name\", \"description\", \"GO\"])" + ] + }, + { + "cell_type": "markdown", + "id": "e3dbe6d3-c228-4389-a528-d292f9c60b23", + "metadata": {}, + "source": [ + "We will treat description as the more general category.\n", + "\n", + "How does FoldSeek fare?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4b87ebe-81cb-42ea-a214-6d598a38b1e4", + "metadata": {}, + "outputs": [], + "source": [ + "print(f'- {np.sum(spongilla[\"foldsought\"]) / spongilla.shape[0] * 100 :.2f}% of proteins have a FoldSeek hit.')" + ] + }, + { + "cell_type": "markdown", + "id": "9f6ea360-879a-4b5e-9586-06b380ea3bea", + "metadata": {}, + "source": [ + "## 2) How does structure-based annotation compare to sequence-based annotation?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3f8376e-97d3-4c85-b840-95b684700d2b", + "metadata": {}, + "outputs": [], + "source": [ + "has_seq = spongilla[\"Description\"] != \"-\"\n", + "has_struct = spongilla[\"foldsought\"]\n", + "\n", + "A = np.sum(has_seq & ~has_struct)\n", + "B = np.sum(~has_seq & has_struct)\n", + "AB = np.sum(has_seq & has_struct)\n", + "\n", + "venn2(subsets=(A, B, AB), set_labels=[\"has sequence annotation\", \"has FoldSeek hit\"])" + ] + }, + { + "cell_type": "markdown", + "id": "b0caaf16-0ad1-4afa-a408-b6bb41f598c0", + "metadata": {}, + "source": [ + "Interesting to see that there is a significant number of proteins with a sequence annotation but no structural one!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9f1e5ac-a763-4a64-8118-49bbc714e25d", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots()\n", + "ax.hist(spongilla[has_seq & ~has_struct][\"plddt\"], bins=50);" + ] + }, + { + "cell_type": "markdown", + "id": "89ca9e4b-ca7d-44a7-a2a6-d5a5059b79d9", + "metadata": {}, + "source": [ + "Why no FoldSeek hits? Structures are partly really good, sequence annotation is available." + ] + }, + { + "cell_type": "markdown", + "id": "a64b993a-5771-4722-b74c-8076fa8ab9fd", + "metadata": {}, + "source": [ + "## what happens with PDB?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8db39c06-50cf-4dd6-9f8e-f4c1fbb13c97", + "metadata": {}, + "outputs": [], + "source": [ + "import urllib.parse\n", + "import urllib.request\n", + "\n", + "url = 'https://www.uniprot.org/uploadlists/'\n", + "\n", + "params = {\n", + "'from': 'PDB_ID',\n", + "'to': 'ACC',\n", + "'format': 'tab',\n", + "'query': \" \".join(foldseek[\"pdb target\"][~foldseek[\"pdb target\"].isnull()].unique())\n", + "}\n", + "\n", + "data = urllib.parse.urlencode(params)\n", + "data = data.encode('utf-8')\n", + "req = urllib.request.Request(url, data)\n", + "with urllib.request.urlopen(req) as f:\n", + " response = f.read()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b0867b7-9bf2-4ebc-89c2-52d09724e4f7", + "metadata": {}, + "outputs": [], + "source": [ + "pdb = []\n", + "uniprot = []\n", + "for row in tqdm(response.decode(\"utf-8\").split(\"\\n\")[:-1]):\n", + " col1, col2 = row.split(\"\\t\")\n", + " pdb.append(col1)\n", + " uniprot.append(col2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c29c7a8-8e90-4996-8b01-aeec246bd69e", + "metadata": {}, + "outputs": [], + "source": [ + "pdb_map = pd.DataFrame({\"pdb\": pdb[1:], \"uniprot\": uniprot[1:]})\n", + "pdb_map = pd.DataFrame(pdb_map.groupby(\"pdb\")[\"uniprot\"].apply(\",\".join))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "799c5363-7897-4e6a-a295-bb054d4ecc2a", + "metadata": {}, + "outputs": [], + "source": [ + "# pdb_map[\"pdb target\"] = pdb_map.index.values\n", + "pdb_map.columns = [\"pdb uniprot\"]\n", + "pdb_map" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6410b627-f4b4-408f-a391-710daf4bea36", + "metadata": {}, + "outputs": [], + "source": [ + "spongilla = spongilla.join(pdb_map, on=\"pdb target\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "186c6256-8dfb-4f02-8301-ea169b60ab12", + "metadata": {}, + "outputs": [], + "source": [ + "def best_names(df):\n", + " names = df[[\"pdb uniprot\", \"swp target\", \"afdb target\"]].copy()\n", + " scores = df[[\"pdb bit score\", \"swp bit score\", \"afdb bit score\"]].copy()\n", + "\n", + " names.replace(to_replace=\".*,.*\", regex=True, value=np.NaN, inplace=True)\n", + " names.columns = [\"pdb\", \"swp\", \"afdb\"]\n", + "\n", + " scores.columns = [\"pdb\", \"swp\", \"afdb\"]\n", + " scores = scores[~names.isnull()]\n", + "\n", + " names.dropna(how=\"all\", inplace=True)\n", + " scores.dropna(how=\"all\", inplace=True)\n", + "\n", + " bla = np.array(scores)\n", + " best_loc = np.array(bla.T == np.array(scores.max(axis=1))).T\n", + "\n", + " multiple_hits = np.sum(best_loc, axis=1) > 1\n", + "\n", + " has_swp = best_loc[:, 1]\n", + " best_loc[multiple_hits & has_swp] *= np.array([False, True, False])\n", + " best_loc[multiple_hits & has_swp]\n", + " has_pdb = best_loc[:, 2]\n", + " best_loc[multiple_hits & has_pdb] *= np.array([True, False, False])\n", + "\n", + " names[\"best foldseek UniProt ID\"] = np.array(names)[best_loc]\n", + " return names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa565ee5-1718-404f-9c29-d2d9f808acba", + "metadata": {}, + "outputs": [], + "source": [ + "names = best_names(spongilla)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "193e188d-273f-46be-8ac1-b8ff71ff1602", + "metadata": {}, + "outputs": [], + "source": [ + "spongilla = spongilla.join(names[\"best foldseek UniProt ID\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7259353-5035-4375-a0b8-a6718d0b0f9c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "spongilla[\"sequence annotation\"] = (spongilla[\"Preferred_name\"] != \"-\") | (spongilla[\"Description\"] != \"-\")" + ] + }, + { + "cell_type": "markdown", + "id": "e199b258-a4ca-418b-8518-5b45da67b5fa", + "metadata": {}, + "source": [ + "Based on either EggNOG Preferred name or EggNOG description, we call sequence annotation TRUE or FALSE." + ] + }, + { + "cell_type": "markdown", + "id": "5c9819cc-6431-4d94-a718-67c912c43d57", + "metadata": {}, + "source": [ + "## 3) How does structure annotation compare to sequence annotation?\n", + "\n", + "We will query the cases where the structure pipeline (MAF) _and_ the sequence pipeline (EggNOG) have both annotated a protein." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07fcbee2-680d-4afe-8cc8-529630e2688b", + "metadata": {}, + "outputs": [], + "source": [ + "spongilla[\"best foldseek UniProt ID\"] = spongilla[\"best foldseek UniProt ID\"].str.split(\"-\").str[1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f290b907-6c73-459d-a2a1-998f04500155", + "metadata": {}, + "outputs": [], + "source": [ + "url = 'https://www.uniprot.org/uploadlists/'\n", + "\n", + "ids = spongilla[\"best foldseek UniProt ID\"]\n", + "unique_ids = ids[~ids.isnull()].unique()\n", + "ask_for = \" \".join(unique_ids)\n", + "\n", + "params = {\n", + "'from': 'ACC',\n", + "'to': 'EGGNOG_ID',\n", + "'format': 'tab',\n", + "'query': ask_for\n", + "}\n", + "\n", + "data = urllib.parse.urlencode(params)\n", + "data = data.encode('utf-8')\n", + "req = urllib.request.Request(url, data)\n", + "with urllib.request.urlopen(req) as f:\n", + " response = f.read()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0438145-ede8-4deb-9d79-c620c6f88c75", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "uniprot = []\n", + "eggnog = []\n", + "for row in tqdm(response.decode(\"utf-8\").split(\"\\n\")[:-1]):\n", + " col1, col2 = row.split(\"\\t\")\n", + " if col2.startswith(\"ENOG50\"):\n", + " col2 = col2[6:]\n", + " uniprot.append(col1)\n", + " eggnog.append(col2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22abae46-5dac-45d3-ba6e-434c64b534e3", + "metadata": {}, + "outputs": [], + "source": [ + "eggnog_map = pd.DataFrame({\"uniprot\": uniprot[1:], \"eggnog\": eggnog[1:]})\n", + "eggnog_map = pd.DataFrame(eggnog_map.groupby(\"uniprot\")[\"eggnog\"].apply(\",\".join))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95434c69-bc1c-406c-8ef8-3a1188eca722", + "metadata": {}, + "outputs": [], + "source": [ + "# pdb_map[\"pdb target\"] = pdb_map.index.values\n", + "eggnog_map.columns = [\"best foldseek eggnog\"]\n", + "eggnog_map" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "449636b5-f221-449f-89b4-60bf7b9b9656", + "metadata": {}, + "outputs": [], + "source": [ + "spongilla = spongilla.join(eggnog_map, on=\"best foldseek UniProt ID\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40f65a3e-3dc7-4b34-9051-ac723de4b176", + "metadata": {}, + "outputs": [], + "source": [ + "spongilla" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ce50a7f-8689-4431-a165-c96a1e0228f4", + "metadata": {}, + "outputs": [], + "source": [ + "has_fs = spongilla[\"foldsought\"]\n", + "has_up = ~spongilla[\"best foldseek UniProt ID\"].isnull()\n", + "has_eg = ~spongilla[\"best foldseek eggnog\"].isnull()\n", + "\n", + "A = np.sum(has_fs & ~has_up & ~has_eg)\n", + "B = np.sum(~has_fs & has_up & ~has_eg)\n", + "AB = np.sum(has_fs & has_up & ~has_eg)\n", + "C = np.sum(~has_fs & ~has_up & has_eg)\n", + "AC = np.sum(has_fs & ~has_up & has_eg)\n", + "BC = np.sum(~has_fs & has_up & has_eg)\n", + "ABC = np.sum(has_fs & has_up & has_eg)\n", + "\n", + "venn3(subsets=(A, B, AB, C, AC, BC, ABC), set_labels=[\"FoldSeek\", \"UniProt\", \"EggNOG\"])" + ] + }, + { + "cell_type": "markdown", + "id": "a80194eb-b820-48d9-a67f-5ba27f182efc", + "metadata": {}, + "source": [ + "We lose a lot in the conversion between UniProt and EggNOG; this is unfortunate but expected, because EggNOG is enriched for model species and FoldSeek will mostly find things in AFDB." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a25a5794-4d6c-4d97-9bbd-b3e01708ee03", + "metadata": {}, + "outputs": [], + "source": [ + "def find_id_in_OG(row):\n", + " x = row[\"best foldseek eggnog\"]\n", + " y = row[\"eggNOG_OGs\"]\n", + " if x is np.NaN or y is np.NaN:\n", + " return False\n", + " if \",\" in x:\n", + " ids = x.split(\",\")\n", + " for i in ids:\n", + " if i in y:\n", + " return True\n", + " return False\n", + " else:\n", + " return x in y" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8314fb8c-c057-4310-bb34-6c7b6884acc3", + "metadata": {}, + "outputs": [], + "source": [ + "spongilla[\"foldseek==eggNOG\"] = spongilla.apply(find_id_in_OG, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea4df93f-5e87-4cc0-8735-08d24c49ba3e", + "metadata": {}, + "outputs": [], + "source": [ + "has_eg = ~spongilla[\"eggNOG_OGs\"].isnull() & ~spongilla[\"best foldseek eggnog\"].isnull()\n", + "agrees = spongilla[\"foldseek==eggNOG\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ece1f37-3b25-489a-a746-09a826a900c6", + "metadata": {}, + "outputs": [], + "source": [ + "np.sum(spongilla[\"foldseek==eggNOG\"]) / np.sum(has_eg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af1d4f5b-beae-4a29-8f01-7c88df38e553", + "metadata": {}, + "outputs": [], + "source": [ + "sns.jointplot(x=\"query length\", y=\"plddt\", hue=\"foldseek==eggNOG\", data=spongilla[has_eg], kind=\"kde\");" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54ce7ab6-78ce-4fb0-b9f6-12fc2a1b36bb", + "metadata": {}, + "outputs": [], + "source": [ + "sns.jointplot(x=\"MSA size\", y=\"afdb bit score\", hue=\"foldseek==eggNOG\", data=spongilla[has_eg], kind=\"kde\");" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0fd7427-38a0-4fa1-a654-bbb49ef4095e", + "metadata": {}, + "outputs": [], + "source": [ + "sns.jointplot(x=\"score\", y=\"afdb bit score\", hue=\"foldseek==eggNOG\", data=spongilla[has_eg], kind=\"kde\");" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "284130c1-25af-4e65-bfb4-c927ad55efd0", + "metadata": {}, + "outputs": [], + "source": [ + "spongilla[has_eg & ~agrees]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f56a203-3e58-4b11-a013-08ee88e7baa3", + "metadata": {}, + "outputs": [], + "source": [ + "import requests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "015cb582-b2fb-4fa3-8e56-8bd369115a9d", + "metadata": {}, + "outputs": [], + "source": [ + "url = 'http://eggnogapi5.embl.de/nog_data/json/domains/'\n", + "\n", + "names = []\n", + "seq = []\n", + "struct = []\n", + "\n", + "for row in tqdm(spongilla[has_eg & ~agrees].iterrows()):\n", + " index, data = row\n", + " seq_query = data[\"eggNOG_OGs\"].split(\"@\")[0]\n", + " struct_query = data[\"best foldseek eggnog\"]\n", + " r = requests.get(url + seq_query)\n", + " seq.append(r.json()[\"domains\"])\n", + " r = requests.get(url + struct_query)\n", + " struct.append(r.json()[\"domains\"])\n", + " names.append(index)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07f44d04-bcdc-4528-b820-6c163f2764c2", + "metadata": {}, + "outputs": [], + "source": [ + "domain_overlap = {}\n", + "for name, seq_dom, struct_dom in zip(names, seq, struct):\n", + " comment = \"\"\n", + " try:\n", + " seq_print = [x[0] for x in seq_dom[\"PFAM\"]]\n", + " except KeyError:\n", + " seq_print = [\"\"]\n", + " comment = \"missing seq\"\n", + " try:\n", + " struct_print = [x[0] for x in struct_dom[\"PFAM\"]]\n", + " except KeyError:\n", + " struct_print = [\"\"]\n", + " comment = \"missing struct\"\n", + " \n", + " overlap = np.intersect1d(seq_print, struct_print)\n", + " struct_overlap = len(overlap)/len(struct_print)*100\n", + " seq_overlap = len(overlap)/len(seq_print)*100\n", + " domain_overlap[name] = [\",\".join(struct_print), \",\".join(seq_print), struct_overlap, seq_overlap, len(struct_print), len(seq_print), comment]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec241bfc-19c1-4dc2-881c-2ac1349989dd", + "metadata": {}, + "outputs": [], + "source": [ + "domain_overlap = pd.DataFrame(domain_overlap).T\n", + "domain_overlap.columns = [\"struct. domains\", \"seq. domains\", \"overlap/struct\", \"overlap/seq\", \"#struct domains\", \"#seq domains\", \"comments\"]\n", + "domain_overlap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a32302a7-5c46-4d60-b796-c2d1bb9b13a4", + "metadata": {}, + "outputs": [], + "source": [ + "domain_overlap = domain_overlap.astype({'overlap/struct': 'float', 'overlap/seq': 'float'})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9df4add-ca06-4caa-a34f-cb4c31cd70fe", + "metadata": {}, + "outputs": [], + "source": [ + "great = np.sum((domain_overlap[\"overlap/struct\"] >= 50) & (domain_overlap[\"overlap/seq\"] >= 50)) / domain_overlap.shape[0]\n", + "some = np.sum((domain_overlap[\"overlap/struct\"] > 0) & (domain_overlap[\"overlap/seq\"] > 0)) / domain_overlap.shape[0]\n", + "none = np.sum((domain_overlap[\"overlap/struct\"] == 0) & (domain_overlap[\"overlap/seq\"] == 0)) / domain_overlap.shape[0]\n", + "print(\"Proteins with sequence _and_ structure annotation whose EggNOG IDs do not overlap:\")\n", + "print(f\"- {some * 100:.2f}% share at least one domain\")\n", + "print(f\"- {great * 100:.2f}% share >50% of their domains\")\n", + "print(f\"- {none * 100:.2f}% share no domains\")" + ] + }, + { + "cell_type": "markdown", + "id": "7509fd23-fced-4e0e-8fe5-e864705c64d3", + "metadata": {}, + "source": [ + "**We can confidently say that in the overwhelming majority of cases, whenever MAF and EggNOG have annotated a protein, the annotation agrees.**\n", + "\n", + "## 3b) A MAF prediction is present but no EggNOG annotation could be assigned to the UniProt IDs.\n", + "\n", + "manual inspection shows that most of these map well into the same fold/family, but for some reason EggNOG cannot assign IDs to them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69287162-6b55-4a5a-a855-3d9c33810f5d", + "metadata": {}, + "outputs": [], + "source": [ + "has_eg = spongilla[\"sequence annotation\"]\n", + "maf_no_eg = ~spongilla[\"best foldseek UniProt ID\"].isnull() & spongilla[\"best foldseek eggnog\"].isnull()\n", + "spongilla[has_eg & maf_no_eg]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f41b4689-0d4a-469f-9a86-5b9bf65f3e51", + "metadata": {}, + "outputs": [], + "source": [ + "url = 'https://www.uniprot.org/uploadlists/'\n", + "\n", + "not_null = ~spongilla[\"best foldseek UniProt ID\"].isnull()\n", + "to_map = spongilla[has_eg & maf_no_eg & not_null][\"best foldseek UniProt ID\"].values\n", + "\n", + "params = {\n", + "'from': 'ACC',\n", + "'to': 'GENENAME',\n", + "'format': 'tab',\n", + "'query': \" \".join(to_map)\n", + "}\n", + "\n", + "data = urllib.parse.urlencode(params)\n", + "data = data.encode('utf-8')\n", + "req = urllib.request.Request(url, data)\n", + "with urllib.request.urlopen(req) as f:\n", + " response = f.read()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "722e8066-eaa0-4c4b-a48c-5b1a65d9e790", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "uniprot = []\n", + "names = []\n", + "for row in tqdm(response.decode(\"utf-8\").split(\"\\n\")[:-1]):\n", + " col1, col2 = row.split(\"\\t\")\n", + " uniprot.append(col1)\n", + " names.append(col2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1761aeac-d65b-4f20-a2c2-b5c4b7f39a55", + "metadata": {}, + "outputs": [], + "source": [ + "names_map = pd.DataFrame({\"uniprot\": uniprot[1:], \"names\": names[1:]})\n", + "names_map = pd.DataFrame(names_map.groupby(\"uniprot\")[\"names\"].apply(\",\".join))\n", + "\n", + "# pdb_map[\"pdb target\"] = pdb_map.index.values\n", + "names_map.columns = [\"foldseek uniprot name\"]\n", + "# names_map" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17376a07-5b77-4657-9d96-854ed23a9aff", + "metadata": {}, + "outputs": [], + "source": [ + "test = spongilla[has_eg & maf_no_eg].copy()\n", + "test = test.join(names_map, on=\"best foldseek UniProt ID\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d05542ce-f666-4bdb-ad8a-e92aae6812b6", + "metadata": {}, + "outputs": [], + "source": [ + "test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c2c2614-5678-4760-89e3-f6acbdefd6e4", + "metadata": {}, + "outputs": [], + "source": [ + "np.sum(test[\"foldseek uniprot name\"].str.lower().str.replace(\"-\", \"\") == test[\"Preferred_name\"].str.lower())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e66f247-31dc-4cdc-a052-d5dc7b17bf0e", + "metadata": {}, + "outputs": [], + "source": [ + "test[[\"foldseek uniprot name\", \"Preferred_name\", \"Description\"]][40:80]" + ] + }, + { + "cell_type": "markdown", + "id": "b297e84b-490a-4b66-9d30-c5c7a02c3988", + "metadata": {}, + "source": [ + "## 3c) FoldSeek enrichment: MAF prediction exists, but no sequence prediction was produced." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96f53f50-3f25-4340-9fc6-5a1cf56beeb0", + "metadata": {}, + "outputs": [], + "source": [ + "spongilla.to_csv(\"../data/spongilla.tsv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5461722f-ae6f-42dd-96b9-67e10bc699e4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "has_eg = ~spongilla[\"sequence annotation\"]\n", + "maf = ~spongilla[\"best foldseek UniProt ID\"].isnull() & spongilla[\"best foldseek eggnog\"].isnull()\n", + "spongilla[has_eg & maf]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c5a9e90-720e-49e2-be53-c873dd0a55b7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/analysis/read-write.ipynb b/analysis/read-write.ipynb index dfb2d4eb254c809a148d1e1a551a0eb77280bf87..cb63ea22d0355e4041aa07e540ed4e28e61b8370 100644 --- a/analysis/read-write.ipynb +++ b/analysis/read-write.ipynb @@ -3,8 +3,36 @@ { "cell_type": "code", "execution_count": 1, - "id": "453497ce-69c3-4707-a742-488ca5114bca", + "id": "a5e450d0-6b47-405d-b6f4-fdbb3edfb641", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2022-05-12 08:55\n" + ] + } + ], + "source": [ + "from datetime import datetime, timezone\n", + "import pytz\n", + "\n", + "utc_dt = datetime.now(timezone.utc) # UTC time\n", + "dt = utc_dt.astimezone()\n", + "tz = pytz.timezone('Europe/Berlin')\n", + "berlin_now = datetime.now(tz)\n", + "\n", + "print(f'{berlin_now:%Y-%m-%d %H:%M}')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1eae5786-023a-45ab-ac4a-ccb3a1db5760", + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "import json\n", @@ -40,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "c58419bf-f9d3-40f6-ae23-a4bf978f2b35", "metadata": {}, "outputs": [], @@ -60,15 +88,22 @@ }, { "cell_type": "markdown", - "id": "2756e053-9223-4d69-8a4e-0c50da18b079", + "id": "9a44be9c-3fdf-4c62-9e16-bea75e383d00", "metadata": {}, "source": [ - "Read the peptides and keep track of which sequences start with a methionine and end with a star (i.e. complete transcripts assembled by Trinity/TransDecoder)" + "## 1. Sequence information\n", + "\n", + "\n", + "### 1.1 Input peptides\n", + "\n", + "The input for the sequence annotation pipeline was predicted peptides, as generated from the assembled genome. Read the peptides and keep track of which sequences start with a methionine and end with a star (i.e. complete transcripts assembled by Trinity/TransDecoder).\n", + "\n", + "Note that there will be multiple (predicted) isoforms per (predicted) gene. We produced one MSA per isoform." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "40fd2798-4dba-48f6-bbd1-fc5758c84835", "metadata": {}, "outputs": [ @@ -76,7 +111,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "223492it [00:00, 1385068.21it/s]\n" + "223492it [00:00, 1306495.29it/s]\n" ] } ], @@ -103,7 +138,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 5, "id": "2e2fafde-8653-4286-aeb4-cecb55804e4f", "metadata": {}, "outputs": [], @@ -118,17 +153,15 @@ }, { "cell_type": "markdown", - "id": "d5377c66-80fc-4ead-8afc-68010c787a8b", + "id": "9ef9eeba-0082-4109-ad9b-33df3db9b129", "metadata": {}, "source": [ - "## 1. Multiple sequence alignments\n", - "\n", - "First read the MSAs and extract the number of sequences in each as well as the sequence length and the _Spongilla_ transcript name" + "The peptides were aligned against UniRef30 and ColabFoldDB using MMSeqs2. Read the MSAs and extract the number of sequences in each as well as the sequence length and the _Spongilla_ transcript name." ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "cc2f89f4-1b24-4b1d-b1ef-b1c40ea99ff4", "metadata": {}, "outputs": [ @@ -136,7 +169,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 41943/41943 [11:06<00:00, 62.89it/s] \n" + "100%|██████████| 41943/41943 [06:08<00:00, 113.92it/s]\n" ] } ], @@ -155,16 +188,14 @@ " no_seqs[i] = (len(lines) - 3) / 2\n", " seq_length[i] = lines[0].rstrip()[1:].split()[0]\n", " gene_name[i] = lines[1].rstrip()[1:]\n", - " # print(no_seqs, seq_length, gene_name)\n", + " # print(no_seqs, seq_length, gene_name)\n", " except FileNotFoundError:\n", - " continue\n", - "\n", - "sequence_info = pd.DataFrame({\"query\": seq_id, \"MSA size\": no_seqs, \"query length\": seq_length, \"gene name\": gene_name})" + " continue" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 7, "id": "64a7ca10-f99c-402e-a9bf-42826933ae9e", "metadata": {}, "outputs": [], @@ -186,17 +217,19 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 8, "id": "5ae5dfb2-84a6-420f-8ac2-2c79d4f95e16", "metadata": {}, "outputs": [], "source": [ - "sequence_info = sequence_info.join(fasta)" + "sequence_info = sequence_info.join(fasta)\n", + "# this column will hold NaNs later so convert it to Int64, which can deal with nulls.\n", + "sequence_info[\"protein_id\"] = sequence_info[\"protein_id\"].astype(int).astype(\"Int64\")" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 9, "id": "c61347d2-f602-4f76-a656-70528242a4be", "metadata": {}, "outputs": [], @@ -209,14 +242,14 @@ "id": "1b716527-0af8-4945-b70f-a11302c11685", "metadata": {}, "source": [ - "## 2. AlphaFold predictions\n", + "### 1.2 AlphaFold-predicted structures\n", "\n", - "Next, read the per-residue pLDDT score from AlphaFold and average it; then keep the best-scoring isoform per gene ID." + "We used the MSAs to predict protein structures using ColabFold. Here, we read the per-residue pLDDT score from the best predicted structure per peptide and average it over all residues; then we merge _Spongilla_ isoforms by keeping the best score per gene ID." ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 10, "id": "92560aed-4440-4577-91ca-2b4bc6c325ea", "metadata": {}, "outputs": [ @@ -224,7 +257,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 41932/41932 [19:00<00:00, 36.76it/s] \n" + "100%|██████████| 41932/41932 [23:33<00:00, 29.67it/s] \n" ] } ], @@ -245,18 +278,17 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 11, "id": "def42203-df23-4f4b-ad7b-a01dde971452", "metadata": {}, "outputs": [], "source": [ - "alphafold = pd.DataFrame({\"query\": proteins, \"plddt\": scores})\n", - "alphafold.set_index(\"query\", inplace=True)" + "alphafold = pd.DataFrame({\"query\": proteins, \"plddt\": scores})" ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 12, "id": "3831f7ff-07b6-42ae-8276-ab899ab6ffa0", "metadata": {}, "outputs": [], @@ -271,13 +303,13 @@ "source": [ "## 3. FoldSeek predictions\n", "\n", - "Finally, read the FoldSeek output. We will do some minor string manipulation to obtain cleaner protein names. After reading the files we will immediately concatenate to the sequence information to obtain a mapping between the AlphaFold query numbers and the gene/protein names." + "We searched against AlphaFoldDB (predicted), PDB (crystal), and SwissProt (predicted) protein structures to detect structural similarity to our predicted _Spongilla_ peptide structures using FoldSeek. Here we read the FoldSeek output. In order to obtain useful phylogenetic information as well as functional annotation, we will translate all structural hits to their UniProt IDs and then query the EggNOG database and UniProt for annotation." ] }, { "cell_type": "code", - "execution_count": 54, - "id": "01ea3860-9614-4651-ab18-25ab13565a91", + "execution_count": 13, + "id": "f9a0fd51-3e71-4c32-95d6-6f0dbeb4cb3c", "metadata": {}, "outputs": [], "source": [ @@ -296,8 +328,8 @@ }, { "cell_type": "code", - "execution_count": 56, - "id": "ecfa5e2e-6236-42bb-ad71-e2f34a7ce9e2", + "execution_count": 14, + "id": "4e8efa67-66a8-4f4b-87c3-e0ae57991860", "metadata": {}, "outputs": [], "source": [ @@ -311,198 +343,406 @@ }, { "cell_type": "code", - "execution_count": 58, - "id": "26e441c5-9274-42bf-af51-45b25f028f89", + "execution_count": 15, + "id": "122b7f1c-adab-4a0d-8486-41bebcd9dfd3", "metadata": {}, "outputs": [], "source": [ - "def keep_best_match(df, prefix):\n", - " df = df.sort_values('bit score', ascending=False).drop_duplicates(['gene_id'])\n", - " df.drop(columns=[\"gene name\"], inplace=True)\n", - " df[\"alphafold id\"] = df.index.values\n", - " df.set_index(\"gene_id\", inplace=True)\n", - " df.columns = prefix + \" \" + df.columns\n", - " return df" + "afdb[\"uniprot\"] = afdb[\"target\"].str.split(\"-\").str[1]\n", + "swp[\"uniprot\"] = swp[\"target\"].str.split(\"-\").str[1]" ] }, { "cell_type": "code", - "execution_count": null, - "id": "89064170-95fe-4dc8-8f47-d370999750cd", + "execution_count": 16, + "id": "9b85e1a6-4df6-4975-9007-d91b8e25b724", "metadata": {}, "outputs": [], "source": [ - "pdb = pdb.join(sequence_info, on=\"query\")\n", - "afdb = afdb.join(sequence_info, on=\"query\")\n", - "swp = swp.join(sequence_info, on=\"query\")" + "def get_from_uniprot(df, column, uniprot_from=\"ACC\", uniprot_to=\"EGGNOG_ID\", request_size=40000):\n", + " ids = df[column][~df[column].isnull()]\n", + " url = 'https://www.uniprot.org/uploadlists/'\n", + "\n", + " params = {\n", + " 'from': uniprot_from,\n", + " 'to': uniprot_to,\n", + " 'format': 'tab',\n", + " 'query': \" \".join(ids.unique())\n", + " }\n", + "\n", + " data = urllib.parse.urlencode(params)\n", + " data = data.encode('utf-8')\n", + " req = urllib.request.Request(url, data)\n", + " with urllib.request.urlopen(req) as f:\n", + " response = f.read()\n", + " return response\n", + "\n", + "def create_id_map(response, id_in, id_out):\n", + " map_from = []\n", + " map_to = []\n", + " for row in tqdm(response.decode(\"utf-8\").split(\"\\n\")[:-1]):\n", + " col1, col2 = row.split(\"\\t\")\n", + " map_from.append(col1)\n", + " map_to.append(col2)\n", + "\n", + " res = pd.DataFrame({id_in: map_from[1:], id_out: map_to[1:]})\n", + " res = pd.DataFrame(res.groupby(id_in)[id_out].apply(\",\".join))\n", + "# res.set_index(id_in, inplace=True)\n", + " return res\n", + "\n", + "def enrich_from_uniprot(df, column_from, column_to, uniprot_from=\"PDB_ID\", uniprot_to=\"ACC\"):\n", + " response = get_from_uniprot(df, column_from, uniprot_from=uniprot_from, uniprot_to=uniprot_to)\n", + " df_map = create_id_map(response, column_from, column_to)\n", + " return df.join(df_map, on=column_from)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "b5e5fc27-3d71-4d14-a6d6-576b185d1901", + "execution_count": 17, + "id": "76ff0420-e724-4152-abbc-29d25bfd3f59", "metadata": { "tags": [] }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 142581/142581 [00:00<00:00, 2335162.30it/s]\n" + ] + } + ], + "source": [ + "pdb = enrich_from_uniprot(pdb, \"target\", \"uniprot\", uniprot_from=\"PDB_ID\", uniprot_to=\"ACC\")" + ] + }, + { + "cell_type": "markdown", + "id": "e51f2299-2aaa-4a80-836e-bf6fa09df562", + "metadata": {}, + "source": [ + "### 3.2 Obtaining sequences for the UniProt IDs" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "7262d562-3c2d-4a7a-9361-1bd197d30a7c", + "metadata": {}, "outputs": [], "source": [ - "pdb = keep_best_match(pdb, \"pdb\")\n", - "afdb = keep_best_match(afdb, \"afdb\")\n", - "swp = keep_best_match(swp, \"swp\")" + "unique_up_id = pd.concat([pdb['uniprot'].drop_duplicates(),\n", + " afdb['uniprot'].drop_duplicates(),\n", + " swp['uniprot'].drop_duplicates()])\n", + "unique_up_id.drop_duplicates(inplace=True)" ] }, { "cell_type": "markdown", - "id": "83a56bba-076b-48b4-b886-68a306f9e3d6", + "id": "f8fbc904-aaf5-4f2a-823c-803b35162765", "metadata": {}, "source": [ - "We now have five tables that are all indexed by the query IDs. We can merge those and obtain the summary table of the run:" + "We will write the IDs in a text file without a header and an index, since we plan on passing this on to `upimapi.py`, a tool to programmatically query UniProt." ] }, { "cell_type": "code", - "execution_count": null, - "id": "4c0d87ff-0df4-43a4-8616-b0c34c88b199", + "execution_count": 19, + "id": "df0fdae3-52e6-4e7a-87e8-c81401278b09", "metadata": {}, "outputs": [], "source": [ - "# concatenate foldseek results\n", - "foldseek = pdb.join(afdb, how=\"outer\").join(swp, how=\"outer\")\n", - "# add structure prediction confidence\n", - "foldseek = foldseek.join(alphafold.set_index(\"gene_id\"), how=\"outer\")" + "unique_up_id.to_csv('../data/unique_uniprot_ids.csv', header=False, index=False)" ] }, { "cell_type": "markdown", - "id": "668068c9-bfad-46ea-9208-4ad769bdce66", + "id": "812f9ac5-4ef4-495a-bf60-b8b9eea0b9b6", "metadata": {}, "source": [ - "Before we continue, keep track of which proteins got any foldseek prediction. This is the first bottleneck on the road to expanding functional annotation!" + "While UPIMAPI will chunk automatically every 10k IDs, we want to use Emapper for annotation, and it has a hard limit at 100.000 IDs per request. Unfortunately, looking up UniProt IDs somehow seems to inflate their numbers, so splitting the table here, which would be the cleaner solution, is not possible. We will append all IDs to each other, separated by commas, and submit the whole file to `upimapi.py`." ] }, { "cell_type": "code", - "execution_count": null, - "id": "dd5dcb98-9f1f-49a8-9b4b-816f14a9b8b3", + "execution_count": 20, + "id": "ec3015c3-316e-48fe-a08a-4fd4323cb9ea", "metadata": {}, "outputs": [], "source": [ - "# create a categorical variable that keeps track of which query proteins didn't get any FoldSeek hits\n", - "foldseek[\"foldsought\"] = ~foldseek[\"pdb target\"].isnull() | ~foldseek[\"afdb target\"].isnull() | ~foldseek[\"swp target\"].isnull()" + "with open(f'../data/foldseek_unique_ids.txt', 'w') as tmp:\n", + " ids_as_str = unique_up_id.astype(str)[0]\n", + " tmp.write(','.join(ids_as_str))" + ] + }, + { + "cell_type": "markdown", + "id": "1da0e8ce-564e-42a5-a3b5-34c777dbc018", + "metadata": {}, + "source": [ + "<div class=\"alert alert-block alert-warning\"><b>DISCLAIMER:</b> To make this cell execute, you may need to copy the contents of the UPIMAPI/resource folder into the main repository directory.</div>" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "4650ac1f-4d11-4590-b39f-2a16468d808e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2022-05-12 07:30:01: ID mapping has begun.\n", + "Auto determined \"full id\" as: False\n", + "../data/uniprotinfo.fasta was found. Will perform mapping for the remaining IDs.\n", + "Maximum iterations were made. Results related to 3 IDs were not obtained. IDs with missing information are available at ../data/ids_unmapped.txt and information obtained is available at ../data/uniprotinfo.fasta\n", + "2022-05-12 07:30:05: UPIMAPI analysis finished in 00h00m03s\n" + ] + } + ], + "source": [ + "!python /g/arendt/npapadop/repos/UPIMAPI/upimapi.py -i ../data/foldseek_unique_ids.txt -o ../data/ --fasta --no-annotation" + ] + }, + { + "cell_type": "markdown", + "id": "57a1ea9a-b33b-4eeb-bdfc-d92c9ef145fd", + "metadata": {}, + "source": [ + "To submit to Emapper we first need to split in chunks of size $\\leq 100.000$. We will use `fasta-splitter.pl` by Kirill Kryukov." ] }, { "cell_type": "code", - "execution_count": null, - "id": "b2bcc4ed-6998-4fe1-b1c2-a932da32c5b2", + "execution_count": 22, + "id": "5b403921-8514-431a-a34d-f92fbf034ab8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "../data/uniprotinfo.fasta: 937447 sequences, 371443258 bp => dividing into 10 parts of <= 100000 sequences\n", + "All done, 17 seconds elapsed\n" + ] + } + ], + "source": [ + "%%bash\n", + "mkdir ../data/uniprot_fastas/ -p\n", + "perl ../scripts/fasta-splitter.pl --part-size 100000 ../data/uniprotinfo.fasta --nopad --measure count --out-dir ../data/uniprot_fastas/" + ] + }, + { + "cell_type": "markdown", + "id": "cc1e0086-8c3f-4881-938e-0270a9f9e53e", + "metadata": {}, + "source": [ + "These were manually submitted to emapper, and the results downloaded and saved.\n", + "\n", + "Let's process the retrieved emapper results:" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "22630619-3006-425f-bace-980499b61afd", + "metadata": {}, + "outputs": [], + "source": [ + "uniprot_eggnog = []\n", + "for file in glob.glob('../data/uniprot_fastas/*.eggnog'):\n", + " tmp = pd.read_csv(file, sep='\\t', skiprows=4, skipfooter=3, engine='python')\n", + " uniprot_eggnog.append(tmp)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "65cd9d5e-fdac-4a14-94f7-787fdba690fa", "metadata": {}, "outputs": [], "source": [ - "foldseek.to_csv(\"../data/foldseek.csv\")" + "uniprot_annotation = pd.concat(uniprot_eggnog)\n", + "\n", + "uniprot_annotation['uniprot'] = uniprot_annotation['#query'].str.split('|').str[1]\n", + "uniprot_annotation.reset_index(drop=True, inplace=True)\n", + "# remove unnecessary columns\n", + "dead_weight = ['#query', 'seed_ortholog', 'EC', 'KEGG_ko', 'KEGG_Pathway',\n", + " 'KEGG_Module', 'KEGG_Reaction', 'KEGG_rclass', 'BRITE',\n", + " 'KEGG_TC', 'CAZy', 'BiGG_Reaction']\n", + "uniprot_annotation.drop(dead_weight, axis=1, inplace=True)\n", + "# convert rest to categorical to save space\n", + "to_categorical = ['uniprot', 'eggNOG_OGs', 'max_annot_lvl', 'COG_category',\n", + " 'Description', 'Preferred_name', 'GOs', 'PFAMs']\n", + "uniprot_annotation[to_categorical] = uniprot_annotation[to_categorical].astype(\"category\")\n", + "# finally save in parquet format\n", + "uniprot_annotation.to_parquet('../data/uniprot_annotation.parquet')" + ] + }, + { + "cell_type": "markdown", + "id": "8a2910c3-dbd5-4268-8eed-eef8c9796ca5", + "metadata": {}, + "source": [ + "We will also query UniProt to get useful gene names, protein names, functional annotation, and taxonomic information. This info is also included in Emapper, but here we get it in a more accessible form." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "c7d97917-53c6-4b51-98a5-97a55100aa37", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2022-05-12 07:30:53: ID mapping has begun.\n", + "Auto determined \"full id\" as: False\n", + "../data/uniprotinfo.tsv was found. Will perform mapping for the remaining IDs.\n", + "IDs present in uniprotinfo file: 936478\n", + "IDs missing: 0\n", + "Results for all IDs are available at ../data/uniprotinfo.tsv\n", + "2022-05-12 07:31:21: UPIMAPI analysis finished in 00h00m27s\n" + ] + } + ], + "source": [ + "!python /g/arendt/npapadop/repos/UPIMAPI/upimapi.py -i ../data/foldseek_unique_ids.txt -o ../data/ --no-annotation" ] }, { "cell_type": "markdown", - "id": "d83e6a8d-864a-405f-9ff2-2fc439ac883a", + "id": "abcfc14c-c236-47c1-ad7a-1bba7244e46b", "metadata": {}, "source": [ - "Now we need the fasta sequences of the hits so we can obtain eggNOG information for all of them:" + "The `csv` file really is huge though so we'll read it and convert to parquet format:" ] }, { "cell_type": "code", - "execution_count": null, - "id": "0be86c44-013e-45ba-a5e9-a681893db417", + "execution_count": 26, + "id": "87957304-19a6-451d-aa82-9ece85cc90de", "metadata": {}, "outputs": [], "source": [ - "foldseek = foldseek.drop(index=foldseek.index[foldseek[\"plddt\"].isnull()])" + "foldseek_full = pd.read_csv('../data/uniprotinfo.tsv', sep='\\t')\n", + "foldseek_full = foldseek_full[['Entry', 'Entry name', 'Gene names', 'Function [CC]', 'Taxonomic lineage (PHYLUM)']]\n", + "foldseek_full.columns = ['uniprot', 'Entry name', 'Gene names', 'Function [CC]', 'Taxonomic lineage (PHYLUM)']\n", + "foldseek_full.to_parquet('../data/fs_targets.parquet')" + ] + }, + { + "cell_type": "markdown", + "id": "86ebcf92-6b4f-4b4c-89dc-f3efc0dfb6b0", + "metadata": {}, + "source": [ + "Add the UniProt and EggNOG annotation to the FoldSeek results:" ] }, { "cell_type": "code", - "execution_count": null, - "id": "fb4ffc3c-ae6f-4610-9564-1663b7f24869", + "execution_count": 28, + "id": "019667f8-47b2-4959-abe8-38e5529b4e86", "metadata": {}, "outputs": [], "source": [ - "foldseek[\"afdb uniprot\"] = foldseek[\"afdb target\"].str.split(\"-\").str[1]\n", - "foldseek[\"swp uniprot\"] = foldseek[\"swp target\"].str.split(\"-\").str[1]" + "pdb = pdb.merge(foldseek_full, on='uniprot', how='left').merge(uniprot_annotation, on='uniprot', how='left')\n", + "afdb = afdb.merge(foldseek_full, on='uniprot', how='left').merge(uniprot_annotation, on='uniprot', how='left')\n", + "swp = swp.merge(foldseek_full, on='uniprot', how='left').merge(uniprot_annotation, on='uniprot', how='left')" + ] + }, + { + "cell_type": "markdown", + "id": "8d55b03f-b7f9-408b-9a48-10573f4984f7", + "metadata": {}, + "source": [ + "Prune matrices to make them smaller:" ] }, { "cell_type": "code", - "execution_count": null, - "id": "67390688-77fb-493f-86fa-20c2727dfe29", + "execution_count": 29, + "id": "5133f52a-26ec-41a6-b90a-5d88a3092c6b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dead_weight = ['no. mismatches', 'no. gap open', 'query start', 'target start',\n", + " 'query end', 'target end', 'target']\n", + "pdb.drop(dead_weight, axis=1, inplace=True)\n", + "afdb.drop(dead_weight, axis=1, inplace=True)\n", + "swp.drop(dead_weight, axis=1, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "bf6217eb-a982-425e-b68f-2ec57972e864", + "metadata": {}, + "source": [ + "Convert non-numeric columns to categorical; many annotations will be repeated (explore by using `.value_counts()` on any of these columns), therefore we will gain a lot of space by converting away from `object`." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "85f48253-2176-4b3b-ae78-e18dc8156d6f", "metadata": {}, "outputs": [], "source": [ - "def get_from_uniprot(df, column, uniprot_from=\"ACC\", uniprot_to=\"EGGNOG_ID\", request_size=40000):\n", - " ids = df[column][~df[column].isnull()]\n", - " url = 'https://www.uniprot.org/uploadlists/'\n", - "\n", - " params = {\n", - " 'from': uniprot_from,\n", - " 'to': uniprot_to,\n", - " 'format': 'tab',\n", - " 'query': \" \".join(ids.unique())\n", - " }\n", - "\n", - " data = urllib.parse.urlencode(params)\n", - " data = data.encode('utf-8')\n", - " req = urllib.request.Request(url, data)\n", - " with urllib.request.urlopen(req) as f:\n", - " response = f.read()\n", - " return response\n", - "\n", - "def create_id_map(response, id_in, id_out):\n", - " map_from = []\n", - " map_to = []\n", - " for row in tqdm(response.decode(\"utf-8\").split(\"\\n\")[:-1]):\n", - " col1, col2 = row.split(\"\\t\")\n", - " map_from.append(col1)\n", - " map_to.append(col2)\n", - "\n", - " res = pd.DataFrame({id_in: map_from[1:], id_out: map_to[1:]})\n", - " res = pd.DataFrame(res.groupby(id_in)[id_out].apply(\",\".join))\n", - "# res.set_index(id_in, inplace=True)\n", - " return res\n", - "\n", - "def enrich_from_uniprot(df, column_from, column_to, uniprot_from=\"PDB_ID\", uniprot_to=\"ACC\"):\n", - " response = get_from_uniprot(df, column_from, uniprot_from=uniprot_from, uniprot_to=uniprot_to)\n", - " df_map = create_id_map(response, column_from, column_to)\n", - " return df.join(df_map, on=column_from)" + "to_categorical = ['uniprot', 'eggNOG_OGs', 'max_annot_lvl', 'COG_category',\n", + " 'Description', 'Preferred_name', 'GOs', 'PFAMs', 'Entry name',\n", + " 'Gene names', 'Function [CC]', 'Taxonomic lineage (PHYLUM)']\n", + "pdb[to_categorical] = pdb[to_categorical].astype(\"category\")\n", + "afdb[to_categorical] = afdb[to_categorical].astype(\"category\")\n", + "swp[to_categorical] = swp[to_categorical].astype(\"category\")" + ] + }, + { + "cell_type": "markdown", + "id": "4d2bbe21-58d1-4eb4-9bc8-4007b8accf77", + "metadata": {}, + "source": [ + "Finally save in parquet format:" ] }, { "cell_type": "code", - "execution_count": null, - "id": "1ae80176-5d9f-4024-bd77-65a0aac5bcc9", + "execution_count": 31, + "id": "3114c69d-cf36-41f9-93ba-6344b2b86b66", "metadata": {}, "outputs": [], "source": [ - "foldseek = enrich_from_uniprot(foldseek, \"pdb target\", \"pdb uniprot\", uniprot_from=\"PDB_ID\", uniprot_to=\"ACC\")" + "pdb.to_parquet('../data/fs_full_pdb.parquet')\n", + "afdb.to_parquet('../data/fs_full_afdb.parquet')\n", + "swp.to_parquet('../data/fs_full_swp.parquet')" + ] + }, + { + "cell_type": "markdown", + "id": "7026038b-d3e2-43dc-886f-1aa412a0d8ea", + "metadata": {}, + "source": [ + "Also process the sequence-based annotation for _Spongilla_ so that it is in a more useful format:" ] }, { "cell_type": "code", - "execution_count": null, - "id": "a069cf15-9c8e-4fba-9174-8dd7603cdbc7", + "execution_count": 32, + "id": "ab94e5ee-e275-4756-8913-458be78a26d3", "metadata": {}, "outputs": [], "source": [ - "params = {\n", - " 'from': \"ACC\",\n", - " 'to': \"GENENAME\",\n", - " 'format': 'tab',\n", - " 'query': \" \".join(unique_ids[(i*request_size):((i+1)*request_size)])\n", - "}\n", - "\n", - "data = urllib.parse.urlencode(params)\n", - "data = data.encode('utf-8')\n", - "req = urllib.request.Request(url, data)\n", - "with urllib.request.urlopen(req) as f:\n", - " response[i] = f.read()" + "eggnog = pd.read_csv('../data/MM_ipw26ffh.emapper.annotations.tsv', sep='\\t', skiprows=4, skipfooter=3, engine='python')\n", + "eggnog[\"gene_id\"] = eggnog[\"#query\"].str.split(\"_\").str[:2].str.join(\"_\")\n", + "eggnog[\"protein_id\"] = eggnog[\"#query\"].str.split(\".\").str[1]\n", + "eggnog.drop(['#query', 'seed_ortholog', 'EC', 'KEGG_ko', 'KEGG_Pathway',\n", + " 'KEGG_Module', 'KEGG_Reaction', 'KEGG_rclass', 'BRITE', 'KEGG_TC',\n", + " 'CAZy', 'BiGG_Reaction'], axis=1, inplace=True)\n", + "eggnog.to_csv('../data/Slacustris_eggnog.tsv', sep='\\t', index=False)" ] } ], diff --git a/scripts/fasta-splitter.pl b/scripts/fasta-splitter.pl new file mode 100644 index 0000000000000000000000000000000000000000..fa5e980e48c61c90b106f04fcd26e08f19aeaecd --- /dev/null +++ b/scripts/fasta-splitter.pl @@ -0,0 +1,297 @@ +#!/usr/bin/env perl +# +# FASTA Splitter - a script for partitioning a FASTA file into pieces +# +# Version 0.2.6 (August 1, 2017) +# +# Copyright (c) 2012-2017 Kirill Kryukov +# +# This software is provided 'as-is', without any express or implied +# warranty. In no event will the authors be held liable for any damages +# arising from the use of this software. +# +# Permission is granted to anyone to use this software for any purpose, +# including commercial applications, and to alter it and redistribute it +# freely, subject to the following restrictions: +# +# 1. The origin of this software must not be misrepresented; you must not +# claim that you wrote the original software. If you use this software +# in a product, an acknowledgment in the product documentation would be +# appreciated but is not required. +# 2. Altered source versions must be plainly marked as such, and must not be +# misrepresented as being the original software. +# 3. This notice may not be removed or altered from any source distribution. +# + +use strict; +use File::Basename qw(basename); +use File::Path qw(make_path); +use Getopt::Long qw(:config pass_through); + +$| = 1; + +my ($script_name,$script_version,$script_date,$script_years) = ('fasta-splitter','0.2.6','2017-08-01','2012-2017'); +my $start_time = time; + +my @files = (); +my ($opt_n_parts,$opt_part_size,$opt_part_num_prefix,$opt_measure,$opt_line_len,$opt_eol,$out_dir,$nopad,$ver,$help); +GetOptions('n-parts=i' => \$opt_n_parts, + 'part-size=i' => \$opt_part_size, + 'part-num-prefix=s' => \$opt_part_num_prefix, + 'measure=s' => \$opt_measure, + 'line-length=i' => \$opt_line_len, + 'eol=s' => \$opt_eol, + 'out-dir=s' => \$out_dir, + 'nopad' => \$nopad, + 'version' => \$ver, + 'help' => \$help); +for (my $i=0; $i<scalar(@ARGV); $i++) +{ + if (substr($ARGV[$i],0,1) eq '-' and $i < scalar(@ARGV)-1) + { + if ($ARGV[$i] eq '-n-parts-total' ) { $opt_n_parts = int($ARGV[++$i]); $opt_measure = 'all'; } + if ($ARGV[$i] eq '-n-parts-sequence' ) { $opt_n_parts = int($ARGV[++$i]); $opt_measure = 'seq'; } + if ($ARGV[$i] eq '-part-total-size' ) { $opt_part_size = int($ARGV[++$i]); $opt_measure = 'all'; } + if ($ARGV[$i] eq '-part-sequence-size') { $opt_part_size = int($ARGV[++$i]); $opt_measure = 'seq'; } + if ($ARGV[$i] eq '-line-length') { $opt_line_len = int($ARGV[++$i]); } + if ($ARGV[$i] eq '-eol' ) { $opt_eol = int($ARGV[++$i]); } + } + else { push @files, $ARGV[$i]; } +} + +my $ver_str = "$script_name, version $script_version, $script_date\nCopyright (c) $script_years Kirill Kryukov\n"; +my $help_str = qq{Usage: ${script_name} [options] <file>... +Options: + --n-parts <N> - Divide into <N> parts + --part-size <N> - Divide into parts of size <N> + --measure (all|seq|count) - Specify whether all data, sequence length, or + number of sequences is used for determining part + sizes ('all' by default). + --line-length - Set output sequence line length, 0 for single line + (default: 60). + --eol (dos|mac|unix) - Choose end-of-line character ('unix' by default). + --part-num-prefix T - Put T before part number in file names (def.: .part-) + --out-dir - Specify output directory. + --nopad - Don't pad part numbers with 0. + --version - Show version. + --help - Show help. +}; + +print (($ver ? $ver_str : ''), ($help ? $help_str : '')); +if (!defined($opt_n_parts) and !defined($opt_part_size) and !defined($opt_measure) and !defined($opt_line_len) and !defined($opt_eol)) +{ + if (!$help and !$ver) { print $ver_str, $help_str; } exit; +} +if (!defined($opt_n_parts) and !defined($opt_part_size)) { die "Splitting method is not specified\nUse -h for help\n"; } +if (!@files) { die "File for splitting is not specified\n"; } + +if (defined($opt_n_parts) and $opt_n_parts <= 0) { die "Non-positive number of parts\n"; } +if (defined($opt_part_size) and $opt_part_size <= 0) { die "Non-positive part size\n"; } +if (defined($opt_measure) and $opt_measure ne 'all' and $opt_measure ne 'seq' and $opt_measure ne 'count') { die "Unknown value of --measure option\n"; } +if (defined($opt_eol) and $opt_eol ne 'dos' and $opt_eol ne 'mac' and $opt_eol ne 'unix') { die "Unknown value of --eol option\n"; } +if (defined($out_dir)) +{ + $out_dir =~ s/[\/\\]+$//; + if (!-e $out_dir) { make_path($out_dir); } + if (!-e $out_dir || !-d $out_dir) { die "Can't create output directory \"$out_dir\"\n"; } + $out_dir .= '/'; +} + +my $n_parts = defined($opt_n_parts) ? $opt_n_parts : 0; +my $part_size = defined($opt_part_size) ? $opt_part_size : 0; +my $line_len = (defined($opt_line_len) and $opt_line_len >= 0) ? $opt_line_len : 60; +my $eol = defined($opt_eol) ? (($opt_eol eq 'dos') ? "\x0D\x0A" : ($opt_eol eq 'mac') ? "\x0D" : "\x0A") : "\x0A"; +my $eol_len = length($eol); +my $measure = defined($opt_measure) ? (($opt_measure eq 'count') ? 0 : ($opt_measure eq 'seq') ? 1 : 2) : 2; +my $part_num_prefix = defined($opt_part_num_prefix) ? $opt_part_num_prefix : '.part-'; +my @part_start = (); +my ($base,$ext,$num_len,$total_size); +my ($OUT,$name,$data,$written_total,$written_this_part,$part_end,$part); + +foreach my $infile (@files) { split_file($infile); } + +my $elapsed_time = time - $start_time; +print "All done, $elapsed_time second", (($elapsed_time==1)?'':'s'), " elapsed\n"; + +sub split_file +{ + my ($infile) = @_; + if (!-e $infile or !-f $infile) { print "Can't find file \"$infile\"\n"; return; } + print $infile; + + ($base,$ext) = (basename($infile),''); + if ($base =~ /^(.+?)(\.[^\.]+)$/) { ($base,$ext) = ($1,$2); } + + @part_start = (); + my ($n_seq,$total_seq_len,$n_parts_found) = (0,0,0); + + if ($part_size) + { + ($n_seq,$total_seq_len,$total_size,$n_parts_found) = get_file_size_and_part_boundaries($infile); + if (!$n_parts) { print ": $n_seq sequences, $total_seq_len bp"; } + print ' => ', ($n_parts ? 'extracting' : 'dividing into'), ' ', $n_parts_found, ' part', ($n_parts_found > 1 ? 's' : ''), + " of <= $part_size ", ($measure ? (($measure > 1) ? 'bytes' : 'bp') : 'sequences'), "\n"; + open(my $IN,'<',$infile) or die "Error: Can't open file \"$infile\"\n"; + binmode $IN; + $num_len = length($n_parts_found); + $OUT = undef; + my ($out_file,$part,$si,$buffer) = (undef,0,-1,''); + while (<$IN>) + { + $_ =~ s/[\x0D\x0A]+$//; + if (substr($_,0,1) eq '>') + { + if ($OUT) + { + if ($line_len == 0) { if ($si >= 0) { print $OUT $eol; } } + elsif ($buffer ne '') { print $OUT $buffer, $eol; $buffer = ''; } + } + $si++; + if ($si >= $part_start[$part+1]) + { + if ($OUT) { close $OUT; } + $part++; + if ($part > $n_parts_found) { last; } + $out_file = $out_dir . $base . $part_num_prefix . ($nopad ? $part : sprintf('%0*d',$num_len,$part)) . $ext; + open($OUT,'>',$out_file) or die "Can't create output file \"$out_file\"\n"; + binmode $OUT; + } + print $OUT $_, $eol; + next; + } + if ($line_len) + { + $buffer .= $_; + while (length($buffer) >= $line_len) { print $OUT substr($buffer,0,$line_len,''), $eol; } + } + else { print $OUT $_; } + } + close $IN; + if ($OUT) + { + if (!$line_len) { if ($si >= 0) { print $OUT $eol; } } + elsif ($buffer ne '') { print $OUT $buffer, $eol; $buffer = ''; } + close $OUT; + } + } + else + { + ($n_seq,$total_seq_len,$total_size) = get_file_size($infile); + print ": $n_seq sequences, $total_seq_len bp => dividing into $n_parts part", ($n_parts > 1 ? 's' : ''), " "; + open(my $IN,'<',$infile) or die "Error: Can't open file \"$infile\"\n"; + binmode $IN; + $num_len = length($n_parts); + ($OUT,$name,$data,$written_total,$written_this_part,$part_end,$part) = (undef,undef,'',0,0,int($total_size / $n_parts),1); + while(<$IN>) + { + $_ =~ s/[\x0D\x0A]+$//; + if (substr($_,0,1) eq '>') + { + if (defined $name) { dump_seq(); } + $name = $_; $data = ''; next; + } + $data .= $_; + } + if (defined $name) { dump_seq(); } + close $IN; + if ($OUT) { close $OUT; } + print " OK\n"; + } +} + +sub dump_seq +{ + my $slen = length($data); + my $seq_size = seq_size(length($name),$slen); + my $new_written_total = $written_total + $seq_size; + if ( !$OUT or + ($written_this_part and ($new_written_total > $part_end) and ($new_written_total - $part_end > $part_end - $written_total)) ) + { + if ($OUT) { close $OUT; $part++; $part_end = int($total_size / $n_parts * $part) + 1; } + + my $part_file = $out_dir . $base . $part_num_prefix . ($nopad ? $part : sprintf('%0*d',$num_len,$part)) . $ext; + + open($OUT,'>',$part_file) or die "Error: Can't create file \"$part_file\"\n"; + binmode $OUT; + $written_this_part = 0; + print "."; + } + print $OUT $name, $eol; + if ($line_len) { for (my $s=0; $s<$slen; $s+=$line_len) { print $OUT substr($data,$s,$line_len), $eol; } } + else { print $OUT $data, $eol; } + $written_this_part += $seq_size; + $written_total += $seq_size; +} + +sub get_file_size_and_part_boundaries +{ + my ($file) = @_; + open(my $IN,'<',$file) or die "Error: Can't open file \"$file\"\n"; + binmode $IN; + my ($nseq,$total_seq_length,$total_size,$n_parts_found,$this_part_size,$nlen,$slen,$stop) = (0,0,0,1,0,0,0,0); + $part_start[1] = 0; + while (<$IN>) + { + $_ =~ s/[\x0D\x0A]+$//; + my $len = length($_); + if (substr($_,0,1) eq '>') + { + if ($nlen) + { + my $seq_size = seq_size($nlen,$slen); + if ($part_size and $this_part_size and ($this_part_size + $seq_size > $part_size)) + { + if ($n_parts and $n_parts_found == $n_parts) { $stop = 1; last; } + else { $this_part_size = $seq_size; $n_parts_found++; $part_start[$n_parts_found] = $nseq; } + } + else { $this_part_size += $seq_size; } + $nseq++; $total_seq_length += $slen; $total_size += $seq_size; + } + ($nlen,$slen) = ($len,0); next; + } + if ($nlen) { $slen += $len; } + } + if ($nlen and !$stop) + { + my $seq_size = seq_size($nlen,$slen); + if ($part_size and $this_part_size and ($this_part_size + $seq_size > $part_size)) + { + if ($n_parts and $n_parts_found == $n_parts) { $stop = 1; } + else { $this_part_size = $seq_size; $n_parts_found++; $part_start[$n_parts_found] = $nseq; } + } + if (!$stop) { $nseq++; $total_seq_length += $slen; $total_size += $seq_size; } + } + close $IN; + $part_start[$n_parts_found+1] = $nseq; + return ($nseq,$total_seq_length,$total_size,$n_parts_found); +} + +sub get_file_size +{ + my ($file) = @_; + open(my $IN,'<',$file) or die "Error: Can't open file \"$file\"\n"; + binmode $IN; + my ($nseq,$total_seq_length,$total_size,$nlen,$slen) = (0,0,0,0,0); + while (<$IN>) + { + $_ =~ s/[\x0D\x0A]+$//; + my $len = length($_); + if (substr($_,0,1) eq '>') + { + if ($nlen) { $nseq++; $total_seq_length += $slen; $total_size += seq_size($nlen,$slen); } + ($nlen,$slen) = ($len,0); next; + } + if ($nlen) { $slen += $len; } + } + if ($nlen) { $nseq++; $total_seq_length += $slen; $total_size += seq_size($nlen,$slen); } + close $IN; + return ($nseq,$total_seq_length,$total_size); +} + +sub seq_size +{ + my ($nlen,$slen) = @_; + return ($measure == 0) ? 1 : + ($measure == 1) ? $slen : + $slen + $nlen + $eol_len*(1 + ($line_len ? int(($slen+$line_len-1)/$line_len) : 1)); +}