diff --git a/analysis/suppl-get_species.ipynb b/analysis/suppl-get_species.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..ae08f18182fe0e31df34ef451b730d9030eea563 --- /dev/null +++ b/analysis/suppl-get_species.ipynb @@ -0,0 +1,158 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "5d30842b-ecc2-4461-87f2-3940b28b8167", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f0254af0-b87f-4550-aa70-2e285c5f85b2", + "metadata": {}, + "outputs": [], + "source": [ + "afdb = pd.read_csv('/Users/npapadop/Documents/data/coffe/self_score.tsv', sep='\\t', header=None)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9920841f-ca20-4e91-bcd2-c9caf0d59a44", + "metadata": {}, + "outputs": [], + "source": [ + "afdb.columns = ['query', 'target', 'perc_id', 'ali_length', 'no_mismatch', 'no_gapopen',\n", + " 'q_start', 'q_end', 't_start', 't_end', 'eval', 'bit']" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "efa843cc-2af9-43fe-8164-5ef85d438359", + "metadata": {}, + "outputs": [], + "source": [ + "afdb['query'] = afdb['query'].str.split('-').str[1]\n", + "afdb['target'] = afdb['target'].str.split('-').str[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "aaefa6a1-6fd0-4775-a300-9871359005a4", + "metadata": {}, + "outputs": [], + "source": [ + "unique_up_id = pd.concat([afdb['query'], afdb['target']])\n", + "unique_up_id.drop_duplicates(inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "5e9ad3ce-400e-4b45-b782-11078d0cf492", + "metadata": {}, + "outputs": [], + "source": [ + "unique_up_id = unique_up_id.values" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "52171df1-5a9b-4105-b8e7-b6186831a0d5", + "metadata": {}, + "outputs": [], + "source": [ + "import requests, sys\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ce2575ad-af72-4036-af92-6f53920bb64d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|███████████████████████████████████████████████████████████████████████████| 5604/5604 [3:03:46<00:00, 1.97s/it]\n" + ] + } + ], + "source": [ + "request_size = 100\n", + "no_chunks = np.ceil(len(unique_up_id) / request_size).astype(int)\n", + "\n", + "accessions = []\n", + "species_list = []\n", + "\n", + "for i in tqdm(range(no_chunks)):\n", + " a = i * request_size\n", + " b = (i+1) * request_size\n", + " chunk = [str(c) for c in unique_up_id[a:b]]\n", + " url = f\"https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=100&accession={','.join(chunk)}\"\n", + " r = requests.get(url, headers={ \"Accept\" : \"application/json\"})\n", + "\n", + " if not r.ok:\n", + " r.raise_for_status()\n", + " break\n", + " for response in r.json():\n", + " accession = response['accession']\n", + " species = response['organism']['names'][0]['value']\n", + " accessions.append(accession)\n", + " species_list.append(species)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "9e427ed7-1af4-4b35-80f0-232b58c5d0ba", + "metadata": {}, + "outputs": [], + "source": [ + "annot = pd.DataFrame({'id': accessions, 'species': species_list})" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "7fb82b4d-8e14-4fa6-a8ea-079a8de11e4e", + "metadata": {}, + "outputs": [], + "source": [ + "annot.to_csv('/Users/npapadop/Documents/data/coffe/afdb_proteomes_species.tsv', header=True, sep='\\t')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/analysis/suppl-model_species_validation.ipynb b/analysis/suppl-model_species_validation.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..0c82da699f59f870375812432f3bad027fa9e89e --- /dev/null +++ b/analysis/suppl-model_species_validation.ipynb @@ -0,0 +1,209 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "7bcc7b18-f2b0-45c7-9a1d-534689b52950", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from matplotlib import pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "78c429cf-dabc-4350-b2db-76ca32b4d88c", + "metadata": {}, + "outputs": [], + "source": [ + "afdb = pd.read_csv('/Users/npapadop/Documents/data/coffe/self_score.tsv', sep='\\t', header=None)\n", + "\n", + "afdb.columns = ['query', 'target', 'perc_id', 'ali_length', 'no_mismatch', 'no_gapopen',\n", + " 'q_start', 'q_end', 't_start', 't_end', 'eval', 'bit']\n", + "\n", + "afdb['query'] = afdb['query'].str.split('-').str[1]\n", + "afdb['target'] = afdb['target'].str.split('-').str[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a01e36ac-e35d-4d7b-9747-22c7f4c1e52a", + "metadata": {}, + "outputs": [], + "source": [ + "annot = pd.read_csv('/Users/npapadop/Documents/data/coffe/afdb_proteomes_species.tsv', sep='\\t', index_col=0)\n", + "\n", + "uniprot_to_eggnog = pd.read_csv('/Users/npapadop/Documents/data/coffe/uni2egg_euk.Nov2018.tsv', sep='\\t', header=None)\n", + "uniprot_to_eggnog.columns = ['uniprot', 'orthogroup']" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1fb78a25-0642-49a1-b7b4-2cf096a662ba", + "metadata": {}, + "outputs": [], + "source": [ + "def non_conspecific_hits(alignment, annotation, species):\n", + " belongs = annotation['species'] == species\n", + " query_belongs = alignment['query'].isin(annotation['id'][belongs])\n", + " target_belongs = alignment['target'].isin(annotation['id'][belongs])\n", + " return query_belongs & ~target_belongs" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4da45e3b-9ffd-419a-b812-a492872a15ad", + "metadata": {}, + "outputs": [], + "source": [ + "def best_match(df, sort_by='bit', tiebreak='ali_length'):\n", + " have_max = df[sort_by] == np.max(df[sort_by])\n", + " max_ali = df[have_max][tiebreak] == np.max(df[have_max][tiebreak])\n", + " return df[have_max][max_ali].index.values[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2f3771b7-91f7-4399-aa89-72ba1eef1209", + "metadata": {}, + "outputs": [], + "source": [ + "def subset_comparable(annotation, indices, reference):\n", + " species_df = annotation.loc[indices].copy()\n", + "\n", + " species_df['query_og'] = species_df.join(reference.set_index('uniprot'), on='query')['orthogroup']\n", + " species_df['target_og'] = species_df.join(reference.set_index('uniprot'), on='target')['orthogroup']\n", + "\n", + " query_available = ~species_df['query_og'].isnull()\n", + " target_available = ~species_df['target_og'].isnull()\n", + " \n", + " return species_df[query_available & target_available]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "596af8e7-d2b9-4a34-b9e4-976fec4790b8", + "metadata": {}, + "outputs": [], + "source": [ + "def perc_agreement(df, df_available):\n", + " if len(df_available) == 0:\n", + " return -0\n", + " perc = np.sum(df_available['query_og'] == df_available['target_og']) / len(df_available)\n", + " return perc" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4170a709-fbc7-4903-955d-091f776689c6", + "metadata": {}, + "outputs": [], + "source": [ + "def analyse_species(species, alignment, annotation, reference):\n", + " is_species = non_conspecific_hits(alignment, annotation, species)\n", + " species_df = alignment[is_species].groupby('query').apply(best_match)\n", + " species_av = subset_comparable(alignment, species_df, reference)\n", + " perc = perc_agreement(species_df, species_av)\n", + " return [len(species_df), len(species_av), perc]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "f824d534-adbe-48ed-9dd4-99ff8a1efac0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ajellomyces capsulatus (strain G186AR / H82 / ATCC MYA-2454 / RMSCC 2432)\t6551\t0\t 0.00%\n", + "Arabidopsis thaliana\t22748\t18361\t 90.63%\n", + "Brugia malayi\t7254\t0\t 0.00%\n", + "Caenorhabditis elegans\t13993\t1824\t 55.70%\n", + "Campylobacter jejuni subsp. jejuni serotype O:2 (strain ATCC 700819 / NCTC 11168)\t1420\t0\t 0.00%\n", + "Candida albicans (strain SC5314 / ATCC MYA-2876)\t4725\t3480\t 89.71%\n", + "Cladophialophora carrionii\t9194\t493\t 67.14%\n", + "Danio rerio\t22041\t14807\t 89.19%\n", + "Dictyostelium discoideum\t7233\t4696\t 67.59%\n", + "Dracunculus medinensis\t8443\t0\t 0.00%\n", + "Drosophila melanogaster\t9480\t6943\t 81.75%\n", + "Enterococcus faecium\t2185\t0\t 0.00%\n", + "Escherichia coli (strain K12)\t4022\t0\t 0.00%\n", + "Fonsecaea pedrosoi CBS 271.37\t10960\t0\t 0.00%\n", + "Glycine max\t44403\t31595\t 90.56%\n", + "Haemophilus influenzae (strain ATCC 51907 / DSM 11121 / KW20 / Rd)\t1512\t0\t 0.00%\n", + "Helicobacter pylori (strain ATCC 700392 / 26695)\t1202\t0\t 0.00%\n", + "Homo sapiens\t18775\t16666\t 96.65%\n", + "Klebsiella pneumoniae subsp. pneumoniae (strain HS11286)\t4794\t0\t 0.00%\n", + "Leishmania infantum\t6001\t5481\t 85.66%\n", + "Madurella mycetomatis\t7779\t0\t 0.00%\n", + "Methanocaldococcus jannaschii (strain ATCC 43067 / DSM 2661 / JAL-1 / JCM 10045 / NBRC 100440)\t1274\t0\t 0.00%\n", + "Mus musculus\t20581\t17718\t 98.40%\n", + "Mycobacterium leprae (strain TN)\t1454\t0\t 0.00%\n", + "Mycobacterium tuberculosis (strain ATCC 25618 / H37Rv)\t3554\t0\t 0.00%\n", + "Mycobacterium ulcerans str. Harvey\t5725\t0\t 0.00%\n", + "Neisseria gonorrhoeae (strain ATCC 700825 / FA 1090)\t1538\t0\t 0.00%\n", + "Nocardia brasiliensis ATCC 700358\t6924\t0\t 0.00%\n", + "Onchocerca volvulus\t8594\t0\t 0.00%\n", + "Oryza sativa subsp. japonica\t24200\t17051\t 91.89%\n", + "Paracoccidioides lutzii (strain ATCC MYA-826 / Pb01)\t6278\t568\t 72.18%\n", + "Plasmodium falciparum (isolate 3D7)\t2687\t0\t 0.00%\n", + "Pseudomonas aeruginosa (strain ATCC 15692 / DSM 22644 / CIP 104116 / JCM 14847 / LMG 12228 / 1C / PRS 101 / PAO1)\t4819\t0\t 0.00%\n", + "Rattus norvegicus\t18492\t16229\t 98.72%\n", + "Saccharomyces cerevisiae (strain ATCC 204508 / S288c)\t4610\t3712\t 88.93%\n", + "Salmonella typhimurium (strain LT2 / SGSC1412 / ATCC 700720)\t4158\t0\t 0.00%\n", + "Schistosoma mansoni\t9545\t1395\t 92.83%\n", + "Schizosaccharomyces pombe (strain 972 / ATCC 24843)\t4274\t2466\t 89.98%\n", + "Shigella dysenteriae serotype 1 (strain Sd197)\t3546\t0\t 0.00%\n", + "Sporothrix schenckii (strain ATCC 58251 / de Perez 2211183)\t6921\t1442\t 69.97%\n", + "Staphylococcus aureus (strain NCTC 8325 / PS 47)\t2087\t0\t 0.00%\n", + "Streptococcus pneumoniae (strain ATCC BAA-255 / R6)\t1665\t0\t 0.00%\n", + "Strongyloides stercoralis\t9555\t0\t 0.00%\n", + "Trichuris trichiura\t7632\t0\t 0.00%\n", + "Trypanosoma brucei brucei (strain 927/4 GUTat10.1)\t6509\t5703\t 93.07%\n", + "Trypanosoma cruzi (strain CL Brener)\t13704\t11352\t 89.21%\n", + "Wuchereria bancrofti\t9467\t0\t 0.00%\n", + "Zea mays\t30034\t18601\t 91.83%\n" + ] + } + ], + "source": [ + "for species in np.unique(annot['species']):\n", + " total_proteins, have_og, og_agreement = analyse_species(species, afdb, annot, uniprot_to_eggnog)\n", + " print(f'{species}\\t{total_proteins}\\t{have_og}\\t{og_agreement * 100: .2f}%')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/analysis/suppl-struct_seq_agreement.ipynb b/analysis/suppl-struct_seq_agreement.ipynb index dbddab5cd11cef99492f7d01807e3f2c0632b1c1..5d754cc5ca66391dae1508ad73fa5d9356407a46 100644 --- a/analysis/suppl-struct_seq_agreement.ipynb +++ b/analysis/suppl-struct_seq_agreement.ipynb @@ -663,7 +663,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.9.13" } }, "nbformat": 4,