diff --git a/analysis/prepare.ipynb b/analysis/prepare.ipynb index 20d03d9662297e1c7548c87e030c5f66fa507113..c5407256a30fd97e37257c20a7a95aa1b7107074 100644 --- a/analysis/prepare.ipynb +++ b/analysis/prepare.ipynb @@ -107,12 +107,120 @@ "fs_swp = \"/scratch/npapadop/foldseek_results/swissprot_score.tsv\"\n", "# AlphaFold predictions\n", "structure_list = \"/g/arendt/npapadop/data/spongilla_af/best_models\"\n", - "metadata = \"/g/arendt/npapadop/data/spongilla_af/\"" + "metadata = \"/g/arendt/npapadop/data/spongilla_af/best_model_metadata/\"" ] }, { "cell_type": "markdown", - "id": "ddd1c39e-ade1-432d-b21d-ec12e43cfe8e", + "id": "de3a52aa-8ecc-44bd-9002-7f1412200dc9", + "metadata": {}, + "source": [ + "## 1. Multiple sequence alignments\n", + "\n", + "First read the MSAs and extract the number of sequences in each as well as the sequence length and the _Spongilla_ transcript name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f300273-15c7-41f3-acb3-658129284f25", + "metadata": {}, + "outputs": [], + "source": [ + "N = len(glob.glob(metadata+\"*.a3m\"))\n", + "seq_id = [\"\"] * N\n", + "no_seqs = [0] * N\n", + "seq_length = [0] * N\n", + "gene_name = [\"\"] * N\n", + "\n", + "for i, alignment in enumerate(tqdm(glob.glob(metadata+\"*.a3m\"))):\n", + " try:\n", + " with open(alignment, \"r\") as f:\n", + " seq_id[i] = alignment.split(\"/\")[-1].split(\".\")[0]\n", + " lines = f.readlines()\n", + " no_seqs[i] = (len(lines) - 3) / 2\n", + " seq_length[i] = lines[0].rstrip()[1:].split()[0]\n", + " gene_name[i] = lines[1].rstrip()[1:]\n", + " # print(no_seqs, seq_length, gene_name)\n", + " except FileNotFoundError:\n", + " continue\n", + "\n", + "sequence_info = pd.DataFrame({\"query\": seq_id, \"MSA size\": no_seqs, \"query length\": seq_length, \"gene name\": gene_name})\n", + "sequence_info.set_index(\"query\", inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e946268-f439-4203-a815-48a5384ccf31", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sequence_info[\"protein_id\"] = sequence_info[\"gene name\"].str.split(\".\").str[1]\n", + "sequence_info[\"gene_id\"] = sequence_info[\"gene name\"].str.split(\".\").str[0].str.split(\"_\").str[:2].str.join(\"_\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f1731c0-af6a-4caa-bc1f-b665b59c05c9", + "metadata": {}, + "outputs": [], + "source": [ + "sequence_info.to_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/sequence_info.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "c06f7868-dbc8-4760-9c3c-c9f8b6be58ea", + "metadata": {}, + "source": [ + "## 2. AlphaFold predictions\n", + "\n", + "Next, read the per-residue pLDDT score from AlphaFold and average it; then keep the best-scoring isoform per gene ID." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eed723f4-314d-4b34-89dd-fc339fd89505", + "metadata": {}, + "outputs": [], + "source": [ + "N = len(os.listdir(structure_list))\n", + "proteins = [\"\"] * N\n", + "scores = [0.] * N\n", + "\n", + "for i, protein in enumerate(tqdm(os.listdir(structure_list))):\n", + " full_name = protein.split(\".\")[0]\n", + " metadata_loc = metadata + full_name + \"_scores.json\"\n", + " with open(metadata_loc, \"r\") as f:\n", + " score = json.load(f)\n", + " name = full_name.split(\"_\")[0]\n", + " proteins[i] = name\n", + " scores[i] = np.mean(score[\"plddt\"])\n", + "\n", + "alphafold = pd.DataFrame({\"query\": proteins, \"plddt\": scores})\n", + "alphafold.set_index(\"query\", inplace=True)\n", + "alphafold = alphafold.join(sequence_info)\n", + "alphafold = alphafold.sort_values('plddt', ascending=False).drop_duplicates(['gene_id'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78ef2d0f-3a3b-46a3-ae53-b5093a61a5f9", + "metadata": {}, + "outputs": [], + "source": [ + "alphafold.to_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/structure_predictions.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "f0ed1ef2-591e-4465-abd3-c3de9d7cce3f", "metadata": {}, "source": [ "Read sequence information for _Spongilla_ and the summary of the structure predictions" @@ -125,8 +233,223 @@ "metadata": {}, "outputs": [], "source": [ - "sequence_info = pd.read_csv(\"../data/sequence_info.csv\", index_col=\"query\")\n", - "alphafold = pd.read_csv(\"../data/structure_predictions.csv\", index_col=\"query\")" + "sequence_info = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/sequence_info.csv\", index_col=\"query\")\n", + "alphafold = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/structure_predictions.csv\", index_col=\"query\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0426d007-88f1-4e7e-8177-abc425f8122e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>plddt</th>\n", + " <th>MSA size</th>\n", + " <th>query length</th>\n", + " <th>gene name</th>\n", + " <th>protein_id</th>\n", + " <th>gene_id</th>\n", + " </tr>\n", + " <tr>\n", + " <th>query</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>20236</th>\n", + " <td>98.147179</td>\n", + " <td>7655.0</td>\n", + " <td>78</td>\n", + " <td>c114736_g1_i1_m.91624</td>\n", + " <td>91624</td>\n", + " <td>c114736_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8471</th>\n", + " <td>98.115354</td>\n", + " <td>16675.0</td>\n", + " <td>127</td>\n", + " <td>c103108_g2_i1_m.62395</td>\n", + " <td>62395</td>\n", + " <td>c103108_g2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10371</th>\n", + " <td>98.018750</td>\n", + " <td>2519.0</td>\n", + " <td>352</td>\n", + " <td>c103630_g1_i2_m.67428</td>\n", + " <td>67428</td>\n", + " <td>c103630_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29803</th>\n", + " <td>97.815000</td>\n", + " <td>3461.0</td>\n", + " <td>120</td>\n", + " <td>c91796_g1_i1_m.16975</td>\n", + " <td>16975</td>\n", + " <td>c91796_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20860</th>\n", + " <td>97.811325</td>\n", + " <td>14752.0</td>\n", + " <td>151</td>\n", + " <td>c2715_g1_i1_m.171</td>\n", + " <td>171</td>\n", + " <td>c2715_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26721</th>\n", + " <td>26.866756</td>\n", + " <td>3.0</td>\n", + " <td>712</td>\n", + " <td>c87685_g1_i1_m.11391</td>\n", + " <td>11391</td>\n", + " <td>c87685_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>32956</th>\n", + " <td>26.821802</td>\n", + " <td>1.0</td>\n", + " <td>172</td>\n", + " <td>c94712_g2_i1_m.22872</td>\n", + " <td>22872</td>\n", + " <td>c94712_g2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>31710</th>\n", + " <td>26.688600</td>\n", + " <td>165.0</td>\n", + " <td>200</td>\n", + " <td>c93707_g1_i1_m.20473</td>\n", + " <td>20473</td>\n", + " <td>c93707_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1202</th>\n", + " <td>26.157910</td>\n", + " <td>2.0</td>\n", + " <td>201</td>\n", + " <td>c100524_g1_i1_m.44523</td>\n", + " <td>44523</td>\n", + " <td>c100524_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1796</th>\n", + " <td>24.582727</td>\n", + " <td>10.0</td>\n", + " <td>297</td>\n", + " <td>c100770_g2_i1_m.45897</td>\n", + " <td>45897</td>\n", + " <td>c100770_g2</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>29662 rows × 6 columns</p>\n", + "</div>" + ], + "text/plain": [ + " plddt MSA size query length gene name protein_id \\\n", + "query \n", + "20236 98.147179 7655.0 78 c114736_g1_i1_m.91624 91624 \n", + "8471 98.115354 16675.0 127 c103108_g2_i1_m.62395 62395 \n", + "10371 98.018750 2519.0 352 c103630_g1_i2_m.67428 67428 \n", + "29803 97.815000 3461.0 120 c91796_g1_i1_m.16975 16975 \n", + "20860 97.811325 14752.0 151 c2715_g1_i1_m.171 171 \n", + "... ... ... ... ... ... \n", + "26721 26.866756 3.0 712 c87685_g1_i1_m.11391 11391 \n", + "32956 26.821802 1.0 172 c94712_g2_i1_m.22872 22872 \n", + "31710 26.688600 165.0 200 c93707_g1_i1_m.20473 20473 \n", + "1202 26.157910 2.0 201 c100524_g1_i1_m.44523 44523 \n", + "1796 24.582727 10.0 297 c100770_g2_i1_m.45897 45897 \n", + "\n", + " gene_id \n", + "query \n", + "20236 c114736_g1 \n", + "8471 c103108_g2 \n", + "10371 c103630_g1 \n", + "29803 c91796_g1 \n", + "20860 c2715_g1 \n", + "... ... \n", + "26721 c87685_g1 \n", + "32956 c94712_g2 \n", + "31710 c93707_g1 \n", + "1202 c100524_g1 \n", + "1796 c100770_g2 \n", + "\n", + "[29662 rows x 6 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "alphafold" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7efb172b-6adb-4f50-84f7-bb63adecd3c9", + "metadata": {}, + "outputs": [], + "source": [ + "sequence_info[\"protein_id\"] = sequence_info[\"protein_id\"].astype(\"Int64\") # this column will hold NaNs later so convert it to Int64, which can hold nulls." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2ad95435-9023-4d97-a0ce-cd7612e57a66", + "metadata": {}, + "outputs": [], + "source": [ + "pdb = read_foldseek(fs_pdb)\n", + "pdb[\"query\"] = pdb[\"query\"].values.astype(int)\n", + "afdb = read_foldseek(fs_afdb)\n", + "afdb[\"query\"] = afdb[\"query\"].values.astype(int)\n", + "swp = read_foldseek(fs_swp)\n", + "swp[\"query\"] = swp[\"query\"].values.astype(int)" ] }, { @@ -202,7 +525,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, "id": "47c101a2-47b5-40db-9693-f6af15031437", "metadata": {}, "outputs": [], @@ -215,17 +538,17 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 12, "id": "c31956e6-ed67-4be5-8faf-f7c0ff4d7c58", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(999,)" + "(998,)" ] }, - "execution_count": 15, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -236,7 +559,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 13, "id": "b3e5038f-c7c9-4cef-961a-4e5682236f1c", "metadata": {}, "outputs": [], @@ -249,7 +572,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 14, "id": "507753ca-7ef1-4e82-be1d-bbc23d7ec553", "metadata": {}, "outputs": [ @@ -257,7 +580,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 3/3 [00:22<00:00, 7.43s/it]\n" + "100%|██████████| 3/3 [00:53<00:00, 17.97s/it]\n" ] } ], @@ -277,16 +600,6 @@ " response[i] = f.read()" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "4f8d91dc-ce25-4acd-ad0e-189076590080", - "metadata": {}, - "outputs": [], - "source": [ - "response" - ] - }, { "cell_type": "markdown", "id": "f50296d5-ac0f-48fc-80ef-85b952b843f4", @@ -297,15 +610,17 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 15, "id": "0acd0a47-df91-4ffd-9e8d-bc8cb8882005", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 142581/142581 [00:00<00:00, 1999873.12it/s]\n" + "100%|██████████| 142581/142581 [00:00<00:00, 1929888.50it/s]\n" ] } ], @@ -313,86 +628,3467 @@ "pdb = enrich_from_uniprot(pdb, \"target\", \"uniprot\", uniprot_from=\"PDB_ID\", uniprot_to=\"ACC\")" ] }, + { + "cell_type": "code", + "execution_count": 16, + "id": "07e0cb24-97e1-49d5-8c2f-1f757d5b2a5f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "418323" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdb[\"uniprot\"].str.contains(\",\").sum()" + ] + }, { "cell_type": "markdown", - "id": "b8f492f6-450d-484f-ba07-8a396ea6c314", + "id": "3f03690e-ac94-4502-b372-f77da44ba6f3", "metadata": {}, "source": [ - "Translate the UniProt IDs to gene names; whatever obtained a UniProt ID should have a gene name, and this will be our fallback option if emapper annotation is not present:" + "418323 rows in the pdb file have multiple entrances in the uniprot column, separated by a comma. I will separate them and duplicate the rest of the rows." ] }, { "cell_type": "code", - "execution_count": 14, - "id": "d37e7a82-8017-4b16-9419-1cb45d41858d", + "execution_count": 17, + "id": "6233210d-ba4b-4ca3-afb4-1b4203550298", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pdb = pdb.drop('uniprot', axis=1).join(pdb['uniprot'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('uniprot'))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "574b2698-4bcc-4c9d-8db8-61e327362d3e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdb[\"uniprot\"].str.contains(\",\").sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "8525155a-ffdc-47e4-8bb6-ce720f0dbc49", "metadata": {}, "outputs": [ { - "ename": "HTTPError", - "evalue": "HTTP Error 413: ", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mHTTPError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipykernel_224/4090906344.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# pdb_seq = enrich_from_uniprot(pdb, \"uniprot\", \"gene name\", uniprot_from=\"ACC\", uniprot_to=\"GENENAME\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;31m# swp_seq = enrich_from_uniprot(swp, \"uniprot\", \"gene name\", uniprot_from=\"ACC\", uniprot_to=\"GENENAME\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mafdb_seq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menrich_from_uniprot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mafdb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"uniprot\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"gene name\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muniprot_from\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"ACC\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muniprot_to\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"GENENAME\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/tmp/ipykernel_224/1426142881.py\u001b[0m in \u001b[0;36menrich_from_uniprot\u001b[0;34m(df, column_from, column_to, uniprot_from, uniprot_to)\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0menrich_from_uniprot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_from\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_to\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muniprot_from\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"PDB_ID\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muniprot_to\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"ACC\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 33\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_from_uniprot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_from\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muniprot_from\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muniprot_from\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muniprot_to\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muniprot_to\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 34\u001b[0m \u001b[0mdf_map\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_id_map\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_from\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_to\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mon\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcolumn_from\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/tmp/ipykernel_224/1426142881.py\u001b[0m in \u001b[0;36mget_from_uniprot\u001b[0;34m(df, column, uniprot_from, uniprot_to)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'utf-8'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0mreq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0murllib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mRequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0murllib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0murlopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.9/urllib/request.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(url, data, timeout, cafile, capath, cadefault, context)\u001b[0m\n\u001b[1;32m 212\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0mopener\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_opener\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 214\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mopener\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 215\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 216\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0minstall_opener\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopener\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.9/urllib/request.py\u001b[0m in \u001b[0;36mopen\u001b[0;34m(self, fullurl, data, timeout)\u001b[0m\n\u001b[1;32m 521\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mprocessor\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocess_response\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 522\u001b[0m \u001b[0mmeth\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprocessor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmeth_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 523\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmeth\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 524\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 525\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.9/urllib/request.py\u001b[0m in \u001b[0;36mhttp_response\u001b[0;34m(self, request, response)\u001b[0m\n\u001b[1;32m 630\u001b[0m \u001b[0;31m# request was successfully received, understood, and accepted.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 631\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m200\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mcode\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;36m300\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 632\u001b[0;31m response = self.parent.error(\n\u001b[0m\u001b[1;32m 633\u001b[0m 'http', request, response, code, msg, hdrs)\n\u001b[1;32m 634\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.9/urllib/request.py\u001b[0m in \u001b[0;36merror\u001b[0;34m(self, proto, *args)\u001b[0m\n\u001b[1;32m 559\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhttp_err\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 560\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'default'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'http_error_default'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0morig_args\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 561\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_chain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 562\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 563\u001b[0m \u001b[0;31m# XXX probably also want an abstract factory that knows when it makes\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.9/urllib/request.py\u001b[0m in \u001b[0;36m_call_chain\u001b[0;34m(self, chain, kind, meth_name, *args)\u001b[0m\n\u001b[1;32m 492\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mhandler\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mhandlers\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 493\u001b[0m \u001b[0mfunc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhandler\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmeth_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 494\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 495\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 496\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.9/urllib/request.py\u001b[0m in \u001b[0;36mhttp_error_default\u001b[0;34m(self, req, fp, code, msg, hdrs)\u001b[0m\n\u001b[1;32m 639\u001b[0m \u001b[0;32mclass\u001b[0m \u001b[0mHTTPDefaultErrorHandler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mBaseHandler\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 640\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mhttp_error_default\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhdrs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 641\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mHTTPError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfull_url\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhdrs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 642\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 643\u001b[0m \u001b[0;32mclass\u001b[0m \u001b[0mHTTPRedirectHandler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mBaseHandler\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mHTTPError\u001b[0m: HTTP Error 413: " - ] + "data": { + "text/plain": [ + "4600922" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "# pdb_seq = enrich_from_uniprot(pdb, \"uniprot\", \"gene name\", uniprot_from=\"ACC\", uniprot_to=\"GENENAME\")\n", - "# swp_seq = enrich_from_uniprot(swp, \"uniprot\", \"gene name\", uniprot_from=\"ACC\", uniprot_to=\"GENENAME\")\n", - "afdb_seq = enrich_from_uniprot(afdb, \"uniprot\", \"gene name\", uniprot_from=\"ACC\", uniprot_to=\"GENENAME\")" + "len(pdb)" ] }, { "cell_type": "markdown", - "id": "b80e0e49-c947-45a5-b386-ae3a4631780b", + "id": "7cc67429-33c7-4596-b269-7d6f71aed6f4", "metadata": {}, "source": [ - "Translate the UniProt IDs to eggNOG IDs:" + "I will try and use the tool UNIMAPI to retrieve information from the UniProtIDs, including the sequences that we will need for a new emapper run. \n", + "UNIMAPI takes a csv file as an imput. I will export those from the foldseek result files as csv files.\n", + "\n", + "I just realized that I need to add the commas and make it a one-liner. Because I don't know how to do this in Python, I will just use regex and safe the files as afdb_uniprotIDs.txt, etc. " ] }, { "cell_type": "code", "execution_count": null, - "id": "de9d5f80-9ac8-4a14-a628-1e99af934403", + "id": "f0c4592d-1465-4ff8-bf7a-b20ac08b2586", "metadata": {}, "outputs": [], "source": [ - "pdb_seq = enrich_from_uniprot(pdb_seq, \"uniprot\", \"eggnog\", uniprot_from=\"ACC\", uniprot_to=\"EGGNOG_ID\")\n", - "swp_seq = enrich_from_uniprot(swp_seq, \"uniprot\", \"eggnog\", uniprot_from=\"ACC\", uniprot_to=\"EGGNOG_ID\")\n", - "afdb_seq = enrich_from_uniprot(afdb_seq, \"uniprot\", \"eggnog\", uniprot_from=\"ACC\", uniprot_to=\"EGGNOG_ID\")" + "afdb[\"uniprot\"].to_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/afdb_uniprotIDs.csv\", index=False, index_label=False, header=False, sep=\",\")\n", + "pdb[\"uniprot\"].to_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/pdb_uniprotIDs.csv\", index=False, index_label=False, header=False, sep=\",\")\n", + "swp[\"uniprot\"].to_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/swp_uniprotIDs.csv\", index=False, index_label=False, header=False, sep=\",\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "c4f36d35-8a75-4577-a948-15f9d6a76945", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Q8WZ42 13900\n", + "Q99996 3626\n", + "Q8WXI7 3612\n", + "Q9Y6V0 3004\n", + "Q15149 2975\n", + " ... \n", + "A0A1P6CI10 1\n", + "K7LJC4 1\n", + "K7LPC1 1\n", + "A0A0N7KCM8 1\n", + "P71601 1\n", + "Name: uniprot, Length: 524585, dtype: int64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "afdb.uniprot.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "33436cf6-558d-42a1-9e3e-25f2c86d9e0f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "E3WDQ9 2605\n", + "Q5RC05 2472\n", + "Q5M7C3 2382\n", + "P59597 2346\n", + "P02467 2107\n", + " ... \n", + "Q6LN45 1\n", + "Q56647 1\n", + "B1JQI1 1\n", + "C5FM58 1\n", + "G3KIM4 1\n", + "Name: uniprot, Length: 405846, dtype: int64" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "swp.uniprot.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "a84a9095-e6ba-4902-b7b0-192005816f5c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "P01116 18010\n", + "P0CX51 10846\n", + "O13516 10730\n", + "P05756 10359\n", + "Q3E7X9 10323\n", + " ... \n", + "P80379 1\n", + "Q56691 1\n", + "Q7SIH1 1\n", + "Q63041 1\n", + "Q484B6 1\n", + "Name: uniprot, Length: 35260, dtype: int64" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdb.uniprot.value_counts()" ] }, { "cell_type": "markdown", - "id": "8ec832ac-8525-438d-819a-cab2232f574f", + "id": "49605efa-c6ef-4f62-8c65-9c5fb87e07b8", + "metadata": { + "tags": [] + }, + "source": [ + "afdb contains 524585 unique uniprot IDs. \n", + "swp contains 405846 unique uniprot IDs.\n", + "pdb contains 35260 unique uniprot IDs.\n", + "\n", + "\n", + "tr -d '\\n' < afdb_uniprotIDs.txt > afdb_uniprotIDs_oneline.txt\n", + "tr -d '\\n' < pdb_uniprotIDs.txt > pdb_uniprotIDs_oneline.txt\n", + "tr -d '\\n' < swp_uniprotIDs.txt > swp_uniprotIDs_oneline.txt\n", + "\n", + "For the pdb oneliner file, I additionally had to remove those thingies -> \" because sometimes a pdb entrance has multiple uniprot ids. So in principle I had to delete the cases which didn't have uniprot ids (\"\",) and afterwards delete the empty lines. After that I had to delete leftover quotation marks. \n", + "\n", + "tr -d '\"' < pdb_uniprotIDs_oneline.txt > pdb_uniprotIDs_oneline.txt\n", + "\n", + "After that, I can run the command for each file using terminal, retrieving specificly picked information.\n", + "\n", + "First, I will make use of the --fasta option, that will just return a fasta file for the sequences. This fasta file can then be used to run EggNOG mapper manually.\n", + "\n", + "upimapi.py -i afdb_uniprotIDs_oneline.txt -o /Volumes/arendt/Fabian/PhD/Computational/Spongefold/UNIMAPI/afdb/ --fasta\n", + "upimapi.py -i pdb_uniprotIDs_oneline.txt -o /Volumes/arendt/Fabian/PhD/Computational/Spongefold/UNIMAPI/pdb/ --fasta\n", + "upimapi.py -i swp_uniprotIDs_oneline.txt -o /Volumes/arendt/Fabian/PhD/Computational/Spongefold/UNIMAPI/swp/ --fasta\n", + "\n", + "Fasta files are saved in the respective folders as: uniprotinfo.fasta\n", + "\n", + "After UPIMAPI is translating UniprotID to fasta, it tries to check if all uniprotIDs have a corresponding fasta sequence. However, while the translation happens using the unique UniprotIDs, for some reason it checks if all UniprotIDs (in the case of afdb 12.5 million) are represented. I got a timeout after 24 hours for afdb and swp. Only pdb ran though. I will check how many \">\" are in the fasta files:\n", + "\n", + "afdb: 525613\n", + "swp: 405935\n", + "pdb: 35280\n", + "\n", + "How can there be more fasta entrances than unique uniprotIDs...?\n", + "\n", + "For the pdb fasta I could immediatelly run e-mapper. For afdb and swp I had to divide the fasta files into 100000 entries files (maximum input for emapper). I did this using the script fasta splitter (http://kirill-kryukov.com/study/tools/fasta-splitter/) with the following commands:\n", + "\n", + "perl fasta-splitter.pl --part-size 100000 ./afdb/uniprotinfo.fasta --nopad --measure count --out-dir ./afdb/fasta_split/\n", + "\n", + "perl fasta-splitter.pl --part-size 100000 ./swp/uniprotinfo.fasta --nopad --measure count --out-dir ./swp/fasta_split/\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "5e43cbad-cc59-4814-8ab6-bf1bae920a3e", "metadata": {}, + "outputs": [], "source": [ - "Cross-reference the eggNOG IDs with the eggNOG annotation that gives a nice name/description for each (most) orthogroup:" + "pdb_emapper = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/UPIMAPI/pdb/emapper/pdb_emapper.tsv\", sep='\\t', skiprows=4, skipfooter=3, engine='python')" ] }, { "cell_type": "code", - "execution_count": 41, - "id": "33c69bec-6a43-49c2-8ea8-3f011fcf86c4", + "execution_count": 21, + "id": "747a5c72-a19b-4afe-ae1c-b896c298d7ee", "metadata": {}, "outputs": [], "source": [ - "eggnog = pd.read_csv(\"../data/e5.og_annotations.tsv\", sep=\"\\t\", header=None)\n", - "eggnog.columns = [\"taxon\", \"orthogroup\", \"evidence\", \"name\"]\n", - "eggnog.dropna(inplace=True)\n", - "\n", - "eggnog.set_index(\"orthogroup\", inplace=True)" + "pdb_emapper[\"uniprot\"] = pdb_emapper[\"#query\"].str.split(\"|\").str[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "e3a291be-38e8-495e-b455-72b0770631fe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "P62238 1\n", + "Q9Y324 1\n", + "D5DKI8 1\n", + "Q6TEK8 1\n", + "P04157 1\n", + " ..\n", + "P04181 1\n", + "A0A1H6Q8Z5 1\n", + "P16038 1\n", + "Q4JB24 1\n", + "P38326 1\n", + "Name: uniprot, Length: 34551, dtype: int64" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdb_emapper.uniprot.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "50150ed8-4d19-4cad-9a7c-5528e3f622f2", + "metadata": {}, + "source": [ + "Although I have 35280 entries in the fasta file, emapper only scans though 34554 (tail of emapper file). Unique uniprot IDs in the pdb emapper files are then 34551. The question is where the other 700 entries went..." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "c522cd0d-704a-486f-a528-a31cf09315de", + "metadata": {}, + "outputs": [], + "source": [ + "pdb_merge = pd.merge(pdb, pdb_emapper, on=\"uniprot\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "7387e9ab-8037-4a6e-8a9a-ec18166247d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "P01116 18010\n", + "P0CX51 10846\n", + "O13516 10730\n", + "P05756 10359\n", + "Q3E7X9 10323\n", + " ... \n", + "A0KKT0 1\n", + "Q06672 1\n", + "P86179 1\n", + "A0A090BWT0 1\n", + "Q484B6 1\n", + "Name: uniprot, Length: 34551, dtype: int64" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdb_merge.uniprot.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "08775c42-6ab6-4ab0-8104-e9848d8029f4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4500383" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(pdb_merge)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "0f0bd0c2-79de-44cf-95ec-3eeb7687be46", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['query',\n", + " 'target',\n", + " 'seq. id.',\n", + " 'alignment length',\n", + " 'no. mismatches',\n", + " 'no. gap open',\n", + " 'query start',\n", + " 'target start',\n", + " 'query end',\n", + " 'target end',\n", + " 'e value',\n", + " 'bit score',\n", + " 'uniprot',\n", + " '#query',\n", + " 'seed_ortholog',\n", + " 'evalue',\n", + " 'score',\n", + " 'eggNOG_OGs',\n", + " 'max_annot_lvl',\n", + " 'COG_category',\n", + " 'Description',\n", + " 'Preferred_name',\n", + " 'GOs',\n", + " 'EC',\n", + " 'KEGG_ko',\n", + " 'KEGG_Pathway',\n", + " 'KEGG_Module',\n", + " 'KEGG_Reaction',\n", + " 'KEGG_rclass',\n", + " 'BRITE',\n", + " 'KEGG_TC',\n", + " 'CAZy',\n", + " 'BiGG_Reaction',\n", + " 'PFAMs']" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(pdb_merge.columns)" + ] + }, + { + "cell_type": "markdown", + "id": "fe4e7ac9-697d-4a67-a4ec-04ea44f56b48", + "metadata": {}, + "source": [ + "I additionally will run UPIMAPI with pdb, afdb and swp input and retrieving information that is stored in Uniprot.\n", + "\n", + "upimapi.py -i afdb_uniprotIDs_oneline.txt -o /Volumes/arendt/Fabian/PhD/Computational/Spongefold/UNIMAPI/afdb/ -dbs \"evolutionary genealogy of genes: Non-supervised Orthologous Groups\"\n", + "\n", + "upimapi.py -i pdb_uniprotIDs_oneline.txt -o /Volumes/arendt/Fabian/PhD/Computational/Spongefold/UNIMAPI/pdb/ -dbs \"evolutionary genealogy of genes: Non-supervised Orthologous Groups\"\n", + "\n", + "upimapi.py -i swp_uniprotIDs_oneline.txt -o /Volumes/arendt/Fabian/PhD/Computational/Spongefold/UNIMAPI/swp/ -dbs \"evolutionary genealogy of genes: Non-supervised Orthologous Groups\"\n", + "\n", + "I will run these on the cluster. The bash scripts can be found in the respective folders." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "7dbb6daf-fbf9-46bf-9f4b-1ef6805ec2e4", + "metadata": {}, + "outputs": [], + "source": [ + "pdb_uniprot_info = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/UPIMAPI/pdb/pdb_upimapi_mapping_eggnog.tsv\", sep='\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "cc007880-726c-457a-b078-e43d251eebc1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pdb_uniprot_info.rename(columns={'Entry': 'uniprot'}, inplace=True)\n", + "pdb_uniprot_info = pdb_uniprot_info[[\"uniprot\", \"Entry name\", \"Gene names\", \"Function [CC]\", \"Taxonomic lineage (PHYLUM)\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "f17d7bc1-a6c4-4bd6-85f1-a95fb134901a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>uniprot</th>\n", + " <th>Entry name</th>\n", + " <th>Gene names</th>\n", + " <th>Function [CC]</th>\n", + " <th>Taxonomic lineage (PHYLUM)</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>P21553</td>\n", + " <td>CISY_THEAC</td>\n", + " <td>gltA Ta0169</td>\n", + " <td>NaN</td>\n", + " <td>Candidatus Thermoplasmatota</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Q61179</td>\n", + " <td>IRF9_MOUSE</td>\n", + " <td>Irf9 Isgf3g</td>\n", + " <td>FUNCTION: Transcription factor that plays an e...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>O34873</td>\n", + " <td>HMGCL_BACSU</td>\n", + " <td>yngG BSU18230</td>\n", + " <td>FUNCTION: Involved in the catabolism of branch...</td>\n", + " <td>Firmicutes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Q9BU02</td>\n", + " <td>THTPA_HUMAN</td>\n", + " <td>THTPA</td>\n", + " <td>FUNCTION: Hydrolase highly specific for thiami...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Q9LLQ2</td>\n", + " <td>P102B_LUPLU</td>\n", + " <td>PR10.2B</td>\n", + " <td>FUNCTION: Class II ribonuclease (RNase) (By si...</td>\n", + " <td>Streptophyta</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>35255</th>\n", + " <td>Q7JZW2</td>\n", + " <td>Q7JZW2_DROME</td>\n", + " <td>RpS15 anon-EST:Posey137 anon-EST:Posey185 Dmel...</td>\n", + " <td>NaN</td>\n", + " <td>Arthropoda</td>\n", + " </tr>\n", + " <tr>\n", + " <th>35256</th>\n", + " <td>Q9HW09</td>\n", + " <td>PANE_PSEAE</td>\n", + " <td>panE PA4397</td>\n", + " <td>FUNCTION: Catalyzes the NADPH-dependent reduct...</td>\n", + " <td>Proteobacteria</td>\n", + " </tr>\n", + " <tr>\n", + " <th>35257</th>\n", + " <td>F2Z508</td>\n", + " <td>F2Z508_PIG</td>\n", + " <td>STMN4</td>\n", + " <td>NaN</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>35258</th>\n", + " <td>Q8ZM82</td>\n", + " <td>IDI_SALTY</td>\n", + " <td>idi STM3039</td>\n", + " <td>FUNCTION: Catalyzes the 1,3-allylic rearrangem...</td>\n", + " <td>Proteobacteria</td>\n", + " </tr>\n", + " <tr>\n", + " <th>35259</th>\n", + " <td>P12873</td>\n", + " <td>RL29_BACSU</td>\n", + " <td>rpmC BSU01240</td>\n", + " <td>NaN</td>\n", + " <td>Firmicutes</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>35260 rows × 5 columns</p>\n", + "</div>" + ], + "text/plain": [ + " uniprot Entry name \\\n", + "0 P21553 CISY_THEAC \n", + "1 Q61179 IRF9_MOUSE \n", + "2 O34873 HMGCL_BACSU \n", + "3 Q9BU02 THTPA_HUMAN \n", + "4 Q9LLQ2 P102B_LUPLU \n", + "... ... ... \n", + "35255 Q7JZW2 Q7JZW2_DROME \n", + "35256 Q9HW09 PANE_PSEAE \n", + "35257 F2Z508 F2Z508_PIG \n", + "35258 Q8ZM82 IDI_SALTY \n", + "35259 P12873 RL29_BACSU \n", + "\n", + " Gene names \\\n", + "0 gltA Ta0169 \n", + "1 Irf9 Isgf3g \n", + "2 yngG BSU18230 \n", + "3 THTPA \n", + "4 PR10.2B \n", + "... ... \n", + "35255 RpS15 anon-EST:Posey137 anon-EST:Posey185 Dmel... \n", + "35256 panE PA4397 \n", + "35257 STMN4 \n", + "35258 idi STM3039 \n", + "35259 rpmC BSU01240 \n", + "\n", + " Function [CC] \\\n", + "0 NaN \n", + "1 FUNCTION: Transcription factor that plays an e... \n", + "2 FUNCTION: Involved in the catabolism of branch... \n", + "3 FUNCTION: Hydrolase highly specific for thiami... \n", + "4 FUNCTION: Class II ribonuclease (RNase) (By si... \n", + "... ... \n", + "35255 NaN \n", + "35256 FUNCTION: Catalyzes the NADPH-dependent reduct... \n", + "35257 NaN \n", + "35258 FUNCTION: Catalyzes the 1,3-allylic rearrangem... \n", + "35259 NaN \n", + "\n", + " Taxonomic lineage (PHYLUM) \n", + "0 Candidatus Thermoplasmatota \n", + "1 Chordata \n", + "2 Firmicutes \n", + "3 Chordata \n", + "4 Streptophyta \n", + "... ... \n", + "35255 Arthropoda \n", + "35256 Proteobacteria \n", + "35257 Chordata \n", + "35258 Proteobacteria \n", + "35259 Firmicutes \n", + "\n", + "[35260 rows x 5 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdb_uniprot_info" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "afb95431-1ec2-491d-83cf-b288322e7fd6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pdb_merge = pd.merge(pdb_merge, pdb_uniprot_info, on=\"uniprot\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "cc0ab000-40e8-423a-8d75-6c3b59753150", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "P01116 18010\n", + "P0CX51 10846\n", + "O13516 10730\n", + "P05756 10359\n", + "Q3E7X9 10323\n", + " ... \n", + "A0KKT0 1\n", + "Q06672 1\n", + "P86179 1\n", + "A0A090BWT0 1\n", + "Q484B6 1\n", + "Name: uniprot, Length: 34551, dtype: int64" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdb_merge.uniprot.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "8560bfda-d99c-463f-8ed5-f834449d252f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['query',\n", + " 'target',\n", + " 'seq. id.',\n", + " 'alignment length',\n", + " 'no. mismatches',\n", + " 'no. gap open',\n", + " 'query start',\n", + " 'target start',\n", + " 'query end',\n", + " 'target end',\n", + " 'e value',\n", + " 'bit score',\n", + " 'uniprot',\n", + " '#query',\n", + " 'seed_ortholog',\n", + " 'evalue',\n", + " 'score',\n", + " 'eggNOG_OGs',\n", + " 'max_annot_lvl',\n", + " 'COG_category',\n", + " 'Description',\n", + " 'Preferred_name',\n", + " 'GOs',\n", + " 'EC',\n", + " 'KEGG_ko',\n", + " 'KEGG_Pathway',\n", + " 'KEGG_Module',\n", + " 'KEGG_Reaction',\n", + " 'KEGG_rclass',\n", + " 'BRITE',\n", + " 'KEGG_TC',\n", + " 'CAZy',\n", + " 'BiGG_Reaction',\n", + " 'PFAMs',\n", + " 'Entry name',\n", + " 'Gene names',\n", + " 'Function [CC]',\n", + " 'Taxonomic lineage (PHYLUM)']" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(pdb_merge.columns)" + ] + }, + { + "cell_type": "markdown", + "id": "09ae8063-9c39-4654-8042-031148dca85d", + "metadata": {}, + "source": [ + "This list seems quiet confusing but in general it now contains the following information:\n", + " - Foldseek query number of that protein and target. The next columns all have to do with foldseek alignment quality, including evalue and bit score.\n", + " - uniprot ID of foldseek pdb, translated with the uniprot API.\n", + " - \"#query\" is the long query name of the fasta that UPIMAPI pulled out from uniprot using the uniprot ID. All columns until PFAM stem from the eggnog search using the fasta file.\n", + " - Entry name, gene names, Function and Taxonomic lineage (PHYLUM) additionally stem from UPIMAPI retrieving additional infomration from uniprot." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "3d1e3600-dbe4-4185-86b1-fd439e04ff1c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>query</th>\n", + " <th>target</th>\n", + " <th>seq. id.</th>\n", + " <th>alignment length</th>\n", + " <th>no. mismatches</th>\n", + " <th>no. gap open</th>\n", + " <th>query start</th>\n", + " <th>target start</th>\n", + " <th>query end</th>\n", + " <th>target end</th>\n", + " <th>...</th>\n", + " <th>KEGG_rclass</th>\n", + " <th>BRITE</th>\n", + " <th>KEGG_TC</th>\n", + " <th>CAZy</th>\n", + " <th>BiGG_Reaction</th>\n", + " <th>PFAMs</th>\n", + " <th>Entry name</th>\n", + " <th>Gene names</th>\n", + " <th>Function [CC]</th>\n", + " <th>Taxonomic lineage (PHYLUM)</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>4731</td>\n", + " <td>5htk</td>\n", + " <td>0.601</td>\n", + " <td>251</td>\n", + " <td>97</td>\n", + " <td>3</td>\n", + " <td>5</td>\n", + " <td>254</td>\n", + " <td>176</td>\n", + " <td>424</td>\n", + " <td>...</td>\n", + " <td>RC00152</td>\n", + " <td>ko00000,ko00001,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>6PF2K,His_Phos_1</td>\n", + " <td>F262_HUMAN</td>\n", + " <td>PFKFB2</td>\n", + " <td>FUNCTION: Synthesis and degradation of fructos...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>4731</td>\n", + " <td>5htk</td>\n", + " <td>0.629</td>\n", + " <td>240</td>\n", + " <td>87</td>\n", + " <td>2</td>\n", + " <td>9</td>\n", + " <td>248</td>\n", + " <td>181</td>\n", + " <td>418</td>\n", + " <td>...</td>\n", + " <td>RC00152</td>\n", + " <td>ko00000,ko00001,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>6PF2K,His_Phos_1</td>\n", + " <td>F262_HUMAN</td>\n", + " <td>PFKFB2</td>\n", + " <td>FUNCTION: Synthesis and degradation of fructos...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>3214</td>\n", + " <td>5htk</td>\n", + " <td>0.127</td>\n", + " <td>212</td>\n", + " <td>128</td>\n", + " <td>12</td>\n", + " <td>135</td>\n", + " <td>293</td>\n", + " <td>1</td>\n", + " <td>208</td>\n", + " <td>...</td>\n", + " <td>RC00152</td>\n", + " <td>ko00000,ko00001,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>6PF2K,His_Phos_1</td>\n", + " <td>F262_HUMAN</td>\n", + " <td>PFKFB2</td>\n", + " <td>FUNCTION: Synthesis and degradation of fructos...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>3214</td>\n", + " <td>5htk</td>\n", + " <td>0.132</td>\n", + " <td>212</td>\n", + " <td>127</td>\n", + " <td>12</td>\n", + " <td>135</td>\n", + " <td>293</td>\n", + " <td>1</td>\n", + " <td>208</td>\n", + " <td>...</td>\n", + " <td>RC00152</td>\n", + " <td>ko00000,ko00001,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>6PF2K,His_Phos_1</td>\n", + " <td>F262_HUMAN</td>\n", + " <td>PFKFB2</td>\n", + " <td>FUNCTION: Synthesis and degradation of fructos...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>3311</td>\n", + " <td>5htk</td>\n", + " <td>0.180</td>\n", + " <td>211</td>\n", + " <td>112</td>\n", + " <td>11</td>\n", + " <td>9</td>\n", + " <td>182</td>\n", + " <td>3</td>\n", + " <td>189</td>\n", + " <td>...</td>\n", + " <td>RC00152</td>\n", + " <td>ko00000,ko00001,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>6PF2K,His_Phos_1</td>\n", + " <td>F262_HUMAN</td>\n", + " <td>PFKFB2</td>\n", + " <td>FUNCTION: Synthesis and degradation of fructos...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4500378</th>\n", + " <td>26834</td>\n", + " <td>4c65</td>\n", + " <td>0.157</td>\n", + " <td>484</td>\n", + " <td>270</td>\n", + " <td>25</td>\n", + " <td>7</td>\n", + " <td>423</td>\n", + " <td>6</td>\n", + " <td>418</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>Amidohydro_1</td>\n", + " <td>OTASE_ASPNC</td>\n", + " <td>Am2 An14g02080</td>\n", + " <td>FUNCTION: Carboxypeptidase that catalyzes the ...</td>\n", + " <td>Ascomycota</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4500379</th>\n", + " <td>26834</td>\n", + " <td>4c5y</td>\n", + " <td>0.142</td>\n", + " <td>497</td>\n", + " <td>265</td>\n", + " <td>26</td>\n", + " <td>7</td>\n", + " <td>424</td>\n", + " <td>7</td>\n", + " <td>421</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>Amidohydro_1</td>\n", + " <td>OTASE_ASPNC</td>\n", + " <td>Am2 An14g02080</td>\n", + " <td>FUNCTION: Carboxypeptidase that catalyzes the ...</td>\n", + " <td>Ascomycota</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4500380</th>\n", + " <td>26834</td>\n", + " <td>4c5z</td>\n", + " <td>0.158</td>\n", + " <td>493</td>\n", + " <td>269</td>\n", + " <td>27</td>\n", + " <td>2</td>\n", + " <td>424</td>\n", + " <td>5</td>\n", + " <td>421</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>Amidohydro_1</td>\n", + " <td>OTASE_ASPNA</td>\n", + " <td>Am2 ASPNIDRAFT_41631</td>\n", + " <td>FUNCTION: Carboxypeptidase that catalyzes the ...</td>\n", + " <td>Ascomycota</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4500381</th>\n", + " <td>26834</td>\n", + " <td>4c5z</td>\n", + " <td>0.155</td>\n", + " <td>495</td>\n", + " <td>267</td>\n", + " <td>24</td>\n", + " <td>2</td>\n", + " <td>424</td>\n", + " <td>6</td>\n", + " <td>421</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>Amidohydro_1</td>\n", + " <td>OTASE_ASPNA</td>\n", + " <td>Am2 ASPNIDRAFT_41631</td>\n", + " <td>FUNCTION: Carboxypeptidase that catalyzes the ...</td>\n", + " <td>Ascomycota</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4500382</th>\n", + " <td>26834</td>\n", + " <td>5xgx</td>\n", + " <td>0.129</td>\n", + " <td>477</td>\n", + " <td>250</td>\n", + " <td>32</td>\n", + " <td>5</td>\n", + " <td>423</td>\n", + " <td>1</td>\n", + " <td>370</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>ko00000,ko01000,ko01002</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>Amidohydro_1,Amidohydro_3</td>\n", + " <td>Q484B6_COLP3</td>\n", + " <td>iadA CPS_1869</td>\n", + " <td>FUNCTION: Catalyzes the hydrolytic cleavage of...</td>\n", + " <td>Proteobacteria</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>4500383 rows × 38 columns</p>\n", + "</div>" + ], + "text/plain": [ + " query target seq. id. alignment length no. mismatches \\\n", + "0 4731 5htk 0.601 251 97 \n", + "1 4731 5htk 0.629 240 87 \n", + "2 3214 5htk 0.127 212 128 \n", + "3 3214 5htk 0.132 212 127 \n", + "4 3311 5htk 0.180 211 112 \n", + "... ... ... ... ... ... \n", + "4500378 26834 4c65 0.157 484 270 \n", + "4500379 26834 4c5y 0.142 497 265 \n", + "4500380 26834 4c5z 0.158 493 269 \n", + "4500381 26834 4c5z 0.155 495 267 \n", + "4500382 26834 5xgx 0.129 477 250 \n", + "\n", + " no. gap open query start target start query end target end ... \\\n", + "0 3 5 254 176 424 ... \n", + "1 2 9 248 181 418 ... \n", + "2 12 135 293 1 208 ... \n", + "3 12 135 293 1 208 ... \n", + "4 11 9 182 3 189 ... \n", + "... ... ... ... ... ... ... \n", + "4500378 25 7 423 6 418 ... \n", + "4500379 26 7 424 7 421 ... \n", + "4500380 27 2 424 5 421 ... \n", + "4500381 24 2 424 6 421 ... \n", + "4500382 32 5 423 1 370 ... \n", + "\n", + " KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction \\\n", + "0 RC00152 ko00000,ko00001,ko01000 - - - \n", + "1 RC00152 ko00000,ko00001,ko01000 - - - \n", + "2 RC00152 ko00000,ko00001,ko01000 - - - \n", + "3 RC00152 ko00000,ko00001,ko01000 - - - \n", + "4 RC00152 ko00000,ko00001,ko01000 - - - \n", + "... ... ... ... ... ... \n", + "4500378 - - - - - \n", + "4500379 - - - - - \n", + "4500380 - - - - - \n", + "4500381 - - - - - \n", + "4500382 - ko00000,ko01000,ko01002 - - - \n", + "\n", + " PFAMs Entry name Gene names \\\n", + "0 6PF2K,His_Phos_1 F262_HUMAN PFKFB2 \n", + "1 6PF2K,His_Phos_1 F262_HUMAN PFKFB2 \n", + "2 6PF2K,His_Phos_1 F262_HUMAN PFKFB2 \n", + "3 6PF2K,His_Phos_1 F262_HUMAN PFKFB2 \n", + "4 6PF2K,His_Phos_1 F262_HUMAN PFKFB2 \n", + "... ... ... ... \n", + "4500378 Amidohydro_1 OTASE_ASPNC Am2 An14g02080 \n", + "4500379 Amidohydro_1 OTASE_ASPNC Am2 An14g02080 \n", + "4500380 Amidohydro_1 OTASE_ASPNA Am2 ASPNIDRAFT_41631 \n", + "4500381 Amidohydro_1 OTASE_ASPNA Am2 ASPNIDRAFT_41631 \n", + "4500382 Amidohydro_1,Amidohydro_3 Q484B6_COLP3 iadA CPS_1869 \n", + "\n", + " Function [CC] \\\n", + "0 FUNCTION: Synthesis and degradation of fructos... \n", + "1 FUNCTION: Synthesis and degradation of fructos... \n", + "2 FUNCTION: Synthesis and degradation of fructos... \n", + "3 FUNCTION: Synthesis and degradation of fructos... \n", + "4 FUNCTION: Synthesis and degradation of fructos... \n", + "... ... \n", + "4500378 FUNCTION: Carboxypeptidase that catalyzes the ... \n", + "4500379 FUNCTION: Carboxypeptidase that catalyzes the ... \n", + "4500380 FUNCTION: Carboxypeptidase that catalyzes the ... \n", + "4500381 FUNCTION: Carboxypeptidase that catalyzes the ... \n", + "4500382 FUNCTION: Catalyzes the hydrolytic cleavage of... \n", + "\n", + " Taxonomic lineage (PHYLUM) \n", + "0 Chordata \n", + "1 Chordata \n", + "2 Chordata \n", + "3 Chordata \n", + "4 Chordata \n", + "... ... \n", + "4500378 Ascomycota \n", + "4500379 Ascomycota \n", + "4500380 Ascomycota \n", + "4500381 Ascomycota \n", + "4500382 Proteobacteria \n", + "\n", + "[4500383 rows x 38 columns]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdb_merge" + ] + }, + { + "cell_type": "markdown", + "id": "81c73dbe-3b82-465e-a6c0-27a0465a003a", + "metadata": {}, + "source": [ + "NOw lets do the same for afdb and swp" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "414f25d8-73d5-4c87-9cc9-e0afbfd0644b", + "metadata": {}, + "outputs": [], + "source": [ + "# get data file names\n", + "path = \"/g/arendt/Fabian/PhD/Computational/Spongefold/UPIMAPI/afdb/emapper/\"\n", + "filenames = glob.glob(path + \"/*.tsv\")\n", + "\n", + "dfs = []\n", + "for filename in filenames:\n", + " dfs.append(pd.read_csv(filename, sep='\\t', skiprows=4, skipfooter=3, engine='python'))\n", + "\n", + "# Concatenate all data into one DataFrame\n", + "afdb_emapper = pd.concat(dfs, ignore_index=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "ae989a35-adec-4930-a614-488a37029c05", + "metadata": {}, + "outputs": [], + "source": [ + "afdb_emapper[\"uniprot\"] = afdb_emapper[\"#query\"].str.split(\"|\").str[1]\n", + "\n", + "afdb_merge = pd.merge(afdb, afdb_emapper, on=\"uniprot\")" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "8841c12f-8f85-45e8-a048-5d276b6c7bef", + "metadata": {}, + "outputs": [], + "source": [ + "afdb_uniprot_info = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/UPIMAPI/afdb/afdb_upimapi_mapping_eggnog.tsv\", sep='\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "66c30db2-7cd5-4e13-a2c3-d2b2a5ef648d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "afdb_uniprot_info.rename(columns={'Entry': 'uniprot'}, inplace=True)\n", + "afdb_uniprot_info = afdb_uniprot_info[[\"uniprot\", \"Entry name\", \"Gene names\", \"Function [CC]\", \"Taxonomic lineage (PHYLUM)\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "1d541f5d-7556-4095-9f31-d1f5c23cbbbd", + "metadata": {}, + "outputs": [], + "source": [ + "afdb_merge = pd.merge(afdb_merge, afdb_uniprot_info, on=\"uniprot\")" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "552d4713-5a5c-450a-9f58-d21e6cd7d501", + "metadata": {}, + "outputs": [], + "source": [ + "# get data file names\n", + "path = \"/g/arendt/Fabian/PhD/Computational/Spongefold/UPIMAPI/swp/emapper/\"\n", + "filenames = glob.glob(path + \"/*.tsv\")\n", + "\n", + "dfs = []\n", + "for filename in filenames:\n", + " dfs.append(pd.read_csv(filename, sep='\\t', skiprows=4, skipfooter=3, engine='python'))\n", + "\n", + "# Concatenate all data into one DataFrame\n", + "swp_emapper = pd.concat(dfs, ignore_index=True)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "81f84c74-3372-4b27-84ab-2c31d1bb4126", + "metadata": {}, + "outputs": [], + "source": [ + "swp_emapper[\"uniprot\"] = swp_emapper[\"#query\"].str.split(\"|\").str[1]\n", + "\n", + "swp_merge = pd.merge(swp, swp_emapper, on=\"uniprot\")" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "9865cc34-8bd4-480f-9375-36f82401b07d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "swp_uniprot_info = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/UPIMAPI/swp/swp_upimapi_mapping_eggnog.tsv\", sep='\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "12cc6569-4954-4ee9-a352-ae4353f71aef", + "metadata": {}, + "outputs": [], + "source": [ + "swp_uniprot_info.rename(columns={'Entry': 'uniprot'}, inplace=True)\n", + "swp_uniprot_info = swp_uniprot_info[[\"uniprot\", \"Entry name\", \"Gene names\", \"Function [CC]\", \"Taxonomic lineage (PHYLUM)\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "f216f616-b9c6-4371-a2eb-9476584f24c0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "swp_merge = pd.merge(swp_merge, swp_uniprot_info, on=\"uniprot\")" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "39f10d29-f245-4685-a999-5440a165be3c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>query</th>\n", + " <th>target</th>\n", + " <th>seq. id.</th>\n", + " <th>alignment length</th>\n", + " <th>no. mismatches</th>\n", + " <th>no. gap open</th>\n", + " <th>query start</th>\n", + " <th>target start</th>\n", + " <th>query end</th>\n", + " <th>target end</th>\n", + " <th>...</th>\n", + " <th>KEGG_rclass</th>\n", + " <th>BRITE</th>\n", + " <th>KEGG_TC</th>\n", + " <th>CAZy</th>\n", + " <th>BiGG_Reaction</th>\n", + " <th>PFAMs</th>\n", + " <th>Entry name</th>\n", + " <th>Gene names</th>\n", + " <th>Function [CC]</th>\n", + " <th>Taxonomic lineage (PHYLUM)</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>10000</td>\n", + " <td>AF-O35840-F1</td>\n", + " <td>0.251</td>\n", + " <td>139</td>\n", + " <td>91</td>\n", + " <td>4</td>\n", + " <td>4</td>\n", + " <td>136</td>\n", + " <td>161</td>\n", + " <td>292</td>\n", + " <td>...</td>\n", + " <td>RC00020,RC00037,RC00041,RC00055</td>\n", + " <td>ko00000,ko00001,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>LCAT</td>\n", + " <td>LCAT_GERGM</td>\n", + " <td>LCAT</td>\n", + " <td>FUNCTION: Central enzyme in the extracellular ...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1414</td>\n", + " <td>AF-O35840-F1</td>\n", + " <td>0.314</td>\n", + " <td>343</td>\n", + " <td>177</td>\n", + " <td>9</td>\n", + " <td>62</td>\n", + " <td>397</td>\n", + " <td>1</td>\n", + " <td>292</td>\n", + " <td>...</td>\n", + " <td>RC00020,RC00037,RC00041,RC00055</td>\n", + " <td>ko00000,ko00001,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>LCAT</td>\n", + " <td>LCAT_GERGM</td>\n", + " <td>LCAT</td>\n", + " <td>FUNCTION: Central enzyme in the extracellular ...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1413</td>\n", + " <td>AF-O35840-F1</td>\n", + " <td>0.350</td>\n", + " <td>134</td>\n", + " <td>84</td>\n", + " <td>3</td>\n", + " <td>44</td>\n", + " <td>175</td>\n", + " <td>160</td>\n", + " <td>292</td>\n", + " <td>...</td>\n", + " <td>RC00020,RC00037,RC00041,RC00055</td>\n", + " <td>ko00000,ko00001,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>LCAT</td>\n", + " <td>LCAT_GERGM</td>\n", + " <td>LCAT</td>\n", + " <td>FUNCTION: Central enzyme in the extracellular ...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>9999</td>\n", + " <td>AF-O35840-F1</td>\n", + " <td>0.235</td>\n", + " <td>140</td>\n", + " <td>99</td>\n", + " <td>5</td>\n", + " <td>73</td>\n", + " <td>209</td>\n", + " <td>158</td>\n", + " <td>292</td>\n", + " <td>...</td>\n", + " <td>RC00020,RC00037,RC00041,RC00055</td>\n", + " <td>ko00000,ko00001,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>LCAT</td>\n", + " <td>LCAT_GERGM</td>\n", + " <td>LCAT</td>\n", + " <td>FUNCTION: Central enzyme in the extracellular ...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>26188</td>\n", + " <td>AF-O35840-F1</td>\n", + " <td>0.414</td>\n", + " <td>82</td>\n", + " <td>47</td>\n", + " <td>1</td>\n", + " <td>74</td>\n", + " <td>154</td>\n", + " <td>158</td>\n", + " <td>239</td>\n", + " <td>...</td>\n", + " <td>RC00020,RC00037,RC00041,RC00055</td>\n", + " <td>ko00000,ko00001,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>LCAT</td>\n", + " <td>LCAT_GERGM</td>\n", + " <td>LCAT</td>\n", + " <td>FUNCTION: Central enzyme in the extracellular ...</td>\n", + " <td>Chordata</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11935340</th>\n", + " <td>7835</td>\n", + " <td>AF-Q8DKL5-F1</td>\n", + " <td>0.145</td>\n", + " <td>48</td>\n", + " <td>41</td>\n", + " <td>0</td>\n", + " <td>7</td>\n", + " <td>54</td>\n", + " <td>240</td>\n", + " <td>287</td>\n", + " <td>...</td>\n", + " <td>RC00004,RC00039,RC00041</td>\n", + " <td>ko00000,ko00001,ko00002,ko01000,ko01004</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>FA_synthesis</td>\n", + " <td>PLSX_THEVB</td>\n", + " <td>plsX tlr0844</td>\n", + " <td>FUNCTION: Catalyzes the reversible formation o...</td>\n", + " <td>Cyanobacteria</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11935341</th>\n", + " <td>7835</td>\n", + " <td>AF-Q5N5X4-F1</td>\n", + " <td>0.090</td>\n", + " <td>44</td>\n", + " <td>40</td>\n", + " <td>0</td>\n", + " <td>7</td>\n", + " <td>50</td>\n", + " <td>236</td>\n", + " <td>279</td>\n", + " <td>...</td>\n", + " <td>RC00004,RC00039,RC00041</td>\n", + " <td>ko00000,ko00001,ko00002,ko01000,ko01004</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>FA_synthesis</td>\n", + " <td>PLSX_SYNP6</td>\n", + " <td>plsX syc0103_c</td>\n", + " <td>FUNCTION: Catalyzes the reversible formation o...</td>\n", + " <td>Cyanobacteria</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11935342</th>\n", + " <td>8001</td>\n", + " <td>AF-Q5F786-F1</td>\n", + " <td>0.250</td>\n", + " <td>24</td>\n", + " <td>18</td>\n", + " <td>0</td>\n", + " <td>13</td>\n", + " <td>36</td>\n", + " <td>220</td>\n", + " <td>243</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>ko00000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>DUF711</td>\n", + " <td>Y1297_NEIG1</td>\n", + " <td>NGO1297</td>\n", + " <td>NaN</td>\n", + " <td>Proteobacteria</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11935343</th>\n", + " <td>8001</td>\n", + " <td>AF-C5CE71-F1</td>\n", + " <td>0.105</td>\n", + " <td>104</td>\n", + " <td>72</td>\n", + " <td>7</td>\n", + " <td>12</td>\n", + " <td>109</td>\n", + " <td>137</td>\n", + " <td>225</td>\n", + " <td>...</td>\n", + " <td>RC00002,RC00078</td>\n", + " <td>ko00000,ko00001,ko00002,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>CobS</td>\n", + " <td>COBS_KOSOT</td>\n", + " <td>cobS Kole_0456</td>\n", + " <td>FUNCTION: Joins adenosylcobinamide-GDP and alp...</td>\n", + " <td>Thermotogae</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11935344</th>\n", + " <td>8001</td>\n", + " <td>AF-G3KIM4-F1</td>\n", + " <td>0.033</td>\n", + " <td>120</td>\n", + " <td>71</td>\n", + " <td>6</td>\n", + " <td>23</td>\n", + " <td>111</td>\n", + " <td>161</td>\n", + " <td>266</td>\n", + " <td>...</td>\n", + " <td>RC00002,RC00818,RC01839</td>\n", + " <td>ko00000,ko00001,ko00002,ko01000</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>HGD-D</td>\n", + " <td>LCDA_ANAPI</td>\n", + " <td>lcdA</td>\n", + " <td>FUNCTION: Involved in the acrylate pathway for...</td>\n", + " <td>Firmicutes</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>11935345 rows × 38 columns</p>\n", + "</div>" + ], + "text/plain": [ + " query target seq. id. alignment length no. mismatches \\\n", + "0 10000 AF-O35840-F1 0.251 139 91 \n", + "1 1414 AF-O35840-F1 0.314 343 177 \n", + "2 1413 AF-O35840-F1 0.350 134 84 \n", + "3 9999 AF-O35840-F1 0.235 140 99 \n", + "4 26188 AF-O35840-F1 0.414 82 47 \n", + "... ... ... ... ... ... \n", + "11935340 7835 AF-Q8DKL5-F1 0.145 48 41 \n", + "11935341 7835 AF-Q5N5X4-F1 0.090 44 40 \n", + "11935342 8001 AF-Q5F786-F1 0.250 24 18 \n", + "11935343 8001 AF-C5CE71-F1 0.105 104 72 \n", + "11935344 8001 AF-G3KIM4-F1 0.033 120 71 \n", + "\n", + " no. gap open query start target start query end target end ... \\\n", + "0 4 4 136 161 292 ... \n", + "1 9 62 397 1 292 ... \n", + "2 3 44 175 160 292 ... \n", + "3 5 73 209 158 292 ... \n", + "4 1 74 154 158 239 ... \n", + "... ... ... ... ... ... ... \n", + "11935340 0 7 54 240 287 ... \n", + "11935341 0 7 50 236 279 ... \n", + "11935342 0 13 36 220 243 ... \n", + "11935343 7 12 109 137 225 ... \n", + "11935344 6 23 111 161 266 ... \n", + "\n", + " KEGG_rclass \\\n", + "0 RC00020,RC00037,RC00041,RC00055 \n", + "1 RC00020,RC00037,RC00041,RC00055 \n", + "2 RC00020,RC00037,RC00041,RC00055 \n", + "3 RC00020,RC00037,RC00041,RC00055 \n", + "4 RC00020,RC00037,RC00041,RC00055 \n", + "... ... \n", + "11935340 RC00004,RC00039,RC00041 \n", + "11935341 RC00004,RC00039,RC00041 \n", + "11935342 - \n", + "11935343 RC00002,RC00078 \n", + "11935344 RC00002,RC00818,RC01839 \n", + "\n", + " BRITE KEGG_TC CAZy BiGG_Reaction \\\n", + "0 ko00000,ko00001,ko01000 - - - \n", + "1 ko00000,ko00001,ko01000 - - - \n", + "2 ko00000,ko00001,ko01000 - - - \n", + "3 ko00000,ko00001,ko01000 - - - \n", + "4 ko00000,ko00001,ko01000 - - - \n", + "... ... ... ... ... \n", + "11935340 ko00000,ko00001,ko00002,ko01000,ko01004 - - - \n", + "11935341 ko00000,ko00001,ko00002,ko01000,ko01004 - - - \n", + "11935342 ko00000 - - - \n", + "11935343 ko00000,ko00001,ko00002,ko01000 - - - \n", + "11935344 ko00000,ko00001,ko00002,ko01000 - - - \n", + "\n", + " PFAMs Entry name Gene names \\\n", + "0 LCAT LCAT_GERGM LCAT \n", + "1 LCAT LCAT_GERGM LCAT \n", + "2 LCAT LCAT_GERGM LCAT \n", + "3 LCAT LCAT_GERGM LCAT \n", + "4 LCAT LCAT_GERGM LCAT \n", + "... ... ... ... \n", + "11935340 FA_synthesis PLSX_THEVB plsX tlr0844 \n", + "11935341 FA_synthesis PLSX_SYNP6 plsX syc0103_c \n", + "11935342 DUF711 Y1297_NEIG1 NGO1297 \n", + "11935343 CobS COBS_KOSOT cobS Kole_0456 \n", + "11935344 HGD-D LCDA_ANAPI lcdA \n", + "\n", + " Function [CC] \\\n", + "0 FUNCTION: Central enzyme in the extracellular ... \n", + "1 FUNCTION: Central enzyme in the extracellular ... \n", + "2 FUNCTION: Central enzyme in the extracellular ... \n", + "3 FUNCTION: Central enzyme in the extracellular ... \n", + "4 FUNCTION: Central enzyme in the extracellular ... \n", + "... ... \n", + "11935340 FUNCTION: Catalyzes the reversible formation o... \n", + "11935341 FUNCTION: Catalyzes the reversible formation o... \n", + "11935342 NaN \n", + "11935343 FUNCTION: Joins adenosylcobinamide-GDP and alp... \n", + "11935344 FUNCTION: Involved in the acrylate pathway for... \n", + "\n", + " Taxonomic lineage (PHYLUM) \n", + "0 Chordata \n", + "1 Chordata \n", + "2 Chordata \n", + "3 Chordata \n", + "4 Chordata \n", + "... ... \n", + "11935340 Cyanobacteria \n", + "11935341 Cyanobacteria \n", + "11935342 Proteobacteria \n", + "11935343 Thermotogae \n", + "11935344 Firmicutes \n", + "\n", + "[11935345 rows x 38 columns]" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "swp_merge" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "511b875a-bdc8-43f9-8e33-34ad491d0487", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "alphafold['query'] = alphafold.index" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "f4407b86-a8b0-4ef1-8c98-3309c0084b15", + "metadata": {}, + "outputs": [], + "source": [ + "alphafold.reset_index(drop=True, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "5df8b919-554a-4df0-9e37-1e70f974f3e6", + "metadata": {}, + "source": [ + "Merge tables with alphafold results" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "b43d8449-d8be-468b-96b4-ee72531cb3ad", + "metadata": {}, + "outputs": [], + "source": [ + "pdb_res = pd.merge(pdb_merge, alphafold, on=\"query\")" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "22c0968a-140f-4ce4-a65a-cc4d668a424a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "c3376_g1 6368\n", + "c105400_g1 6368\n", + "c94782_g1 6076\n", + "c43224_g1 6002\n", + "c89761_g1 5787\n", + " ... \n", + "c87906_g2 1\n", + "c111030_g1 1\n", + "c99629_g1 1\n", + "c96209_g1 1\n", + "c74333_g1 1\n", + "Name: gene_id, Length: 10157, dtype: int64" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdb_res.gene_id.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "b8f492f6-450d-484f-ba07-8a396ea6c314", + "metadata": {}, + "source": [ + "Translate the UniProt IDs to gene names; whatever obtained a UniProt ID should have a gene name, and this will be our fallback option if emapper annotation is not present:" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "a567e55d-761f-4ca7-a2c1-8a77f24749a6", + "metadata": {}, + "outputs": [], + "source": [ + "afdb_res = pd.merge(afdb_merge, alphafold, on=\"query\")" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "7dec1532-b05e-496c-aa06-0dfa1b70e637", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "c102030_g2 330\n", + "c98824_g1 328\n", + "c96693_g1 326\n", + "c106372_g1 326\n", + "c104839_g2 325\n", + " ... \n", + "c95444_g1 38\n", + "c44058_g1 34\n", + "c105808_g1 21\n", + "c112778_g1 5\n", + "c78729_g1 5\n", + "Name: gene_id, Length: 29386, dtype: int64" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "afdb_res.gene_id.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "dc9c73b6-ec0c-4744-9d3c-f34a46d86908", + "metadata": {}, + "outputs": [], + "source": [ + "swp_res = pd.merge(swp_merge, alphafold, on=\"query\")" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "0ccb5e8c-0ecf-474a-8464-71700626ccdd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "c94230_g1 311\n", + "c103036_g1 310\n", + "c94975_g1 308\n", + "c105925_g1 308\n", + "c103624_g1 307\n", + " ... \n", + "c101908_g2 10\n", + "c44058_g1 9\n", + "c95444_g1 7\n", + "c103292_g1 6\n", + "c105808_g1 4\n", + "Name: gene_id, Length: 29385, dtype: int64" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "swp_res.gene_id.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "cdda4572-1cf5-4fd1-9c3b-5cf5f50c3e2f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['query',\n", + " 'target',\n", + " 'seq. id.',\n", + " 'alignment length',\n", + " 'no. mismatches',\n", + " 'no. gap open',\n", + " 'query start',\n", + " 'target start',\n", + " 'query end',\n", + " 'target end',\n", + " 'e value',\n", + " 'bit score',\n", + " 'uniprot',\n", + " '#query',\n", + " 'seed_ortholog',\n", + " 'evalue',\n", + " 'score',\n", + " 'eggNOG_OGs',\n", + " 'max_annot_lvl',\n", + " 'COG_category',\n", + " 'Description',\n", + " 'Preferred_name',\n", + " 'GOs',\n", + " 'EC',\n", + " 'KEGG_ko',\n", + " 'KEGG_Pathway',\n", + " 'KEGG_Module',\n", + " 'KEGG_Reaction',\n", + " 'KEGG_rclass',\n", + " 'BRITE',\n", + " 'KEGG_TC',\n", + " 'CAZy',\n", + " 'BiGG_Reaction',\n", + " 'PFAMs',\n", + " 'Entry name',\n", + " 'Gene names',\n", + " 'Function [CC]',\n", + " 'Taxonomic lineage (PHYLUM)',\n", + " 'plddt',\n", + " 'MSA size',\n", + " 'query length',\n", + " 'gene name',\n", + " 'protein_id',\n", + " 'gene_id']" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(pdb_res.columns)" + ] + }, + { + "cell_type": "markdown", + "id": "94bd0480-b089-48fe-8841-9f839beaf7b8", + "metadata": { + "tags": [] + }, + "source": [ + "Before I add the eggnog information from the sponge proteome, I need to drop and rename some of the columns" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "b866d26f-94b5-4875-ba4f-45003d74ce1f", + "metadata": {}, + "outputs": [], + "source": [ + "pdb_res.rename(columns = {'query':'fs_query', 'target':'fs_target', 'e value':'fs_e value', 'bit score':\"fs_bit score\", 'uniprot':'fs_target_uniprot', 'evalue':'fs_target_eggnog_evalue', \n", + " 'score':'fs_target_eggnog_score', 'eggNOG_OGs':'fs_target_eggnogOGs', 'max_annot_lvl':'fs_target_max_annot_lvl','COG_category':'fs_target_COG_category',\n", + " 'Description':'fs_target_Description', 'Preferred_name':'fs_target_Preferred_name', 'GOs':'fs_target_GOs', 'Entry name':'fs_target_Entry name', \n", + " 'Gene names':'fs_target_Gene names', 'Function [CC]':'fs_target_Function [CC]', 'Taxonomic lineage (PHYLUM)':'fs_target_Taxonomic lineage (PHYLUM)', 'PFAMs':'fs_target_PFAMs'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "3b7c0cc2-a740-42cb-bf08-09154963e4fd", + "metadata": {}, + "outputs": [], + "source": [ + "pdb_res.drop(['no. mismatches',\n", + " 'no. gap open',\n", + " 'query start',\n", + " 'target start',\n", + " 'query end',\n", + " 'target end','#query',\n", + " 'seed_ortholog','EC',\n", + " 'KEGG_ko',\n", + " 'KEGG_Pathway',\n", + " 'KEGG_Module',\n", + " 'KEGG_Reaction',\n", + " 'KEGG_rclass',\n", + " 'BRITE',\n", + " 'KEGG_TC',\n", + " 'CAZy',\n", + " 'BiGG_Reaction'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "0d510c96-ebaf-48f0-aa59-e6070dcecba0", + "metadata": {}, + "outputs": [], + "source": [ + "afdb_res.rename(columns = {'query':'fs_query', 'target':'fs_target', 'e value':'fs_e value', 'bit score':\"fs_bit score\", 'uniprot':'fs_target_uniprot', 'evalue':'fs_target_eggnog_evalue', \n", + " 'score':'fs_target_eggnog_score', 'eggNOG_OGs':'fs_target_eggnogOGs', 'max_annot_lvl':'fs_target_max_annot_lvl','COG_category':'fs_target_COG_category',\n", + " 'Description':'fs_target_Description', 'Preferred_name':'fs_target_Preferred_name', 'GOs':'fs_target_GOs', 'Entry name':'fs_target_Entry name', \n", + " 'Gene names':'fs_target_Gene names', 'Function [CC]':'fs_target_Function [CC]', 'Taxonomic lineage (PHYLUM)':'fs_target_Taxonomic lineage (PHYLUM)', 'PFAMs':'fs_target_PFAMs'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "d384509d-212c-4ad7-9fe8-897d7048cdf0", + "metadata": {}, + "outputs": [], + "source": [ + "afdb_res.drop(['no. mismatches',\n", + " 'no. gap open',\n", + " 'query start',\n", + " 'target start',\n", + " 'query end',\n", + " 'target end','#query',\n", + " 'seed_ortholog','EC',\n", + " 'KEGG_ko',\n", + " 'KEGG_Pathway',\n", + " 'KEGG_Module',\n", + " 'KEGG_Reaction',\n", + " 'KEGG_rclass',\n", + " 'BRITE',\n", + " 'KEGG_TC',\n", + " 'CAZy',\n", + " 'BiGG_Reaction'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "78d517da-65a2-4ac6-a007-d3d3c6c3ddae", + "metadata": {}, + "outputs": [], + "source": [ + "swp_res.rename(columns = {'query':'fs_query', 'target':'fs_target', 'e value':'fs_e value', 'bit score':\"fs_bit score\", 'uniprot':'fs_target_uniprot', 'evalue':'fs_target_eggnog_evalue', \n", + " 'score':'fs_target_eggnog_score', 'eggNOG_OGs':'fs_target_eggnogOGs', 'max_annot_lvl':'fs_target_max_annot_lvl','COG_category':'fs_target_COG_category',\n", + " 'Description':'fs_target_Description', 'Preferred_name':'fs_target_Preferred_name', 'GOs':'fs_target_GOs', 'Entry name':'fs_target_Entry name', \n", + " 'Gene names':'fs_target_Gene names', 'Function [CC]':'fs_target_Function [CC]', 'Taxonomic lineage (PHYLUM)':'fs_target_Taxonomic lineage (PHYLUM)', 'PFAMs':'fs_target_PFAMs'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "00cdb496-4bd6-40ac-a22d-ac1f5e4ba960", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "swp_res.drop(['no. mismatches',\n", + " 'no. gap open',\n", + " 'query start',\n", + " 'target start',\n", + " 'query end',\n", + " 'target end','#query',\n", + " 'seed_ortholog','EC',\n", + " 'KEGG_ko',\n", + " 'KEGG_Pathway',\n", + " 'KEGG_Module',\n", + " 'KEGG_Reaction',\n", + " 'KEGG_rclass',\n", + " 'BRITE',\n", + " 'KEGG_TC',\n", + " 'CAZy',\n", + " 'BiGG_Reaction'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "808506eb-e776-422f-8c5f-de068d8ba97b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['fs_query',\n", + " 'fs_target',\n", + " 'seq. id.',\n", + " 'alignment length',\n", + " 'fs_e value',\n", + " 'fs_bit score',\n", + " 'fs_target_uniprot',\n", + " 'fs_target_eggnog_evalue',\n", + " 'fs_target_eggnog_score',\n", + " 'fs_target_eggnogOGs',\n", + " 'fs_target_max_annot_lvl',\n", + " 'fs_target_COG_category',\n", + " 'fs_target_Description',\n", + " 'fs_target_Preferred_name',\n", + " 'fs_target_GOs',\n", + " 'fs_target_PFAMs',\n", + " 'fs_target_Entry name',\n", + " 'fs_target_Gene names',\n", + " 'fs_target_Function [CC]',\n", + " 'fs_target_Taxonomic lineage (PHYLUM)',\n", + " 'plddt',\n", + " 'MSA size',\n", + " 'query length',\n", + " 'gene name',\n", + " 'protein_id',\n", + " 'gene_id']" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(afdb_res.columns)" + ] + }, + { + "cell_type": "markdown", + "id": "74a8129b-5ae0-4b7f-9a26-7bf0947670b5", + "metadata": {}, + "source": [ + "Now the question is how to subset these huge tables a little bit more so we can work with them properly. The best way would be to filter based on fs bit score and/or fs_target_eggnogOGs. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d37e7a82-8017-4b16-9419-1cb45d41858d", + "metadata": {}, + "outputs": [], + "source": [ + "#pdb_seq = enrich_from_uniprot(pdb, \"uniprot\", \"gene name\", uniprot_from=\"ACC\", uniprot_to=\"GENENAME\")\n", + "# swp_seq = enrich_from_uniprot(swp, \"uniprot\", \"gene name\", uniprot_from=\"ACC\", uniprot_to=\"GENENAME\")\n", + "#afdb_seq = enrich_from_uniprot(afdb, \"uniprot\", \"gene name\", uniprot_from=\"ACC\", uniprot_to=\"GENENAME\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de9d5f80-9ac8-4a14-a628-1e99af934403", + "metadata": {}, + "outputs": [], + "source": [ + "#pdb_seq = enrich_from_uniprot(pdb_seq, \"uniprot\", \"eggnog\", uniprot_from=\"ACC\", uniprot_to=\"EGGNOG_ID\")\n", + "#swp_seq = enrich_from_uniprot(swp_seq, \"uniprot\", \"eggnog\", uniprot_from=\"ACC\", uniprot_to=\"EGGNOG_ID\")\n", + "#afdb_seq = enrich_from_uniprot(afdb_seq, \"uniprot\", \"eggnog\", uniprot_from=\"ACC\", uniprot_to=\"EGGNOG_ID\")" + ] + }, + { + "cell_type": "markdown", + "id": "8ec832ac-8525-438d-819a-cab2232f574f", + "metadata": {}, + "source": [ + "Cross-reference the eggNOG IDs with the eggNOG annotation that gives a nice name/description for each (most) orthogroup:" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "33c69bec-6a43-49c2-8ea8-3f011fcf86c4", + "metadata": {}, + "outputs": [], + "source": [ + "eggnog = pd.read_csv(\"/g/arendt/Fabian/PhD/Computational/Spongefold/spongilla_eggnog.tsv\", sep=\"\\t\", skiprows=4, skipfooter=3, engine='python')" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "4247ce9f-18b7-4fa1-a6a6-58218ef6e797", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>#query</th>\n", + " <th>seed_ortholog</th>\n", + " <th>evalue</th>\n", + " <th>score</th>\n", + " <th>eggNOG_OGs</th>\n", + " <th>max_annot_lvl</th>\n", + " <th>COG_category</th>\n", + " <th>Description</th>\n", + " <th>Preferred_name</th>\n", + " <th>GOs</th>\n", + " <th>...</th>\n", + " <th>KEGG_ko</th>\n", + " <th>KEGG_Pathway</th>\n", + " <th>KEGG_Module</th>\n", + " <th>KEGG_Reaction</th>\n", + " <th>KEGG_rclass</th>\n", + " <th>BRITE</th>\n", + " <th>KEGG_TC</th>\n", + " <th>CAZy</th>\n", + " <th>BiGG_Reaction</th>\n", + " <th>PFAMs</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>c100000_g1_i1_m.41809</td>\n", + " <td>400682.PAC_15712888</td>\n", + " <td>6.690000e-72</td>\n", + " <td>242.0</td>\n", + " <td>COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>I</td>\n", + " <td>CRAL/TRIO domain</td>\n", + " <td>MOSPD2</td>\n", + " <td>-</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>CRAL_TRIO,Motile_Sperm</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>c100000_g1_i2_m.41814</td>\n", + " <td>400682.PAC_15712888</td>\n", + " <td>1.640000e-13</td>\n", + " <td>77.8</td>\n", + " <td>COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>I</td>\n", + " <td>CRAL/TRIO domain</td>\n", + " <td>MOSPD2</td>\n", + " <td>-</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>CRAL_TRIO,Motile_Sperm</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>c100000_g2_i1_m.41818</td>\n", + " <td>400682.PAC_15712888</td>\n", + " <td>3.350000e-33</td>\n", + " <td>135.0</td>\n", + " <td>COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>I</td>\n", + " <td>CRAL/TRIO domain</td>\n", + " <td>MOSPD2</td>\n", + " <td>-</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>CRAL_TRIO,Motile_Sperm</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>c100001_g1_i2_m.41826</td>\n", + " <td>400682.PAC_15716590</td>\n", + " <td>5.880000e-48</td>\n", + " <td>176.0</td>\n", + " <td>COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>A</td>\n", + " <td>RNA secondary structure unwinding</td>\n", + " <td>DDX46</td>\n", + " <td>GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO...</td>\n", + " <td>...</td>\n", + " <td>ko:K12811</td>\n", + " <td>ko03040,map03040</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>ko00000,ko00001,ko01000,ko03041</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>DEAD,Helicase_C</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>c100001_g2_i1_m.41829</td>\n", + " <td>400682.PAC_15716590</td>\n", + " <td>1.220000e-305</td>\n", + " <td>868.0</td>\n", + " <td>COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>A</td>\n", + " <td>RNA secondary structure unwinding</td>\n", + " <td>DDX46</td>\n", + " <td>GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO...</td>\n", + " <td>...</td>\n", + " <td>ko:K12811</td>\n", + " <td>ko03040,map03040</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>ko00000,ko00001,ko01000,ko03041</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>DEAD,Helicase_C</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24193</th>\n", + " <td>c99995_g1_i1_m.41796</td>\n", + " <td>45351.EDO40823</td>\n", + " <td>3.090000e-120</td>\n", + " <td>352.0</td>\n", + " <td>COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>L</td>\n", + " <td>protein-DNA loading ATPase activity</td>\n", + " <td>RFC3</td>\n", + " <td>GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO...</td>\n", + " <td>...</td>\n", + " <td>ko:K10756</td>\n", + " <td>ko03030,ko03420,ko03430,map03030,map03420,map0...</td>\n", + " <td>M00289,M00295</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>ko00000,ko00001,ko00002,ko03032,ko03036,ko03400</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>DNA_pol3_delta2,Rep_fac_C</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24194</th>\n", + " <td>c99995_g2_i1_m.41797</td>\n", + " <td>109478.XP_005883212.1</td>\n", + " <td>1.570000e-71</td>\n", + " <td>226.0</td>\n", + " <td>COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>L</td>\n", + " <td>protein-DNA loading ATPase activity</td>\n", + " <td>RFC3</td>\n", + " <td>GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO...</td>\n", + " <td>...</td>\n", + " <td>ko:K10756</td>\n", + " <td>ko03030,ko03420,ko03430,map03030,map03420,map0...</td>\n", + " <td>M00289,M00295</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>ko00000,ko00001,ko00002,ko03032,ko03036,ko03400</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>DNA_pol3_delta2,Rep_fac_C</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24195</th>\n", + " <td>c99997_g2_i1_m.41801</td>\n", + " <td>7739.XP_002612114.1</td>\n", + " <td>1.150000e-205</td>\n", + " <td>583.0</td>\n", + " <td>COG0644@1|root,2QW6Y@2759|Eukaryota,39YRX@3315...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>C</td>\n", + " <td>FAD binding domain</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>FAD_binding_3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24196</th>\n", + " <td>c99998_g1_i1_m.41804</td>\n", + " <td>400682.PAC_15712215</td>\n", + " <td>2.240000e-15</td>\n", + " <td>89.7</td>\n", + " <td>COG0666@1|root,KOG4369@1|root,KOG0504@2759|Euk...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>T</td>\n", + " <td>positive regulation of MDA-5 signaling pathway</td>\n", + " <td>ANKRD17</td>\n", + " <td>GO:0000785,GO:0001568,GO:0001654,GO:0001745,GO...</td>\n", + " <td>...</td>\n", + " <td>ko:K16726</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>ko00000,ko03036</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>Ank_2,Ank_4,KH_1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24197</th>\n", + " <td>c99999_g1_i1_m.41806</td>\n", + " <td>400682.PAC_15718953</td>\n", + " <td>5.030000e-29</td>\n", + " <td>111.0</td>\n", + " <td>2DI7Z@1|root,2S5Y2@2759|Eukaryota,3A72E@33154|...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>...</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>24198 rows × 21 columns</p>\n", + "</div>" + ], + "text/plain": [ + " #query seed_ortholog evalue score \\\n", + "0 c100000_g1_i1_m.41809 400682.PAC_15712888 6.690000e-72 242.0 \n", + "1 c100000_g1_i2_m.41814 400682.PAC_15712888 1.640000e-13 77.8 \n", + "2 c100000_g2_i1_m.41818 400682.PAC_15712888 3.350000e-33 135.0 \n", + "3 c100001_g1_i2_m.41826 400682.PAC_15716590 5.880000e-48 176.0 \n", + "4 c100001_g2_i1_m.41829 400682.PAC_15716590 1.220000e-305 868.0 \n", + "... ... ... ... ... \n", + "24193 c99995_g1_i1_m.41796 45351.EDO40823 3.090000e-120 352.0 \n", + "24194 c99995_g2_i1_m.41797 109478.XP_005883212.1 1.570000e-71 226.0 \n", + "24195 c99997_g2_i1_m.41801 7739.XP_002612114.1 1.150000e-205 583.0 \n", + "24196 c99998_g1_i1_m.41804 400682.PAC_15712215 2.240000e-15 89.7 \n", + "24197 c99999_g1_i1_m.41806 400682.PAC_15718953 5.030000e-29 111.0 \n", + "\n", + " eggNOG_OGs max_annot_lvl \\\n", + "0 COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk... 33208|Metazoa \n", + "1 COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk... 33208|Metazoa \n", + "2 COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk... 33208|Metazoa \n", + "3 COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33... 33208|Metazoa \n", + "4 COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33... 33208|Metazoa \n", + "... ... ... \n", + "24193 COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33... 33208|Metazoa \n", + "24194 COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33... 33208|Metazoa \n", + "24195 COG0644@1|root,2QW6Y@2759|Eukaryota,39YRX@3315... 33208|Metazoa \n", + "24196 COG0666@1|root,KOG4369@1|root,KOG0504@2759|Euk... 33208|Metazoa \n", + "24197 2DI7Z@1|root,2S5Y2@2759|Eukaryota,3A72E@33154|... 33208|Metazoa \n", + "\n", + " COG_category Description \\\n", + "0 I CRAL/TRIO domain \n", + "1 I CRAL/TRIO domain \n", + "2 I CRAL/TRIO domain \n", + "3 A RNA secondary structure unwinding \n", + "4 A RNA secondary structure unwinding \n", + "... ... ... \n", + "24193 L protein-DNA loading ATPase activity \n", + "24194 L protein-DNA loading ATPase activity \n", + "24195 C FAD binding domain \n", + "24196 T positive regulation of MDA-5 signaling pathway \n", + "24197 - - \n", + "\n", + " Preferred_name GOs ... \\\n", + "0 MOSPD2 - ... \n", + "1 MOSPD2 - ... \n", + "2 MOSPD2 - ... \n", + "3 DDX46 GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO... ... \n", + "4 DDX46 GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO... ... \n", + "... ... ... ... \n", + "24193 RFC3 GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO... ... \n", + "24194 RFC3 GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO... ... \n", + "24195 - - ... \n", + "24196 ANKRD17 GO:0000785,GO:0001568,GO:0001654,GO:0001745,GO... ... \n", + "24197 - - ... \n", + "\n", + " KEGG_ko KEGG_Pathway \\\n", + "0 - - \n", + "1 - - \n", + "2 - - \n", + "3 ko:K12811 ko03040,map03040 \n", + "4 ko:K12811 ko03040,map03040 \n", + "... ... ... \n", + "24193 ko:K10756 ko03030,ko03420,ko03430,map03030,map03420,map0... \n", + "24194 ko:K10756 ko03030,ko03420,ko03430,map03030,map03420,map0... \n", + "24195 - - \n", + "24196 ko:K16726 - \n", + "24197 - - \n", + "\n", + " KEGG_Module KEGG_Reaction KEGG_rclass \\\n", + "0 - - - \n", + "1 - - - \n", + "2 - - - \n", + "3 - - - \n", + "4 - - - \n", + "... ... ... ... \n", + "24193 M00289,M00295 - - \n", + "24194 M00289,M00295 - - \n", + "24195 - - - \n", + "24196 - - - \n", + "24197 - - - \n", + "\n", + " BRITE KEGG_TC CAZy \\\n", + "0 - - - \n", + "1 - - - \n", + "2 - - - \n", + "3 ko00000,ko00001,ko01000,ko03041 - - \n", + "4 ko00000,ko00001,ko01000,ko03041 - - \n", + "... ... ... ... \n", + "24193 ko00000,ko00001,ko00002,ko03032,ko03036,ko03400 - - \n", + "24194 ko00000,ko00001,ko00002,ko03032,ko03036,ko03400 - - \n", + "24195 - - - \n", + "24196 ko00000,ko03036 - - \n", + "24197 - - - \n", + "\n", + " BiGG_Reaction PFAMs \n", + "0 - CRAL_TRIO,Motile_Sperm \n", + "1 - CRAL_TRIO,Motile_Sperm \n", + "2 - CRAL_TRIO,Motile_Sperm \n", + "3 - DEAD,Helicase_C \n", + "4 - DEAD,Helicase_C \n", + "... ... ... \n", + "24193 - DNA_pol3_delta2,Rep_fac_C \n", + "24194 - DNA_pol3_delta2,Rep_fac_C \n", + "24195 - FAD_binding_3 \n", + "24196 - Ank_2,Ank_4,KH_1 \n", + "24197 - - \n", + "\n", + "[24198 rows x 21 columns]" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eggnog[\"gene_id\"] = eggnog[\"#query\"].str.split(\"_\").str[:2].str.join(\"_\")\n", + "eggnog[['gene_id', 'protein_id']] = eggnog['#query'].str.split('_')., 1, expand=True\n", + "eggnog[\"#query\"].str.split(\"_\").str[:2].str.join(\"_\")" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "f3756118-c34c-4e6d-8954-c94473601758", + "metadata": {}, + "outputs": [], + "source": [ + "eggnog[\"gene_id\"] = eggnog[\"#query\"].str.split(\"_\").str[:2].str.join(\"_\")\n", + "eggnog[\"protein_id\"] = eggnog[\"#query\"].str.split(\".\").str[1]\n", + "eggnog.drop(['#query',\n", + " 'seed_ortholog',\n", + " 'EC',\n", + " 'KEGG_ko',\n", + " 'KEGG_Pathway',\n", + " 'KEGG_Module',\n", + " 'KEGG_Reaction',\n", + " 'KEGG_rclass',\n", + " 'BRITE',\n", + " 'KEGG_TC',\n", + " 'CAZy',\n", + " 'BiGG_Reaction'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "18507e2d-7055-44d7-ac36-0d439c05e34c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>evalue</th>\n", + " <th>score</th>\n", + " <th>eggNOG_OGs</th>\n", + " <th>max_annot_lvl</th>\n", + " <th>COG_category</th>\n", + " <th>Description</th>\n", + " <th>Preferred_name</th>\n", + " <th>GOs</th>\n", + " <th>PFAMs</th>\n", + " <th>gene_id</th>\n", + " <th>protein_id</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>6.690000e-72</td>\n", + " <td>242.0</td>\n", + " <td>COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>I</td>\n", + " <td>CRAL/TRIO domain</td>\n", + " <td>MOSPD2</td>\n", + " <td>-</td>\n", + " <td>CRAL_TRIO,Motile_Sperm</td>\n", + " <td>c100000_g1</td>\n", + " <td>41809</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1.640000e-13</td>\n", + " <td>77.8</td>\n", + " <td>COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>I</td>\n", + " <td>CRAL/TRIO domain</td>\n", + " <td>MOSPD2</td>\n", + " <td>-</td>\n", + " <td>CRAL_TRIO,Motile_Sperm</td>\n", + " <td>c100000_g1</td>\n", + " <td>41814</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>3.350000e-33</td>\n", + " <td>135.0</td>\n", + " <td>COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>I</td>\n", + " <td>CRAL/TRIO domain</td>\n", + " <td>MOSPD2</td>\n", + " <td>-</td>\n", + " <td>CRAL_TRIO,Motile_Sperm</td>\n", + " <td>c100000_g2</td>\n", + " <td>41818</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>5.880000e-48</td>\n", + " <td>176.0</td>\n", + " <td>COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>A</td>\n", + " <td>RNA secondary structure unwinding</td>\n", + " <td>DDX46</td>\n", + " <td>GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO...</td>\n", + " <td>DEAD,Helicase_C</td>\n", + " <td>c100001_g1</td>\n", + " <td>41826</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1.220000e-305</td>\n", + " <td>868.0</td>\n", + " <td>COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>A</td>\n", + " <td>RNA secondary structure unwinding</td>\n", + " <td>DDX46</td>\n", + " <td>GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO...</td>\n", + " <td>DEAD,Helicase_C</td>\n", + " <td>c100001_g2</td>\n", + " <td>41829</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24193</th>\n", + " <td>3.090000e-120</td>\n", + " <td>352.0</td>\n", + " <td>COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>L</td>\n", + " <td>protein-DNA loading ATPase activity</td>\n", + " <td>RFC3</td>\n", + " <td>GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO...</td>\n", + " <td>DNA_pol3_delta2,Rep_fac_C</td>\n", + " <td>c99995_g1</td>\n", + " <td>41796</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24194</th>\n", + " <td>1.570000e-71</td>\n", + " <td>226.0</td>\n", + " <td>COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>L</td>\n", + " <td>protein-DNA loading ATPase activity</td>\n", + " <td>RFC3</td>\n", + " <td>GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO...</td>\n", + " <td>DNA_pol3_delta2,Rep_fac_C</td>\n", + " <td>c99995_g2</td>\n", + " <td>41797</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24195</th>\n", + " <td>1.150000e-205</td>\n", + " <td>583.0</td>\n", + " <td>COG0644@1|root,2QW6Y@2759|Eukaryota,39YRX@3315...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>C</td>\n", + " <td>FAD binding domain</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>FAD_binding_3</td>\n", + " <td>c99997_g2</td>\n", + " <td>41801</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24196</th>\n", + " <td>2.240000e-15</td>\n", + " <td>89.7</td>\n", + " <td>COG0666@1|root,KOG4369@1|root,KOG0504@2759|Euk...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>T</td>\n", + " <td>positive regulation of MDA-5 signaling pathway</td>\n", + " <td>ANKRD17</td>\n", + " <td>GO:0000785,GO:0001568,GO:0001654,GO:0001745,GO...</td>\n", + " <td>Ank_2,Ank_4,KH_1</td>\n", + " <td>c99998_g1</td>\n", + " <td>41804</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24197</th>\n", + " <td>5.030000e-29</td>\n", + " <td>111.0</td>\n", + " <td>2DI7Z@1|root,2S5Y2@2759|Eukaryota,3A72E@33154|...</td>\n", + " <td>33208|Metazoa</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>-</td>\n", + " <td>c99999_g1</td>\n", + " <td>41806</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>24198 rows × 11 columns</p>\n", + "</div>" + ], + "text/plain": [ + " evalue score \\\n", + "0 6.690000e-72 242.0 \n", + "1 1.640000e-13 77.8 \n", + "2 3.350000e-33 135.0 \n", + "3 5.880000e-48 176.0 \n", + "4 1.220000e-305 868.0 \n", + "... ... ... \n", + "24193 3.090000e-120 352.0 \n", + "24194 1.570000e-71 226.0 \n", + "24195 1.150000e-205 583.0 \n", + "24196 2.240000e-15 89.7 \n", + "24197 5.030000e-29 111.0 \n", + "\n", + " eggNOG_OGs max_annot_lvl \\\n", + "0 COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk... 33208|Metazoa \n", + "1 COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk... 33208|Metazoa \n", + "2 COG5066@1|root,KOG1471@1|root,KOG0439@2759|Euk... 33208|Metazoa \n", + "3 COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33... 33208|Metazoa \n", + "4 COG0513@1|root,KOG0334@2759|Eukaryota,38VUQ@33... 33208|Metazoa \n", + "... ... ... \n", + "24193 COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33... 33208|Metazoa \n", + "24194 COG0470@1|root,KOG2035@2759|Eukaryota,38FUD@33... 33208|Metazoa \n", + "24195 COG0644@1|root,2QW6Y@2759|Eukaryota,39YRX@3315... 33208|Metazoa \n", + "24196 COG0666@1|root,KOG4369@1|root,KOG0504@2759|Euk... 33208|Metazoa \n", + "24197 2DI7Z@1|root,2S5Y2@2759|Eukaryota,3A72E@33154|... 33208|Metazoa \n", + "\n", + " COG_category Description \\\n", + "0 I CRAL/TRIO domain \n", + "1 I CRAL/TRIO domain \n", + "2 I CRAL/TRIO domain \n", + "3 A RNA secondary structure unwinding \n", + "4 A RNA secondary structure unwinding \n", + "... ... ... \n", + "24193 L protein-DNA loading ATPase activity \n", + "24194 L protein-DNA loading ATPase activity \n", + "24195 C FAD binding domain \n", + "24196 T positive regulation of MDA-5 signaling pathway \n", + "24197 - - \n", + "\n", + " Preferred_name GOs \\\n", + "0 MOSPD2 - \n", + "1 MOSPD2 - \n", + "2 MOSPD2 - \n", + "3 DDX46 GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO... \n", + "4 DDX46 GO:0000375,GO:0000377,GO:0000398,GO:0001650,GO... \n", + "... ... ... \n", + "24193 RFC3 GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO... \n", + "24194 RFC3 GO:0000723,GO:0000731,GO:0000819,GO:0003674,GO... \n", + "24195 - - \n", + "24196 ANKRD17 GO:0000785,GO:0001568,GO:0001654,GO:0001745,GO... \n", + "24197 - - \n", + "\n", + " PFAMs gene_id protein_id \n", + "0 CRAL_TRIO,Motile_Sperm c100000_g1 41809 \n", + "1 CRAL_TRIO,Motile_Sperm c100000_g1 41814 \n", + "2 CRAL_TRIO,Motile_Sperm c100000_g2 41818 \n", + "3 DEAD,Helicase_C c100001_g1 41826 \n", + "4 DEAD,Helicase_C c100001_g2 41829 \n", + "... ... ... ... \n", + "24193 DNA_pol3_delta2,Rep_fac_C c99995_g1 41796 \n", + "24194 DNA_pol3_delta2,Rep_fac_C c99995_g2 41797 \n", + "24195 FAD_binding_3 c99997_g2 41801 \n", + "24196 Ank_2,Ank_4,KH_1 c99998_g1 41804 \n", + "24197 - c99999_g1 41806 \n", + "\n", + "[24198 rows x 11 columns]" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eggnog" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "60c99abb-679f-43a9-af6f-0fb37159f68a", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['evalue',\n", + " 'seq_eggnog_score',\n", + " 'seq_eggnogOGs',\n", + " 'seq_max_annot_lvl',\n", + " 'seq_COG_category',\n", + " 'seq_Description',\n", + " 'seq_Preferred_name',\n", + " 'seq_GOs',\n", + " 'seq_PFAMs',\n", + " 'gene_id',\n", + " 'protein_id']" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(eggnog.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "06223fb1-1492-4723-b621-3d9a13b0b3fe", + "metadata": {}, + "outputs": [], + "source": [ + "eggnog.rename(columns = {'e value':'seq_e value',\n", + " 'score':'seq_eggnog_score', 'eggNOG_OGs':'seq_eggnogOGs', 'max_annot_lvl':'seq_max_annot_lvl','COG_category':'seq_COG_category',\n", + " 'Description':'seq_Description', 'Preferred_name':'seq_Preferred_name', 'GOs':'seq_GOs', 'PFAMs':'seq_PFAMs'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "10d606c3-1bc9-4c78-81c7-279d607b8e90", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>fs_query</th>\n", + " <th>fs_target</th>\n", + " <th>seq. id.</th>\n", + " <th>alignment length</th>\n", + " <th>fs_e value</th>\n", + " <th>fs_bit score</th>\n", + " <th>fs_target_uniprot</th>\n", + " <th>fs_target_eggnog_evalue</th>\n", + " <th>fs_target_eggnog_score</th>\n", + " <th>fs_target_eggnogOGs</th>\n", + " <th>...</th>\n", + " <th>fs_target_Entry name</th>\n", + " <th>fs_target_Gene names</th>\n", + " <th>fs_target_Function [CC]</th>\n", + " <th>fs_target_Taxonomic lineage (PHYLUM)</th>\n", + " <th>plddt</th>\n", + " <th>MSA size</th>\n", + " <th>query length</th>\n", + " <th>gene name</th>\n", + " <th>protein_id</th>\n", + " <th>gene_id</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1442503</th>\n", + " <td>41612</td>\n", + " <td>AF-Q6NYT3-F1</td>\n", + " <td>0.173</td>\n", + " <td>156</td>\n", + " <td>1.632</td>\n", + " <td>90</td>\n", + " <td>Q6NYT3</td>\n", + " <td>1.470000e-217</td>\n", + " <td>600.0</td>\n", + " <td>28N8T@1|root,2QUU4@2759|Eukaryota,39TGP@33154|...</td>\n", + " <td>...</td>\n", + " <td>IER5L_DANRE</td>\n", + " <td>ier5l si:ch211-208h16.10 zgc:77455</td>\n", + " <td>NaN</td>\n", + " <td>Chordata</td>\n", + " <td>63.615046</td>\n", + " <td>747.0</td>\n", + " <td>218</td>\n", + " <td>c99854_g1_i2_m.41041</td>\n", + " <td>41041</td>\n", + " <td>c99854_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1442600</th>\n", + " <td>41612</td>\n", + " <td>AF-Q8CCI5-F1</td>\n", + " <td>0.141</td>\n", + " <td>155</td>\n", + " <td>6.330</td>\n", + " <td>68</td>\n", + " <td>Q8CCI5</td>\n", + " <td>8.230000e-132</td>\n", + " <td>377.0</td>\n", + " <td>KOG4477@1|root,KOG4477@2759|Eukaryota,39UAE@33...</td>\n", + " <td>...</td>\n", + " <td>RYBP_MOUSE</td>\n", + " <td>Rybp Dedaf</td>\n", + " <td>FUNCTION: Component of a Polycomb group (PcG) ...</td>\n", + " <td>Chordata</td>\n", + " <td>63.615046</td>\n", + " <td>747.0</td>\n", + " <td>218</td>\n", + " <td>c99854_g1_i2_m.41041</td>\n", + " <td>41041</td>\n", + " <td>c99854_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1442521</th>\n", + " <td>41612</td>\n", + " <td>AF-I1MQL7-F1</td>\n", + " <td>0.154</td>\n", + " <td>226</td>\n", + " <td>10.360</td>\n", + " <td>60</td>\n", + " <td>I1MQL7</td>\n", + " <td>0.000000e+00</td>\n", + " <td>947.0</td>\n", + " <td>KOG0724@1|root,KOG0724@2759|Eukaryota,37Q80@33...</td>\n", + " <td>...</td>\n", + " <td>I1MQL7_SOYBN</td>\n", + " <td>778089 GLYMA_16G217700</td>\n", + " <td>NaN</td>\n", + " <td>Streptophyta</td>\n", + " <td>63.615046</td>\n", + " <td>747.0</td>\n", + " <td>218</td>\n", + " <td>c99854_g1_i2_m.41041</td>\n", + " <td>41041</td>\n", + " <td>c99854_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1442614</th>\n", + " <td>41612</td>\n", + " <td>AF-Q9XUQ0-F1</td>\n", + " <td>0.140</td>\n", + " <td>235</td>\n", + " <td>11.020</td>\n", + " <td>59</td>\n", + " <td>Q9XUQ0</td>\n", + " <td>0.000000e+00</td>\n", + " <td>961.0</td>\n", + " <td>2ASSX@1|root,2RZQJ@2759|Eukaryota,39UW6@33154|...</td>\n", + " <td>...</td>\n", + " <td>Q9XUQ0_CAEEL</td>\n", + " <td>pqn-67 CELE_T16G1.1 T16G1.1</td>\n", + " <td>NaN</td>\n", + " <td>Nematoda (roundworms)</td>\n", + " <td>63.615046</td>\n", + " <td>747.0</td>\n", + " <td>218</td>\n", + " <td>c99854_g1_i2_m.41041</td>\n", + " <td>41041</td>\n", + " <td>c99854_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1442500</th>\n", + " <td>41612</td>\n", + " <td>AF-C1H5I7-F1</td>\n", + " <td>0.119</td>\n", + " <td>151</td>\n", + " <td>11.020</td>\n", + " <td>59</td>\n", + " <td>C1H5I7</td>\n", + " <td>0.000000e+00</td>\n", + " <td>1177.0</td>\n", + " <td>2CNBF@1|root,2QUZZ@2759|Eukaryota,39NHE@33154|...</td>\n", + " <td>...</td>\n", + " <td>C1H5I7_PARBA</td>\n", + " <td>PAAG_06028</td>\n", + " <td>NaN</td>\n", + " <td>Ascomycota</td>\n", + " <td>63.615046</td>\n", + " <td>747.0</td>\n", + " <td>218</td>\n", + " <td>c99854_g1_i2_m.41041</td>\n", + " <td>41041</td>\n", + " <td>c99854_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1442707</th>\n", + " <td>41612</td>\n", + " <td>AF-Q0J235-F1</td>\n", + " <td>0.107</td>\n", + " <td>140</td>\n", + " <td>481.100</td>\n", + " <td>-17</td>\n", + " <td>Q0J235</td>\n", + " <td>4.250000e-263</td>\n", + " <td>734.0</td>\n", + " <td>28J99@1|root,2QQZR@2759|Eukaryota,37SB3@33090|...</td>\n", + " <td>...</td>\n", + " <td>ROLL9_ORYSJ</td>\n", + " <td>RL9 SLL1 Os09g0395300 LOC_Os09g23200 B1040D06.24</td>\n", + " <td>FUNCTION: Probable transcription factor that r...</td>\n", + " <td>Streptophyta</td>\n", + " <td>63.615046</td>\n", + " <td>747.0</td>\n", + " <td>218</td>\n", + " <td>c99854_g1_i2_m.41041</td>\n", + " <td>41041</td>\n", + " <td>c99854_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1442619</th>\n", + " <td>41612</td>\n", + " <td>AF-P54258-F1</td>\n", + " <td>0.160</td>\n", + " <td>150</td>\n", + " <td>481.100</td>\n", + " <td>-17</td>\n", + " <td>P54258</td>\n", + " <td>0.000000e+00</td>\n", + " <td>1649.0</td>\n", + " <td>KOG2133@1|root,KOG2133@2759|Eukaryota,39T1J@33...</td>\n", + " <td>...</td>\n", + " <td>ATN1_RAT</td>\n", + " <td>Atn1 Drpla</td>\n", + " <td>FUNCTION: Transcriptional corepressor. Recruit...</td>\n", + " <td>Chordata</td>\n", + " <td>63.615046</td>\n", + " <td>747.0</td>\n", + " <td>218</td>\n", + " <td>c99854_g1_i2_m.41041</td>\n", + " <td>41041</td>\n", + " <td>c99854_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1442625</th>\n", + " <td>41612</td>\n", + " <td>AF-Q8IM56-F1</td>\n", + " <td>0.121</td>\n", + " <td>148</td>\n", + " <td>481.100</td>\n", + " <td>-19</td>\n", + " <td>Q8IM56</td>\n", + " <td>0.000000e+00</td>\n", + " <td>2878.0</td>\n", + " <td>2CMI8@1|root,2QQEE@2759|Eukaryota,3YC5G@5794|A...</td>\n", + " <td>...</td>\n", + " <td>Q8IM56_PLAF7</td>\n", + " <td>PF3D7_1403800</td>\n", + " <td>NaN</td>\n", + " <td>Apicomplexa</td>\n", + " <td>63.615046</td>\n", + " <td>747.0</td>\n", + " <td>218</td>\n", + " <td>c99854_g1_i2_m.41041</td>\n", + " <td>41041</td>\n", + " <td>c99854_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1442455</th>\n", + " <td>41612</td>\n", + " <td>AF-P13611-F8</td>\n", + " <td>0.192</td>\n", + " <td>151</td>\n", + " <td>481.100</td>\n", + " <td>-21</td>\n", + " <td>P13611</td>\n", + " <td>0.000000e+00</td>\n", + " <td>6472.0</td>\n", + " <td>28IZN@1|root,2QRBE@2759|Eukaryota,38FU8@33154|...</td>\n", + " <td>...</td>\n", + " <td>CSPG2_HUMAN</td>\n", + " <td>VCAN CSPG2</td>\n", + " <td>FUNCTION: May play a role in intercellular sig...</td>\n", + " <td>Chordata</td>\n", + " <td>63.615046</td>\n", + " <td>747.0</td>\n", + " <td>218</td>\n", + " <td>c99854_g1_i2_m.41041</td>\n", + " <td>41041</td>\n", + " <td>c99854_g1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1442557</th>\n", + " <td>41612</td>\n", + " <td>AF-Q96KW2-F1</td>\n", + " <td>0.105</td>\n", + " <td>247</td>\n", + " <td>481.100</td>\n", + " <td>-26</td>\n", + " <td>Q96KW2</td>\n", + " <td>0.000000e+00</td>\n", + " <td>1989.0</td>\n", + " <td>28YNY@1|root,2RWWZ@2759|Eukaryota,3AER2@33154|...</td>\n", + " <td>...</td>\n", + " <td>P12L2_HUMAN</td>\n", + " <td>POM121L2 POM121L</td>\n", + " <td>NaN</td>\n", + " <td>Chordata</td>\n", + " <td>63.615046</td>\n", + " <td>747.0</td>\n", + " <td>218</td>\n", + " <td>c99854_g1_i2_m.41041</td>\n", + " <td>41041</td>\n", + " <td>c99854_g1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>265 rows × 26 columns</p>\n", + "</div>" + ], + "text/plain": [ + " fs_query fs_target seq. id. alignment length fs_e value \\\n", + "1442503 41612 AF-Q6NYT3-F1 0.173 156 1.632 \n", + "1442600 41612 AF-Q8CCI5-F1 0.141 155 6.330 \n", + "1442521 41612 AF-I1MQL7-F1 0.154 226 10.360 \n", + "1442614 41612 AF-Q9XUQ0-F1 0.140 235 11.020 \n", + "1442500 41612 AF-C1H5I7-F1 0.119 151 11.020 \n", + "... ... ... ... ... ... \n", + "1442707 41612 AF-Q0J235-F1 0.107 140 481.100 \n", + "1442619 41612 AF-P54258-F1 0.160 150 481.100 \n", + "1442625 41612 AF-Q8IM56-F1 0.121 148 481.100 \n", + "1442455 41612 AF-P13611-F8 0.192 151 481.100 \n", + "1442557 41612 AF-Q96KW2-F1 0.105 247 481.100 \n", + "\n", + " fs_bit score fs_target_uniprot fs_target_eggnog_evalue \\\n", + "1442503 90 Q6NYT3 1.470000e-217 \n", + "1442600 68 Q8CCI5 8.230000e-132 \n", + "1442521 60 I1MQL7 0.000000e+00 \n", + "1442614 59 Q9XUQ0 0.000000e+00 \n", + "1442500 59 C1H5I7 0.000000e+00 \n", + "... ... ... ... \n", + "1442707 -17 Q0J235 4.250000e-263 \n", + "1442619 -17 P54258 0.000000e+00 \n", + "1442625 -19 Q8IM56 0.000000e+00 \n", + "1442455 -21 P13611 0.000000e+00 \n", + "1442557 -26 Q96KW2 0.000000e+00 \n", + "\n", + " fs_target_eggnog_score \\\n", + "1442503 600.0 \n", + "1442600 377.0 \n", + "1442521 947.0 \n", + "1442614 961.0 \n", + "1442500 1177.0 \n", + "... ... \n", + "1442707 734.0 \n", + "1442619 1649.0 \n", + "1442625 2878.0 \n", + "1442455 6472.0 \n", + "1442557 1989.0 \n", + "\n", + " fs_target_eggnogOGs ... \\\n", + "1442503 28N8T@1|root,2QUU4@2759|Eukaryota,39TGP@33154|... ... \n", + "1442600 KOG4477@1|root,KOG4477@2759|Eukaryota,39UAE@33... ... \n", + "1442521 KOG0724@1|root,KOG0724@2759|Eukaryota,37Q80@33... ... \n", + "1442614 2ASSX@1|root,2RZQJ@2759|Eukaryota,39UW6@33154|... ... \n", + "1442500 2CNBF@1|root,2QUZZ@2759|Eukaryota,39NHE@33154|... ... \n", + "... ... ... \n", + "1442707 28J99@1|root,2QQZR@2759|Eukaryota,37SB3@33090|... ... \n", + "1442619 KOG2133@1|root,KOG2133@2759|Eukaryota,39T1J@33... ... \n", + "1442625 2CMI8@1|root,2QQEE@2759|Eukaryota,3YC5G@5794|A... ... \n", + "1442455 28IZN@1|root,2QRBE@2759|Eukaryota,38FU8@33154|... ... \n", + "1442557 28YNY@1|root,2RWWZ@2759|Eukaryota,3AER2@33154|... ... \n", + "\n", + " fs_target_Entry name \\\n", + "1442503 IER5L_DANRE \n", + "1442600 RYBP_MOUSE \n", + "1442521 I1MQL7_SOYBN \n", + "1442614 Q9XUQ0_CAEEL \n", + "1442500 C1H5I7_PARBA \n", + "... ... \n", + "1442707 ROLL9_ORYSJ \n", + "1442619 ATN1_RAT \n", + "1442625 Q8IM56_PLAF7 \n", + "1442455 CSPG2_HUMAN \n", + "1442557 P12L2_HUMAN \n", + "\n", + " fs_target_Gene names \\\n", + "1442503 ier5l si:ch211-208h16.10 zgc:77455 \n", + "1442600 Rybp Dedaf \n", + "1442521 778089 GLYMA_16G217700 \n", + "1442614 pqn-67 CELE_T16G1.1 T16G1.1 \n", + "1442500 PAAG_06028 \n", + "... ... \n", + "1442707 RL9 SLL1 Os09g0395300 LOC_Os09g23200 B1040D06.24 \n", + "1442619 Atn1 Drpla \n", + "1442625 PF3D7_1403800 \n", + "1442455 VCAN CSPG2 \n", + "1442557 POM121L2 POM121L \n", + "\n", + " fs_target_Function [CC] \\\n", + "1442503 NaN \n", + "1442600 FUNCTION: Component of a Polycomb group (PcG) ... \n", + "1442521 NaN \n", + "1442614 NaN \n", + "1442500 NaN \n", + "... ... \n", + "1442707 FUNCTION: Probable transcription factor that r... \n", + "1442619 FUNCTION: Transcriptional corepressor. Recruit... \n", + "1442625 NaN \n", + "1442455 FUNCTION: May play a role in intercellular sig... \n", + "1442557 NaN \n", + "\n", + " fs_target_Taxonomic lineage (PHYLUM) plddt MSA size query length \\\n", + "1442503 Chordata 63.615046 747.0 218 \n", + "1442600 Chordata 63.615046 747.0 218 \n", + "1442521 Streptophyta 63.615046 747.0 218 \n", + "1442614 Nematoda (roundworms) 63.615046 747.0 218 \n", + "1442500 Ascomycota 63.615046 747.0 218 \n", + "... ... ... ... ... \n", + "1442707 Streptophyta 63.615046 747.0 218 \n", + "1442619 Chordata 63.615046 747.0 218 \n", + "1442625 Apicomplexa 63.615046 747.0 218 \n", + "1442455 Chordata 63.615046 747.0 218 \n", + "1442557 Chordata 63.615046 747.0 218 \n", + "\n", + " gene name protein_id gene_id \n", + "1442503 c99854_g1_i2_m.41041 41041 c99854_g1 \n", + "1442600 c99854_g1_i2_m.41041 41041 c99854_g1 \n", + "1442521 c99854_g1_i2_m.41041 41041 c99854_g1 \n", + "1442614 c99854_g1_i2_m.41041 41041 c99854_g1 \n", + "1442500 c99854_g1_i2_m.41041 41041 c99854_g1 \n", + "... ... ... ... \n", + "1442707 c99854_g1_i2_m.41041 41041 c99854_g1 \n", + "1442619 c99854_g1_i2_m.41041 41041 c99854_g1 \n", + "1442625 c99854_g1_i2_m.41041 41041 c99854_g1 \n", + "1442455 c99854_g1_i2_m.41041 41041 c99854_g1 \n", + "1442557 c99854_g1_i2_m.41041 41041 c99854_g1 \n", + "\n", + "[265 rows x 26 columns]" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "afdb_res[afdb_res['gene_id'].str.contains(\"c99854_g1\")].sort_values(by=\"fs_bit score\", ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "2c216345-b501-419c-b3a4-2a71894a5899", + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'gene_id'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipykernel_77/3048031996.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mafdb_res\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmerge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mafdb_merge\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0meggnog\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mon\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"gene_id\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/reshape/merge.py\u001b[0m in \u001b[0;36mmerge\u001b[0;34m(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[0mvalidate\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 106\u001b[0m ) -> DataFrame:\n\u001b[0;32m--> 107\u001b[0;31m op = _MergeOperation(\n\u001b[0m\u001b[1;32m 108\u001b[0m \u001b[0mleft\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 109\u001b[0m \u001b[0mright\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/reshape/merge.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, left, right, how, on, left_on, right_on, axis, left_index, right_index, sort, suffixes, copy, indicator, validate)\u001b[0m\n\u001b[1;32m 698\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mright_join_keys\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 699\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin_names\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 700\u001b[0;31m ) = self._get_merge_keys()\n\u001b[0m\u001b[1;32m 701\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 702\u001b[0m \u001b[0;31m# validate the merge keys dtypes. We may need to coerce\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/reshape/merge.py\u001b[0m in \u001b[0;36m_get_merge_keys\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1103\u001b[0m \u001b[0mright_keys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1104\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlk\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1105\u001b[0;31m \u001b[0mleft_keys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mleft\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_label_or_level_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1106\u001b[0m \u001b[0mjoin_names\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1107\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m_get_label_or_level_values\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 1774\u001b[0m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maxes\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_level_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1775\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1776\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1777\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1778\u001b[0m \u001b[0;31m# Check for duplicates\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'gene_id'" + ] + } + ], + "source": [ + "afdb_res = pd.merge(afdb_merge, eggnog, on=\"gene_id\")" ] }, {