From 5276fe26b4d8ef6dc73b647adf7b4553a77b8621 Mon Sep 17 00:00:00 2001 From: Nikolaos Papadopoulos <npapadop@embl.de> Date: Mon, 1 Aug 2022 09:32:51 +0200 Subject: [PATCH] now manual workaround for PDB ID mapping --- analysis/process_pdb.ipynb | 659 ++++++++++++++++++++++ scripts/results_processing/io_read_pdb.py | 83 --- scripts/results_processing/io_read_pdb.sh | 13 - 3 files changed, 659 insertions(+), 96 deletions(-) create mode 100644 analysis/process_pdb.ipynb delete mode 100644 scripts/results_processing/io_read_pdb.py delete mode 100644 scripts/results_processing/io_read_pdb.sh diff --git a/analysis/process_pdb.ipynb b/analysis/process_pdb.ipynb new file mode 100644 index 0000000..c8b8d0f --- /dev/null +++ b/analysis/process_pdb.ipynb @@ -0,0 +1,659 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "5b92f263-711c-4cdf-8b11-5d1defa4a0d7", + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "import re\n", + "import time\n", + "import json\n", + "import zlib\n", + "from xml.etree import ElementTree\n", + "from urllib.parse import urlparse, parse_qs, urlencode\n", + "import requests\n", + "from requests.adapters import HTTPAdapter, Retry\n", + "\n", + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a4e75e96-4f37-46a9-af3e-44914e6114d3", + "metadata": {}, + "outputs": [], + "source": [ + "def read_foldseek(result_path):\n", + " df = pd.read_csv(result_path, sep=\"\\s+\", header=None)\n", + " df.columns = [\"query\", \"target\", \"seq. id.\", \"alignment length\", \"no. mismatches\",\n", + " \"no. gap open\", \"query start\", \"query end\", \"target start\", \"target end\",\n", + " \"e value\", \"bit score\"]\n", + " df[\"query\"] = df[\"query\"].str.split(\"_\").str[0]\n", + " df[\"corrected bit score\"] = df[\"bit score\"] / df[\"alignment length\"]\n", + " if \"pdb\" in result_path:\n", + " df[\"target\"] = df[\"target\"].str.split(\".\").str[0]\n", + " else:\n", + " df[\"target\"] = df[\"target\"].str.split(\"-\").str[:3].str.join(\"-\")\n", + " return df\n", + "\n", + "def get_aligned_plddt(df, plddt, name_dict):\n", + " aligned_plddt = [0.] * len(df)\n", + " for e in tqdm(df.iterrows()):\n", + " index, row = e\n", + " query = row['query']\n", + " isoform = name_dict[query]\n", + " protein = plddt[isoform]\n", + " start = row['query start'] - 1\n", + " end = row['query end'] - 1\n", + " aligned_plddt[index] = np.mean(protein[start:end])\n", + " return aligned_plddt" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d8fda205-90f8-4941-b77e-39f3e10d6ab2", + "metadata": {}, + "outputs": [], + "source": [ + "# FOLDSEEK scores\n", + "fs_pdb = '/g/arendt/npapadop/repos/coffe/data/pdb_score.tsv'\n", + "# per-residue pLDDT score\n", + "plddt = np.load('/g/arendt/npapadop/repos/coffe/data/spongilla_plddt.npz')\n", + "\n", + "sequence_info = pd.read_csv(\"/g/arendt/npapadop/repos/coffe/data/sequence_info.csv\")\n", + "query_to_isoform = sequence_info[['query', 'isoform']].set_index('query').to_dict()['isoform']" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a8f277d7-94f6-4316-859b-6162c639ee73", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1470483it [25:34, 958.34it/s] \n" + ] + } + ], + "source": [ + "pdb = read_foldseek(fs_pdb)\n", + "pdb[\"query\"] = pdb[\"query\"].values.astype(int)\n", + "\n", + "pdb['aligned_plddt'] = get_aligned_plddt(pdb, plddt, query_to_isoform)" + ] + }, + { + "cell_type": "markdown", + "id": "0ddf8b37-4f85-4076-be68-32be64e02df3", + "metadata": {}, + "source": [ + "Write out the unique PDB IDs in a file and submit it to the UniProt ID mapper:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0b6e404b-a754-4779-a7fa-a6ea683c7abb", + "metadata": {}, + "outputs": [], + "source": [ + "pd.Series(pdb['target'].unique()).to_csv('../data/pdb_ids.csv', header=False, index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "49ed1f9c-8225-46a8-9abe-d6fb568cb175", + "metadata": {}, + "source": [ + "(retrieve result link)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a23cc67a-1d9b-4c78-bcec-be7bd965318f", + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://rest.uniprot.org/idmapping/results/8f35e821d58602a990b260ae64cf77fb1b1f23d4?compressed=true&fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength&format=tsv&size=500\"" + ] + }, + { + "cell_type": "markdown", + "id": "3e9c9787-5021-4e3e-96e7-8cc58d6820e1", + "metadata": {}, + "source": [ + "Use the UniProt API functions to retrieve the mapping results, since the download link seems to be broken?" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "2048ee0c-7761-4635-9d0b-c5ce69f53384", + "metadata": {}, + "outputs": [], + "source": [ + "POLLING_INTERVAL = 3\n", + "\n", + "API_URL = \"https://rest.uniprot.org\"\n", + "\n", + "\n", + "retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])\n", + "session = requests.Session()\n", + "session.mount(\"https://\", HTTPAdapter(max_retries=retries))\n", + "\n", + "\n", + "def get_next_link(headers):\n", + " re_next_link = re.compile(r'<(.+)>; rel=\"next\"')\n", + " if \"Link\" in headers:\n", + " match = re_next_link.match(headers[\"Link\"])\n", + " if match:\n", + " return match.group(1)\n", + "\n", + "\n", + "def get_batch(batch_response, file_format, compressed):\n", + " batch_url = get_next_link(batch_response.headers)\n", + " while batch_url:\n", + " batch_response = session.get(batch_url)\n", + " batch_response.raise_for_status()\n", + " yield decode_results(batch_response, file_format, compressed)\n", + " batch_url = get_next_link(batch_response.headers)\n", + "\n", + "\n", + "def combine_batches(all_results, batch_results, file_format):\n", + " if file_format == \"json\":\n", + " for key in (\"results\", \"failedIds\"):\n", + " if key in batch_results and batch_results[key]:\n", + " all_results[key] += batch_results[key]\n", + " elif file_format == \"tsv\":\n", + " return all_results + batch_results[1:]\n", + " else:\n", + " return all_results + batch_results\n", + " return all_results\n", + "\n", + "\n", + "def decode_results(response, file_format, compressed):\n", + " if compressed:\n", + " decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)\n", + " if file_format == \"json\":\n", + " j = json.loads(decompressed.decode(\"utf-8\"))\n", + " return j\n", + " elif file_format == \"tsv\":\n", + " return [line for line in decompressed.decode(\"utf-8\").split(\"\\n\") if line]\n", + " elif file_format == \"xlsx\":\n", + " return [decompressed]\n", + " elif file_format == \"xml\":\n", + " return [decompressed.decode(\"utf-8\")]\n", + " else:\n", + " return decompressed.decode(\"utf-8\")\n", + " elif file_format == \"json\":\n", + " return response.json()\n", + " elif file_format == \"tsv\":\n", + " return [line for line in response.text.split(\"\\n\") if line]\n", + " elif file_format == \"xlsx\":\n", + " return [response.content]\n", + " elif file_format == \"xml\":\n", + " return [response.text]\n", + " return response.text\n", + "\n", + "\n", + "def get_xml_namespace(element):\n", + " m = re.match(r\"\\{(.*)\\}\", element.tag)\n", + " return m.groups()[0] if m else \"\"\n", + "\n", + "\n", + "def merge_xml_results(xml_results):\n", + " merged_root = ElementTree.fromstring(xml_results[0])\n", + " for result in xml_results[1:]:\n", + " root = ElementTree.fromstring(result)\n", + " for child in root.findall(\"{http://uniprot.org/uniprot}entry\"):\n", + " merged_root.insert(-1, child)\n", + " ElementTree.register_namespace(\"\", get_xml_namespace(merged_root[0]))\n", + " return ElementTree.tostring(merged_root, encoding=\"utf-8\", xml_declaration=True)\n", + "\n", + "\n", + "def print_progress_batches(batch_index, size, total):\n", + " n_fetched = min((batch_index + 1) * size, total)\n", + " print(f\"Fetched: {n_fetched} / {total}\")\n", + "\n", + "\n", + "def get_id_mapping_results_search(url):\n", + " parsed = urlparse(url)\n", + " query = parse_qs(parsed.query)\n", + " file_format = query[\"format\"][0] if \"format\" in query else \"json\"\n", + " if \"size\" in query:\n", + " size = int(query[\"size\"][0])\n", + " else:\n", + " size = 500\n", + " query[\"size\"] = size\n", + " compressed = (\n", + " query[\"compressed\"][0].lower() == \"true\" if \"compressed\" in query else False\n", + " )\n", + " parsed = parsed._replace(query=urlencode(query, doseq=True))\n", + " url = parsed.geturl()\n", + " request = session.get(url)\n", + " request.raise_for_status()\n", + " results = decode_results(request, file_format, compressed)\n", + " total = int(request.headers[\"x-total-results\"])\n", + " print_progress_batches(0, size, total)\n", + " for i, batch in enumerate(get_batch(request, file_format, compressed), 1):\n", + " results = combine_batches(results, batch, file_format)\n", + " print_progress_batches(i, size, total)\n", + " if file_format == \"xml\":\n", + " return merge_xml_results(results)\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9d46cd87-8f41-4d2e-a36c-1f06e86315b6", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fetched: 500 / 144380\n", + "Fetched: 1000 / 144380\n", + "Fetched: 1500 / 144380\n", + "Fetched: 2000 / 144380\n", + "Fetched: 2500 / 144380\n", + "Fetched: 3000 / 144380\n", + "Fetched: 3500 / 144380\n", + "Fetched: 4000 / 144380\n", + "Fetched: 4500 / 144380\n", + "Fetched: 5000 / 144380\n", + "Fetched: 5500 / 144380\n", + "Fetched: 6000 / 144380\n", + "Fetched: 6500 / 144380\n", + "Fetched: 7000 / 144380\n", + "Fetched: 7500 / 144380\n", + "Fetched: 8000 / 144380\n", + "Fetched: 8500 / 144380\n", + "Fetched: 9000 / 144380\n", + "Fetched: 9500 / 144380\n", + "Fetched: 10000 / 144380\n", + "Fetched: 10500 / 144380\n", + "Fetched: 11000 / 144380\n", + "Fetched: 11500 / 144380\n", + "Fetched: 12000 / 144380\n", + "Fetched: 12500 / 144380\n", + "Fetched: 13000 / 144380\n", + "Fetched: 13500 / 144380\n", + "Fetched: 14000 / 144380\n", + "Fetched: 14500 / 144380\n", + "Fetched: 15000 / 144380\n", + "Fetched: 15500 / 144380\n", + "Fetched: 16000 / 144380\n", + "Fetched: 16500 / 144380\n", + "Fetched: 17000 / 144380\n", + "Fetched: 17500 / 144380\n", + "Fetched: 18000 / 144380\n", + "Fetched: 18500 / 144380\n", + "Fetched: 19000 / 144380\n", + "Fetched: 19500 / 144380\n", + "Fetched: 20000 / 144380\n", + "Fetched: 20500 / 144380\n", + "Fetched: 21000 / 144380\n", + "Fetched: 21500 / 144380\n", + "Fetched: 22000 / 144380\n", + "Fetched: 22500 / 144380\n", + "Fetched: 23000 / 144380\n", + "Fetched: 23500 / 144380\n", + "Fetched: 24000 / 144380\n", + "Fetched: 24500 / 144380\n", + "Fetched: 25000 / 144380\n", + "Fetched: 25500 / 144380\n", + "Fetched: 26000 / 144380\n", + "Fetched: 26500 / 144380\n", + "Fetched: 27000 / 144380\n", + "Fetched: 27500 / 144380\n", + "Fetched: 28000 / 144380\n", + "Fetched: 28500 / 144380\n", + "Fetched: 29000 / 144380\n", + "Fetched: 29500 / 144380\n", + "Fetched: 30000 / 144380\n", + "Fetched: 30500 / 144380\n", + "Fetched: 31000 / 144380\n", + "Fetched: 31500 / 144380\n", + "Fetched: 32000 / 144380\n", + "Fetched: 32500 / 144380\n", + "Fetched: 33000 / 144380\n", + "Fetched: 33500 / 144380\n", + "Fetched: 34000 / 144380\n", + "Fetched: 34500 / 144380\n", + "Fetched: 35000 / 144380\n", + "Fetched: 35500 / 144380\n", + "Fetched: 36000 / 144380\n", + "Fetched: 36500 / 144380\n", + "Fetched: 37000 / 144380\n", + "Fetched: 37500 / 144380\n", + "Fetched: 38000 / 144380\n", + "Fetched: 38500 / 144380\n", + "Fetched: 39000 / 144380\n", + "Fetched: 39500 / 144380\n", + "Fetched: 40000 / 144380\n", + "Fetched: 40500 / 144380\n", + "Fetched: 41000 / 144380\n", + "Fetched: 41500 / 144380\n", + "Fetched: 42000 / 144380\n", + "Fetched: 42500 / 144380\n", + "Fetched: 43000 / 144380\n", + "Fetched: 43500 / 144380\n", + "Fetched: 44000 / 144380\n", + "Fetched: 44500 / 144380\n", + "Fetched: 45000 / 144380\n", + "Fetched: 45500 / 144380\n", + "Fetched: 46000 / 144380\n", + "Fetched: 46500 / 144380\n", + "Fetched: 47000 / 144380\n", + "Fetched: 47500 / 144380\n", + "Fetched: 48000 / 144380\n", + "Fetched: 48500 / 144380\n", + "Fetched: 49000 / 144380\n", + "Fetched: 49500 / 144380\n", + "Fetched: 50000 / 144380\n", + "Fetched: 50500 / 144380\n", + "Fetched: 51000 / 144380\n", + "Fetched: 51500 / 144380\n", + "Fetched: 52000 / 144380\n", + "Fetched: 52500 / 144380\n", + "Fetched: 53000 / 144380\n", + "Fetched: 53500 / 144380\n", + "Fetched: 54000 / 144380\n", + "Fetched: 54500 / 144380\n", + "Fetched: 55000 / 144380\n", + "Fetched: 55500 / 144380\n", + "Fetched: 56000 / 144380\n", + "Fetched: 56500 / 144380\n", + "Fetched: 57000 / 144380\n", + "Fetched: 57500 / 144380\n", + "Fetched: 58000 / 144380\n", + "Fetched: 58500 / 144380\n", + "Fetched: 59000 / 144380\n", + "Fetched: 59500 / 144380\n", + "Fetched: 60000 / 144380\n", + "Fetched: 60500 / 144380\n", + "Fetched: 61000 / 144380\n", + "Fetched: 61500 / 144380\n", + "Fetched: 62000 / 144380\n", + "Fetched: 62500 / 144380\n", + "Fetched: 63000 / 144380\n", + "Fetched: 63500 / 144380\n", + "Fetched: 64000 / 144380\n", + "Fetched: 64500 / 144380\n", + "Fetched: 65000 / 144380\n", + "Fetched: 65500 / 144380\n", + "Fetched: 66000 / 144380\n", + "Fetched: 66500 / 144380\n", + "Fetched: 67000 / 144380\n", + "Fetched: 67500 / 144380\n", + "Fetched: 68000 / 144380\n", + "Fetched: 68500 / 144380\n", + "Fetched: 69000 / 144380\n", + "Fetched: 69500 / 144380\n", + "Fetched: 70000 / 144380\n", + "Fetched: 70500 / 144380\n", + "Fetched: 71000 / 144380\n", + "Fetched: 71500 / 144380\n", + "Fetched: 72000 / 144380\n", + "Fetched: 72500 / 144380\n", + "Fetched: 73000 / 144380\n", + "Fetched: 73500 / 144380\n", + "Fetched: 74000 / 144380\n", + "Fetched: 74500 / 144380\n", + "Fetched: 75000 / 144380\n", + "Fetched: 75500 / 144380\n", + "Fetched: 76000 / 144380\n", + "Fetched: 76500 / 144380\n", + "Fetched: 77000 / 144380\n", + "Fetched: 77500 / 144380\n", + "Fetched: 78000 / 144380\n", + "Fetched: 78500 / 144380\n", + "Fetched: 79000 / 144380\n", + "Fetched: 79500 / 144380\n", + "Fetched: 80000 / 144380\n", + "Fetched: 80500 / 144380\n", + "Fetched: 81000 / 144380\n", + "Fetched: 81500 / 144380\n", + "Fetched: 82000 / 144380\n", + "Fetched: 82500 / 144380\n", + "Fetched: 83000 / 144380\n", + "Fetched: 83500 / 144380\n", + "Fetched: 84000 / 144380\n", + "Fetched: 84500 / 144380\n", + "Fetched: 85000 / 144380\n", + "Fetched: 85500 / 144380\n", + "Fetched: 86000 / 144380\n", + "Fetched: 86500 / 144380\n", + "Fetched: 87000 / 144380\n", + "Fetched: 87500 / 144380\n", + "Fetched: 88000 / 144380\n", + "Fetched: 88500 / 144380\n", + "Fetched: 89000 / 144380\n", + "Fetched: 89500 / 144380\n", + "Fetched: 90000 / 144380\n", + "Fetched: 90500 / 144380\n", + "Fetched: 91000 / 144380\n", + "Fetched: 91500 / 144380\n", + "Fetched: 92000 / 144380\n", + "Fetched: 92500 / 144380\n", + "Fetched: 93000 / 144380\n", + "Fetched: 93500 / 144380\n", + "Fetched: 94000 / 144380\n", + "Fetched: 94500 / 144380\n", + "Fetched: 95000 / 144380\n", + "Fetched: 95500 / 144380\n", + "Fetched: 96000 / 144380\n", + "Fetched: 96500 / 144380\n", + "Fetched: 97000 / 144380\n", + "Fetched: 97500 / 144380\n", + "Fetched: 98000 / 144380\n", + "Fetched: 98500 / 144380\n", + "Fetched: 99000 / 144380\n", + "Fetched: 99500 / 144380\n", + "Fetched: 100000 / 144380\n", + "Fetched: 100500 / 144380\n", + "Fetched: 101000 / 144380\n", + "Fetched: 101500 / 144380\n", + "Fetched: 102000 / 144380\n", + "Fetched: 102500 / 144380\n", + "Fetched: 103000 / 144380\n", + "Fetched: 103500 / 144380\n", + "Fetched: 104000 / 144380\n", + "Fetched: 104500 / 144380\n", + "Fetched: 105000 / 144380\n", + "Fetched: 105500 / 144380\n", + "Fetched: 106000 / 144380\n", + "Fetched: 106500 / 144380\n", + "Fetched: 107000 / 144380\n", + "Fetched: 107500 / 144380\n", + "Fetched: 108000 / 144380\n", + "Fetched: 108500 / 144380\n", + "Fetched: 109000 / 144380\n", + "Fetched: 109500 / 144380\n", + "Fetched: 110000 / 144380\n", + "Fetched: 110500 / 144380\n", + "Fetched: 111000 / 144380\n", + "Fetched: 111500 / 144380\n", + "Fetched: 112000 / 144380\n", + "Fetched: 112500 / 144380\n", + "Fetched: 113000 / 144380\n", + "Fetched: 113500 / 144380\n", + "Fetched: 114000 / 144380\n", + "Fetched: 114500 / 144380\n", + "Fetched: 115000 / 144380\n", + "Fetched: 115500 / 144380\n", + "Fetched: 116000 / 144380\n", + "Fetched: 116500 / 144380\n", + "Fetched: 117000 / 144380\n", + "Fetched: 117500 / 144380\n", + "Fetched: 118000 / 144380\n", + "Fetched: 118500 / 144380\n", + "Fetched: 119000 / 144380\n", + "Fetched: 119500 / 144380\n", + "Fetched: 120000 / 144380\n", + "Fetched: 120500 / 144380\n", + "Fetched: 121000 / 144380\n", + "Fetched: 121500 / 144380\n", + "Fetched: 122000 / 144380\n", + "Fetched: 122500 / 144380\n", + "Fetched: 123000 / 144380\n", + "Fetched: 123500 / 144380\n", + "Fetched: 124000 / 144380\n", + "Fetched: 124500 / 144380\n", + "Fetched: 125000 / 144380\n", + "Fetched: 125500 / 144380\n", + "Fetched: 126000 / 144380\n", + "Fetched: 126500 / 144380\n", + "Fetched: 127000 / 144380\n", + "Fetched: 127500 / 144380\n", + "Fetched: 128000 / 144380\n", + "Fetched: 128500 / 144380\n", + "Fetched: 129000 / 144380\n", + "Fetched: 129500 / 144380\n", + "Fetched: 130000 / 144380\n", + "Fetched: 130500 / 144380\n", + "Fetched: 131000 / 144380\n", + "Fetched: 131500 / 144380\n", + "Fetched: 132000 / 144380\n", + "Fetched: 132500 / 144380\n", + "Fetched: 133000 / 144380\n", + "Fetched: 133500 / 144380\n", + "Fetched: 134000 / 144380\n", + "Fetched: 134500 / 144380\n", + "Fetched: 135000 / 144380\n", + "Fetched: 135500 / 144380\n", + "Fetched: 136000 / 144380\n", + "Fetched: 136500 / 144380\n", + "Fetched: 137000 / 144380\n", + "Fetched: 137500 / 144380\n", + "Fetched: 138000 / 144380\n", + "Fetched: 138500 / 144380\n", + "Fetched: 139000 / 144380\n", + "Fetched: 139500 / 144380\n", + "Fetched: 140000 / 144380\n", + "Fetched: 140500 / 144380\n", + "Fetched: 141000 / 144380\n", + "Fetched: 141500 / 144380\n", + "Fetched: 142000 / 144380\n", + "Fetched: 142500 / 144380\n", + "Fetched: 143000 / 144380\n", + "Fetched: 143500 / 144380\n", + "Fetched: 144000 / 144380\n", + "Fetched: 144380 / 144380\n" + ] + } + ], + "source": [ + "results = get_id_mapping_results_search(url)" + ] + }, + { + "cell_type": "markdown", + "id": "7ae2cf71-0cde-4595-9c27-3be502c7de2b", + "metadata": {}, + "source": [ + "Parse result json into a dict and that into a data frame" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "bcbc223f-8615-41f8-a2cc-409ec1999d32", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 144380/144380 [00:00<00:00, 948289.08it/s]\n" + ] + } + ], + "source": [ + "pdb_to_uniprot = {}\n", + "for r in tqdm(results[1:]):\n", + " pdb_id, uniprot_id = r.split('\\t')\n", + " pdb_to_uniprot[pdb_id] = uniprot_id" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4232b8fd-228c-4d9b-9945-86511bbec8b1", + "metadata": {}, + "outputs": [], + "source": [ + "pdb_to_uniprot = pd.DataFrame.from_dict(pdb_to_uniprot, orient='index')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "906c2884-13a5-44b6-948b-3dc52590c271", + "metadata": {}, + "outputs": [], + "source": [ + "pdb_to_uniprot.columns = ['uniprot']" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "1d10a20d-b3b4-4179-9499-8238a1c62236", + "metadata": {}, + "outputs": [], + "source": [ + "pdb = pdb.join(pdb_to_uniprot, on='target')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "d180bb5b-ba34-441d-a0c1-f259609c978e", + "metadata": {}, + "outputs": [], + "source": [ + "pdb.to_parquet('/g/arendt/npapadop/repos/coffe/data/pdb_tmp.parquet')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scripts/results_processing/io_read_pdb.py b/scripts/results_processing/io_read_pdb.py deleted file mode 100644 index f1c8e3b..0000000 --- a/scripts/results_processing/io_read_pdb.py +++ /dev/null @@ -1,83 +0,0 @@ -import os -import json -from tqdm import tqdm -from urllib import parse, request - -import numpy as np -import pandas as pd - -def read_foldseek(result_path): - df = pd.read_csv(result_path, sep="\s+", header=None) - df.columns = ["query", "target", "seq. id.", "alignment length", "no. mismatches", - "no. gap open", "query start", "query end", "target start", "target end", - "e value", "bit score"] - df["query"] = df["query"].str.split("_").str[0] - df["corrected bit score"] = df["bit score"] / df["alignment length"] - if "pdb" in result_path: - df["target"] = df["target"].str.split(".").str[0] - else: - df["target"] = df["target"].str.split("-").str[:3].str.join("-") - return df - -def get_aligned_plddt(df, plddt, name_dict): - aligned_plddt = [0.] * len(df) - for e in tqdm(df.iterrows()): - index, row = e - query = row['query'] - isoform = name_dict[query] - protein = plddt[isoform] - start = row['query start'] - 1 - end = row['query end'] - 1 - aligned_plddt[index] = np.mean(protein[start:end]) - return aligned_plddt - -def get_from_uniprot(df, column, uniprot_from="ACC", uniprot_to="EGGNOG_ID", request_size=40000): - ids = df[column][~df[column].isnull()] - url = 'https://rest.uniprot.org/' - - params = { - 'from': uniprot_from, - 'to': uniprot_to, - 'format': 'tab', - 'query': " ".join(ids.unique()) - } - - data = parse.urlencode(params) - data = data.encode('utf-8') - req = request.Request(url, data) - with request.urlopen(req) as f: - response = f.read() - return response - -def create_id_map(response, id_in, id_out): - map_from = [] - map_to = [] - for row in tqdm(response.decode("utf-8").split("\n")[:-1]): - col1, col2 = row.split("\t") - map_from.append(col1) - map_to.append(col2) - - res = pd.DataFrame({id_in: map_from[1:], id_out: map_to[1:]}) - res = pd.DataFrame(res.groupby(id_in)[id_out].apply(",".join)) -# res.set_index(id_in, inplace=True) - return res - -def enrich_from_uniprot(df, column_from, column_to, uniprot_from="PDB_ID", uniprot_to="ACC"): - response = get_from_uniprot(df, column_from, uniprot_from=uniprot_from, uniprot_to=uniprot_to) - df_map = create_id_map(response, column_from, column_to) - return df.join(df_map, on=column_from) - -# FOLDSEEK scores -fs_pdb = '/g/arendt/npapadop/repos/coffe/data/pdb_score.tsv' -# per-residue pLDDT score -plddt = np.load('/g/arendt/npapadop/repos/coffe/data/spongilla_plddt.npz') - -sequence_info = pd.read_csv("/g/arendt/npapadop/repos/coffe/data/sequence_info.csv") -query_to_isoform = sequence_info[['query', 'isoform']].set_index('query').to_dict()['isoform'] - -pdb = read_foldseek(fs_pdb) -pdb["query"] = pdb["query"].values.astype(int) - -pdb['aligned_plddt'] = get_aligned_plddt(pdb, plddt, query_to_isoform) -pdb = enrich_from_uniprot(pdb, "target", "uniprot", uniprot_from="PDB_ID", uniprot_to="ACC") -pdb.to_parquet('/g/arendt/npapadop/repos/coffe/data/pdb_tmp.parquet') \ No newline at end of file diff --git a/scripts/results_processing/io_read_pdb.sh b/scripts/results_processing/io_read_pdb.sh deleted file mode 100644 index 800685e..0000000 --- a/scripts/results_processing/io_read_pdb.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -ex -#SBATCH -J coffe_read_pdb -#SBATCH -t 5:00:00 -#SBATCH -c 1 -#SBATCH --mem=16G -#SBATCH -o /g/arendt/npapadop/cluster/io_read_pdb.out -#SBATCH -e /g/arendt/npapadop/cluster/io_read_pdb.err - -module load Anaconda3 -source ~/.bash_profile -conda activate /g/arendt/npapadop/repos/condas/CoFFE - -python /g/arendt/npapadop/repos/coffe/scripts/results_processing/io_read_pdb.py \ No newline at end of file -- GitLab