From 5276fe26b4d8ef6dc73b647adf7b4553a77b8621 Mon Sep 17 00:00:00 2001
From: Nikolaos Papadopoulos <npapadop@embl.de>
Date: Mon, 1 Aug 2022 09:32:51 +0200
Subject: [PATCH] now manual workaround for PDB ID mapping

---
 analysis/process_pdb.ipynb                | 659 ++++++++++++++++++++++
 scripts/results_processing/io_read_pdb.py |  83 ---
 scripts/results_processing/io_read_pdb.sh |  13 -
 3 files changed, 659 insertions(+), 96 deletions(-)
 create mode 100644 analysis/process_pdb.ipynb
 delete mode 100644 scripts/results_processing/io_read_pdb.py
 delete mode 100644 scripts/results_processing/io_read_pdb.sh

diff --git a/analysis/process_pdb.ipynb b/analysis/process_pdb.ipynb
new file mode 100644
index 0000000..c8b8d0f
--- /dev/null
+++ b/analysis/process_pdb.ipynb
@@ -0,0 +1,659 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5b92f263-711c-4cdf-8b11-5d1defa4a0d7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "import re\n",
+    "import time\n",
+    "import json\n",
+    "import zlib\n",
+    "from xml.etree import ElementTree\n",
+    "from urllib.parse import urlparse, parse_qs, urlencode\n",
+    "import requests\n",
+    "from requests.adapters import HTTPAdapter, Retry\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "a4e75e96-4f37-46a9-af3e-44914e6114d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_foldseek(result_path):\n",
+    "    df = pd.read_csv(result_path, sep=\"\\s+\", header=None)\n",
+    "    df.columns = [\"query\", \"target\", \"seq. id.\", \"alignment length\", \"no. mismatches\",\n",
+    "                   \"no. gap open\", \"query start\", \"query end\", \"target start\", \"target end\",\n",
+    "                   \"e value\", \"bit score\"]\n",
+    "    df[\"query\"] = df[\"query\"].str.split(\"_\").str[0]\n",
+    "    df[\"corrected bit score\"] = df[\"bit score\"] / df[\"alignment length\"]\n",
+    "    if \"pdb\" in result_path:\n",
+    "        df[\"target\"] = df[\"target\"].str.split(\".\").str[0]\n",
+    "    else:\n",
+    "        df[\"target\"] = df[\"target\"].str.split(\"-\").str[:3].str.join(\"-\")\n",
+    "    return df\n",
+    "\n",
+    "def get_aligned_plddt(df, plddt, name_dict):\n",
+    "    aligned_plddt = [0.] * len(df)\n",
+    "    for e in tqdm(df.iterrows()):\n",
+    "        index, row = e\n",
+    "        query = row['query']\n",
+    "        isoform = name_dict[query]\n",
+    "        protein = plddt[isoform]\n",
+    "        start = row['query start'] - 1\n",
+    "        end = row['query end'] - 1\n",
+    "        aligned_plddt[index] = np.mean(protein[start:end])\n",
+    "    return aligned_plddt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "d8fda205-90f8-4941-b77e-39f3e10d6ab2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# FOLDSEEK scores\n",
+    "fs_pdb = '/g/arendt/npapadop/repos/coffe/data/pdb_score.tsv'\n",
+    "# per-residue pLDDT score\n",
+    "plddt = np.load('/g/arendt/npapadop/repos/coffe/data/spongilla_plddt.npz')\n",
+    "\n",
+    "sequence_info = pd.read_csv(\"/g/arendt/npapadop/repos/coffe/data/sequence_info.csv\")\n",
+    "query_to_isoform = sequence_info[['query', 'isoform']].set_index('query').to_dict()['isoform']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "a8f277d7-94f6-4316-859b-6162c639ee73",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "1470483it [25:34, 958.34it/s] \n"
+     ]
+    }
+   ],
+   "source": [
+    "pdb = read_foldseek(fs_pdb)\n",
+    "pdb[\"query\"] = pdb[\"query\"].values.astype(int)\n",
+    "\n",
+    "pdb['aligned_plddt'] = get_aligned_plddt(pdb, plddt, query_to_isoform)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0ddf8b37-4f85-4076-be68-32be64e02df3",
+   "metadata": {},
+   "source": [
+    "Write out the unique PDB IDs in a file and submit it to the UniProt ID mapper:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "0b6e404b-a754-4779-a7fa-a6ea683c7abb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.Series(pdb['target'].unique()).to_csv('../data/pdb_ids.csv', header=False, index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "49ed1f9c-8225-46a8-9abe-d6fb568cb175",
+   "metadata": {},
+   "source": [
+    "(retrieve result link)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a23cc67a-1d9b-4c78-bcec-be7bd965318f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = \"https://rest.uniprot.org/idmapping/results/8f35e821d58602a990b260ae64cf77fb1b1f23d4?compressed=true&fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength&format=tsv&size=500\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3e9c9787-5021-4e3e-96e7-8cc58d6820e1",
+   "metadata": {},
+   "source": [
+    "Use the UniProt API functions to retrieve the mapping results, since the download link seems to be broken?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "2048ee0c-7761-4635-9d0b-c5ce69f53384",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "POLLING_INTERVAL = 3\n",
+    "\n",
+    "API_URL = \"https://rest.uniprot.org\"\n",
+    "\n",
+    "\n",
+    "retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])\n",
+    "session = requests.Session()\n",
+    "session.mount(\"https://\", HTTPAdapter(max_retries=retries))\n",
+    "\n",
+    "\n",
+    "def get_next_link(headers):\n",
+    "    re_next_link = re.compile(r'<(.+)>; rel=\"next\"')\n",
+    "    if \"Link\" in headers:\n",
+    "        match = re_next_link.match(headers[\"Link\"])\n",
+    "        if match:\n",
+    "            return match.group(1)\n",
+    "\n",
+    "\n",
+    "def get_batch(batch_response, file_format, compressed):\n",
+    "    batch_url = get_next_link(batch_response.headers)\n",
+    "    while batch_url:\n",
+    "        batch_response = session.get(batch_url)\n",
+    "        batch_response.raise_for_status()\n",
+    "        yield decode_results(batch_response, file_format, compressed)\n",
+    "        batch_url = get_next_link(batch_response.headers)\n",
+    "\n",
+    "\n",
+    "def combine_batches(all_results, batch_results, file_format):\n",
+    "    if file_format == \"json\":\n",
+    "        for key in (\"results\", \"failedIds\"):\n",
+    "            if key in batch_results and batch_results[key]:\n",
+    "                all_results[key] += batch_results[key]\n",
+    "    elif file_format == \"tsv\":\n",
+    "        return all_results + batch_results[1:]\n",
+    "    else:\n",
+    "        return all_results + batch_results\n",
+    "    return all_results\n",
+    "\n",
+    "\n",
+    "def decode_results(response, file_format, compressed):\n",
+    "    if compressed:\n",
+    "        decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)\n",
+    "        if file_format == \"json\":\n",
+    "            j = json.loads(decompressed.decode(\"utf-8\"))\n",
+    "            return j\n",
+    "        elif file_format == \"tsv\":\n",
+    "            return [line for line in decompressed.decode(\"utf-8\").split(\"\\n\") if line]\n",
+    "        elif file_format == \"xlsx\":\n",
+    "            return [decompressed]\n",
+    "        elif file_format == \"xml\":\n",
+    "            return [decompressed.decode(\"utf-8\")]\n",
+    "        else:\n",
+    "            return decompressed.decode(\"utf-8\")\n",
+    "    elif file_format == \"json\":\n",
+    "        return response.json()\n",
+    "    elif file_format == \"tsv\":\n",
+    "        return [line for line in response.text.split(\"\\n\") if line]\n",
+    "    elif file_format == \"xlsx\":\n",
+    "        return [response.content]\n",
+    "    elif file_format == \"xml\":\n",
+    "        return [response.text]\n",
+    "    return response.text\n",
+    "\n",
+    "\n",
+    "def get_xml_namespace(element):\n",
+    "    m = re.match(r\"\\{(.*)\\}\", element.tag)\n",
+    "    return m.groups()[0] if m else \"\"\n",
+    "\n",
+    "\n",
+    "def merge_xml_results(xml_results):\n",
+    "    merged_root = ElementTree.fromstring(xml_results[0])\n",
+    "    for result in xml_results[1:]:\n",
+    "        root = ElementTree.fromstring(result)\n",
+    "        for child in root.findall(\"{http://uniprot.org/uniprot}entry\"):\n",
+    "            merged_root.insert(-1, child)\n",
+    "    ElementTree.register_namespace(\"\", get_xml_namespace(merged_root[0]))\n",
+    "    return ElementTree.tostring(merged_root, encoding=\"utf-8\", xml_declaration=True)\n",
+    "\n",
+    "\n",
+    "def print_progress_batches(batch_index, size, total):\n",
+    "    n_fetched = min((batch_index + 1) * size, total)\n",
+    "    print(f\"Fetched: {n_fetched} / {total}\")\n",
+    "\n",
+    "\n",
+    "def get_id_mapping_results_search(url):\n",
+    "    parsed = urlparse(url)\n",
+    "    query = parse_qs(parsed.query)\n",
+    "    file_format = query[\"format\"][0] if \"format\" in query else \"json\"\n",
+    "    if \"size\" in query:\n",
+    "        size = int(query[\"size\"][0])\n",
+    "    else:\n",
+    "        size = 500\n",
+    "        query[\"size\"] = size\n",
+    "    compressed = (\n",
+    "        query[\"compressed\"][0].lower() == \"true\" if \"compressed\" in query else False\n",
+    "    )\n",
+    "    parsed = parsed._replace(query=urlencode(query, doseq=True))\n",
+    "    url = parsed.geturl()\n",
+    "    request = session.get(url)\n",
+    "    request.raise_for_status()\n",
+    "    results = decode_results(request, file_format, compressed)\n",
+    "    total = int(request.headers[\"x-total-results\"])\n",
+    "    print_progress_batches(0, size, total)\n",
+    "    for i, batch in enumerate(get_batch(request, file_format, compressed), 1):\n",
+    "        results = combine_batches(results, batch, file_format)\n",
+    "        print_progress_batches(i, size, total)\n",
+    "    if file_format == \"xml\":\n",
+    "        return merge_xml_results(results)\n",
+    "    return results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "9d46cd87-8f41-4d2e-a36c-1f06e86315b6",
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fetched: 500 / 144380\n",
+      "Fetched: 1000 / 144380\n",
+      "Fetched: 1500 / 144380\n",
+      "Fetched: 2000 / 144380\n",
+      "Fetched: 2500 / 144380\n",
+      "Fetched: 3000 / 144380\n",
+      "Fetched: 3500 / 144380\n",
+      "Fetched: 4000 / 144380\n",
+      "Fetched: 4500 / 144380\n",
+      "Fetched: 5000 / 144380\n",
+      "Fetched: 5500 / 144380\n",
+      "Fetched: 6000 / 144380\n",
+      "Fetched: 6500 / 144380\n",
+      "Fetched: 7000 / 144380\n",
+      "Fetched: 7500 / 144380\n",
+      "Fetched: 8000 / 144380\n",
+      "Fetched: 8500 / 144380\n",
+      "Fetched: 9000 / 144380\n",
+      "Fetched: 9500 / 144380\n",
+      "Fetched: 10000 / 144380\n",
+      "Fetched: 10500 / 144380\n",
+      "Fetched: 11000 / 144380\n",
+      "Fetched: 11500 / 144380\n",
+      "Fetched: 12000 / 144380\n",
+      "Fetched: 12500 / 144380\n",
+      "Fetched: 13000 / 144380\n",
+      "Fetched: 13500 / 144380\n",
+      "Fetched: 14000 / 144380\n",
+      "Fetched: 14500 / 144380\n",
+      "Fetched: 15000 / 144380\n",
+      "Fetched: 15500 / 144380\n",
+      "Fetched: 16000 / 144380\n",
+      "Fetched: 16500 / 144380\n",
+      "Fetched: 17000 / 144380\n",
+      "Fetched: 17500 / 144380\n",
+      "Fetched: 18000 / 144380\n",
+      "Fetched: 18500 / 144380\n",
+      "Fetched: 19000 / 144380\n",
+      "Fetched: 19500 / 144380\n",
+      "Fetched: 20000 / 144380\n",
+      "Fetched: 20500 / 144380\n",
+      "Fetched: 21000 / 144380\n",
+      "Fetched: 21500 / 144380\n",
+      "Fetched: 22000 / 144380\n",
+      "Fetched: 22500 / 144380\n",
+      "Fetched: 23000 / 144380\n",
+      "Fetched: 23500 / 144380\n",
+      "Fetched: 24000 / 144380\n",
+      "Fetched: 24500 / 144380\n",
+      "Fetched: 25000 / 144380\n",
+      "Fetched: 25500 / 144380\n",
+      "Fetched: 26000 / 144380\n",
+      "Fetched: 26500 / 144380\n",
+      "Fetched: 27000 / 144380\n",
+      "Fetched: 27500 / 144380\n",
+      "Fetched: 28000 / 144380\n",
+      "Fetched: 28500 / 144380\n",
+      "Fetched: 29000 / 144380\n",
+      "Fetched: 29500 / 144380\n",
+      "Fetched: 30000 / 144380\n",
+      "Fetched: 30500 / 144380\n",
+      "Fetched: 31000 / 144380\n",
+      "Fetched: 31500 / 144380\n",
+      "Fetched: 32000 / 144380\n",
+      "Fetched: 32500 / 144380\n",
+      "Fetched: 33000 / 144380\n",
+      "Fetched: 33500 / 144380\n",
+      "Fetched: 34000 / 144380\n",
+      "Fetched: 34500 / 144380\n",
+      "Fetched: 35000 / 144380\n",
+      "Fetched: 35500 / 144380\n",
+      "Fetched: 36000 / 144380\n",
+      "Fetched: 36500 / 144380\n",
+      "Fetched: 37000 / 144380\n",
+      "Fetched: 37500 / 144380\n",
+      "Fetched: 38000 / 144380\n",
+      "Fetched: 38500 / 144380\n",
+      "Fetched: 39000 / 144380\n",
+      "Fetched: 39500 / 144380\n",
+      "Fetched: 40000 / 144380\n",
+      "Fetched: 40500 / 144380\n",
+      "Fetched: 41000 / 144380\n",
+      "Fetched: 41500 / 144380\n",
+      "Fetched: 42000 / 144380\n",
+      "Fetched: 42500 / 144380\n",
+      "Fetched: 43000 / 144380\n",
+      "Fetched: 43500 / 144380\n",
+      "Fetched: 44000 / 144380\n",
+      "Fetched: 44500 / 144380\n",
+      "Fetched: 45000 / 144380\n",
+      "Fetched: 45500 / 144380\n",
+      "Fetched: 46000 / 144380\n",
+      "Fetched: 46500 / 144380\n",
+      "Fetched: 47000 / 144380\n",
+      "Fetched: 47500 / 144380\n",
+      "Fetched: 48000 / 144380\n",
+      "Fetched: 48500 / 144380\n",
+      "Fetched: 49000 / 144380\n",
+      "Fetched: 49500 / 144380\n",
+      "Fetched: 50000 / 144380\n",
+      "Fetched: 50500 / 144380\n",
+      "Fetched: 51000 / 144380\n",
+      "Fetched: 51500 / 144380\n",
+      "Fetched: 52000 / 144380\n",
+      "Fetched: 52500 / 144380\n",
+      "Fetched: 53000 / 144380\n",
+      "Fetched: 53500 / 144380\n",
+      "Fetched: 54000 / 144380\n",
+      "Fetched: 54500 / 144380\n",
+      "Fetched: 55000 / 144380\n",
+      "Fetched: 55500 / 144380\n",
+      "Fetched: 56000 / 144380\n",
+      "Fetched: 56500 / 144380\n",
+      "Fetched: 57000 / 144380\n",
+      "Fetched: 57500 / 144380\n",
+      "Fetched: 58000 / 144380\n",
+      "Fetched: 58500 / 144380\n",
+      "Fetched: 59000 / 144380\n",
+      "Fetched: 59500 / 144380\n",
+      "Fetched: 60000 / 144380\n",
+      "Fetched: 60500 / 144380\n",
+      "Fetched: 61000 / 144380\n",
+      "Fetched: 61500 / 144380\n",
+      "Fetched: 62000 / 144380\n",
+      "Fetched: 62500 / 144380\n",
+      "Fetched: 63000 / 144380\n",
+      "Fetched: 63500 / 144380\n",
+      "Fetched: 64000 / 144380\n",
+      "Fetched: 64500 / 144380\n",
+      "Fetched: 65000 / 144380\n",
+      "Fetched: 65500 / 144380\n",
+      "Fetched: 66000 / 144380\n",
+      "Fetched: 66500 / 144380\n",
+      "Fetched: 67000 / 144380\n",
+      "Fetched: 67500 / 144380\n",
+      "Fetched: 68000 / 144380\n",
+      "Fetched: 68500 / 144380\n",
+      "Fetched: 69000 / 144380\n",
+      "Fetched: 69500 / 144380\n",
+      "Fetched: 70000 / 144380\n",
+      "Fetched: 70500 / 144380\n",
+      "Fetched: 71000 / 144380\n",
+      "Fetched: 71500 / 144380\n",
+      "Fetched: 72000 / 144380\n",
+      "Fetched: 72500 / 144380\n",
+      "Fetched: 73000 / 144380\n",
+      "Fetched: 73500 / 144380\n",
+      "Fetched: 74000 / 144380\n",
+      "Fetched: 74500 / 144380\n",
+      "Fetched: 75000 / 144380\n",
+      "Fetched: 75500 / 144380\n",
+      "Fetched: 76000 / 144380\n",
+      "Fetched: 76500 / 144380\n",
+      "Fetched: 77000 / 144380\n",
+      "Fetched: 77500 / 144380\n",
+      "Fetched: 78000 / 144380\n",
+      "Fetched: 78500 / 144380\n",
+      "Fetched: 79000 / 144380\n",
+      "Fetched: 79500 / 144380\n",
+      "Fetched: 80000 / 144380\n",
+      "Fetched: 80500 / 144380\n",
+      "Fetched: 81000 / 144380\n",
+      "Fetched: 81500 / 144380\n",
+      "Fetched: 82000 / 144380\n",
+      "Fetched: 82500 / 144380\n",
+      "Fetched: 83000 / 144380\n",
+      "Fetched: 83500 / 144380\n",
+      "Fetched: 84000 / 144380\n",
+      "Fetched: 84500 / 144380\n",
+      "Fetched: 85000 / 144380\n",
+      "Fetched: 85500 / 144380\n",
+      "Fetched: 86000 / 144380\n",
+      "Fetched: 86500 / 144380\n",
+      "Fetched: 87000 / 144380\n",
+      "Fetched: 87500 / 144380\n",
+      "Fetched: 88000 / 144380\n",
+      "Fetched: 88500 / 144380\n",
+      "Fetched: 89000 / 144380\n",
+      "Fetched: 89500 / 144380\n",
+      "Fetched: 90000 / 144380\n",
+      "Fetched: 90500 / 144380\n",
+      "Fetched: 91000 / 144380\n",
+      "Fetched: 91500 / 144380\n",
+      "Fetched: 92000 / 144380\n",
+      "Fetched: 92500 / 144380\n",
+      "Fetched: 93000 / 144380\n",
+      "Fetched: 93500 / 144380\n",
+      "Fetched: 94000 / 144380\n",
+      "Fetched: 94500 / 144380\n",
+      "Fetched: 95000 / 144380\n",
+      "Fetched: 95500 / 144380\n",
+      "Fetched: 96000 / 144380\n",
+      "Fetched: 96500 / 144380\n",
+      "Fetched: 97000 / 144380\n",
+      "Fetched: 97500 / 144380\n",
+      "Fetched: 98000 / 144380\n",
+      "Fetched: 98500 / 144380\n",
+      "Fetched: 99000 / 144380\n",
+      "Fetched: 99500 / 144380\n",
+      "Fetched: 100000 / 144380\n",
+      "Fetched: 100500 / 144380\n",
+      "Fetched: 101000 / 144380\n",
+      "Fetched: 101500 / 144380\n",
+      "Fetched: 102000 / 144380\n",
+      "Fetched: 102500 / 144380\n",
+      "Fetched: 103000 / 144380\n",
+      "Fetched: 103500 / 144380\n",
+      "Fetched: 104000 / 144380\n",
+      "Fetched: 104500 / 144380\n",
+      "Fetched: 105000 / 144380\n",
+      "Fetched: 105500 / 144380\n",
+      "Fetched: 106000 / 144380\n",
+      "Fetched: 106500 / 144380\n",
+      "Fetched: 107000 / 144380\n",
+      "Fetched: 107500 / 144380\n",
+      "Fetched: 108000 / 144380\n",
+      "Fetched: 108500 / 144380\n",
+      "Fetched: 109000 / 144380\n",
+      "Fetched: 109500 / 144380\n",
+      "Fetched: 110000 / 144380\n",
+      "Fetched: 110500 / 144380\n",
+      "Fetched: 111000 / 144380\n",
+      "Fetched: 111500 / 144380\n",
+      "Fetched: 112000 / 144380\n",
+      "Fetched: 112500 / 144380\n",
+      "Fetched: 113000 / 144380\n",
+      "Fetched: 113500 / 144380\n",
+      "Fetched: 114000 / 144380\n",
+      "Fetched: 114500 / 144380\n",
+      "Fetched: 115000 / 144380\n",
+      "Fetched: 115500 / 144380\n",
+      "Fetched: 116000 / 144380\n",
+      "Fetched: 116500 / 144380\n",
+      "Fetched: 117000 / 144380\n",
+      "Fetched: 117500 / 144380\n",
+      "Fetched: 118000 / 144380\n",
+      "Fetched: 118500 / 144380\n",
+      "Fetched: 119000 / 144380\n",
+      "Fetched: 119500 / 144380\n",
+      "Fetched: 120000 / 144380\n",
+      "Fetched: 120500 / 144380\n",
+      "Fetched: 121000 / 144380\n",
+      "Fetched: 121500 / 144380\n",
+      "Fetched: 122000 / 144380\n",
+      "Fetched: 122500 / 144380\n",
+      "Fetched: 123000 / 144380\n",
+      "Fetched: 123500 / 144380\n",
+      "Fetched: 124000 / 144380\n",
+      "Fetched: 124500 / 144380\n",
+      "Fetched: 125000 / 144380\n",
+      "Fetched: 125500 / 144380\n",
+      "Fetched: 126000 / 144380\n",
+      "Fetched: 126500 / 144380\n",
+      "Fetched: 127000 / 144380\n",
+      "Fetched: 127500 / 144380\n",
+      "Fetched: 128000 / 144380\n",
+      "Fetched: 128500 / 144380\n",
+      "Fetched: 129000 / 144380\n",
+      "Fetched: 129500 / 144380\n",
+      "Fetched: 130000 / 144380\n",
+      "Fetched: 130500 / 144380\n",
+      "Fetched: 131000 / 144380\n",
+      "Fetched: 131500 / 144380\n",
+      "Fetched: 132000 / 144380\n",
+      "Fetched: 132500 / 144380\n",
+      "Fetched: 133000 / 144380\n",
+      "Fetched: 133500 / 144380\n",
+      "Fetched: 134000 / 144380\n",
+      "Fetched: 134500 / 144380\n",
+      "Fetched: 135000 / 144380\n",
+      "Fetched: 135500 / 144380\n",
+      "Fetched: 136000 / 144380\n",
+      "Fetched: 136500 / 144380\n",
+      "Fetched: 137000 / 144380\n",
+      "Fetched: 137500 / 144380\n",
+      "Fetched: 138000 / 144380\n",
+      "Fetched: 138500 / 144380\n",
+      "Fetched: 139000 / 144380\n",
+      "Fetched: 139500 / 144380\n",
+      "Fetched: 140000 / 144380\n",
+      "Fetched: 140500 / 144380\n",
+      "Fetched: 141000 / 144380\n",
+      "Fetched: 141500 / 144380\n",
+      "Fetched: 142000 / 144380\n",
+      "Fetched: 142500 / 144380\n",
+      "Fetched: 143000 / 144380\n",
+      "Fetched: 143500 / 144380\n",
+      "Fetched: 144000 / 144380\n",
+      "Fetched: 144380 / 144380\n"
+     ]
+    }
+   ],
+   "source": [
+    "results = get_id_mapping_results_search(url)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7ae2cf71-0cde-4595-9c27-3be502c7de2b",
+   "metadata": {},
+   "source": [
+    "Parse result json into a dict and that into a data frame"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "bcbc223f-8615-41f8-a2cc-409ec1999d32",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 144380/144380 [00:00<00:00, 948289.08it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "pdb_to_uniprot = {}\n",
+    "for r in tqdm(results[1:]):\n",
+    "    pdb_id, uniprot_id = r.split('\\t')\n",
+    "    pdb_to_uniprot[pdb_id] = uniprot_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "4232b8fd-228c-4d9b-9945-86511bbec8b1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pdb_to_uniprot = pd.DataFrame.from_dict(pdb_to_uniprot, orient='index')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "906c2884-13a5-44b6-948b-3dc52590c271",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pdb_to_uniprot.columns = ['uniprot']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "1d10a20d-b3b4-4179-9499-8238a1c62236",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pdb = pdb.join(pdb_to_uniprot, on='target')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "d180bb5b-ba34-441d-a0c1-f259609c978e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pdb.to_parquet('/g/arendt/npapadop/repos/coffe/data/pdb_tmp.parquet')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/scripts/results_processing/io_read_pdb.py b/scripts/results_processing/io_read_pdb.py
deleted file mode 100644
index f1c8e3b..0000000
--- a/scripts/results_processing/io_read_pdb.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import os
-import json
-from tqdm import tqdm
-from urllib import parse, request
-
-import numpy as np
-import pandas as pd
-
-def read_foldseek(result_path):
-    df = pd.read_csv(result_path, sep="\s+", header=None)
-    df.columns = ["query", "target", "seq. id.", "alignment length", "no. mismatches",
-                   "no. gap open", "query start", "query end", "target start", "target end",
-                   "e value", "bit score"]
-    df["query"] = df["query"].str.split("_").str[0]
-    df["corrected bit score"] = df["bit score"] / df["alignment length"]
-    if "pdb" in result_path:
-        df["target"] = df["target"].str.split(".").str[0]
-    else:
-        df["target"] = df["target"].str.split("-").str[:3].str.join("-")
-    return df
-
-def get_aligned_plddt(df, plddt, name_dict):
-    aligned_plddt = [0.] * len(df)
-    for e in tqdm(df.iterrows()):
-        index, row = e
-        query = row['query']
-        isoform = name_dict[query]
-        protein = plddt[isoform]
-        start = row['query start'] - 1
-        end = row['query end'] - 1
-        aligned_plddt[index] = np.mean(protein[start:end])
-    return aligned_plddt
-
-def get_from_uniprot(df, column, uniprot_from="ACC", uniprot_to="EGGNOG_ID", request_size=40000):
-    ids = df[column][~df[column].isnull()]
-    url = 'https://rest.uniprot.org/'
-
-    params = {
-    'from': uniprot_from,
-    'to': uniprot_to,
-    'format': 'tab',
-    'query': " ".join(ids.unique())
-    }
-
-    data = parse.urlencode(params)
-    data = data.encode('utf-8')
-    req = request.Request(url, data)
-    with request.urlopen(req) as f:
-        response = f.read()
-    return response
-
-def create_id_map(response, id_in, id_out):
-    map_from = []
-    map_to = []
-    for row in tqdm(response.decode("utf-8").split("\n")[:-1]):
-        col1, col2 = row.split("\t")
-        map_from.append(col1)
-        map_to.append(col2)
-
-    res = pd.DataFrame({id_in: map_from[1:], id_out: map_to[1:]})
-    res = pd.DataFrame(res.groupby(id_in)[id_out].apply(",".join))
-#     res.set_index(id_in, inplace=True)
-    return res
-
-def enrich_from_uniprot(df, column_from, column_to, uniprot_from="PDB_ID", uniprot_to="ACC"):
-    response = get_from_uniprot(df, column_from, uniprot_from=uniprot_from, uniprot_to=uniprot_to)
-    df_map = create_id_map(response, column_from, column_to)
-    return df.join(df_map, on=column_from)
-
-# FOLDSEEK scores
-fs_pdb = '/g/arendt/npapadop/repos/coffe/data/pdb_score.tsv'
-# per-residue pLDDT score
-plddt = np.load('/g/arendt/npapadop/repos/coffe/data/spongilla_plddt.npz')
-
-sequence_info = pd.read_csv("/g/arendt/npapadop/repos/coffe/data/sequence_info.csv")
-query_to_isoform = sequence_info[['query', 'isoform']].set_index('query').to_dict()['isoform']
-
-pdb = read_foldseek(fs_pdb)
-pdb["query"] = pdb["query"].values.astype(int)
-
-pdb['aligned_plddt'] = get_aligned_plddt(pdb, plddt, query_to_isoform)
-pdb = enrich_from_uniprot(pdb, "target", "uniprot", uniprot_from="PDB_ID", uniprot_to="ACC")
-pdb.to_parquet('/g/arendt/npapadop/repos/coffe/data/pdb_tmp.parquet')
\ No newline at end of file
diff --git a/scripts/results_processing/io_read_pdb.sh b/scripts/results_processing/io_read_pdb.sh
deleted file mode 100644
index 800685e..0000000
--- a/scripts/results_processing/io_read_pdb.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash -ex
-#SBATCH -J coffe_read_pdb
-#SBATCH -t 5:00:00
-#SBATCH -c 1
-#SBATCH --mem=16G
-#SBATCH -o /g/arendt/npapadop/cluster/io_read_pdb.out
-#SBATCH -e /g/arendt/npapadop/cluster/io_read_pdb.err
-
-module load Anaconda3
-source ~/.bash_profile
-conda activate /g/arendt/npapadop/repos/condas/CoFFE
-
-python /g/arendt/npapadop/repos/coffe/scripts/results_processing/io_read_pdb.py
\ No newline at end of file
-- 
GitLab