Skip to content
Snippets Groups Projects
Commit 5276fe26 authored by Nikolaos Papadopoulos's avatar Nikolaos Papadopoulos
Browse files

now manual workaround for PDB ID mapping

parent 8dd17e2e
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id:5b92f263-711c-4cdf-8b11-5d1defa4a0d7 tags:
``` python
from tqdm import tqdm
import re
import time
import json
import zlib
from xml.etree import ElementTree
from urllib.parse import urlparse, parse_qs, urlencode
import requests
from requests.adapters import HTTPAdapter, Retry
import numpy as np
import pandas as pd
```
%% Cell type:code id:a4e75e96-4f37-46a9-af3e-44914e6114d3 tags:
``` python
def read_foldseek(result_path):
df = pd.read_csv(result_path, sep="\s+", header=None)
df.columns = ["query", "target", "seq. id.", "alignment length", "no. mismatches",
"no. gap open", "query start", "query end", "target start", "target end",
"e value", "bit score"]
df["query"] = df["query"].str.split("_").str[0]
df["corrected bit score"] = df["bit score"] / df["alignment length"]
if "pdb" in result_path:
df["target"] = df["target"].str.split(".").str[0]
else:
df["target"] = df["target"].str.split("-").str[:3].str.join("-")
return df
def get_aligned_plddt(df, plddt, name_dict):
aligned_plddt = [0.] * len(df)
for e in tqdm(df.iterrows()):
index, row = e
query = row['query']
isoform = name_dict[query]
protein = plddt[isoform]
start = row['query start'] - 1
end = row['query end'] - 1
aligned_plddt[index] = np.mean(protein[start:end])
return aligned_plddt
```
%% Cell type:code id:d8fda205-90f8-4941-b77e-39f3e10d6ab2 tags:
``` python
# FOLDSEEK scores
fs_pdb = '/g/arendt/npapadop/repos/coffe/data/pdb_score.tsv'
# per-residue pLDDT score
plddt = np.load('/g/arendt/npapadop/repos/coffe/data/spongilla_plddt.npz')
sequence_info = pd.read_csv("/g/arendt/npapadop/repos/coffe/data/sequence_info.csv")
query_to_isoform = sequence_info[['query', 'isoform']].set_index('query').to_dict()['isoform']
```
%% Cell type:code id:a8f277d7-94f6-4316-859b-6162c639ee73 tags:
``` python
pdb = read_foldseek(fs_pdb)
pdb["query"] = pdb["query"].values.astype(int)
pdb['aligned_plddt'] = get_aligned_plddt(pdb, plddt, query_to_isoform)
```
%% Output
1470483it [25:34, 958.34it/s]
%% Cell type:markdown id:0ddf8b37-4f85-4076-be68-32be64e02df3 tags:
Write out the unique PDB IDs in a file and submit it to the UniProt ID mapper:
%% Cell type:code id:0b6e404b-a754-4779-a7fa-a6ea683c7abb tags:
``` python
pd.Series(pdb['target'].unique()).to_csv('../data/pdb_ids.csv', header=False, index=False)
```
%% Cell type:markdown id:49ed1f9c-8225-46a8-9abe-d6fb568cb175 tags:
(retrieve result link)
%% Cell type:code id:a23cc67a-1d9b-4c78-bcec-be7bd965318f tags:
``` python
url = "https://rest.uniprot.org/idmapping/results/8f35e821d58602a990b260ae64cf77fb1b1f23d4?compressed=true&fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength&format=tsv&size=500"
```
%% Cell type:markdown id:3e9c9787-5021-4e3e-96e7-8cc58d6820e1 tags:
Use the UniProt API functions to retrieve the mapping results, since the download link seems to be broken?
%% Cell type:code id:2048ee0c-7761-4635-9d0b-c5ce69f53384 tags:
``` python
POLLING_INTERVAL = 3
API_URL = "https://rest.uniprot.org"
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))
def get_next_link(headers):
re_next_link = re.compile(r'<(.+)>; rel="next"')
if "Link" in headers:
match = re_next_link.match(headers["Link"])
if match:
return match.group(1)
def get_batch(batch_response, file_format, compressed):
batch_url = get_next_link(batch_response.headers)
while batch_url:
batch_response = session.get(batch_url)
batch_response.raise_for_status()
yield decode_results(batch_response, file_format, compressed)
batch_url = get_next_link(batch_response.headers)
def combine_batches(all_results, batch_results, file_format):
if file_format == "json":
for key in ("results", "failedIds"):
if key in batch_results and batch_results[key]:
all_results[key] += batch_results[key]
elif file_format == "tsv":
return all_results + batch_results[1:]
else:
return all_results + batch_results
return all_results
def decode_results(response, file_format, compressed):
if compressed:
decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
if file_format == "json":
j = json.loads(decompressed.decode("utf-8"))
return j
elif file_format == "tsv":
return [line for line in decompressed.decode("utf-8").split("\n") if line]
elif file_format == "xlsx":
return [decompressed]
elif file_format == "xml":
return [decompressed.decode("utf-8")]
else:
return decompressed.decode("utf-8")
elif file_format == "json":
return response.json()
elif file_format == "tsv":
return [line for line in response.text.split("\n") if line]
elif file_format == "xlsx":
return [response.content]
elif file_format == "xml":
return [response.text]
return response.text
def get_xml_namespace(element):
m = re.match(r"\{(.*)\}", element.tag)
return m.groups()[0] if m else ""
def merge_xml_results(xml_results):
merged_root = ElementTree.fromstring(xml_results[0])
for result in xml_results[1:]:
root = ElementTree.fromstring(result)
for child in root.findall("{http://uniprot.org/uniprot}entry"):
merged_root.insert(-1, child)
ElementTree.register_namespace("", get_xml_namespace(merged_root[0]))
return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True)
def print_progress_batches(batch_index, size, total):
n_fetched = min((batch_index + 1) * size, total)
print(f"Fetched: {n_fetched} / {total}")
def get_id_mapping_results_search(url):
parsed = urlparse(url)
query = parse_qs(parsed.query)
file_format = query["format"][0] if "format" in query else "json"
if "size" in query:
size = int(query["size"][0])
else:
size = 500
query["size"] = size
compressed = (
query["compressed"][0].lower() == "true" if "compressed" in query else False
)
parsed = parsed._replace(query=urlencode(query, doseq=True))
url = parsed.geturl()
request = session.get(url)
request.raise_for_status()
results = decode_results(request, file_format, compressed)
total = int(request.headers["x-total-results"])
print_progress_batches(0, size, total)
for i, batch in enumerate(get_batch(request, file_format, compressed), 1):
results = combine_batches(results, batch, file_format)
print_progress_batches(i, size, total)
if file_format == "xml":
return merge_xml_results(results)
return results
```
%% Cell type:code id:9d46cd87-8f41-4d2e-a36c-1f06e86315b6 tags:
``` python
results = get_id_mapping_results_search(url)
```
%% Output
Fetched: 500 / 144380
Fetched: 1000 / 144380
Fetched: 1500 / 144380
Fetched: 2000 / 144380
Fetched: 2500 / 144380
Fetched: 3000 / 144380
Fetched: 3500 / 144380
Fetched: 4000 / 144380
Fetched: 4500 / 144380
Fetched: 5000 / 144380
Fetched: 5500 / 144380
Fetched: 6000 / 144380
Fetched: 6500 / 144380
Fetched: 7000 / 144380
Fetched: 7500 / 144380
Fetched: 8000 / 144380
Fetched: 8500 / 144380
Fetched: 9000 / 144380
Fetched: 9500 / 144380
Fetched: 10000 / 144380
Fetched: 10500 / 144380
Fetched: 11000 / 144380
Fetched: 11500 / 144380
Fetched: 12000 / 144380
Fetched: 12500 / 144380
Fetched: 13000 / 144380
Fetched: 13500 / 144380
Fetched: 14000 / 144380
Fetched: 14500 / 144380
Fetched: 15000 / 144380
Fetched: 15500 / 144380
Fetched: 16000 / 144380
Fetched: 16500 / 144380
Fetched: 17000 / 144380
Fetched: 17500 / 144380
Fetched: 18000 / 144380
Fetched: 18500 / 144380
Fetched: 19000 / 144380
Fetched: 19500 / 144380
Fetched: 20000 / 144380
Fetched: 20500 / 144380
Fetched: 21000 / 144380
Fetched: 21500 / 144380
Fetched: 22000 / 144380
Fetched: 22500 / 144380
Fetched: 23000 / 144380
Fetched: 23500 / 144380
Fetched: 24000 / 144380
Fetched: 24500 / 144380
Fetched: 25000 / 144380
Fetched: 25500 / 144380
Fetched: 26000 / 144380
Fetched: 26500 / 144380
Fetched: 27000 / 144380
Fetched: 27500 / 144380
Fetched: 28000 / 144380
Fetched: 28500 / 144380
Fetched: 29000 / 144380
Fetched: 29500 / 144380
Fetched: 30000 / 144380
Fetched: 30500 / 144380
Fetched: 31000 / 144380
Fetched: 31500 / 144380
Fetched: 32000 / 144380
Fetched: 32500 / 144380
Fetched: 33000 / 144380
Fetched: 33500 / 144380
Fetched: 34000 / 144380
Fetched: 34500 / 144380
Fetched: 35000 / 144380
Fetched: 35500 / 144380
Fetched: 36000 / 144380
Fetched: 36500 / 144380
Fetched: 37000 / 144380
Fetched: 37500 / 144380
Fetched: 38000 / 144380
Fetched: 38500 / 144380
Fetched: 39000 / 144380
Fetched: 39500 / 144380
Fetched: 40000 / 144380
Fetched: 40500 / 144380
Fetched: 41000 / 144380
Fetched: 41500 / 144380
Fetched: 42000 / 144380
Fetched: 42500 / 144380
Fetched: 43000 / 144380
Fetched: 43500 / 144380
Fetched: 44000 / 144380
Fetched: 44500 / 144380
Fetched: 45000 / 144380
Fetched: 45500 / 144380
Fetched: 46000 / 144380
Fetched: 46500 / 144380
Fetched: 47000 / 144380
Fetched: 47500 / 144380
Fetched: 48000 / 144380
Fetched: 48500 / 144380
Fetched: 49000 / 144380
Fetched: 49500 / 144380
Fetched: 50000 / 144380
Fetched: 50500 / 144380
Fetched: 51000 / 144380
Fetched: 51500 / 144380
Fetched: 52000 / 144380
Fetched: 52500 / 144380
Fetched: 53000 / 144380
Fetched: 53500 / 144380
Fetched: 54000 / 144380
Fetched: 54500 / 144380
Fetched: 55000 / 144380
Fetched: 55500 / 144380
Fetched: 56000 / 144380
Fetched: 56500 / 144380
Fetched: 57000 / 144380
Fetched: 57500 / 144380
Fetched: 58000 / 144380
Fetched: 58500 / 144380
Fetched: 59000 / 144380
Fetched: 59500 / 144380
Fetched: 60000 / 144380
Fetched: 60500 / 144380
Fetched: 61000 / 144380
Fetched: 61500 / 144380
Fetched: 62000 / 144380
Fetched: 62500 / 144380
Fetched: 63000 / 144380
Fetched: 63500 / 144380
Fetched: 64000 / 144380
Fetched: 64500 / 144380
Fetched: 65000 / 144380
Fetched: 65500 / 144380
Fetched: 66000 / 144380
Fetched: 66500 / 144380
Fetched: 67000 / 144380
Fetched: 67500 / 144380
Fetched: 68000 / 144380
Fetched: 68500 / 144380
Fetched: 69000 / 144380
Fetched: 69500 / 144380
Fetched: 70000 / 144380
Fetched: 70500 / 144380
Fetched: 71000 / 144380
Fetched: 71500 / 144380
Fetched: 72000 / 144380
Fetched: 72500 / 144380
Fetched: 73000 / 144380
Fetched: 73500 / 144380
Fetched: 74000 / 144380
Fetched: 74500 / 144380
Fetched: 75000 / 144380
Fetched: 75500 / 144380
Fetched: 76000 / 144380
Fetched: 76500 / 144380
Fetched: 77000 / 144380
Fetched: 77500 / 144380
Fetched: 78000 / 144380
Fetched: 78500 / 144380
Fetched: 79000 / 144380
Fetched: 79500 / 144380
Fetched: 80000 / 144380
Fetched: 80500 / 144380
Fetched: 81000 / 144380
Fetched: 81500 / 144380
Fetched: 82000 / 144380
Fetched: 82500 / 144380
Fetched: 83000 / 144380
Fetched: 83500 / 144380
Fetched: 84000 / 144380
Fetched: 84500 / 144380
Fetched: 85000 / 144380
Fetched: 85500 / 144380
Fetched: 86000 / 144380
Fetched: 86500 / 144380
Fetched: 87000 / 144380
Fetched: 87500 / 144380
Fetched: 88000 / 144380
Fetched: 88500 / 144380
Fetched: 89000 / 144380
Fetched: 89500 / 144380
Fetched: 90000 / 144380
Fetched: 90500 / 144380
Fetched: 91000 / 144380
Fetched: 91500 / 144380
Fetched: 92000 / 144380
Fetched: 92500 / 144380
Fetched: 93000 / 144380
Fetched: 93500 / 144380
Fetched: 94000 / 144380
Fetched: 94500 / 144380
Fetched: 95000 / 144380
Fetched: 95500 / 144380
Fetched: 96000 / 144380
Fetched: 96500 / 144380
Fetched: 97000 / 144380
Fetched: 97500 / 144380
Fetched: 98000 / 144380
Fetched: 98500 / 144380
Fetched: 99000 / 144380
Fetched: 99500 / 144380
Fetched: 100000 / 144380
Fetched: 100500 / 144380
Fetched: 101000 / 144380
Fetched: 101500 / 144380
Fetched: 102000 / 144380
Fetched: 102500 / 144380
Fetched: 103000 / 144380
Fetched: 103500 / 144380
Fetched: 104000 / 144380
Fetched: 104500 / 144380
Fetched: 105000 / 144380
Fetched: 105500 / 144380
Fetched: 106000 / 144380
Fetched: 106500 / 144380
Fetched: 107000 / 144380
Fetched: 107500 / 144380
Fetched: 108000 / 144380
Fetched: 108500 / 144380
Fetched: 109000 / 144380
Fetched: 109500 / 144380
Fetched: 110000 / 144380
Fetched: 110500 / 144380
Fetched: 111000 / 144380
Fetched: 111500 / 144380
Fetched: 112000 / 144380
Fetched: 112500 / 144380
Fetched: 113000 / 144380
Fetched: 113500 / 144380
Fetched: 114000 / 144380
Fetched: 114500 / 144380
Fetched: 115000 / 144380
Fetched: 115500 / 144380
Fetched: 116000 / 144380
Fetched: 116500 / 144380
Fetched: 117000 / 144380
Fetched: 117500 / 144380
Fetched: 118000 / 144380
Fetched: 118500 / 144380
Fetched: 119000 / 144380
Fetched: 119500 / 144380
Fetched: 120000 / 144380
Fetched: 120500 / 144380
Fetched: 121000 / 144380
Fetched: 121500 / 144380
Fetched: 122000 / 144380
Fetched: 122500 / 144380
Fetched: 123000 / 144380
Fetched: 123500 / 144380
Fetched: 124000 / 144380
Fetched: 124500 / 144380
Fetched: 125000 / 144380
Fetched: 125500 / 144380
Fetched: 126000 / 144380
Fetched: 126500 / 144380
Fetched: 127000 / 144380
Fetched: 127500 / 144380
Fetched: 128000 / 144380
Fetched: 128500 / 144380
Fetched: 129000 / 144380
Fetched: 129500 / 144380
Fetched: 130000 / 144380
Fetched: 130500 / 144380
Fetched: 131000 / 144380
Fetched: 131500 / 144380
Fetched: 132000 / 144380
Fetched: 132500 / 144380
Fetched: 133000 / 144380
Fetched: 133500 / 144380
Fetched: 134000 / 144380
Fetched: 134500 / 144380
Fetched: 135000 / 144380
Fetched: 135500 / 144380
Fetched: 136000 / 144380
Fetched: 136500 / 144380
Fetched: 137000 / 144380
Fetched: 137500 / 144380
Fetched: 138000 / 144380
Fetched: 138500 / 144380
Fetched: 139000 / 144380
Fetched: 139500 / 144380
Fetched: 140000 / 144380
Fetched: 140500 / 144380
Fetched: 141000 / 144380
Fetched: 141500 / 144380
Fetched: 142000 / 144380
Fetched: 142500 / 144380
Fetched: 143000 / 144380
Fetched: 143500 / 144380
Fetched: 144000 / 144380
Fetched: 144380 / 144380
%% Cell type:markdown id:7ae2cf71-0cde-4595-9c27-3be502c7de2b tags:
Parse result json into a dict and that into a data frame
%% Cell type:code id:bcbc223f-8615-41f8-a2cc-409ec1999d32 tags:
``` python
pdb_to_uniprot = {}
for r in tqdm(results[1:]):
pdb_id, uniprot_id = r.split('\t')
pdb_to_uniprot[pdb_id] = uniprot_id
```
%% Output
100%|██████████| 144380/144380 [00:00<00:00, 948289.08it/s]
%% Cell type:code id:4232b8fd-228c-4d9b-9945-86511bbec8b1 tags:
``` python
pdb_to_uniprot = pd.DataFrame.from_dict(pdb_to_uniprot, orient='index')
```
%% Cell type:code id:906c2884-13a5-44b6-948b-3dc52590c271 tags:
``` python
pdb_to_uniprot.columns = ['uniprot']
```
%% Cell type:code id:1d10a20d-b3b4-4179-9499-8238a1c62236 tags:
``` python
pdb = pdb.join(pdb_to_uniprot, on='target')
```
%% Cell type:code id:d180bb5b-ba34-441d-a0c1-f259609c978e tags:
``` python
pdb.to_parquet('/g/arendt/npapadop/repos/coffe/data/pdb_tmp.parquet')
```
import os
import json
from tqdm import tqdm
from urllib import parse, request
import numpy as np
import pandas as pd
def read_foldseek(result_path):
df = pd.read_csv(result_path, sep="\s+", header=None)
df.columns = ["query", "target", "seq. id.", "alignment length", "no. mismatches",
"no. gap open", "query start", "query end", "target start", "target end",
"e value", "bit score"]
df["query"] = df["query"].str.split("_").str[0]
df["corrected bit score"] = df["bit score"] / df["alignment length"]
if "pdb" in result_path:
df["target"] = df["target"].str.split(".").str[0]
else:
df["target"] = df["target"].str.split("-").str[:3].str.join("-")
return df
def get_aligned_plddt(df, plddt, name_dict):
aligned_plddt = [0.] * len(df)
for e in tqdm(df.iterrows()):
index, row = e
query = row['query']
isoform = name_dict[query]
protein = plddt[isoform]
start = row['query start'] - 1
end = row['query end'] - 1
aligned_plddt[index] = np.mean(protein[start:end])
return aligned_plddt
def get_from_uniprot(df, column, uniprot_from="ACC", uniprot_to="EGGNOG_ID", request_size=40000):
ids = df[column][~df[column].isnull()]
url = 'https://rest.uniprot.org/'
params = {
'from': uniprot_from,
'to': uniprot_to,
'format': 'tab',
'query': " ".join(ids.unique())
}
data = parse.urlencode(params)
data = data.encode('utf-8')
req = request.Request(url, data)
with request.urlopen(req) as f:
response = f.read()
return response
def create_id_map(response, id_in, id_out):
map_from = []
map_to = []
for row in tqdm(response.decode("utf-8").split("\n")[:-1]):
col1, col2 = row.split("\t")
map_from.append(col1)
map_to.append(col2)
res = pd.DataFrame({id_in: map_from[1:], id_out: map_to[1:]})
res = pd.DataFrame(res.groupby(id_in)[id_out].apply(",".join))
# res.set_index(id_in, inplace=True)
return res
def enrich_from_uniprot(df, column_from, column_to, uniprot_from="PDB_ID", uniprot_to="ACC"):
response = get_from_uniprot(df, column_from, uniprot_from=uniprot_from, uniprot_to=uniprot_to)
df_map = create_id_map(response, column_from, column_to)
return df.join(df_map, on=column_from)
# FOLDSEEK scores
fs_pdb = '/g/arendt/npapadop/repos/coffe/data/pdb_score.tsv'
# per-residue pLDDT score
plddt = np.load('/g/arendt/npapadop/repos/coffe/data/spongilla_plddt.npz')
sequence_info = pd.read_csv("/g/arendt/npapadop/repos/coffe/data/sequence_info.csv")
query_to_isoform = sequence_info[['query', 'isoform']].set_index('query').to_dict()['isoform']
pdb = read_foldseek(fs_pdb)
pdb["query"] = pdb["query"].values.astype(int)
pdb['aligned_plddt'] = get_aligned_plddt(pdb, plddt, query_to_isoform)
pdb = enrich_from_uniprot(pdb, "target", "uniprot", uniprot_from="PDB_ID", uniprot_to="ACC")
pdb.to_parquet('/g/arendt/npapadop/repos/coffe/data/pdb_tmp.parquet')
\ No newline at end of file
#!/bin/bash -ex
#SBATCH -J coffe_read_pdb
#SBATCH -t 5:00:00
#SBATCH -c 1
#SBATCH --mem=16G
#SBATCH -o /g/arendt/npapadop/cluster/io_read_pdb.out
#SBATCH -e /g/arendt/npapadop/cluster/io_read_pdb.err
module load Anaconda3
source ~/.bash_profile
conda activate /g/arendt/npapadop/repos/condas/CoFFE
python /g/arendt/npapadop/repos/coffe/scripts/results_processing/io_read_pdb.py
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment