Skip to content
Snippets Groups Projects
Commit 5276fe26 authored by Nikolaos Papadopoulos's avatar Nikolaos Papadopoulos
Browse files

now manual workaround for PDB ID mapping

parent 8dd17e2e
No related branches found
No related tags found
No related merge requests found
This diff is collapsed.
import os
import json
from tqdm import tqdm
from urllib import parse, request
import numpy as np
import pandas as pd
def read_foldseek(result_path):
df = pd.read_csv(result_path, sep="\s+", header=None)
df.columns = ["query", "target", "seq. id.", "alignment length", "no. mismatches",
"no. gap open", "query start", "query end", "target start", "target end",
"e value", "bit score"]
df["query"] = df["query"].str.split("_").str[0]
df["corrected bit score"] = df["bit score"] / df["alignment length"]
if "pdb" in result_path:
df["target"] = df["target"].str.split(".").str[0]
else:
df["target"] = df["target"].str.split("-").str[:3].str.join("-")
return df
def get_aligned_plddt(df, plddt, name_dict):
aligned_plddt = [0.] * len(df)
for e in tqdm(df.iterrows()):
index, row = e
query = row['query']
isoform = name_dict[query]
protein = plddt[isoform]
start = row['query start'] - 1
end = row['query end'] - 1
aligned_plddt[index] = np.mean(protein[start:end])
return aligned_plddt
def get_from_uniprot(df, column, uniprot_from="ACC", uniprot_to="EGGNOG_ID", request_size=40000):
ids = df[column][~df[column].isnull()]
url = 'https://rest.uniprot.org/'
params = {
'from': uniprot_from,
'to': uniprot_to,
'format': 'tab',
'query': " ".join(ids.unique())
}
data = parse.urlencode(params)
data = data.encode('utf-8')
req = request.Request(url, data)
with request.urlopen(req) as f:
response = f.read()
return response
def create_id_map(response, id_in, id_out):
map_from = []
map_to = []
for row in tqdm(response.decode("utf-8").split("\n")[:-1]):
col1, col2 = row.split("\t")
map_from.append(col1)
map_to.append(col2)
res = pd.DataFrame({id_in: map_from[1:], id_out: map_to[1:]})
res = pd.DataFrame(res.groupby(id_in)[id_out].apply(",".join))
# res.set_index(id_in, inplace=True)
return res
def enrich_from_uniprot(df, column_from, column_to, uniprot_from="PDB_ID", uniprot_to="ACC"):
response = get_from_uniprot(df, column_from, uniprot_from=uniprot_from, uniprot_to=uniprot_to)
df_map = create_id_map(response, column_from, column_to)
return df.join(df_map, on=column_from)
# FOLDSEEK scores
fs_pdb = '/g/arendt/npapadop/repos/coffe/data/pdb_score.tsv'
# per-residue pLDDT score
plddt = np.load('/g/arendt/npapadop/repos/coffe/data/spongilla_plddt.npz')
sequence_info = pd.read_csv("/g/arendt/npapadop/repos/coffe/data/sequence_info.csv")
query_to_isoform = sequence_info[['query', 'isoform']].set_index('query').to_dict()['isoform']
pdb = read_foldseek(fs_pdb)
pdb["query"] = pdb["query"].values.astype(int)
pdb['aligned_plddt'] = get_aligned_plddt(pdb, plddt, query_to_isoform)
pdb = enrich_from_uniprot(pdb, "target", "uniprot", uniprot_from="PDB_ID", uniprot_to="ACC")
pdb.to_parquet('/g/arendt/npapadop/repos/coffe/data/pdb_tmp.parquet')
\ No newline at end of file
#!/bin/bash -ex
#SBATCH -J coffe_read_pdb
#SBATCH -t 5:00:00
#SBATCH -c 1
#SBATCH --mem=16G
#SBATCH -o /g/arendt/npapadop/cluster/io_read_pdb.out
#SBATCH -e /g/arendt/npapadop/cluster/io_read_pdb.err
module load Anaconda3
source ~/.bash_profile
conda activate /g/arendt/npapadop/repos/condas/CoFFE
python /g/arendt/npapadop/repos/coffe/scripts/results_processing/io_read_pdb.py
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment