-
Nikolaos Papadopoulos authoredNikolaos Papadopoulos authored
io_read_swp.py 3.35 KiB
import os
import json
from tqdm import tqdm
import numpy as np
import pandas as pd
def read_foldseek(result_path):
df = pd.read_csv(result_path, sep="\s+", header=None)
df.columns = ["query", "target", "seq. id.", "alignment length", "no. mismatches",
"no. gap open", "query start", "query end", "target start", "target end",
"e value", "bit score"]
df["query"] = df["query"].str.split("_").str[0]
df["corrected bit score"] = df["bit score"] / df["alignment length"]
if "pdb" in result_path:
df["target"] = df["target"].str.split(".").str[0]
else:
df["target"] = df["target"].str.split("-").str[:3].str.join("-")
return df
def get_aligned_plddt(df, plddt, name_dict):
aligned_plddt = [0.] * len(df)
for e in tqdm(df.iterrows()):
index, row = e
query = row['query']
isoform = name_dict[query]
protein = plddt[isoform]
start = row['query start'] - 1
end = row['query end'] - 1
aligned_plddt[index] = np.mean(protein[start:end])
return aligned_plddt
def get_from_uniprot(df, column, uniprot_from="ACC", uniprot_to="EGGNOG_ID", request_size=40000):
ids = df[column][~df[column].isnull()]
url = 'https://www.uniprot.org/uploadlists/'
params = {
'from': uniprot_from,
'to': uniprot_to,
'format': 'tab',
'query': " ".join(ids.unique())
}
data = urllib.parse.urlencode(params)
data = data.encode('utf-8')
req = urllib.request.Request(url, data)
with urllib.request.urlopen(req) as f:
response = f.read()
return response
def create_id_map(response, id_in, id_out):
map_from = []
map_to = []
for row in tqdm(response.decode("utf-8").split("\n")[:-1]):
col1, col2 = row.split("\t")
map_from.append(col1)
map_to.append(col2)
res = pd.DataFrame({id_in: map_from[1:], id_out: map_to[1:]})
res = pd.DataFrame(res.groupby(id_in)[id_out].apply(",".join))
# res.set_index(id_in, inplace=True)
return res
def enrich_from_uniprot(df, column_from, column_to, uniprot_from="PDB_ID", uniprot_to="ACC"):
response = get_from_uniprot(df, column_from, uniprot_from=uniprot_from, uniprot_to=uniprot_to)
df_map = create_id_map(response, column_from, column_to)
return df.join(df_map, on=column_from)
# MSA location
msas = '/g/arendt/npapadop/repos/coffe/data/msa/'
# FOLDSEEK scores
fs_pdb = '/g/arendt/npapadop/repos/coffe/data/pdb_score.tsv'
fs_afdb = '/g/arendt/npapadop/repos/coffe/data/afdb_score.tsv'
fs_swp = '/g/arendt/npapadop/repos/coffe/data/swissprot_score.tsv'
# AlphaFold predictions
structure_list = '/g/arendt/npapadop/repos/coffe/data/named_models/'
metadata = '/g/arendt/npapadop/repos/coffe/data/named_metadata/'
# input fasta
all_peptides = '/g/arendt/npapadop/repos/coffe/data/Spongilla_lacustris_translated_proteome.fasta'
plddt = np.load('/g/arendt/npapadop/repos/coffe/data/spongilla_plddt.npz')
sequence_info = pd.read_csv("/g/arendt/npapadop/repos/coffe/data/sequence_info.csv")
query_to_isoform = sequence_info[['query', 'isoform']].set_index('query').to_dict()['isoform']
swp = read_foldseek(fs_swp)
swp["query"] = swp["query"].values.astype(int)
swp['aligned_plddt'] = get_aligned_plddt(swp, plddt, query_to_isoform)
swp["uniprot"] = swp["target"].str.split("-").str[1]
swp_best.to_parquet('/g/arendt/npapadop/repos/coffe/data/swp_tmp.parquet')