io_read_swp.py

import os
import json
from tqdm import tqdm

import numpy as np
import pandas as pd

def read_foldseek(result_path):
    df = pd.read_csv(result_path, sep="\s+", header=None)
    df.columns = ["query", "target", "seq. id.", "alignment length", "no. mismatches",
                   "no. gap open", "query start", "query end", "target start", "target end",
                   "e value", "bit score"]
    df["query"] = df["query"].str.split("_").str[0]
    df["corrected bit score"] = df["bit score"] / df["alignment length"]
    if "pdb" in result_path:
        df["target"] = df["target"].str.split(".").str[0]
    else:
        df["target"] = df["target"].str.split("-").str[:3].str.join("-")
    return df

def get_aligned_plddt(df, plddt, name_dict):
    aligned_plddt = [0.] * len(df)
    for e in tqdm(df.iterrows()):
        index, row = e
        query = row['query']
        isoform = name_dict[query]
        protein = plddt[isoform]
        start = row['query start'] - 1
        end = row['query end'] - 1
        aligned_plddt[index] = np.mean(protein[start:end])
    return aligned_plddt

def get_from_uniprot(df, column, uniprot_from="ACC", uniprot_to="EGGNOG_ID", request_size=40000):
    ids = df[column][~df[column].isnull()]
    url = 'https://www.uniprot.org/uploadlists/'

    params = {
    'from': uniprot_from,
    'to': uniprot_to,
    'format': 'tab',
    'query': " ".join(ids.unique())
    }

    data = urllib.parse.urlencode(params)
    data = data.encode('utf-8')
    req = urllib.request.Request(url, data)
    with urllib.request.urlopen(req) as f:
        response = f.read()
    return response

def create_id_map(response, id_in, id_out):
    map_from = []
    map_to = []
    for row in tqdm(response.decode("utf-8").split("\n")[:-1]):
        col1, col2 = row.split("\t")
        map_from.append(col1)
        map_to.append(col2)

    res = pd.DataFrame({id_in: map_from[1:], id_out: map_to[1:]})
    res = pd.DataFrame(res.groupby(id_in)[id_out].apply(",".join))
#     res.set_index(id_in, inplace=True)
    return res

def enrich_from_uniprot(df, column_from, column_to, uniprot_from="PDB_ID", uniprot_to="ACC"):
    response = get_from_uniprot(df, column_from, uniprot_from=uniprot_from, uniprot_to=uniprot_to)
    df_map = create_id_map(response, column_from, column_to)
    return df.join(df_map, on=column_from)

# MSA location
msas = '/g/arendt/npapadop/repos/coffe/data/msa/'
# FOLDSEEK scores
fs_pdb = '/g/arendt/npapadop/repos/coffe/data/pdb_score.tsv'
fs_afdb = '/g/arendt/npapadop/repos/coffe/data/afdb_score.tsv'
fs_swp = '/g/arendt/npapadop/repos/coffe/data/swissprot_score.tsv'
# AlphaFold predictions
structure_list = '/g/arendt/npapadop/repos/coffe/data/named_models/'
metadata = '/g/arendt/npapadop/repos/coffe/data/named_metadata/'
# input fasta
all_peptides = '/g/arendt/npapadop/repos/coffe/data/Spongilla_lacustris_translated_proteome.fasta'

plddt = np.load('/g/arendt/npapadop/repos/coffe/data/spongilla_plddt.npz')

sequence_info = pd.read_csv("/g/arendt/npapadop/repos/coffe/data/sequence_info.csv")
query_to_isoform = sequence_info[['query', 'isoform']].set_index('query').to_dict()['isoform']

swp = read_foldseek(fs_swp)
swp["query"] = swp["query"].values.astype(int)

swp['aligned_plddt'] = get_aligned_plddt(swp, plddt, query_to_isoform)
swp["uniprot"] = swp["target"].str.split("-").str[1]
swp_best.to_parquet('/g/arendt/npapadop/repos/coffe/data/swp_tmp.parquet')