Skip to content
Snippets Groups Projects
Commit bc9cc294 authored by Niko Papadopoulos's avatar Niko Papadopoulos
Browse files

renaming revision notebooks; added yeast/arabidopsis-human comparison

parent 97404076
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id:56159bf4-7e59-4416-a5ed-8054179cd39d tags:
How good is protein structure at finding conserved function? To find out, we tried to compare enzyme function between distantly related species, to see how often function was correctly annotated by structural similarity. In particular, we were interested in cases where orthology via sequence similarity was no longer detectable but the annotated protein function was still similar. Enzymes are an interesting test case, since they have an easily accessible function description (EC number).
The detailed strategy I am going to pursue here:
1. take the proteome of a species distantly related to human (yeast, arabidopsis)
2. foldseek against Alphafold/proteomes
3. keep only significant human hits
4. remove query/target pairs that clearly share evolutionary history (same root orthogroup/same most specific orthogroup)
5. for all remaining cases: only keep enzymes (valid EC is available)
6. calculate how many digits of the EC overlap
7. plot the results
```
# download Alphafold proteomes:
> foldseek databases Alphafold/Proteome proteomes tmp/
# download and untar yeast protein structures, then:
> foldseek createdb UP000002311_559292_YEAST_v4/*.pdb.gz yeast
> foldseek search yeast proteomes annot_yeast tmp
> foldseek convertalis yeast proteomes annot_yeast yeast.m8
```
(same for _Arabidopsis_).
%% Cell type:code id:d46b79e8-e6a3-4503-89a4-687527ce3256 tags:
``` python
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
```
%% Cell type:code id:11891bb5-7525-46ac-b944-49464fa4e5dd tags:
``` python
def keep_term(row, term="root"):
"""
A function to isolate an EggNOG orthogroup by term.
Expects a comma-separated string where the target orthogroup
contains the term.
"""
x = np.array(row.split(','))
keep = np.zeros(len(x), dtype=bool)
for i, og in enumerate(x):
keep[i] = term in og
# print(x, keep)
try:
return x[keep][0]
except IndexError:
return None
```
%% Cell type:code id:8d4c0fd8-6c3c-4e8f-8e3d-36b4abc2a40a tags:
``` python
def compare_EC(x, slot1="query_EC", slot2="target_EC"):
query = [np.array(ec.split(".")) for ec in x[slot1].split(",")]
target = [np.array(ec.split(".")) for ec in x[slot2].split(",")]
max_agreement = 0
for q in query:
for t in target:
tmp = np.sum(q == t)
if tmp > max_agreement:
max_agreement = tmp
return max_agreement
```
%% Cell type:code id:7fa6da37-7faf-404a-87cd-0d690a9a4ea6 tags:
``` python
def same_structure_same_EC(query_annot, query_foldseek, target_annot, lowest_relevant_OG="Opisthokonta"):
in_target = query_foldseek["target"].isin(target_annot["id"])
best = query_foldseek[in_target].sort_values("e value").drop_duplicates("query")
informative_columns = ["eggNOG_OGs", "Preferred_name", "GOs", "EC"]
best["corrected bit score"] = best["bit score"] / best["alignment length"]
best = best[["query", "target", "corrected bit score"]]
query = query_annot.set_index("id").loc[best["query"]][informative_columns]
query.columns = "query_" + query.columns
target = target_annot.set_index("id").loc[best["target"]][informative_columns]
target.columns = "target_" + target.columns
best = best.reset_index(drop=True).join(target.reset_index(drop=True))
best = best.join(query, on="query")
best['query_root'] = best['query_eggNOG_OGs'].apply(keep_term, term="root")
best['target_root'] = best['target_eggNOG_OGs'].apply(keep_term, term="root")
best['query_op'] = best['query_eggNOG_OGs'].apply(keep_term, term=lowest_relevant_OG)
best['target_op'] = best['target_eggNOG_OGs'].apply(keep_term, term=lowest_relevant_OG)
best["homolog"] = best["query_root"] == best["target_root"]
best["ortholog"] = best["query_op"] == best["target_op"]
no_homology_whatsoever = ~best["homolog"] & ~best["ortholog"]
unrelated = best[no_homology_whatsoever].reset_index(drop=True).copy()
unrelated["target_EC"].replace("-", None, inplace=True)
unrelated["query_EC"].replace("-", None, inplace=True)
target_has_EC = ~unrelated["target_EC"].isnull()
query_has_EC = ~unrelated["query_EC"].isnull()
keep = target_has_EC & query_has_EC
unrelated_with_EC = unrelated[keep].reset_index(drop=True).copy()
unrelated_with_EC["ec overlap"] = unrelated_with_EC.apply(compare_EC, axis=1)
fig, ax = plt.subplots()
b = sns.boxplot(data=unrelated_with_EC, y='corrected bit score', x='ec overlap', ax=ax, fliersize=0, whis=[5, 95])
counts = unrelated_with_EC.groupby(['ec overlap']).apply(len)
for i, c in enumerate(counts.values):
ax.text(i-0.1, 4., str(c), fontsize='x-large')
ax.set_ylim(0, 5);
```
%% Cell type:code id:22f00eeb-722e-4820-a797-18e76e50de82 tags:
``` python
yeast_annot = pd.read_csv("../data/eggnog/S_cerevisiae_annotations.tsv", sep="\t", skiprows=4, skipfooter=3)
yeast_annot["max_annot_lvl"].unique()
yeast_annot["id"] = yeast_annot["#query"].str.split("|").str[1]
yeast_foldseek = pd.read_csv("/Users/npapadop/Documents/data/foldseek/yeast.m8", sep="\t", header=None)
yeast_foldseek.columns = ["query", "target", "seq. id.", "alignment length", "no. mismatches",
"no. gap open", "query start", "query end", "target start", "target end",
"e value", "bit score"]
yeast_foldseek["query"] = yeast_foldseek["query"].str.split("-").str[1]
yeast_foldseek["target"] = yeast_foldseek["target"].str.split("-").str[1]
```
%% Output
/var/folders/md/d6lwwbv97xb6g6ddypntnprh0000gp/T/ipykernel_488/2997960917.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'.
yeast_annot = pd.read_csv("../data/eggnog/S_cerevisiae_annotations.tsv", sep="\t", skiprows=4, skipfooter=3)
%% Cell type:code id:d8ad897d-3c5e-4ab5-b79d-c0ca22af20fd tags:
``` python
a = yeast_foldseek["query"].unique()
b = yeast_annot["id"].unique()
c = np.intersect1d(a, b)
len(a), len(b), len(c)
```
%% Output
(5994, 5537, 5520)
%% Cell type:markdown id:4437ebdf-5878-43ce-8e7a-f21c77bc6277 tags:
we don't want to get into quibbly territory, so we'll proceed with the IDs that are in both lists and will not wonder why they don't match to 100%.
%% Cell type:code id:f57728f2-f505-4195-9df1-a606ec65f40d tags:
``` python
keep = yeast_annot["id"].isin(c)
yeast_annot = yeast_annot[keep]
keep = yeast_foldseek["query"].isin(c)
yeast_foldseek = yeast_foldseek[keep]
```
%% Cell type:code id:d4a421da-3d5d-47f3-90eb-f77441d929d1 tags:
``` python
human_annot = pd.read_csv("../data/eggnog/H_sapiens_annotations.tsv", sep="\t", skiprows=4, skipfooter=3)
human_annot["id"] = human_annot["#query"].str.split("|").str[1]
```
%% Output
/var/folders/md/d6lwwbv97xb6g6ddypntnprh0000gp/T/ipykernel_488/2273340934.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'.
human_annot = pd.read_csv("../data/eggnog/H_sapiens_annotations.tsv", sep="\t", skiprows=4, skipfooter=3)
%% Cell type:code id:f7f4835c-03b8-4117-b494-387c570e5c12 tags:
``` python
same_structure_same_EC(yeast_annot, yeast_foldseek, human_annot)
```
%% Output
%% Cell type:code id:ff056da6-2246-41c0-85cd-01eda6589dda tags:
``` python
arabidopsis_annot = pd.read_csv("../data/eggnog/A_thaliana_annotations.tsv", sep="\t", skiprows=4, skipfooter=3)
arabidopsis_annot["max_annot_lvl"].unique()
arabidopsis_annot["id"] = arabidopsis_annot["#query"].str.split("|").str[1]
arabidopsis_foldseek = pd.read_csv("/Users/npapadop/Documents/data/foldseek/arabidopsis.m8", sep="\t", header=None)
arabidopsis_foldseek.columns = ["query", "target", "seq. id.", "alignment length", "no. mismatches",
"no. gap open", "query start", "query end", "target start", "target end",
"e value", "bit score"]
arabidopsis_foldseek["query"] = arabidopsis_foldseek["query"].str.split("-").str[1]
arabidopsis_foldseek["target"] = arabidopsis_foldseek["target"].str.split("-").str[1]
```
%% Output
/var/folders/md/d6lwwbv97xb6g6ddypntnprh0000gp/T/ipykernel_488/2842444695.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'.
arabidopsis_annot = pd.read_csv("../data/eggnog/A_thaliana_annotations.tsv", sep="\t", skiprows=4, skipfooter=3)
%% Cell type:code id:cfd46099-627e-4b12-aee3-5979914eaff7 tags:
``` python
a = arabidopsis_foldseek["query"].unique()
b = arabidopsis_annot["id"].unique()
c = np.intersect1d(a, b)
len(a), len(b), len(c)
```
%% Output
(27298, 25145, 25071)
%% Cell type:markdown id:21080634-fbcd-4e68-9dbf-e1589e3dfe41 tags:
we don't want to get into quibbly territory, so we'll proceed with the IDs that are in both lists and will not wonder why they don't match to 100%.
%% Cell type:code id:007206af-b083-45d1-9132-5d36b6d3f8aa tags:
``` python
keep = arabidopsis_annot["id"].isin(c)
arabidopsis_annot = arabidopsis_annot[keep]
keep = arabidopsis_foldseek["query"].isin(c)
arabidopsis_foldseek = arabidopsis_foldseek[keep]
```
%% Cell type:code id:a5fbe0b2-1545-410a-9416-053cab1514cc tags:
``` python
same_structure_same_EC(arabidopsis_annot, arabidopsis_foldseek, human_annot, lowest_relevant_OG="Eukaryota")
```
%% Output
%% Cell type:code id:48d39bb0-ed6d-4aea-9235-374a37592fbb tags:
``` python
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment