Commits (91)
data/pfam/
data/db_config.txt
test/
*.pyc
*.DS_Store
*__pycache__
*.remote-sync.json
conda/
GECCO.egg-info/
dist/
build/
*.log
This diff is collapsed.
![](gecco.png)
# Hi, I'm GECCO!
## Requirements
* [Python](https://www.python.org/downloads/) 3.6 or higher
* [HMMER](http://hmmer.org/) v3.2 or higher
* [Prodigal](https://github.com/hyattpd/Prodigal) v2.6.3 or higher
Both HMMER and Prodigal can be installed through [conda](https://anaconda.org/). They have to be in the `PATH` variable before running GECCO.
## Installation
To install GECCO, just run
```bash
git clone https://git.embl.de/fleck/GECCO.git
cd GECCO
python setup.py install
```
This should install all Python dependencies and download the Pfam database. Also, it should add `gecco.py` to your `PATH` variable. If that did not work for some reason, you can do this manually by typing
```bash
export PATH=$(pwd):$PATH
```
If you're on a remote server without root access, it might be useful to install GECCO inside a conda environment because otherwise pip probably won't be able to install the Python dependencies.
## Running GECCO
Once `gecco.py` is available in your `PATH`, you can run it from everywhere by giving it a FASTA or GenBank file with the genome you want to analyze, as well as an output directory.
```bash
python gecco.py -o some_output_dir some_genome.fna
```
For more info, you could check the wiki... if there was one. So far, you're out of luck because I haven't gotten aroud to write one yet ;)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
gecco.png

23.1 KB

#!/usr/bin/env python
#########################################################################################
# #
# GECCO #
# GEne Cluster prediction #
# with COnditional random fields #
# #
# MAIN SCRIPT #
# #
# Author: Jonas Simon Fleck (jonas.simon.fleck@gmail.com) #
# #
#########################################################################################
import argparse
import logging
import multiprocessing
import os
import pickle
import subprocess
import sys
import warnings
import numpy as np
import pandas as pd
from Bio import SeqIO
from gecco.crf import ClusterCRF
from gecco.hmmer import HMMER
from gecco.interface import main_interface
from gecco.knn import ClusterKNN
from gecco.orf import ORFFinder
from gecco.refine import ClusterRefiner
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
# CONST
SCRIPT_DIR = os.path.abspath(os.path.dirname(os.path.abspath(sys.argv[0])))
PFAM = open(os.path.join(SCRIPT_DIR, "data/db_config.txt")).readlines()[0].strip()
MODEL = os.path.join(SCRIPT_DIR, "data/model/feat_v8_param_v2.crf.model")
TRAINING_MATRIX = os.path.join(SCRIPT_DIR, "data/knn/domain_composition.tsv")
LABELS = os.path.join(SCRIPT_DIR, "data/knn/type_labels.tsv")
# MAIN
if __name__ == "__main__":
# PARAMS
args = main_interface()
# Make out directory
out_dir = args.out
if not os.path.exists(out_dir):
os.makedirs(out_dir)
# Set up logging
logging.basicConfig(
level = logging.INFO,
format = "%(asctime)s [%(levelname)s] %(message)s",
handlers = [
logging.FileHandler(os.path.join(out_dir, "gecco.log")),
logging.StreamHandler()
])
logging.info(f"GECCO is running with these parameters:\n{args.__dict__}")
e_filter = min(1, args.e_filter)
threads = args.threads
if not threads:
threads = multiprocessing.cpu_count()
# PRODIGAL
# If input is a genome, run prodigal to extract ORFs/proteins
if args.GENOME:
genome = args.GENOME
base = ".".join(os.path.basename(genome).split(".")[:-1])
# PRODIGAL
logging.info("Predicting ORFs with PRODIGAL.")
prodigal_out = os.path.join(out_dir, "prodigal/")
if not os.path.exists(prodigal_out):
os.makedirs(prodigal_out)
# Extract ORFs from genome
prodigal = ORFFinder(genome, prodigal_out, method="prodigal")
orf_file = prodigal.run()
prodigal = True
# If input is a fasta with proteins, treat it as ORFs in given order
else:
orf_file = args.PROTEINS
base = ".".join(os.path.basename(orf_file).split(".")[:-1])
prodigal = False
# HMMER
logging.info("Running Pfam domain annotation.")
hmmer_out = os.path.join(out_dir, "hmmer/")
if not os.path.exists(hmmer_out):
os.makedirs(hmmer_out)
# Run PFAM HMM DB over ORFs to annotate with Pfam domains
hmmer = HMMER(orf_file, hmmer_out, hmms=PFAM, prodigal=prodigal)
pfam_df = hmmer.run()
# Filter i-Evalue
pfam_df = pfam_df[pfam_df["i_Evalue"] < e_filter]
# Reformat pfam IDs
pfam_df = pfam_df.assign(
pfam = pfam_df["pfam"].str.replace(r"(PF\d+)\.\d+", lambda m: m.group(1))
)
# Write feature table to file
feat_out = os.path.join(out_dir, base + ".features.tsv")
pfam_df.to_csv(feat_out, sep="\t", index=False)
# CRF
logging.info("Prediction of cluster probabilities with the CRF model.")
# Load model from file
with open(MODEL, "rb") as f:
crf = pickle.load(f)
# If extracted from genome split input dataframe into sequences
if prodigal:
pfam_df = [seq for _, seq in pfam_df.groupby("sequence_id")]
else:
pfam_df = [pfam_df]
pfam_df = crf.predict_marginals(data=pfam_df)
# Write predictions to file
pred_out = os.path.join(out_dir, base + ".pred.tsv")
pfam_df.to_csv(pred_out, sep="\t", index=False)
# REFINE
logging.info("Extracting clusters.")
refiner = ClusterRefiner(threshold=args.thresh)
clusters = []
for sid, subdf in pfam_df.groupby("sequence_id"):
if len(subdf["protein_id"].unique()) < 5:
logging.warning(
f"Skipping sequence {sid} because it is too short (< 5 ORFs).")
continue
found_clusters = refiner.find_clusters(
subdf,
method = args.post,
prefix = sid
)
if found_clusters:
clusters += found_clusters
del pfam_df
if not clusters:
logging.warning("Unfortunately, no clusters were found. Exiting now.")
sys.exit()
# KNN
logging.info("Prediction of BGC types.")
# Reformat training matrix
train_df = pd.read_csv(TRAINING_MATRIX, sep="\t", encoding="utf-8")
train_comp = train_df.iloc[:,1:].values
id_array = train_df["BGC_id"].values
pfam_array = train_df.columns.values[1:]
# Reformant type labels
types_df = pd.read_csv(LABELS, sep="\t", encoding="utf-8")
types_array = types_df["cluster_type"].values
subtypes_array = types_df["subtype"].values
# Calculate domain composition for all new found clusters
new_comp = np.array(
[c.domain_composition(all_possible=pfam_array) for c in clusters]
)
# Inititate kNN and predict types
knn = ClusterKNN(metric=args.dist, n_neighbors=args.k)
knn_pred = knn.fit_predict(train_comp, new_comp, y=types_array)
# WRITE RESULTS
logging.info("Writing final results to file.")
# Write predicted cluster coordinates to file
cluster_out = os.path.join(out_dir, base + ".clusters.tsv")
with open(cluster_out, "wt") as f:
for c, t in zip(clusters, knn_pred):
c.type = t[0]
c.type_prob = t[1]
c.write_to_file(f, long=True)
# Write predicted cluster sequences to file
for c in clusters:
prots = c.prot_ids
cid = c.name
prot_list = []
proteins = SeqIO.parse(orf_file, "fasta")
for p in proteins:
if p.id in prots:
p.description = cid + " # " + p.description
prot_list.append(p)
with open(os.path.join(out_dir, cid + ".proteins.faa"), "wt") as fout:
SeqIO.write(prot_list, fout, "fasta")
logging.info("DONE.\n")
import numpy as np
from gecco.preprocessing import flatten
class Protein(object):
"""Definition of a Protein"""
def __init__(self, seq_id, start, end, name, p=0.0, domains=[], weights=None):
self.seq_id = seq_id
self.start = min(start, end)
self.end = max(start, end)
self.name = np.array(name)
self.domains = np.array(domains)
if weights is not None:
self.weights = np.array(weights)
else:
self.weights = np.ones(len(domains))
self.probs = np.array(p)
def is_potential_cluster(self, thresh=0.3):
return self.probs.mean() > thresh
class BGC(object):
"""A biosynthetic gene cluster with multiple proteins"""
def __init__(self, proteins=None, name=None, bgc_type=None, type_prob=None,
seq_id=None, start=None, end=None, domains=None, weights=None):
self.proteins = proteins
if proteins:
self.seq_id = self.proteins[0].seq_id
self.prot_ids = np.array([p.name for p in proteins])
self.start = min([p.start for p in proteins])
self.end = max([p.end for p in proteins])
self.domains = np.array([p.domains for p in proteins])
self.weights = np.array([p.weights for p in proteins])
self.probs = np.array([p.probs for p in proteins])
else:
self.seq_id = seq_id
self.start = start
self.end = end
self.domains = domains
self.weights = weights
self.name = name if name else seq_id
self.type = bgc_type
self.type_prob = type_prob
def is_valid(self, criterion="antismash", thresh=0.6):
if criterion == "antismash":
# These are the default options only
return self._antismash_check(
n_biopfams = 5,
p_thresh = 0.6,
n_cds = 5
)
if criterion == "gecco":
return self._gecco_check()
def write_to_file(self, handle, long=False):
if self.proteins:
p_mean = np.hstack(self.probs).mean()
p_max = np.hstack(self.probs).max()
if not long:
row = [self.seq_id, self.name, self.start, self.end, p_mean, p_max,
self.type, self.type_prob]
else:
prot = ",".join(np.hstack(self.prot_ids))
pfam = ",".join(np.hstack(self.domains))
row = [self.seq_id, self.name, self.start, self.end, p_mean, p_max,
self.type, self.type_prob, prot, pfam]
else:
row = [self.seq_id, self.name, self.type]
row = map(str, row)
handle.write("\t".join(row) + "\n")
def domain_composition(self, all_possible=None):
"""Computes weighted domain composition with respect to all_possible.
"""
doms = np.hstack(self.domains)
w = np.hstack(self.weights)
if all_possible is None:
all_possible = np.unique(doms)
comp_arr = np.zeros(len(all_possible))
for i in range(len(all_possible)):
n = list(doms).count(all_possible[i])
if n > 0:
weight = w[doms == all_possible[i]].mean()
else:
weight = 0
comp_arr[i] = n * weight
comp_arr = comp_arr / comp_arr.sum()
return np.nan_to_num(comp_arr, copy=False)
def domain_counts(self, all_possible=None):
"""Computes domain counts with respect to all_possible.
"""
doms = np.hstack(self.domains)
if all_possible is None:
all_possible = np.unique(doms)
comp_arr = np.zeros(len(all_possible))
for i in range(len(all_possible)):
n = list(doms).count(all_possible[i])
comp_arr[i] = n
return comp_arr
def _antismash_check(self, n_biopfams=5, p_thresh=0.6, n_cds=5):
"""
Checks for cluster validity using antiSMASH criteria:
1) MEAN p over thresh
2) At least n_biopfams intersection between bio_pfams and the pfams in the
cluster
"""
bio_pfams = {
"PF00109", "PF02801", "PF08659", "PF00378", "PF08541",
"PF08545", "PF02803", "PF00108", "PF02706", "PF03364", "PF08990", "PF00501",
"PF00668", "PF08415", "PF00975", "PF03061", "PF00432", "PF00494", "PF03936",
"PF01397", "PF00432", "PF04275", "PF00348", "PF02401", "PF04551", "PF00368",
"PF00534", "PF00535", "PF02922", "PF01041","PF00128", "PF00908","PF02719", "PF04321", "PF01943", "PF02806", "PF02350", "PF02397", "PF04932","PF01075",
"PF00953","PF01050", "PF03033", "PF01501", "PF05159", "PF04101", "PF02563",
"PF08437", "PF02585", "PF01721", "PF02052", "PF02674","PF03515", "PF04369",
"PF08109", "PF08129", "PF09221", "PF09683", "PF10439", "PF11420", "PF11632",
"PF11758", "PF12173","PF04738", "PF04737", "PF04604", "PF05147", "PF08109",
"PF08129", "PF08130", "PF00155", "PF00202", "PF00702", "PF06339","PF04183",
"PF10331", "PF03756", "PF00106", "PF01370", "PF00107", "PF08240", "PF00441",
"PF02770", "PF02771", "PF08028","PF01408", "PF02894", "PF00984", "PF00725",
"PF03720", "PF03721", "PF07993", "PF02737", "PF00903", "PF00037", "PF04055",
"PF00171", "PF00067", "PF01266", "PF01118", "PF02668", "PF00248", "PF01494",
"PF01593", "PF03992", "PF00355", "PF01243","PF00384", "PF01488", "PF00857",
"PF04879", "PF08241", "PF08242", "PF00698", "PF00483", "PF00561", "PF00583",
"PF01636","PF01039", "PF00288", "PF00289", "PF02786", "PF01757", "PF02785",
"PF02409", "PF01553", "PF02348", "PF00891", "PF01596","PF04820", "PF02522",
"PF08484", "PF08421"
}
bio_crit = len(set(np.hstack(self.domains)) & bio_pfams) >= n_biopfams
p_crit = np.mean([p.mean() for p in self.probs]) >= p_thresh
cds_crit = len(np.hstack(self.prot_ids)) >= n_cds
return (bio_crit and p_crit and cds_crit)
def _gecco_check(self):
return True
import math
import random
import numbers
import warnings
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from sklearn.model_selection import PredefinedSplit
from sklearn_crfsuite import CRF
from itertools import zip_longest
from gecco.cross_validation import LotoSplit, n_folds, n_folds_partial, StratifiedSplit
from gecco.preprocessing import flatten, truncate
# CLASS
class ClusterCRF(object):
"""
ClusterCRF is a wrapper around CRFSuite and enables predition and cross validation
for dataframes. This is handy for sequence prediction in e.g. annotated genomic
sequences.
Initiated with the columns defining the respective properties in the dataframe:
Y_col: column with class labels
feature_cols: column(s) with categorical features
weight_cols: column(s) with 'weights' for categorical features. These are applied
locally and don't correspont to the actual weights the model learns. You can read up on how this is used inside CRFSuite on the CRFSuite website.
group_col: in case of feature_type = 'group', this defines the grouping column
feature_type: defines how features should be extracted:
single: Features are extractd on a domain (row) level
overlap: Features are extracted in overlapping windows
group: Features are extracted in groupings determined by a column in the data
frame. This is most useful when dealing with proteins, but can handle
arbitrary grouping levels
algorithm: the optimization algorithm for the model
(again, check the CRFSuite website)
overlap: in case of feature_type='overlap', by how much the windows should
overlap
weights_prefix: prefix for writing transition and state feature weights after
each fitting
**kwargs: other parameters you want to pass to the CRF model.
"""
def __init__(self, Y_col=None,
feature_cols=[], weight_cols=[], group_col="protein_id", feature_type="single",
algorithm="lbsgf", overlap=2, weights_prefix=None, **kwargs):
self.Y_col = Y_col
self.features = feature_cols
self.weights = weight_cols
self.groups = group_col
self.feature_type = feature_type
self.overlap = overlap
self.weights_prefix = weights_prefix
self.alg = algorithm
self.model = CRF(
algorithm = self.alg,
all_possible_transitions = True,
all_possible_states = True,
**kwargs)
def fit(self, X=None, Y=None, data=None):
"""
Fits the model.
If X and Y are defined, it fits the model on the corresponding vectors.
If data is defined, it takes a dataframe to extract features and labels for
fitting
"""
if (X is not None) and (Y is not None):
self.model.fit(X, Y)
elif data is not None:
samples = [self._extract_features(s) for s in data]
X = np.array([x for x, _ in samples])
Y = np.array([y for _, y in samples])
self.model.fit(X, Y)
if self.weights_prefix:
# Random integer to avoid overriding.
# Hacky, but 1/10000 is enough for now
rnd = random.randint(1, 10000)
prfx = f"{self.weights_prefix}.{rnd:05}"
self.save_weights(prfx)
def predict_marginals(self, data=None, X=None):
"""
Predicts marginals for your data.
If X (a feature vector) is defined, it outputs a probability vector.
If data (a dataframe) is defined, it outputs the same dataframe with the
probability vector concatenated to it as the column p_pred.
"""
if X is not None:
return self.model.predict_marginals(X)
elif data is not None:
# Extract features to dict (CRFSuite format)
samples = [self._extract_features(s, X_only=True) for s in data]
X = np.array([x for x, _ in samples])
marginal_probs = self.model.predict_marginals(X)
# Extract cluster (1) probs
cluster_probs = []
for sample in marginal_probs:
try:
sample_probs = np.array([d["1"] for d in [_ for _ in sample]])
except KeyError:
warnings.warn(
"Cluster probabilities of test set were found to be zero. This indicates that there might be something wrong with your input data.", Warning
)
sample_probs = np.array([0 for d in [_ for _ in sample]])
cluster_probs.append(sample_probs)
# Merge probs vector with the input dataframe. This is tricky if we are
# dealing with protein features as length of vector does not fit to dataframe
# To deal with this, we merge by protein_id
# --> HOWEVER: this requires the protein IDs to be unique among all samples
if self.feature_type == "group":
result_df = [self._merge(df, p_pred=p)
for df, p in zip(data, cluster_probs)]
result_df = pd.concat(result_df)
else:
result_df = pd.concat(data)
result_df = result_df.assign(p_pred=np.concatenate(cluster_probs))
return result_df
def cv(self, data, k=10, threads=1, trunc=None, strat_col=None):
"""Runs stratified k-fold CV using k splits and a stratification column"""
if strat_col:
types = [s[strat_col].values[0].split(",") for s in data]
cv_split = StratifiedSplit(types, n_splits=k)
else:
folds = n_folds(len(data), n=k)
cv_split = PredefinedSplit(folds)
# Run one job per split and collects the result in a list
results = Parallel(n_jobs=threads)(
delayed(self._single_fold_cv)(data, train_idx, test_idx, trunc=trunc)
for train_idx, test_idx in cv_split.split())
return results
def loto_cv(self, data, type_col, threads=1, trunc=None):
"""Runs LOTO CV based on a column defining the type of the sample (type_col)"""
types = [s[type_col].values[0].split(",") for s in data]
cv_split = LotoSplit(types)
# Run one job per split and collects the result in a list
results = Parallel(n_jobs=threads)(
delayed(self._single_fold_cv)(data, train_idx, test_idx,
round_id=typ, trunc=trunc)
for train_idx, test_idx, typ in cv_split.split())
return results
def _single_fold_cv(self, data, train_idx, test_idx, round_id=None,
trunc=None):
"""Performs a single CV round with the given train_idx and test_idx
"""
train_data = [data[i].reset_index() for i in train_idx]
if trunc:
# Truncate training set from both sides to desired length
train_data = [truncate(df, trunc, Y_col=self.Y_col, grouping=self.groups)
for df in train_data]
# Extract features to CRFSuite format
train_samples = [self._extract_features(s) for s in train_data]
X_train = np.array([x for x, _ in train_samples])
Y_train = np.array([y for _, y in train_samples])
test_data = [data[i].reset_index() for i in test_idx]
test_samples = [self._extract_features(s) for s in test_data]
X_test = np.array([x for x, _ in test_samples])
Y_test = np.array([y for _, y in test_samples])
self.fit(X=X_train, Y=Y_train)
# Extract cluster (1) probabilities from marginals
marginal_probs = self.model.predict_marginals(X_test)
cluster_probs = []
for sample in marginal_probs:
try:
sample_probs = np.array([d["1"] for d in [_ for _ in sample]])
except KeyError:
warnings.warn(
"Cluster probabilities of test set were found to be zero. This indicates that there might be something wrong with your input data.", Warning
)
sample_probs = np.array([0 for d in [_ for _ in sample]])
cluster_probs.append(sample_probs)
# Merge probs vector with the input dataframe. This is tricky if we are dealing
# with protein features as length of vector does not fit to dataframe
# To deal with this, we merge by protein_id
# --> HOWEVER: this requires the protein IDs to be unique within each sample
if self.feature_type == "group":
result_df = [df.assign(cv_round=round_id) for df in test_data]
result_df = [self._merge(df, p_pred=p)
for df, p in zip(test_data, cluster_probs)]
result_df = pd.concat(result_df)
else:
result_df = (pd.concat(test_data)
.assign(
p_pred = np.concatenate(cluster_probs),
cv_round = round_id
)
)
return result_df
def _extract_features(self, sample, X_only=False):
"""
Chooses extraction function based on feature type.
single: Features are extracted on a domain (row) level
overlap: Features are extracted in overlapping windows
group: Features are extracted in groupings determined by a column in the data
frame. This is most useful when dealing with proteins, but can handle arbitrary
grouping levels.
"""
# Little hacky this, but... meh...
if X_only:
Y_col = None
else:
Y_col = self.Y_col
if self.feature_type == "single":
return self._extract_single_features(sample, Y_col)
if self.feature_type == "overlap":
return self._extract_overlapping_features(sample, Y_col)
if self.feature_type == "group":
return self._extract_protein_features(sample, Y_col)
def _merge(self, df, **cols):
unidf = pd.DataFrame(cols)
unidf[self.groups] = df[self.groups].unique()
return df.merge(unidf)
def _extract_single_features(self, table, Y_col=None):
"""
Prepares class labels Y and features from a table
given
table: a table with pfam domains and class lables
Y_col: the column encoding the class labels (e.g. 1 (BGC) and 0 (non-BGC))
if Y_col == None, only X is returned --> for prediction cases
"""
X = []
for _, row in table.iterrows():
feat_dict = dict()
feat_dict = self._make_feature_dict(row, feat_dict)
X.append(feat_dict)
if Y_col:
Y = np.array(table[Y_col].astype(str))
return X, Y
else:
return X, None
def _extract_protein_features(self, table, Y_col=None):
"""
Extracts features on protein level
given
table: a table with pfam domains and class lables
Y_col: the column containing the class labels (e.g. 1 (BGC) and 0 (non-BGC))
if Y_col == None, only X is returned --> for prediction cases
"""
X = []
Y = []
for prot, tbl in table.groupby(self.groups, sort=False):
feat_dict = dict()
for _, row in tbl.iterrows():
feat_dict = self._make_feature_dict(row, feat_dict)
X.append(feat_dict)
if Y_col:
Y.append(str(tbl[Y_col].values[0]))
if Y_col:
return X, Y
else:
return X, None
def _extract_overlapping_features(self, table, Y_col=None):
"""
Prepares class labels Y and features from a table
given
table: a table with pfam domains and class lables
Y_col: the column containing the class labels (e.g. 1 (BGC) and 0 (non-BGC))
if Y_col == None, only X is returned --> for prediction cases
"""
X = []
for idx, _ in table.iterrows():
wind = table.iloc[idx - self.overlap : idx + self.overlap + 1]
feat_dict = dict()
for _, row in wind.iterrows():
feat_dict = self._make_feature_dict(row, feat_dict)
X.append(feat_dict)
if Y_col:
Y = np.array(table[Y_col].astype(str))
return X, Y
else:
return X, None
def _make_feature_dict(self, row, feat_dict=dict()):
"""
Constructs a dict with key:value pairs from
row: input row or dict
self.features: either name of feature or name of column in row/dict
self.weights: either numerical weight or name of column in row/dict
"""
for f, w in zip(self.features, self.weights):
if isinstance(w, numbers.Number):
key, val = row[f], w
else:
try:
key, val = row[f], row[w]
except KeyError:
key, val = f, row[w]
if key in feat_dict.keys():
feat_dict[key] = max(val, feat_dict[key])
else:
feat_dict[key] = val
return feat_dict
def save_weights(self, fileprefix):
with open(fileprefix + ".trans.tsv", "wt") as f:
f.write("from\tto\tweight\n")
for (label_from, label_to), weight in self.model.transition_features_.items():
f.write(f"{label_from}\t{label_to}\t{weight}\n")
with open(fileprefix + ".state.tsv", "wt") as f:
f.write("attr\tlabel\tweight\n")
for (attr, label), weight in self.model.state_features_.items():
f.write(f"{attr}\t{label}\t{weight}\n")
......@@ -3,7 +3,7 @@ import numpy as np
import pandas as pd
import multiprocessing
from sklearn.model_selection import PredefinedSplit, check_cv, StratifiedKFold
from preprocessing import flatten
from gecco.preprocessing import flatten
# CLASS
class LotoSplit(object):
......
import os
import subprocess
import pandas as pd
class HMMER(object):
"""Searches a HMM library against protein sequences."""
def __init__(self, fasta, out_dir, hmms, prodigal=True):
self.fasta = fasta
self.prodigal = prodigal
if not self.prodigal:
self.protein_order = self._get_protein_order()
self.out_dir = out_dir
self.hmms = hmms
self._check_hmmer()
def run(self):
"""Runs HMMER. Converts output to tsv."""
base = ".".join(os.path.basename(self.fasta).split(".")[:-1])
dom_out = os.path.join(self.out_dir, base + ".hmmer.dom")
cmd = ["hmmsearch", "--domtblout",
dom_out, self.hmms, self.fasta]
std_out = os.path.join(self.out_dir, base + ".hmmer.out")
err_out = os.path.join(self.out_dir, base + ".hmmer.err")
# Run HMMER
subprocess.run(cmd,
stdout = open(std_out, "wt"),
stderr = open(err_out, "wt"))
# Convert to TSV
tsv_out = os.path.join(self.out_dir, base + ".hmmer.tsv")
self._to_tsv(dom_out, tsv_out)
# Sort table properly
out_df = pd.read_csv(tsv_out, sep = "\t")
out_df = (out_df.sort_values(["sequence_id", "start", "domain_start"])
.reset_index(drop=True))
return out_df
def _check_hmmer(self):
"""Checks wether hmmsearch is available. Raises error if not."""
try:
devnull = open(os.devnull)
subprocess.run(["hmmsearch"], stdout=devnull, stderr=devnull)
except OSError as e:
if e.errno == os.errno.ENOENT:
raise OSError("HMMER does not seem to be installed. Please install it and re-run GECCO.")
def _to_tsv(self, dom_file, out_file):
"""Converts HMMER --domtblout output to regular TSV"""
header = ["sequence_id", "protein_id", "start", "end", "strand", "pfam",
"i_Evalue", "domain_start", "domain_end"]
with open(dom_file, "rt") as f:
with open(out_file, "wt") as fout:
fout.write("\t".join(header) + "\n")
for l in f:
if not l.startswith("#"):
l = l.split()
if self.prodigal:
sid = "_".join(l[0].split("_")[:-1])
pid = l[0]
start = min(l[23], l[25])
end = max(l[23], l[25])
strand = "+" if l[27] == "1" else "-"
else:
sid = "NA"
pid = l[0]
start = str(self.protein_order[pid])
end = str(self.protein_order[pid])
strand = "NA"
acc = l[4]
if not acc:
acc = l[3]
row = [sid, pid, start, end, strand, acc, l[12]] + l[17:19]
fout.write("\t".join(row) + "\n")
def _get_protein_order(self):
pos_dict = dict()
i = 0
with open(self.fasta, "rt") as f:
for line in f:
if line.startswith(">"):
pid = line[1:].split()[0]
pos_dict[pid] = i
i += 1
return pos_dict
import sys
import argparse
# FUNC
def main_interface():
parser = argparse.ArgumentParser(description="Predicts biosynthetic gene clusters from a genome FASTA/Genbank file.")
parser.add_argument("-g", "--genome",
dest="GENOME",
type=str,
metavar="<f>",
help="A genome FASTA/Genbank file as input.")
parser.add_argument("-p", "--proteins",
dest="PROTEINS",
type=str,
metavar="<f>",
help="A protein FASTA file as input.")
parser.add_argument("-o", "--output-dir",
dest="out",
type=str,
default="./",
metavar="<d>",
help="Output directory [./].")
parser.add_argument("-t", "--cpus", "--threads",
dest="threads",
type=int,
metavar="<int>",
help="Number of CPUs for multithreading [auto].")
parser.add_argument("--e-filter", "-e",
dest="e_filter",
type=float,
default="1e-5",
metavar="<e_filter>",
help="E-value cutoff for pfam domains to be included [1e-5].")
parser.add_argument("--thresh", "-m",
dest="thresh",
type=float,
default="0.4",
metavar="<float>",
help="Probability threshold for cluster detection. Default depends on the chosen postprocessing method [0.4 (gecco)/0.6 (antismash).")
parser.add_argument("-k", "--neighbors",
dest="k",
type=int,
default="5",
metavar="<int>",
help="Numer of neighbors for kNN type prediction [5].")
parser.add_argument("-d", "--distance-metric",
dest="dist",
type=str,
default="jsd",
metavar="<jsd/tanimoto>",
help="Distance metric for kNN type prediction [jsd].")
parser.add_argument("--postproc",
dest="post",
type=str,
default="gecco",
metavar="<gecco/antismash>",
help="Type of method for cluster extraction [gecco].")
parser.add_argument("--verbose",
dest="verbose",
action="store_true",
help="Turn on verbose mode.")
args = parser.parse_args()
return args
def scripts_interface():
parser = argparse.ArgumentParser(description="Generic interface for all scripts which use the ClusterCRF (gecco_[cv/loto/train/predict/refine].py)")
parser.add_argument("DATA",
type=str,
metavar="<DATA>",
help="Pfam table.")
parser.add_argument("-o", "--output-basename",
dest="out",
type=str,
default="CRF",
metavar="<basename>",
help="Basename for the output.")
parser.add_argument("-m", "--model",
dest="model",
type=str,
metavar="<model>",
help="Model for predictions.")
parser.add_argument("-t", "--threads",
dest="threads",
type=int,
metavar="<threads>",
help="Number of CPUs.")
parser.add_argument("-e", "--evalue",
dest="e_filter",
type=float,
default="1e-5",
help="E-value threshold for the data set.")
parser.add_argument("-y", "--y-col",
dest="y",
type=str,
default="BGC",
help="Column with class labels.")
parser.add_argument("-w", "--weight-col",
dest="w",
type=str,
nargs="+",
default=["1"],
help="Column to be used as local weights on features.")
parser.add_argument("-f", "--feature-col",
dest="feat",
type=str,
nargs="+",
default=["domain"],
help="Column to be used as features.")
parser.add_argument("-s", "--split-col",
dest="split_col",
default="sequence_id",
type=str,
help="Column to be used for splitting in to samples, i.e. different sequences.")
parser.add_argument("-g", "--group-col",
dest="group_col",
default="protein_id",
type=str,
help="Column to be used for grouping features if feature_type is 'group'.")
parser.add_argument("-p", "--threshold",
dest="thresh",
default="0.6",
type=float,
help="Probability threshold for clusters prediction.")
parser.add_argument("--sort-cols",
dest="sort_cols",
default=["genome_id", "start", "domain_start"],
nargs="+",
type=str,
help="Columns to be used for sorting the data.")
parser.add_argument("--strat-col",
dest="strat_col",
default="BGC_type",
type=str,
help="Columns to be used for stratifying the samples (BGC types).")
parser.add_argument("--feature-type",
dest="feature_type",
type=str,
default="group",
help="How features should be extracted. 'Single', 'overlap' or on some grouping level ('group').")
parser.add_argument("--truncate",
dest="truncate",
type=int,
help="Training set will be truncated to this length.")
parser.add_argument("--overlap",
dest="overlap",
type=int,
default="2",
help="If overlapping features: How much overlap.")
parser.add_argument("--no-shuffle",
dest="shuffle",
action="store_false",
help="Switch to turn of shuffling of the data before doing CV.")
parser.add_argument("--folds",
dest="splits",
default="10",
type=int,
help="Number of folds for CV.")
parser.add_argument("--distance-metric",
dest="dist",
default="jsd",
type=str,
help="Dinstance metric for kNN.")
parser.add_argument("-k", "--neighbors",
dest="k",
type=int,
default="5",
help="Numer of neighbors for kNN type prediction.")
parser.add_argument("--min-orfs",
dest="orfs",
default="5",
type=int,
help="Minimum number of ORFs required for a sequence to be considered.")
parser.add_argument("--postproc",
dest="post",
default="gecco",
type=str,
help="Method for extracting clusters.")
parser.add_argument("--C1",
dest="C1",
default="0.15",
type=float,
help="Parameter for L1 regularization.")
parser.add_argument("--C2",
dest="C2",
default="0.15",
type=float,
help="Parameter for L2 regularization.")
args = parser.parse_args()
return args
def annot_interface():
parser = argparse.ArgumentParser(description="Generic interface for the annotation script (gecco_annotate.py).")
parser.add_argument("-g", "--genome",
dest="GENOME",
type=str,
metavar="<f>",
help="A genome FASTA file.")
parser.add_argument("-p", "--proteins",
dest="PROTEINS",
type=str,
metavar="<f>",
help="A protein FASTA file.")
parser.add_argument("-o", "--output-dir",
dest="out",
type=str,
default="./",
metavar="<d>",
help="The output directory.")
parser.add_argument("-d", "--database", "--db",
dest="DB",
type=str,
metavar="<model>",
help="HMM database for annotation.")
parser.add_argument("--e-filter", "-e",
dest="e_filter",
type=float,
default="1e-5",
metavar="<e_filter>",
help="E-value cutoff for pfam domains to be included [1e-5].")
args = parser.parse_args()
return args
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE, MDS
from sklearn.neighbors import KNeighborsClassifier
from gecco.utils import jsd_pairwise, tanimoto_pairwise
class ClusterKNN(object):
"""
Predicts cluster types based on a kNN Classifier and plots results.
Essentially a wrapper around sklearns KNeighborsClassifier
(MDS and TSNE maybe later)
"""
def __init__(self, metric="jsd", **kwargs):
self.metric = metric
if self.metric == "jsd":
self.dist = jsd_pairwise
# Doesn't work, really
if self.metric == "tanimoto":
self.dist = tanimoto_pairwise
self.knn = KNeighborsClassifier(
metric = self.dist,
**kwargs
)
def fit_predict(self, train_matrix, new_matrix, y):
self.knn.fit(train_matrix, y=y)
pred = self.knn.predict(new_matrix)
proba = [max(p) for p in self.knn.predict_proba(new_matrix)]
return list(zip(pred, proba))
import os
import subprocess
class ORFFinder(object):
"""Finds ORFs given a FASTA file and writes the results to out_dir"""
def __init__(self, fasta, out_dir, method="prodigal",
out_formats=["genes", "coords"],
other_args=[]):
self.fasta = fasta
self.base = ".".join(os.path.basename(fasta).split(".")[:-1])
self.out_dir = out_dir
self.method = method
self._check_method()
self.out_formats = out_formats
self.other_args = other_args
def run(self):
"""Find ORFs"""
cmd = self._make_commandline()
log_out = os.path.join(self.out_dir, self.base + f".{self.method}.log")
subprocess.run(cmd,
stdout = open(log_out, "wt"),
stderr = open(log_out, "wt"))