......@@ -23,6 +23,10 @@ variables:
after_script:
- python -m coverage combine
- python -m coverage report
- python ci/gitlab/after_script.hmmfilter.py
artifacts:
paths:
- ci/artifacts
# --- Stages -------------------------------------------------------------------
......
......@@ -3,4 +3,6 @@ include LICENSE
include static/gecco.png
include gecco/_version.txt
recursive-include gecco/data *.tsv *.tsv.gz *.model *.md5 *.ini
recursive-include gecco/crf *.pkl *.pkl.md5
recursive-include gecco/hmmer *.ini
recursive-include gecco/knn *.tsv *.npz
import gzip
import re
import os
import io
import tqdm
from gecco.crf import ClusterCRF
from gecco.hmmer import embedded_hmms
# Create the artifact folder
os.makedirs(os.path.join("ci", "artifacts"), exist_ok=True)
# Load the internal CRF model and compile a regex that matches the domains
# known to the CRF (i.e. useful domains for us to annotate with)
crf = ClusterCRF.trained()
rx = re.compile("|".join(crf.model.attributes_).encode("utf-8"))
# Filter the hmms
for hmm in embedded_hmms():
input_ = os.path.join("ci", "artifacts", "{}.hmm.gz".format(hmm.id))
with gzip.open(hmm.path, "rb") as src, gzip.open(input_, "wb") as dst:
blocklines = []
for line in src:
blocklines.append(line)
if line == b"//\n":
if any(rx.search(line) is not None for line in blocklines):
dst.writelines(blocklines)
blocklines.clear()
......@@ -20,7 +20,7 @@ pip install -U coverage
mkdir -p ci/cache
mkdir -p build/lib/gecco/data/hmms
if [ "$CI" == "true" ]; then
if [ "$CI_SERVER" == "true" ]; then
QUIET="-q"
else
QUIET=""
......
......@@ -169,7 +169,7 @@ class Run(Command): # noqa: D101
genes = crf.predict_probabilities(genes)
self.logger.debug("Extracting feature table")
feats_df = pandas.concat([g.to_feature_table() for g in genes])
feats_df = pandas.concat([g.to_feature_table() for g in genes], sort=False)
pred_out = os.path.join(out_dir, f"{base}.features.tsv")
self.logger.debug("Writing feature table to {!r}", pred_out)
......
......@@ -257,7 +257,7 @@ class ClusterCRF(object):
# Group genes by sequence id
seqs = itertools.groupby(genes, key=operator.attrgetter("source.id"))
# Build one feature table for sequence group
data = [pandas.concat([g.to_feature_table() for g in s]) for _, s in seqs]
data = [pandas.concat([g.to_feature_table() for g in s], sort=False) for _, s in seqs]
# Predict marginals using the feature table
probs = self.predict_marginals(data, jobs=jobs)
# Assign results to the input gene sequence: each domain has a row in
......
No preview for this file type
d2fdaf65e3c4519ce2ab8c0c210eb207
\ No newline at end of file
711c8252c7427da4411a33fcb37ec363
\ No newline at end of file
[hmm]
id = Panther
version = 15.0
url = ftp://ftp.pantherdb.org/panther_library/15.0/PANTHER15.0_ascii.tgz
relabel_with = s/(PTHR\d+)\..*/\1/
This diff is collapsed.
This diff is collapsed.
......@@ -125,8 +125,6 @@ class Gene:
the source sequence.
protein (`~gecco.model.Protein`): The protein translated from this
gene.
probability (`float`, optional): The probability with which this
protein is part of a biosynthetic gene cluster.
"""
......@@ -143,11 +141,18 @@ class Gene:
return self.protein.id
@property
def probability(self) -> Optional[float]:
domains = self.protein.domains
if not any(d.probability is not None for d in domains):
return None
return sum(domain.probability for domain in domains) / len(domains) # type: ignore
def average_probability(self) -> Optional[float]:
"""`float`: The average of domain probabilities of being biosynthetic.
"""
p = [d.probability for d in self.protein.domains if d.probability is not None]
return sum(p) / len(p) if p else None
@property
def maximum_probability(self) -> float:
"""`float`: The highest of domain probabilities of being biosynthetic.
"""
p = [d.probability for d in self.protein.domains if d.probability is not None]
return max(p) if p else None
def to_feature_table(self) -> pandas.DataFrame:
"""Convert this gene to a feature table listing domain annotations.
......@@ -247,15 +252,15 @@ class Cluster:
def average_probability(self) -> Optional[float]:
"""`float`: The average of proteins probability of being biosynthetic.
"""
p = [g.probability for g in self.genes if g.probability is not None]
p = [g.average_probability for g in self.genes if g.average_probability is not None]
return sum(p) / len(p) if p else None
@property
def maximum_probability(self) -> float:
def maximum_probability(self) -> Optional[float]:
"""`float`: The highest of proteins probability of being biosynthetic.
"""
p = [g.probability for g in self.genes if g.probability is not None]
return max(p)
p = [g.maximum_probability for g in self.genes if g.maximum_probability is not None]
return max(p) if p else None
# ---
......
......@@ -56,8 +56,8 @@ class GeneGrouper:
self.threshold = threshold
def __call__(self, gene: Gene) -> bool: # noqa: D102
if gene.probability is not None:
self.in_cluster = gene.probability > self.threshold
if gene.average_probability is not None:
self.in_cluster = gene.average_probability > self.threshold
return self.in_cluster
......@@ -87,7 +87,8 @@ class ClusterRefiner:
validity. See `gecco.bgc.BGC.is_valid` documentation for
allowed values and expected behaviours.
n_cds (`int`): The minimum number of CDS a gene cluster must
contain to be considered valid.
contain to be considered valid. If ``criterion`` is ``gecco``,
then this is the minimum number of **annotated** CDS.
n_biopfams (`int`): The minimum number of biosynthetic Pfam
domains a gene cluster must contain to be considered valid
(*only when the criterion is* ``antismash``).
......@@ -114,13 +115,15 @@ class ClusterRefiner:
respect to the postprocessing criterion given at initialisation.
"""
return filter(self._validate_cluster, self._iter_clusters(genes))
unfiltered_clusters = map(self._trim_cluster, self._iter_clusters(genes))
return filter(self._validate_cluster, unfiltered_clusters)
def _validate_cluster(self, cluster: Cluster) -> bool:
"""Check a cluster validity depending on the postprocessing criterion.
"""
if self.criterion == "gecco":
cds_crit = len(cluster.genes) >= self.n_cds
annotated = [ g for g in cluster.genes if g.protein.domains ]
cds_crit = len(annotated) >= self.n_cds
return cds_crit
elif self.criterion == "antismash":
domains = {d.name for gene in cluster.genes for d in gene.protein.domains}
......@@ -131,6 +134,15 @@ class ClusterRefiner:
else:
raise ValueError(f"unknown BGC filtering criterion: {self.criterion}")
def _trim_cluster(self, cluster: Cluster) -> Cluster:
"""Remove unannotated proteins from the cluster edges.
"""
while not cluster.genes[0].protein.domains:
cluster.genes.pop(0)
while not cluster.genes[-1].protein.domains:
cluster.genes.pop()
return cluster
def _iter_clusters(
self,
genes: List[Gene],
......
......@@ -127,8 +127,6 @@ class update_model(setuptools.Command):
self.announce(msg, level=2)
def run(self):
import gecco.data
# Copy the file to the new in-source location and compute its hash.
hasher = hashlib.md5()
self.info("Copying the trained CRF model to the in-source location")
......@@ -177,8 +175,15 @@ class build_py(_build_py):
def download(self, output, options):
base = "https://github.com/althonos/GECCO/releases/download/v{version}/{id}.hmm.gz"
url = base.format(id=options["id"], version=self.distribution.get_version())
self.announce("fetching {}".format(url), level=2)
with ResponseProgressBar(urllib.request.urlopen(url), desc=os.path.basename(output)) as src:
# attempt to use the GitHub releases URL, otherwise fallback to official URL
try:
self.announce("fetching {}".format(url), level=2)
response = urllib.request.urlopen(url)
except urllib.error.HTTPError:
self.announce("using fallback {}".format(options["url"]), level=2)
response = urllib.request.urlopen(options["url"])
# download the HMM
with ResponseProgressBar(response, desc=os.path.basename(output)) as src:
with open(output, "wb") as dst:
shutil.copyfileobj(src, dst)
......
......@@ -123,7 +123,7 @@ class TestRun(TestCommand, unittest.TestCase):
mock.patch("gecco.crf.ClusterCRF.predict_probabilities", new=_predict_probabilities)
)
stack.enter_context(
mock.patch("gecco.cli.commands.run.ClusterKNN.trained", new=ClusterKNN)
mock.patch("gecco.cli.commands.run.ClusterKNN.trained", new=lambda model_path, metric: ClusterKNN(metric=metric))
)
stack.enter_context(
mock.patch("gecco.cli.commands.run.ClusterKNN.predict_types", new=_fit_predict)
......