Commits (3)
......@@ -5,7 +5,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
## [Unreleased]
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.8.7...master
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.8.8...master
## [v0.8.8] - 2022-02-21
[v0.8.8]: https://git.embl.de/grp-zeller/GECCO/compare/v0.8.7...v0.8.8
### Fixed
- `ClusterRefiner` filtering method for edge genes not working as intended.
- `gecco run` and `gecco annotate` commands crashing on missing input files instead of nicely rendering the error.
## [v0.8.7] - 2022-02-18
[v0.8.7]: https://git.embl.de/grp-zeller/GECCO/compare/v0.8.6...v0.8.7
......
......@@ -10,4 +10,4 @@ See Also:
__author__ = "Martin Larralde"
__license__ = "GPLv3"
__version__ = "0.8.7"
__version__ = "0.8.8"
......@@ -123,23 +123,21 @@ class Annotate(Command): # noqa: D101
def _load_sequences(self):
from Bio import SeqIO
# guess format or use the one given in CLI
if self.format is not None:
format = self.format
self.info("Using", "user-provided sequence format", repr(format), level=2)
else:
self.info("Detecting", "sequence format from file contents", level=2)
format = guess_sequences_format(self.genome)
self.success("Detected", "format of input as", repr(format), level=2)
# get filesize and unit
input_size = os.stat(self.genome).st_size
total, scale, unit = ProgressReader.scale_size(input_size)
task = self.progress.add_task("Loading sequences", total=total, unit=unit, precision=".1f")
# load sequences
self.info("Loading", "sequences from genomic file", repr(self.genome), level=1)
try:
# guess format or use the one given in CLI
if self.format is not None:
format = self.format
self.info("Using", "user-provided sequence format", repr(format), level=2)
else:
self.info("Detecting", "sequence format from file contents", level=2)
format = guess_sequences_format(self.genome)
self.success("Detected", "format of input as", repr(format), level=2)
# get filesize and unit
input_size = os.stat(self.genome).st_size
total, scale, unit = ProgressReader.scale_size(input_size)
task = self.progress.add_task("Loading sequences", total=total, unit=unit, precision=".1f")
# load sequences
self.info("Loading", "sequences from genomic file", repr(self.genome), level=1)
with ProgressReader(open(self.genome, "rb"), self.progress, task, scale) as f:
sequences = list(SeqIO.parse(io.TextIOWrapper(f), format))
except FileNotFoundError as err:
......
......@@ -133,17 +133,21 @@ class ClusterRefiner:
"""Check a cluster validity depending on the postprocessing criterion.
"""
if self.criterion == "gecco":
# check that the number of cluster genes that are outside of
# the edge region is above the number of required CDS
annotated = [g for g in cluster.genes if g.protein.domains]
cds_crit = len(annotated) >= self.n_cds
# extract IDs of annotated genes that are too close to the edge
if self.edge_distance > 0:
annotated_ids = [g.id for g in seq if g.protein.domains]
edge_genes = set(annotated_ids[:self.edge_distance]).union(annotated_ids[-self.edge_distance:])
else:
edge_genes = set()
# check that the number of cluster genes that are outside of
# the edge region is above the number of required CDS
annotated = [ g for g in cluster.genes if g.protein.domains and g.id not in edge_genes ]
cds_crit = len(annotated) >= self.n_cds
return cds_crit
# NOTE (@althonos): it is needed for compatibility with the post-processed results
# that we filter on any number of cluster genes, but it would be
# better to filter on the number of annotated cluster genes instead.
edge_crit = len(set(g.id for g in cluster.genes).difference(edge_genes)) >= self.n_cds
return cds_crit and edge_crit
elif self.criterion == "antismash":
domains = {d.name for gene in cluster.genes for d in gene.protein.domains}
p_crit = numpy.mean([g.average_probability for g in cluster.genes]) >= self.average_threshold
......