Commits (8)
......@@ -5,16 +5,46 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
## [Unreleased]
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.1.0...master
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.2.1...master
## [v0.2.1] - 2020-07-23
[v0.2.1]: https://git.embl.de/grp-zeller/GECCO/compare/v0.2.0...v0.2.1
### Fixed
- Various potential crashes in `ClusterRefiner` code.
### Removed
- Uneeded feature dictionary filtering in `ClusterCRF` for models with
Fisher Exact Test feature selection.
## [v0.2.0] - 2020-07-23
[v0.2.0]: https://git.embl.de/grp-zeller/GECCO/compare/v0.1.1...v0.2.0
### Fixed
- `pandas` warning about unsorted columns in `gecco run`.
### Removed
- `Gene.probability` property, replaced by `Gene.maximum_probability` and
`Gene.average_probability` properties to be explicit.
### Changed
- Internal model now uses `Pfam` and `Tigrfam` with the top 35% features
selected with Fisher's Exact Test.
- `ClusterRefiner` now removes genes on `Cluster` edges if they do not
contain any domain annotation.
## [v0.1.1] - 2020-07-22
[v0.1.1]: https://git.embl.de/grp-zeller/GECCO/compare/v0.1.0...v0.1.1
### Added
- `ClusterCRF.predict_probabilities` to annotate a list of `Gene`.
### Changed
- BGC probability is now stored at the `Domain` level instead of at the `Gene`
level, independently of the feature extraction level used by the CRF.
- `ClusterKNN` will use the model path provided to `gecco run` if any.
### Docs
- Added this changelog file to document changes in the code.
- Added documentation to `gecco` submodules missing some.
- Included the `CHANGELOG.md` file to the generated docs.
## [v0.1.0] - 2020-07-17
[v0.1.0]: https://git.embl.de/grp-zeller/GECCO/compare/v0.0.1...v0.1.0
Initial release.
## [v0.0.1] - 2018-08-13
[v0.0.1]: https://git.embl.de/grp-zeller/GECCO/compare/37afb97...v0.0.1
Proof-of-concept.
......@@ -118,6 +118,7 @@ class Run(Command): # noqa: D101
self.logger.info("Loading sequences from genome file {!r}", genome)
sequences = SeqIO.parse(genome, guess_sequences_format(genome))
self.logger.debug("Extracting genes from input sequences")
orf_finder = PyrodigalFinder(metagenome=True)
genes = list(
itertools.chain.from_iterable(map(orf_finder.find_genes, sequences))
......@@ -165,7 +166,7 @@ class Run(Command): # noqa: D101
self.logger.debug("Loading trained CRF model")
crf = ClusterCRF.trained(self.args["--model"])
self.logger.debug("Predicting BGC probabilies")
self.logger.debug("Predicting BGC probabilities")
genes = crf.predict_probabilities(genes)
self.logger.debug("Extracting feature table")
......
......@@ -289,11 +289,6 @@ class ClusterCRF(object):
# Convert data to `CRFSuite` format
X, _ = self._extract_features(data, X_only=True, jobs=jobs)
# Remove non-significant features from the extracted bunch
if self.significant_features:
sf = set(itertools.chain(*self.significant_features.values()))
X = [[{k: row[k] for k in row.keys() & sf} for row in x] for x in X]
# Extract cluster (1) probabilities from predicted marginals
marginal_probs = self.model.predict_marginals(X)
cluster_probs = [
......
......@@ -148,7 +148,7 @@ class Gene:
return sum(p) / len(p) if p else None
@property
def maximum_probability(self) -> float:
def maximum_probability(self) -> Optional[float]:
"""`float`: The highest of domain probabilities of being biosynthetic.
"""
p = [d.probability for d in self.protein.domains if d.probability is not None]
......@@ -179,6 +179,7 @@ class Gene:
1 - domain.i_evalue,
domain.start,
domain.end,
domain.probability,
)
for domain in self.protein.domains
],
......@@ -194,6 +195,7 @@ class Gene:
"rev_i_Evalue",
"domain_start",
"domain_end",
"bgc_probability"
],
)
......
......@@ -127,7 +127,7 @@ class ClusterRefiner:
return cds_crit
elif self.criterion == "antismash":
domains = {d.name for gene in cluster.genes for d in gene.protein.domains}
p_crit = numpy.mean([g.probability for g in cluster.genes]) >= self.average_threshold
p_crit = numpy.mean([g.average_probability for g in cluster.genes]) >= self.average_threshold
bio_crit = len(domains & BIO_PFAMS) >= self.n_biopfams
cds_crit = len(cluster.genes) >= self.n_cds
return p_crit and bio_crit and cds_crit
......@@ -137,9 +137,9 @@ class ClusterRefiner:
def _trim_cluster(self, cluster: Cluster) -> Cluster:
"""Remove unannotated proteins from the cluster edges.
"""
while not cluster.genes[0].protein.domains:
while cluster.genes and not cluster.genes[0].protein.domains:
cluster.genes.pop(0)
while not cluster.genes[-1].protein.domains:
while cluster.genes and not cluster.genes[-1].protein.domains:
cluster.genes.pop()
return cluster
......