Commits (4)
......@@ -5,7 +5,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
## [Unreleased]
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.5.5...master
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.6.0...master
## [v0.6.0] - 2021-02-28
[v0.6.0]: https://git.embl.de/grp-zeller/GECCO/compare/v0.5.5...v0.6.0
### Changed
- Updated internal model with a cleaned-up version of the MIBiG-2.0
Pfam-33.1/Tigrfam-15.0 embedding.
- Updated internal InterPro catalog.
### Fixed
- Features not being grouped together in `gecco cv` and `gecco train`
when provided with a feature table where rows were not sorted by
protein IDs.
## [v0.5.5] - 2021-02-28
[v0.5.5]: https://git.embl.de/grp-zeller/GECCO/compare/v0.5.4...v0.5.5
......
No preview for this file type
5e6a3d158fd23276c8582764f62ec9e7
\ No newline at end of file
e9e64fd5f2e2156c1df0b55039a87997
\ No newline at end of file
"""Data layer classes storing information needed for BGC detection.
"""
import collections
import csv
import datetime
import enum
......@@ -467,8 +468,17 @@ class FeatureTable(Dumpable, Sized):
built for each gene of size ``gene.end``, so that each gene can still
be converted to a `~Bio.SeqRecord.SeqRecord` if needed.
"""
for _, group in itertools.groupby(self, key=operator.attrgetter("protein_id")):
rows = list(group)
# group rows by protein/gene ID
protein_indices = collections.defaultdict(list)
for i, protein_id in enumerate(self.protein_id):
protein_indices[protein_id].append(i)
# yield genes in order
for protein_id in sorted(protein_indices):
rows = [self[i] for i in protein_indices[protein_id]]
assert all(x.sequence_id == rows[0].sequence_id for x in rows)
assert all(x.protein_id == rows[0].protein_id for x in rows)
assert all(x.start == rows[0].start for x in rows)
assert all(x.end == rows[0].end for x in rows)
source = SeqRecord(id=rows[0].sequence_id, seq=_UnknownSeq())
strand = Strand.Coding if rows[0].strand == "+" else Strand.Reverse
protein = Protein(rows[0].protein_id, seq=None)
......
This diff is collapsed.
......@@ -97,10 +97,10 @@ class update_model(setuptools.Command):
sig.write(hasher.hexdigest())
# Update the domain composition table
self.info("Copying the KNN training data to the in-source location")
self.info("Copying the RF training data to the in-source location")
for filename in ["compositions.npz", "domains.tsv", "types.tsv"]:
src = os.path.join(self.model, filename)
dst = os.path.join("gecco", "knn", filename)
dst = os.path.join("gecco", "types", filename)
shutil.copy(src=src, dst=dst)
# Update the interpro entries
......