Commits (4)
......@@ -5,7 +5,18 @@ The format is based on [Keep a Changelog](
and this project adheres to [Semantic Versioning](
## [Unreleased]
## [v0.6.0] - 2021-02-28
### Changed
- Updated internal model with a cleaned-up version of the MIBiG-2.0
Pfam-33.1/Tigrfam-15.0 embedding.
- Updated internal InterPro catalog.
### Fixed
- Features not being grouped together in `gecco cv` and `gecco train`
when provided with a feature table where rows were not sorted by
protein IDs.
## [v0.5.5] - 2021-02-28
No preview for this file type
\ No newline at end of file
\ No newline at end of file
"""Data layer classes storing information needed for BGC detection.
import collections
import csv
import datetime
import enum
......@@ -467,8 +468,17 @@ class FeatureTable(Dumpable, Sized):
built for each gene of size ``gene.end``, so that each gene can still
be converted to a `~Bio.SeqRecord.SeqRecord` if needed.
for _, group in itertools.groupby(self, key=operator.attrgetter("protein_id")):
rows = list(group)
# group rows by protein/gene ID
protein_indices = collections.defaultdict(list)
for i, protein_id in enumerate(self.protein_id):
# yield genes in order
for protein_id in sorted(protein_indices):
rows = [self[i] for i in protein_indices[protein_id]]
assert all(x.sequence_id == rows[0].sequence_id for x in rows)
assert all(x.protein_id == rows[0].protein_id for x in rows)
assert all(x.start == rows[0].start for x in rows)
assert all(x.end == rows[0].end for x in rows)
source = SeqRecord(id=rows[0].sequence_id, seq=_UnknownSeq())
strand = Strand.Coding if rows[0].strand == "+" else Strand.Reverse
protein = Protein(rows[0].protein_id, seq=None)
This diff is collapsed.
......@@ -97,10 +97,10 @@ class update_model(setuptools.Command):
# Update the domain composition table"Copying the KNN training data to the in-source location")"Copying the RF training data to the in-source location")
for filename in ["compositions.npz", "domains.tsv", "types.tsv"]:
src = os.path.join(self.model, filename)
dst = os.path.join("gecco", "knn", filename)
dst = os.path.join("gecco", "types", filename)
shutil.copy(src=src, dst=dst)
# Update the interpro entries