Commits (12)
......@@ -5,10 +5,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
## [Unreleased]
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.5.2...master
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.5.3...master
## [v0.5.3] - 2021-02-21
[v0.5.3]: https://git.embl.de/grp-zeller/GECCO/compare/v0.5.2...v0.5.3
### Fixed
- Coordinates of genes in output GenBank files.
- Potential issue with the number of CPUs in `PyHMMER.run`.
### Changed
- Bump required `pyrodigal` version to `v0.4.2` to fix buffer overflow.
## [v0.5.2] - 2021-01-29
[v0.5.1]: https://git.embl.de/grp-zeller/GECCO/compare/v0.5.1...v0.5.2
[v0.5.2]: https://git.embl.de/grp-zeller/GECCO/compare/v0.5.1...v0.5.2
### Added
- Support for downloading HMM files directly from GitHub releases assets.
- Validation of filtered HMMs with MD5 checksum.
......@@ -16,7 +24,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Invalid coordinates of protein domains in GenBank output files.
- `gecco.interpro` module not being added to wheel distribution.
### Changed
- Bump required `pyhmmer` version to`v0.2.1`.
- Bump required `pyhmmer` version to `v0.2.1`.
## [v0.5.1] - 2021-01-15
[v0.5.1]: https://git.embl.de/grp-zeller/GECCO/compare/v0.5.0...v0.5.1
......
......@@ -2,14 +2,6 @@ p {
text-align: justify;
}
/* a.reference strong {
font-weight: bold;
font-size: 90%;
color: #c7254e;
box-sizing: border-box;
font-family: Menlo,Monaco,Consolas,"Courier New",monospace;
} */
.field-list a.reference {
font-weight: bold;
font-style: italic;
......@@ -22,3 +14,24 @@ p {
.class dd {
margin-left: 2%;
}
span.navbar-version {
margin-left: -10px;
font-style: italic;
}
a.navbar-brand {
font-weight: bold;
padding-top: 8px;
padding-bottom: 5px;
}
.navbar .nav > li > a {
color: white;
text-transform: none;
font-size: 14px;
}
.navbar-brand img {
margin-right: 10px;
}
......@@ -5,5 +5,5 @@ BGC Detection
.. automodule:: gecco.crf
.. autoclass:: ClusterCRF
.. autoclass:: ClusterCRF(object)
:members:
......@@ -7,5 +7,14 @@ Domain Annotation
.. autofunction:: embedded_hmms
.. autoclass:: HMMER
.. autoclass:: HMM(object)
:members:
.. autoclass:: DomainAnnotator(object)
:members:
.. autoclass:: HMMER(DomainAnnotator)
:members:
.. autoclass:: PyHMMER(DomainAnnotator)
:members:
......@@ -7,6 +7,7 @@ API Reference
model
orf
hmmer
interpro
crf
refine
types
......@@ -20,11 +21,14 @@ Data Model
.. autosummary::
:nosignatures:
ProductType
Strand
Domain
Protein
Gene
Cluster
ClusterTable
FeatureTable
ORF Extraction
......@@ -48,6 +52,7 @@ Domain Annotation
:nosignatures:
HMMER
PyHMMER
BGC Detection
-------------
......@@ -79,4 +84,17 @@ Type Prediction
.. autosummary::
:nosignatures:
TypeBinarizer
TypeClassifier
InterPro Metadata
-----------------
.. currentmodule:: gecco.interpro
.. autosummary::
:nosignatures:
InterPro
InterProEntry
InterPro metadata
=================
.. currentmodule:: gecco.interpro
.. automodule:: gecco.interpro
.. autoclass:: InterPro(object)
:members:
.. autoclass:: InterProEntry(object)
:members:
......@@ -5,10 +5,33 @@ Data Model
.. automodule:: gecco.model
.. autoclass:: Domain
Python Layer
------------
.. autoclass:: Protein
.. autoclass:: ProductType(enum.IntFlag)
:members:
.. autoclass:: Gene
.. autoclass:: Strand(enum.IntEnum)
:members:
.. autoclass:: Cluster
.. autoclass:: Domain(object)
:members:
.. autoclass:: Protein(object)
:members:
.. autoclass:: Gene(object)
:members:
.. autoclass:: Cluster(object)
:members:
Report Tables
-------------
.. autoclass:: ClusterTable(collections.Sized)
:members:
.. autoclass:: FeatureTable(collections.Sized)
:members:
......@@ -5,8 +5,8 @@ ORF Extraction
.. automodule:: gecco.orf
.. autoclass:: ORFFinder
.. autoclass:: ORFFinder(object)
:members:
.. autoclass:: PyrodigalFinder
.. autoclass:: PyrodigalFinder(ORFFinder)
:members:
......@@ -5,5 +5,5 @@ BGC Extraction
.. automodule:: gecco.refine
.. autoclass:: ClusterRefiner
.. autoclass:: ClusterRefiner(object)
:members:
......@@ -4,5 +4,9 @@ Type Prediction
.. currentmodule:: gecco.types
.. automodule:: gecco.types
.. autoclass:: TypeClassifier
.. autoclass:: TypeBinarizer(sklearn.preprocessing.MultiLabelBinarizer)
:members:
.. autoclass:: TypeClassifier(object)
:members:
......@@ -44,7 +44,7 @@ author = gecco.__author__
# The parsed semantic version
semver = semantic_version.Version.coerce(gecco.__version__)
# The short X.Y version
version = "{v.major}.{v.minor}.{v.patch}".format(v=semver)
version = "v{v.major}.{v.minor}.{v.patch}".format(v=semver)
# The full version, including alpha/beta/rc tags
release = str(semver)
......@@ -122,8 +122,7 @@ html_theme_options = {
"navbar_pagenav": False,
# A list of tuples containing pages or urls to link to.
"navbar_links": [
("GitHub", _parser.get("metadata", "home-page").strip(), True),
("CI", project_urls["Builds"])
("Repository", _parser.get("metadata", "home-page").strip(), True),
],
# + [
# (k, v, True)
......@@ -165,6 +164,9 @@ html_logo = os.path.join("_static", "img", "gecco.png")
# Windows-style icon file (.ico), which is 16x16 or 32x32 pixels large.
html_favicon = os.path.join("_static", "img", "gecco.ico")
# Hide the `source` button in the navbar.
html_show_sourcelink = False
# -- Options for imgmath extension -------------------------------------------
imgmath_image_format = "svg"
......@@ -196,6 +198,7 @@ intersphinx_mapping = {
"numpy": ("https://docs.scipy.org/doc/numpy/", None),
"statsmodels": ("https://tedboy.github.io/statsmodels_doc/", None),
"biopython": ("https://biopython.org/docs/1.77/api/", None),
"pyhmmer": ("https://pyhmmer.readthedocs.io/en/latest/", None),
}
# -- Options for todo extension ----------------------------------------------
......
......@@ -43,7 +43,12 @@ from .cv import LeaveOneGroupOut
from .select import fisher_significance
__all__ = ["ClusterCRF"]
class ClusterCRF(object):
"""A wrapper for `sklearn_crfsuite.CRF` to work with the GECCO data model.
"""
@classmethod
def trained(cls, model_path: Optional[str] = None) -> "ClusterCRF":
......
"""Compatibility wrapper for HMMER binaries and output.
"""
import abc
import collections
import configparser
import contextlib
......@@ -14,6 +15,7 @@ import typing
from typing import Callable, Dict, Optional, Iterable, Iterator, List, Mapping, Type, Sequence
import pkg_resources
import pyhmmer
from Bio import SeqIO
from .._meta import requires
......@@ -27,6 +29,9 @@ if typing.TYPE_CHECKING:
_T = typing.TypeVar("_T", bound="DomainRow")
__all__ = ["DomainRow", "HMM", "HMMER", "PyHMMER", "embedded_hmms"]
class DomainRow(typing.NamedTuple):
"""A single row in a domain table created by ``hmmsearch``.
......@@ -100,12 +105,10 @@ class HMM(typing.NamedTuple):
return regex.sub(after, domain)
class HMMER(BinaryRunner):
"""A wrapper for HMMER that scans a HMM library against protein sequences.
class DomainAnnotator(metaclass=abc.ABCMeta):
"""An abstract class for annotating genes with protein domains.
"""
BINARY = "hmmsearch"
def __init__(self, hmm: HMM, cpus: Optional[int] = None) -> None:
"""Prepare a new HMMER annotation handler with the given ``hmms``.
......@@ -119,14 +122,29 @@ class HMMER(BinaryRunner):
self.hmm = hmm
self.cpus = cpus
@abc.abstractmethod
def run(self, genes: Iterable[Gene]) -> List[Gene]:
"""Run HMMER on proteins of ``genes`` and update them with domains.
"""Run annotation on proteins of ``genes`` and update their domains.
Arguments:
genes (iterable of `~gecco.model.Gene`): An iterable that yield
genes to annotate with ``self.hmm``.
"""
return NotImplemented
class HMMER(DomainAnnotator):
"""A wrapper for HMMER that uses the ``hmmsearch`` binary.
"""
BINARY = "hmmsearch"
def __init__(self, hmm: HMM, cpus: Optional[int] = None) -> None:
DomainAnnotator.__init__(self, hmm, cpus)
BinaryRunner.__init__(self)
def run(self, genes: Iterable[Gene]) -> List[Gene]:
# collect genes and build an index of genes by protein id
gene_index = collections.OrderedDict([(gene.id, gene) for gene in genes])
......@@ -176,13 +194,10 @@ class HMMER(BinaryRunner):
return list(gene_index.values())
class PyHMMER(object):
def __init__(self, hmm: HMM, cpus: Optional[int] = None) -> None:
self.hmm = hmm
self.cpus = cpus
class PyHMMER(DomainAnnotator):
"""A domain annotator that uses `pyhmmer.hmmer.hmmsearch`.
"""
@requires("pyhmmer")
def run(self, genes: Iterable[Gene], callback: Optional[Callable[..., None]] = None) -> List[Gene]:
# collect genes and build an index of genes by protein id
gene_index = collections.OrderedDict([(gene.id, gene) for gene in genes])
......@@ -199,7 +214,8 @@ class PyHMMER(object):
# Run HMMER subprocess.run(cmd, stdout=subprocess.DEVNULL).check_returncode()
with pyhmmer.plan7.HMMFile(self.hmm.path) as hmm_file:
hmms_hits = pyhmmer.hmmsearch(hmm_file, esl_sqs, cpus=self.cpus, callback=callback)
cpus = 0 if self.cpus is None else self.cpus
hmms_hits = pyhmmer.hmmsearch(hmm_file, esl_sqs, cpus=cpus, callback=callback)
# Load InterPro metadata for the annotation
interpro = InterPro.load()
......@@ -232,7 +248,6 @@ class PyHMMER(object):
return list(gene_index.values())
def embedded_hmms() -> Iterator[HMM]:
"""Iterate over the embedded HMMs that are shipped with GECCO.
"""
......
......@@ -9,6 +9,9 @@ from typing import Dict, List, Optional
import pkg_resources
__all__ = ["InterProEntry", "InterPro"]
@dataclass
class InterProEntry:
"""A single domain entry in the InterPro database.
......
......@@ -25,6 +25,18 @@ from ._base import Dumpable
from ._meta import requires
__all__ = [
"ProductType",
"Strand",
"Domain",
"Protein",
"Gene",
"Cluster",
"FeatureTable",
"ClusterTable"
]
# fmt: off
class ProductType(enum.IntFlag):
"""A flag to declare the type of product synthesized by a gene cluster.
......@@ -207,7 +219,7 @@ class Gene:
"""
# NB(@althonos): we use inclusive 1-based ranges in the data model
# but Biopython expects 0-based ranges with exclusive ends
loc = FeatureLocation(start=self.start-1, end=self.end, strand=int(self.strand))
loc = FeatureLocation(start=self.start, end=self.end+1, strand=int(self.strand))
qualifiers = dict(self.qualifiers)
qualifiers.setdefault("locus_tag", self.protein.id)
qualifiers.setdefault("translation", str(self.protein.seq))
......@@ -360,6 +372,7 @@ class Cluster:
for gene in self.genes:
# write gene as a /cds GenBank record
cds = gene.to_seq_feature()
cds.location += -self.start
bgc.features.append(cds)
# write domains as /misc_feature annotations
for domain in gene.protein.domains:
......
......@@ -24,6 +24,9 @@ if typing.TYPE_CHECKING:
from Bio.SeqRecord import SeqRecord
__all__ = ["ORFFinder", "PyrodigalFinder"]
class ORFFinder(metaclass=abc.ABCMeta):
"""An abstract base class to provide a generic ORF finder.
"""
......
......@@ -14,6 +14,9 @@ from Bio.SeqRecord import SeqRecord
from .model import Cluster, Domain, Gene, Protein, Strand
__all__ = ["BIO_PFAMS", "GeneGrouper", "ClusterRefiner"]
# fmt: off
# `set` of `str`: A set of domains from Pfam considered 'biosynthetic' by AntiSMASH.
BIO_PFAMS = frozenset({
......
......@@ -17,7 +17,12 @@ import sklearn.preprocessing
from ..model import ProductType, Cluster
__all__ = ["TypeBinarizer", "TypeClassifier"]
class TypeBinarizer(sklearn.preprocessing.MultiLabelBinarizer):
"""A `MultiLabelBinarizer` working with `ProductType` instances.
"""
def __init__(self):
self.classes_ = sorted(x for x in ProductType.__members__.values() if x)
......@@ -39,6 +44,8 @@ class TypeBinarizer(sklearn.preprocessing.MultiLabelBinarizer):
class TypeClassifier(object):
"""A wrapper to predict the type of a `~gecco.model.Cluster`.
"""
@classmethod
def trained(cls, model_path: Optional[str] = None) -> "TypeClassifier":
......
......@@ -47,7 +47,7 @@ install_requires =
docopt ~=0.6.2
numpy ~=1.16
pyhmmer ~=0.2.1
pyrodigal ~=0.4.1
pyrodigal ~=0.4.2
scikit-learn ~=0.24.0
scipy ~=1.4
sklearn-crfsuite ~=0.3.6
......