......@@ -5,7 +5,16 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
## [Unreleased]
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.8.0...master
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.8.1...master
## [v0.8.1] - 2021-07-29
[v0.8.1]: https://git.embl.de/grp-zeller/GECCO/compare/v0.8.0...v0.8.1
### Changed
- `gecco run` now filters out unneeded features before annotating, making it easier to analyze the results of a run with a custom `--model`.
### Fixed
- `gecco` reporting about using Pfam `v33.1` while actually using `v34.0` because of an outdated field in `gecco/hmmer/Pfam.ini`.
### Added
- Missing documentation for the `strand` attribute of `gecco.model.Gene`.
## [v0.8.0] - 2021-07-03
[v0.8.0]: https://git.embl.de/grp-zeller/GECCO/compare/v0.7.0...v0.8.0
......
......@@ -10,4 +10,4 @@ See Also:
__author__ = "Martin Larralde"
__license__ = "GPLv3"
__version__ = "0.8.0"
__version__ = "0.8.1"
......@@ -123,6 +123,17 @@ class OrderedPoolWrapper:
return list(map(operator.itemgetter(1), results))
class UniversalContainer(object):
"""A container that contains everything.
"""
def __repr__(self):
return f"{self.__class__.__name__}()"
def __contains__(self, item: object) -> bool:
return True
@contextlib.contextmanager
def patch_locale(name: str):
"""Create a context manager to locally change the locale in use.
......
......@@ -142,7 +142,7 @@ class Annotate(Command): # noqa: D101
sequences = list(SeqIO.parse(io.TextIOWrapper(f), format))
except FileNotFoundError as err:
self.error("Could not find input file:", repr(self.genome))
raise CommandExit(e.errno) from err
raise CommandExit(err.errno) from err
except ValueError as err:
self.error("Failed to load sequences:", err)
raise CommandExit(getattr(err, "errno", 1)) from err
......@@ -165,7 +165,7 @@ class Annotate(Command): # noqa: D101
return list(orf_finder.find_genes(sequences, progress=callback))
def _annotate_domains(self, genes):
def _annotate_domains(self, genes, whitelist=None):
from ...hmmer import PyHMMER, embedded_hmms
self.info("Running", "HMMER domain annotation", level=1)
......@@ -177,7 +177,7 @@ class Annotate(Command): # noqa: D101
task = self.progress.add_task(description=f"{hmm.id} v{hmm.version}", total=hmm.size, unit="domains", precision="")
callback = lambda h, t: self.progress.update(task, advance=1)
self.info("Starting", f"annotation with [bold blue]{hmm.id} v{hmm.version}[/]", level=2)
features = PyHMMER(hmm, self.jobs).run(genes, progress=callback)
features = PyHMMER(hmm, self.jobs, whitelist).run(genes, progress=callback)
self.success("Finished", f"annotation with [bold blue]{hmm.id} v{hmm.version}[/]", level=2)
self.progress.update(task_id=task, visible=False)
......
......@@ -25,6 +25,11 @@ from .annotate import Annotate
from .._utils import patch_showwarnings
from ...model import ProductType
try:
import importlib.resources as importlib_resources
except ImportError:
import importlib_resources
class Run(Annotate): # noqa: D101
......@@ -136,6 +141,27 @@ class Run(Annotate): # noqa: D101
self.warn("Output folder contains files that will be overwritten")
break
def _load_model_domains(self) -> typing.Set[str]:
if self.model is None:
self.info("Loading", "features from internal model", level=2)
resource_context = importlib_resources.path("gecco.types", "domains.tsv")
domains_file = resource_context.__enter__()
else:
self.info("Loading", "domain whitelist from", repr(self.model), level=2)
resource_context = contextlib.nullcontext()
domains_file = os.path.join(self.model, "domains.tsv")
try:
with open(domains_file) as f:
domains = set(filter(None, map(str.strip, f)))
except FileNotFoundError as err:
self.error("Could not find model domains:", repr(domains_file))
raise CommandExit(e.errno) from err
else:
self.success("Found", len(domains), "selected features", level=2)
return domains
finally:
resource_context.__exit__(None, None, None)
def _predict_probabilities(self, genes):
from ...crf import ClusterCRF
......@@ -288,8 +314,11 @@ class Run(Annotate): # noqa: D101
else:
self.warn("No genes were found")
return 0
# use a whitelist for domain annotation, so that we only annotate
# with features that are useful for the CRF
whitelist = self._load_model_domains()
# annotate domains and predict probabilities
genes = self._annotate_domains(genes)
genes = self._annotate_domains(genes, whitelist=whitelist)
genes = self._predict_probabilities(genes)
self._write_feature_table(genes)
# extract clusters from probability vector
......
[hmm]
id = Pfam
version = 33.1
version = 34.0
size = 2683
url = ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam34.0/Pfam-A.hmm.gz
relabel_with = s/(PF\d+).\d+/\1/
......
......@@ -15,13 +15,13 @@ import re
import subprocess
import tempfile
import typing
from typing import Callable, Dict, Optional, Iterable, Iterator, List, Mapping, Type, Sequence
from typing import Callable, Container, Dict, Optional, Iterable, Iterator, List, Mapping, Type, Sequence
import pyhmmer
from Bio import SeqIO
from pyhmmer.hmmer import hmmsearch
from .._meta import requires
from .._meta import requires, UniversalContainer
from ..model import Gene, Domain
from ..interpro import InterPro
......@@ -62,18 +62,22 @@ class DomainAnnotator(metaclass=abc.ABCMeta):
"""An abstract class for annotating genes with protein domains.
"""
def __init__(self, hmm: HMM, cpus: Optional[int] = None) -> None:
def __init__(self, hmm: HMM, cpus: Optional[int] = None, whitelist: Optional[Container[str]] = None) -> None:
"""Prepare a new HMMER annotation handler with the given ``hmms``.
Arguments:
hmm (str): The path to the file containing the HMMs.
cpus (int, optional): The number of CPUs to allocate for the
``hmmsearch`` command. Give ``None`` to use the default.
whitelist (container of str): If given, a container containing
the accessions of the individual HMMs to annotate with. If
`None` is given, annotate with the entire file.
"""
super().__init__()
self.hmm = hmm
self.cpus = cpus
self.whitelist = UniversalContainer() if whitelist is None else whitelist
@abc.abstractmethod
def run(self, genes: Iterable[Gene]) -> List[Gene]:
......@@ -114,11 +118,17 @@ class PyHMMER(DomainAnnotator):
file = ctx.enter_context(open(self.hmm.path, "rb"))
if self.hmm.path.endswith(".gz"):
file = ctx.enter_context(gzip.GzipFile(fileobj=file))
# Run search pipeline using the HMM
# Only retain the HMMs which are in the whitelist
hmm_file = ctx.enter_context(pyhmmer.plan7.HMMFile(file))
profiles = (
hmm
for hmm in hmm_file
if self.hmm.relabel(hmm.accession.decode()) in self.whitelist
)
# Run search pipeline using the filtered HMMs
cpus = 0 if self.cpus is None else self.cpus
hmms_hits = hmmsearch(
hmm_file,
profiles,
esl_sqs,
cpus=cpus,
callback=progress,
......
......@@ -191,6 +191,7 @@ class Gene:
the source sequence, independent of the strandedness.
end (`int`): The index of the rightmost nucleotide of the gene within
the source sequence.
strand (`~gecco.model.Strand`): The strand where the gene is located.
protein (`~gecco.model.Protein`): The protein translated from this
gene.
qualifiers (`dict`, optional): A dictionary of feature qualifiers that
......@@ -558,7 +559,7 @@ class FeatureTable(Dumpable, Sized):
frame[column] = getattr(self, column)
return frame
def __iadd__(self, rhs: "FeatureTable") -> "FeatureTable":
def __iadd__(self, rhs: "FeatureTable") -> "FeatureTable": # noqa: D105
if not isinstance(rhs, FeatureTable):
return NotImplemented
for col in self.__annotations__:
......@@ -722,7 +723,7 @@ class ClusterTable(Dumpable, Sized):
table.domains.append(sorted(domains))
return table
def __iadd__(self, rhs: "ClusterTable") -> "ClusterTable":
def __iadd__(self, rhs: "ClusterTable") -> "ClusterTable": # noqa: D105
if not isinstance(rhs, FeatureTable):
return NotImplemented
for col in self.__annotations__:
......
This diff is collapsed.
>sp|P20108|PRDX3_MOUSE Thioredoxin-dependent peroxide reductase, mitochondrial OS=Mus musculus OX=10090 GN=Prdx3 PE=1 SV=1
MAAAAGRLLWSSVARHASAISRSISASTVLRPVASRRTCLTDILWSASAQGKSAFSTSSS
FHTPAVTQHAPYFKGTAVVNGEFKELSLDDFKGKYLVLFFYPLDFTFVCPTEIVAFSDKA
NEFHDVNCEVVAVSVDSHFSHLAWINTPRKNGGLGHMNITLLSDITKQISRDYGVLLESA
GIALRGLFIIDPNGVVKHLSVNDLPVGRSVEETLRLVKAFQFVETHGEVCPANWTPESPT
IKPSPTASKEYFEKVHQ
>sp|Q52658|SCA4_RICCN Antigenic heat-stable 120 kDa protein OS=Rickettsia conorii (strain ATCC VR-613 / Malish 7) OX=272944 GN=sca4 PE=4 SV=1
MSKDGNLDTSEFDPLANKEYTEEQKQTLEQEQKEFLSQTTTPALEADDGFIVTSASFAQS
TPSMSALSGNISPDSQTSDPITKAVRETIIQPQKDNLIEQILKDLAALTDRDLAEQKRKE
IEEEKEKDKTLSTFFGNPANREFIDKALENPELKKKLESIEIAGYKNVHNTFSAASGYPG
GFKPVQWENHVSANDLRATVVKNDAGDELCTLNETTVKTKPFTLAKQDGTQVQISSYREI
DFPIKLDKADGSMHLSMVALKADGTKPSKDKAVYFTAHYEEGPNGKPQLKEISSPKPLKF
AGTGDDAIAYIEHGGEIYTLAVTRGKYKEMMKEVELNQGQSVDLSQAEDIIIGQGQSKEQ
PLITPQQTTSSSVEPPQYKQQVPPITPTNQPLQPETSQMPQSQQVNPNLLNTATALSGSM
QDLLNYVNAGLTKAIDSNKQIDLIKEAATAILNNEKSDIAEKQANIIALAENTVNNKNLK
PDAKVTGVNAVLETIKNDQNTPNLEKSKMLEATVAIVLNSENLEPKQKQQMLEKAVDVGL
SLKDDASRAAAIDGIKDVVIKSNLSPEDKMLIAVGDKVNVSELSNAEKQKLLGSVLKKGV
EAQVLSPAQQQLMQQHLYKIMAEQTKKDTIKKVNDILFDPLSNTELKTTNIQAITSNVLD
GPATAEVKGEIIQAITNTIAGSSLEAQDKAAIIKGVGETIATHSDTSLSLPNKALIMASA
EKGIAESQTNLPDRELMTKGLVDGIYEGKGGPEITKAVSSGIDNSNINDSEKEALKKAKD
AASEAALDRDTQNLTEGFKGQNIEEHKPHDDIYNKAREVINAVNPVIEALEKSKEPVVSA
EERIVQETSSILNNISKLAVEKVNNFRAMLSPNGNLKTLEEKKEEAIKKVDELVKAFGTK
SSTEEQQSFIKTNLIDDKTLSKEVRLQTIDKLLQEQKRSEAIENPSVKTEDVRVVSGKSK
LKPISKDNPDIEKAKMVVGRDRVNIKGNIKIMGALMNARDIIQSENLNKSTPIKRESSPP
QR
>sp|P42656|RAD24_SCHPO Checkpoint signal transducer rad24 OS=Schizosaccharomyces pombe (strain 972 / ATCC 24843) OX=284812 GN=rad24 PE=1 SV=2
MSTTSREDAVYLAKLAEQAERYEGMVENMKSVASTDQELTVEERNLLSVAYKNVIGARRA
SWRIVSSIEQKEESKGNTAQVELIKEYRQKIEQELDTICQDILTVLEKHLIPNAASAESK
VFYYKMKGDYYRYLAEFAVGEKRQHSADQSLEGYKAASEIATAELAPTHPIRLGLALNFS
VFYYEILNSPDRACYLAKQAFDEAISELDSLSEESYKDSTLIMQLLRDNLTLWTSDAEYS
AAAAGGNTEGAQENAPSNAPEGEAEPKADA
"""Test `gecco.hmmer` module.
"""
import copy
import os
import unittest
from unittest import mock
import Bio.SeqIO
from gecco.model import Strand, Protein, Gene
from gecco.hmmer import PyHMMER, HMM
class TestPyHMMER(unittest.TestCase):
@classmethod
def setUpClass(cls):
folder = os.path.dirname(os.path.abspath(__file__))
cls.hmm = HMM(
id="Pfam",
version="vX.Y",
url="http://example.com",
path=os.path.join(folder, "data", "minipfam.hmm"),
size=10,
relabel_with=r"s/(PF\d+).\d+/\1/",
)
cls.records = list(
Bio.SeqIO.parse(os.path.join(folder, "data", "proteins.faa"),
"fasta",
))
cls.proteins = [
Protein(record.id, record.seq)
for record in cls.records
]
cls.genes = [
Gene(record, 1, len(prot.seq)*3+1, Strand.Coding, prot)
for record, prot in zip(cls.records, cls.proteins)
]
def test_whitelist(self):
# run normally, we should find 3 genes with domains
pyhmmer = PyHMMER(self.hmm, 1)
genes = pyhmmer.run(copy.deepcopy(self.genes))
self.assertEqual(sum(1 for gene in genes if gene.protein.domains), 3)
# whitelist only the first HMM, so only the first domain should be annotated
pyhmmer = PyHMMER(self.hmm, 1, whitelist={"PF10417"})
self.assertEqual(pyhmmer.whitelist, {"PF10417"})
genes = pyhmmer.run(copy.deepcopy(self.genes))
self.assertEqual(sum(1 for gene in genes if gene.protein.domains), 1)