......@@ -109,6 +109,3 @@ dmypy.json
# End of https://www.gitignore.io/api/python
......@@ -5,7 +5,16 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
## [Unreleased]
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.2.1...master
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.2.2...master
## [v0.2.2] - 2020-07-31
[v0.2.2]: https://git.embl.de/grp-zeller/GECCO/compare/v0.2.1...v0.2.2
### Changed
- `Domain` and `Gene` can now carry qualifiers that are used when they
are translated to a sequence feature.
### Added
- InterPro names, accessions, and HMMER e-value for each annotated domain
in GenBank output files.
## [v0.2.1] - 2020-07-23
[v0.2.1]: https://git.embl.de/grp-zeller/GECCO/compare/v0.2.0...v0.2.1
import contextlib
import gzip
import re
import os
......@@ -6,19 +7,31 @@ import io
import tqdm
from gecco.crf import ClusterCRF
from gecco.hmmer import embedded_hmms
from gecco.interpro import InterPro
# Create the artifact folder
os.makedirs(os.path.join("ci", "artifacts"), exist_ok=True)
# Load InterPro to know how many entries we have to process
interpro = InterPro.load()
# Load the internal CRF model and compile a regex that matches the domains
# known to the CRF (i.e. useful domains for us to annotate with)
crf = ClusterCRF.trained()
rx = re.compile("|".join(crf.model.attributes_).encode("utf-8"))
# Filter the hmms
for hmm in embedded_hmms():
input_ = os.path.join("ci", "artifacts", "{}.hmm.gz".format(hmm.id))
with gzip.open(hmm.path, "rb") as src, gzip.open(input_, "wb") as dst:
out = os.path.join("ci", "artifacts", "{}.hmm.gz".format(hmm.id))
size = sum(1 for e in interpro.entries if e.source_database.upper().startswith(mm.id.upper()))
pbar = tqdm.tqdm(desc=hmm.id, total=size, unit="B", unit_scale=True, unit_divisor=1024)
with contextlib.ExitStack() as ctx:
pbar = ctx.enter_context(pbar)
src = ctx.enter_context(gzip.open(hmm.path, "rb"))
dst = ctx.enter_context(gzip.open(out, "wb"))
blocklines = []
for line in src:
......@@ -26,3 +39,4 @@ for hmm in embedded_hmms():
if any(rx.search(line) is not None for line in blocklines):
......@@ -13,7 +13,7 @@ if [ ! -x "$(command -v hmmsearch)" ]; then
log Installing Python dependencies with pip
pip install -U coverage
pip install -U coverage tqdm
# --- Install data dependencies ----------------------------------------------
......@@ -20,11 +20,10 @@ from Bio import SeqIO
from ._base import Command
from .._utils import guess_sequences_format
from ...crf import ClusterCRF
from ...hmmer import HMMER, embedded_hmms
from ...hmmer import HMMER, HMM, embedded_hmms
from ...knn import ClusterKNN
from ...orf import PyrodigalFinder
from ...refine import ClusterRefiner
from ...model import Hmm
class Run(Command): # noqa: D101
......@@ -129,7 +128,7 @@ class Run(Command): # noqa: D101
self.logger.info("Running domain annotation")
# Run all HMMs over ORFs to annotate with protein domains
def annotate(hmm: Hmm) -> "pandas.DataFrame":
def annotate(hmm: HMM) -> "pandas.DataFrame":
"Starting annotation with HMM {} v{}", hmm.id, hmm.version
......@@ -211,7 +210,7 @@ class Run(Command): # noqa: D101
for cluster in clusters:
gbk_out = os.path.join(out_dir, f"{cluster.id}.gbk")
self.logger.debug("Writing cluster {} to {!r}", cluster.id, gbk_out)
SeqIO.write(cluster.to_record(), gbk_out, "genbank")
SeqIO.write(cluster.to_seq_record(), gbk_out, "genbank")
# Exit gracefully
self.logger.info("Successfully found {} clusters!", len(clusters))
......@@ -7,6 +7,7 @@ import csv
import errno
import glob
import os
import re
import subprocess
import tempfile
import typing
......@@ -17,7 +18,8 @@ import pkg_resources
from Bio import SeqIO
from .._base import BinaryRunner
from ..model import Gene, Domain, Hmm
from ..model import Gene, Domain
from ..interpro import InterPro
if typing.TYPE_CHECKING:
from Bio.SeqRecord import SeqRecord
......@@ -74,13 +76,36 @@ class DomainRow(typing.NamedTuple):
return cls(**values)
class HMM(typing.NamedTuple):
"""A Hidden Markov Model library to use with `~gecco.hmmer.HMMER`.
id: str
version: str
url: str
path: str
relabel_with: Optional[str] = None
def relabel(self, domain: str) -> str:
"""Rename a domain obtained by this HMM to the right accession.
This method can be used with HMM libraries that have separate HMMs
for the same domain, such as Pfam.
if self.relabel_with is None:
return domain
before, after = re.match("^s/(.*)/(.*)/$", self.relabel_with).groups() # type: ignore
regex = re.compile(before)
return regex.sub(after, domain)
class HMMER(BinaryRunner):
"""A wrapper for HMMER that scans a HMM library against protein sequences.
BINARY = "hmmsearch"
def __init__(self, hmm: "Hmm", cpus: Optional[int] = None) -> None:
def __init__(self, hmm: HMM, cpus: Optional[int] = None) -> None:
"""Prepare a new HMMER annotation handler with the given ``hmms``.
......@@ -109,7 +134,7 @@ class HMMER(BinaryRunner):
doms_tmp = tempfile.NamedTemporaryFile(prefix="hmmer", suffix=".dom", mode="rt")
# write protein sequences
protein_records = (g.protein.to_record() for g in genes)
protein_records = (g.protein.to_seq_record() for g in genes)
SeqIO.write(protein_records, seqs_tmp.name, "fasta")
# Prepare the command line arguments
......@@ -121,24 +146,38 @@ class HMMER(BinaryRunner):
subprocess.run(cmd, stdout=subprocess.DEVNULL).check_returncode()
# Load InterPro metadata for the annotation
interpro = InterPro.load()
# Read the domain table
lines = filter(lambda line: not line.startswith("#"), doms_tmp)
rows = map(DomainRow.from_line, lines)
# update protein domains
for row in rows:
gene = gene_index[row.target_name]
name = self.hmm.relabel(row.query_accession or row.query_name)
domain = Domain(name, row.env_from, row.env_to, self.hmm.id, row.i_evalue)
# extract domain from the domain table row
accession = self.hmm.relabel(row.query_accession or row.query_name)
domain = Domain(accession, row.env_from, row.env_to, self.hmm.id, row.i_evalue)
# add additional qualifiers with available metadata
domain.qualifiers["inference"] = ["protein motif"]
domain.qualifiers["note"] = ["e-value: {}".format(row.i_evalue)]
domain.qualifiers["db_xref"] = ["{}:{}".format(self.hmm.id.upper(), accession)]
# add additional qualifiers using the InterPro entry list
entry = interpro.by_accession[accession]
domain.qualifiers["function"] = [interpro.by_accession[accession].name]
if entry.integrated is not None:
# add the domain to the protein domains of the right gene
# return the updated list of genes that was given in argument
return genes
def embedded_hmms() -> Iterator[Hmm]:
def embedded_hmms() -> Iterator[HMM]:
"""Iterate over the embedded HMMs that are shipped with GECCO.
for ini in glob.glob(pkg_resources.resource_filename(__name__, "*.ini")):
cfg = configparser.ConfigParser()
yield Hmm(path=ini.replace(".ini", ".hmm.gz"), **dict(cfg.items("hmm")))
yield HMM(path=ini.replace(".ini", ".hmm.gz"), **dict(cfg.items("hmm")))
"""Simple data classes to expose embedded InterPro data.
import gzip
import json
from dataclasses import dataclass
from typing import Dict, List, Optional
import pkg_resources
class InterProEntry:
accession: str
go_terms: Dict[str, Dict[str, str]]
integrated: Optional[str]
member_databases: Dict[str, Dict[str, str]]
name: str
source_database: str
type: str
class InterPro:
entries: List[InterProEntry]
def __init__(self, entries: List[InterProEntry]):
self.entries = entries
self.by_accession = { entry.accession:entry for entry in entries }
def load(cls):
with pkg_resources.resource_stream(__name__, "interpro.json.gz") as f:
data = json.load(gzip.open(f, mode="rt"))
entries = [ InterProEntry(**entry["metadata"]) for entry in data ]
return cls(entries)
......@@ -5,7 +5,7 @@ import csv
import enum
import re
import typing
from typing import Iterable, List, Optional, Sequence
from typing import Dict, Iterable, List, Optional, Sequence
from dataclasses import dataclass
import Bio.Alphabet
......@@ -18,29 +18,6 @@ from Bio.SeqRecord import SeqRecord
from . import __version__
class Hmm(typing.NamedTuple):
"""A Hidden Markov Model library to use with `~gecco.hmmer.HMMER`.
id: str
version: str
url: str
path: str
relabel_with: Optional[str] = None
def relabel(self, domain: str) -> str:
"""Rename a domain obtained by this HMM to the right accession.
This method can be used with HMM libraries that have separate HMMs
for the same domain, such as Pfam.
if self.relabel_with is None:
return domain
before, after = re.match("^s/(.*)/(.*)/$", self.relabel_with).groups() # type: ignore
regex = re.compile(before)
return regex.sub(after, domain)
class Strand(enum.IntEnum):
"""A flag to declare on which DNA strand a gene is located.
......@@ -71,6 +48,9 @@ class Domain:
that measures how reliable the domain annotation is.
probability (`float`, optional): The probability that this domain
is part of a BGC, or `None` if no prediction has been made yet.
qualifiers (`dict`, optional): A dictionary of feature qualifiers that
is added to the `~Bio.SeqFeature.SeqFeature` built from this
......@@ -78,8 +58,36 @@ class Domain:
start: int
end: int
hmm: str
i_evalue: float = 0.0
probability: Optional[float] = None
i_evalue: float
probability: Optional[float]
qualifiers: Dict[str, object]
def __init__(
self, name: str, start: int, end: int, hmm: str, i_evalue: float = 0.0,
probability: Optional[float] = None, qualifiers: Optional[Dict[str, object]] = None
): # noqa: D107
self.name = name
self.start = start
self.end = end
self.hmm = hmm
self.i_evalue = i_evalue
self.probability = probability
self.qualifiers = qualifiers or dict()
def to_seq_feature(self, protein_coordinates=False) -> SeqFeature:
"""Convert the domain to a single feature.
protein_coordinates (`bool`): Set to `True` for the feature
coordinates to be given in amino-acids, or to `False` in
stride = 1 if protein_coordinates else 3
loc = FeatureLocation(0, (self.end - self.start)*stride)
qualifiers = self.qualifiers.copy()
qualifiers.setdefault("standard_name", self.name)
return SeqFeature(location=loc, type="misc_feature", qualifiers=qualifiers)
......@@ -105,7 +113,7 @@ class Protein:
self.seq = seq
self.domains = domains or list()
def to_record(self) -> SeqRecord:
def to_seq_record(self) -> SeqRecord:
"""Convert the protein to a single record.
# FIXME: add domains
......@@ -125,6 +133,9 @@ class Gene:
the source sequence.
protein (`~gecco.model.Protein`): The protein translated from this
qualifiers (`dict`, optional): A dictionary of feature qualifiers that
is added to the `~Bio.SeqFeature.SeqFeature` built from this
......@@ -133,6 +144,15 @@ class Gene:
end: int
strand: Strand
protein: Protein
qualifiers: Dict[str, object]
def __init__(self, source: SeqRecord, start: int, end: int, strand: Strand, protein: Protein, qualifiers: Optional[Dict[str, object]] = None): # noqa: D107
self.source = source
self.start = start
self.end = end
self.strand = strand
self.protein = protein
self.qualifiers = qualifiers or dict()
def id(self) -> str:
......@@ -154,6 +174,8 @@ class Gene:
p = [d.probability for d in self.protein.domains if d.probability is not None]
return max(p) if p else None
# ---
def to_feature_table(self) -> pandas.DataFrame:
"""Convert this gene to a feature table listing domain annotations.
......@@ -199,6 +221,18 @@ class Gene:
def to_seq_feature(self) -> SeqFeature:
"""Convert the gene to a single feature.
# NB(@althonos): we use inclusive 1-based ranges in the data model
# but Biopython expects 0-based ranges with exclusive ends
end = self.end - self.start + 1
loc = FeatureLocation(start=0, end=end, strand=int(self.strand))
qualifiers = self.qualifiers.copy()
qualifiers.setdefault("locus_tag", self.protein.id)
qualifiers.setdefault("translation", str(self.protein.seq))
return SeqFeature(location=loc, type="cds", qualifiers=qualifiers)
class Cluster:
......@@ -294,7 +328,7 @@ class Cluster:
# ---
def to_record(self) -> SeqRecord:
def to_seq_record(self) -> SeqRecord:
"""Convert the cluster to a single record.
Annotations of the source sequence are kept intact if they don't
......@@ -303,7 +337,9 @@ class Cluster:
bgc = self.source[self.start : self.end]
# NB(@althonos): we use inclusive 1-based ranges in the data model
# but slicing expects 0-based ranges with exclusive ends
bgc = self.source[self.start - 1 : self.end]
bgc.id = bgc.name = self.id
bgc.seq.alphabet = Bio.Alphabet.generic_dna
......@@ -315,28 +351,17 @@ class Cluster:
# add proteins as CDS features
for gene in self.genes:
# translate gene location
start = gene.start - self.start - 1
end = gene.end - self.start
loc = FeatureLocation(start=start, end=end, strand=int(gene.strand))
# write protein as a /cds GenBank record
cds = SeqFeature(location=loc, type="cds")
cds.qualifiers["label"] = gene.id
cds.qualifiers["inference"] = [
"EXISTENCE:ab initio prediction:PRODIGAL:2.6"
cds.qualifiers["translation"] = str(gene.protein.seq)
# write gene as a /cds GenBank record
cds = gene.to_seq_feature()
cds.location += gene.start - self.start
# write domains as /misc_feature annotations
# # write domains as /misc_feature annotations
for domain in gene.protein.domains:
dom_start = start + domain.start * 3
dom_end = start + domain.end * 3
loc = FeatureLocation(dom_start, dom_end, strand=int(gene.strand))
misc = SeqFeature(location=loc, type="misc_feature")
misc.qualifiers["label"] = domain.name
misc = domain.to_seq_feature(protein_coordinates=False)
misc.location += cds.location.start
# return the complete BGC
return bgc
def to_cluster_table(self) -> pandas.DataFrame:
......@@ -74,4 +74,8 @@ class PyrodigalFinder(ORFFinder):
end=max(orf.begin, orf.end),
strand=Strand.Coding if orf.strand == 1 else Strand.Reverse,
"inference": ["ab initio prediction:Prodigal:2.6"],
"transl_table": orf.translation_table,
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import configparser
import csv
import glob
import gzip
import hashlib
import io
import json
import math
import os
import re
import shutil
import ssl
import sys
import tarfile
import urllib.request
from functools import partial
from xml.etree import ElementTree as etree
import setuptools
from setuptools.command.build_py import build_py as _build_py
from setuptools.command.sdist import sdist as _sdist
from tqdm import tqdm
except ImportError:
tqdm = None
from tqdm import tqdm
class ResponseProgressBar(object):
......@@ -30,24 +29,17 @@ class ResponseProgressBar(object):
def __init__(self, inner, tqdm=tqdm, **kwargs):
self.inner = inner
self.total = total = int(inner.headers['Content-Length'])
if tqdm is not None:
self.pbar = tqdm(
self.update = self.pbar.update
self.refresh = self.pbar.refresh
self.pbar = None
self.current = 0
self.next_p = 5
self.desc = kwargs.get("desc")
self.pbar = tqdm(
self.update = self.pbar.update
self.refresh = self.pbar.refresh
def __enter__(self):
return self
......@@ -57,8 +49,6 @@ class ResponseProgressBar(object):
self.inner.__exit__(exc_type, exc_value, traceback)
if self.pbar is not None:
self.pbar.__exit__(exc_type, exc_value, traceback)
return False
def read(self, n=None):
......@@ -66,26 +56,6 @@ class ResponseProgressBar(object):
return chunk
def update(self, n):
self.current = c = self.current + n
p = int(float(c) * 100.0 / self.total)
if p >= self.next_p:
self.next_p = p + 5
"{} : {:>{}}B / {}B [{: 3}%]".format(
def refresh(self):
class sdist(_sdist):
"""An extension to the `sdist` command that generates a `pyproject.toml`.
......@@ -151,28 +121,52 @@ class update_model(setuptools.Command):
class build_py(_build_py):
def _flush_progress_bar(pbar):
if not sys.stdout.isatty():
"""A hacked `build_py` command to download data before wheel creation.
def run(self):
def build_interpro_list(self):
path = os.path.join(self.build_lib, "gecco", "interpro", "interpro.json.gz")
if os.path.exists(path):
self.announce("getting Pfam entries from InterPro", level=2)
entries = self.download_interpro_entries("pfam")
self.announce("getting Tigrfam entries from InterPro", level=2)
with gzip.open(path, "wt") as dest:
json.dump(entries, dest)
def download_interpro_entries(self, db):
next = "https://www.ebi.ac.uk:443/interpro/api/entry/all/{}/?page_size=200".format(db)
entries = []
context = ssl._create_unverified_context()
with tqdm(desc=db, leave=False) as pbar:
while next:
with urllib.request.urlopen(next, context=context) as res:
payload = json.load(res)
pbar.total = payload["count"]
next = payload["next"]
return entries
def download_hmms(self):
for in_ in glob.glob(os.path.join("gecco", "hmmer", "*.ini")):
cfg = configparser.ConfigParser()
out = os.path.join(self.build_lib, in_.replace('.ini', '.hmm.gz'))
self.make_file([in_], out, self.download, [out, dict(cfg.items('hmm'))])
self.make_file([in_], out, self.download_hmm, [out, dict(cfg.items('hmm'))])
if os.path.exists(out):
def download(self, output, options):
def download_hmm(self, output, options):
base = "https://github.com/althonos/GECCO/releases/download/v{version}/{id}.hmm.gz"
url = base.format(id=options["id"], version=self.distribution.get_version())
# attempt to use the GitHub releases URL, otherwise fallback to official URL