Commits (10)
......@@ -17,9 +17,7 @@ variables:
before_script:
- ./ci/gitlab/test.before_script.sh
script:
- python setup.py bdist_wheel
- pip install -U dist/*.whl
- python -m coverage run -p -m unittest discover -vv
- ./ci/gitlab/test.script.sh
after_script:
- python -m coverage combine
- python -m coverage report
......
......@@ -5,7 +5,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
## [Unreleased]
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.2.2...master
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.3.0...master
## [v0.3.0] - 2020-08-03
[v0.2.2]: https://git.embl.de/grp-zeller/GECCO/compare/v0.2.2...v0.3.0
### Changed
- Replaced Nearest-Neighbours classifier with Random Forest to perform type
prediction for candidate BGCs.
- `gecco.knn` module was renamed to implementation-agnostic name `gecco.types`.
### Fixed
- Extraction of domain composition taking a long time in `gecco train` command.
### Removed
- `--metric` argument to the `gecco run` CLI command.
## [v0.2.2] - 2020-07-31
[v0.2.2]: https://git.embl.de/grp-zeller/GECCO/compare/v0.2.1...v0.2.2
......
......@@ -24,8 +24,8 @@ rx = re.compile("|".join(crf.model.attributes_).encode("utf-8"))
# Filter the hmms
for hmm in embedded_hmms():
out = os.path.join("ci", "artifacts", "{}.hmm.gz".format(hmm.id))
size = sum(1 for e in interpro.entries if e.source_database.upper().startswith(mm.id.upper()))
pbar = tqdm.tqdm(desc=hmm.id, total=size, unit="B", unit_scale=True, unit_divisor=1024)
size = sum(1 for e in interpro.entries if e.source_database.upper().startswith(hmm.id.upper()))
pbar = tqdm.tqdm(desc=hmm.id, total=size)
with contextlib.ExitStack() as ctx:
pbar = ctx.enter_context(pbar)
......
......@@ -21,7 +21,7 @@ from ._base import Command
from .._utils import guess_sequences_format
from ...crf import ClusterCRF
from ...hmmer import HMMER, HMM, embedded_hmms
from ...knn import ClusterKNN
from ...types import TypeClassifier
from ...orf import PyrodigalFinder
from ...refine import ClusterRefiner
......@@ -61,12 +61,6 @@ class Run(Command): # noqa: D101
--postproc <method> the method to use for cluster validation
(antismash or gecco). [default: gecco]
Parameters - BGC Type Prediction:
-d <d>, --distance <d> the distance metric to use for kNN type
prediction. [default: jensenshannon]
-k <n>, --neighbors <n> the number of neighbors to use for
kNN type prediction [default: 5]
Parameters - Debug:
--model <model.crf> the path to an alternative CRF model
to use (obtained with `gecco train`).
......@@ -78,7 +72,6 @@ class Run(Command): # noqa: D101
return retcode
# Check value of numeric arguments
self.args["--neighbors"] = int(self.args["--neighbors"])
self.args["--cds"] = int(self.args["--cds"])
self.args["--e-filter"] = e_filter = float(self.args["--e-filter"])
if e_filter < 0 or e_filter > 1:
......@@ -192,10 +185,10 @@ class Run(Command): # noqa: D101
self.logger.warning("No gene clusters were found")
return 0
# --- KNN ------------------------------------------------------------
# --- TYPE CLASSIFICATION ---------------------------------------------
self.logger.info("Predicting BGC types")
knn = ClusterKNN.trained(self.args["--model"], metric=self.args["--distance"])
clusters = knn.predict_types(clusters)
classifier = TypeClassifier.trained(self.args["--model"])
clusters = classifier.predict_types(clusters)
# --- RESULTS --------------------------------------------------------
self.logger.info("Writing final result files to folder {!r}", out_dir)
......
......@@ -226,10 +226,12 @@ class Train(Command): # noqa: D101
def domain_composition(table: pandas.DataFrame) -> numpy.ndarray:
is_bgc = table[self.args["--y-col"]].array == 1
names = table[self.args["--feature-cols"][0]].array[is_bgc]
unique_names = set(names)
weights = table[self.args["--weight-cols"][0]].array[is_bgc]
composition = numpy.zeros(len(all_possible))
for i, domain in enumerate(all_possible):
composition[i] = numpy.sum(weights[names == domain])
if domain in unique_names:
composition[i] = numpy.sum(weights[names == domain])
if pbar is not None:
pbar.update(1)
return composition / (composition.sum() or 1) # type: ignore
......
......@@ -166,7 +166,7 @@ class HMMER(BinaryRunner):
entry = interpro.by_accession[accession]
domain.qualifiers["function"] = [interpro.by_accession[accession].name]
if entry.integrated is not None:
domain.qualifiers["db_xref"].append("InterPro:{}".format(entry.integrated))
domain.qualifiers["db_xref"].append("InterPro:{}".format(entry.integrated)) # type: ignore
# add the domain to the protein domains of the right gene
gene_index[row.target_name].protein.domains.append(domain)
......
......@@ -29,7 +29,7 @@ class InterPro:
self.by_accession = { entry.accession:entry for entry in entries }
@classmethod
def load(cls):
def load(cls) -> "InterPro":
with pkg_resources.resource_stream(__name__, "interpro.json.gz") as f:
data = json.load(gzip.open(f, mode="rt"))
entries = [ InterProEntry(**entry["metadata"]) for entry in data ]
......
"""Supervised classifier to predict the biosynthetic class of a cluster.
"""
import csv
import functools
import os
import typing
from typing import Callable, Dict, List, Iterable, Optional, Sequence, Tuple, Union
import numpy
import pandas
import pkg_resources
import sklearn.neighbors
import scipy.spatial.distance
from ..model import Cluster
if typing.TYPE_CHECKING:
import numpy
_S = typing.TypeVar("_S", bound=Sequence[Cluster])
_Metric = typing.Callable[["numpy.ndarray", "numpy.ndarray"], "numpy.ndarray"]
class ClusterKNN(object):
"""A kNN classifier to predict cluster types based on domain similarity.
Essentially a wrapper around `sklearn.neighbors.KNeighborsClassifier`
provided for convenience.
Attributes:
metric (distance): A distance metric used by the classifier. It takes
two probability vectors as arguments, and returns the distance
between these two vectors. By default, this is
`~scipy.spatial.distance.jensenshannon`, but another function such
as `~scipy.spatial.distance.mahalanobis` can be passed as argument
to the constructor if needed.
model (`sklearn.neighbors.KNeighborsClassifier`): The internal kNN
classifier used for the prediction.
"""
@classmethod
def trained(
cls,
model_path: Optional[str] = None,
metric: Union[str, "_Metric"] = "jensenshannon",
) -> "ClusterKNN":
"""Create a new `ClusterKNN` instance pre-trained with embedded data.
Arguments:
model_path (`str`, optional): The path to the model directory
obtained with the ``gecco train`` command. If `None` given,
use the embedded training data.
metric (`str` or `function`): The distance metric to use with the
classifier. Either given a metric name (such as
``jensenshannon``, the default) or a callable that takes
two vectors.
Returns:
`~gecco.knn.ClusterKNN`: A KNN model that can be used to perform
predictions without training first.
"""
if model_path is not None:
doms_path = os.path.join(model_path, "domains.tsv")
typs_path = os.path.join(model_path, "types.tsv")
comp_path = os.path.join(model_path, "compositions.npz")
else:
doms_path = pkg_resources.resource_filename(__name__, "domains.tsv")
typs_path = pkg_resources.resource_filename(__name__, "types.tsv")
comp_path = pkg_resources.resource_filename(__name__, "compositions.npz")
compositions = scipy.sparse.load_npz(comp_path).toarray()
domains = pandas.read_csv(doms_path, header=None, sep="\t")[0].array
types = pandas.read_csv(typs_path, header=None, sep="\t")[1].array
knn = cls(metric=metric)
knn.model.fit(compositions, y=types)
knn.model.attributes_ = domains
return knn
@classmethod
def _get_metric(cls, name: str) -> "_Metric":
if name == "jensenshannon":
return scipy.spatial.distance.jensenshannon # type: ignore
elif name == "tanimoto":
# NB(@althonos): Tanimoto distance seems to be mostly for boolean
# vectors, not probability vectors.
return lambda p, q: p * q / (p - q) ** 2 # type: ignore
else:
raise ValueError(f"unknown metric name: {name!r}")
def __init__(
self, metric: Union[str, "_Metric"] = "jensenshannon", **kwargs: object
) -> None:
"""Instantiate a new classifier.
Arguments:
metric (`str` or `function`): The distance metric to use with the
classifier. Either given a metric name (such as
``jensenshannon``, the default) or a callable that takes
two vectors.
Keyword Arguments:
Any additional keyword argument is passed as argument to the
internal `~sklearn.neighbors.KNeighborsClassifier` constructor.
Raises:
`ValueError`: When an unknown distance metric is given.
"""
if isinstance(metric, str):
self.metric = self._get_metric(metric)
else:
self.metric = metric
self.model = sklearn.neighbors.KNeighborsClassifier(
metric=self.metric, **kwargs
)
def predict_types(self, clusters: "_S") -> "_S":
"""Predict types for each of the given clusters.
"""
comps = [c.domain_composition(self.model.attributes_) for c in clusters]
probas = self.model.predict_proba(comps)
for cluster, proba in zip(typing.cast(Iterable[Cluster], clusters), probas):
imax = numpy.argmax(proba)
cluster.type = self.model.classes_[imax]
cluster.type_probability = proba[imax]
return clusters
......@@ -74,14 +74,14 @@ class Domain:
self.probability = probability
self.qualifiers = qualifiers or dict()
def to_seq_feature(self, protein_coordinates=False) -> SeqFeature:
def to_seq_feature(self, protein_coordinates: bool = False) -> SeqFeature:
"""Convert the domain to a single feature.
Arguments:
protein_coordinates (`bool`): Set to `True` for the feature
coordinates to be given in amino-acids, or to `False` in
nucleotides.
"""
stride = 1 if protein_coordinates else 3
loc = FeatureLocation(0, (self.end - self.start)*stride)
......@@ -242,29 +242,36 @@ class Cluster:
id (`str`): The identifier of the gene cluster.
genes (`list` of `~gecco.model.Gene`): A list of the genes belonging
to this gene cluster.
type (`str`): The putative type of product synthesized by this gene
cluster, according to similarity with existing clusters.
type_probability (`float`): The probability with which the BGC type
was identified.
types (`list` of `str`): The list of the putative types of product
synthesized by this gene cluster, according to similarity in
domain composition with curated clusters.
types_probabilities (`list` of `float`): The probability with which
each BGC type was identified (same dimension as the ``types``
attribute).
"""
id: str
genes: List[Gene]
type: str = "Other"
type_probability: float = 0.0
types: List[str]
types_probabilities: List[float]
def __init__(
self,
id: str,
genes: Optional[List[Gene]] = None,
type: str = "Unknown",
type_probability: float = 0.0,
types: Optional[List[str]] = None,
types_probabilities: Optional[List[float]] = None,
): # noqa: D107
self.id = id
self.genes = genes or list()
self.type = type
self.type_probability = type_probability
self.types = types or list()
self.types_probabilities = types_probabilities or list()
if len(self.types) != len(self.types_probabilities):
err = "type and type probability lists must have the same dimensions"
raise ValueError(err)
@property
def source(self) -> SeqRecord: # type: ignore
......@@ -381,8 +388,8 @@ class Cluster:
self.end,
self.average_probability,
self.maximum_probability,
self.type,
self.type_probability,
";".join(self.types),
";".join(map(str, self.types_probabilities)),
";".join([gene.id for gene in self.genes]),
";".join(
sorted(
......@@ -402,8 +409,8 @@ class Cluster:
"end",
"average_p",
"max_p",
"BGC_type",
"BGC_type_p",
"BGC_types",
"BGC_types_p",
"proteins",
"domains",
],
......
"""Supervised classifier to predict the biosynthetic type of a cluster.
"""
import csv
import functools
import os
import typing
from typing import Callable, Dict, List, Iterable, Optional, Sequence, Tuple, Union
import numpy
import pandas
import pkg_resources
import scipy.sparse
import sklearn.ensemble
import sklearn.preprocessing
if typing.TYPE_CHECKING:
from ..model import Cluster
_S = typing.TypeVar("_S", bound=Sequence[Cluster])
class TypeClassifier(object):
"""
"""
@classmethod
def trained(cls, model_path: Optional[str] = None) -> "TypeClassifier":
"""Create a new `TypeClassifier` pre-trained with embedded data.
Arguments:
model_path (`str`, optional): The path to the model directory
obtained with the ``gecco train`` command. If `None` given,
use the embedded training data.
Returns:
`~gecco.types.TypeClassifier`: A random forest model that can be
used to perform BGC type predictions without training first.
"""
if model_path is not None:
doms_path = os.path.join(model_path, "domains.tsv")
typs_path = os.path.join(model_path, "types.tsv")
comp_path = os.path.join(model_path, "compositions.npz")
else:
doms_path = pkg_resources.resource_filename(__name__, "domains.tsv")
typs_path = pkg_resources.resource_filename(__name__, "types.tsv")
comp_path = pkg_resources.resource_filename(__name__, "compositions.npz")
compositions = scipy.sparse.load_npz(comp_path)
domains = pandas.read_csv(doms_path, header=None, sep="\t")[0].array
types = pandas.read_csv(typs_path, header=None, sep="\t")[1].array
classifier = cls(random_state=0)
types_bin = classifier.binarizer.fit_transform([t.split(";") for t in types])
classifier.model.fit(compositions, y=types_bin)
classifier.model.attributes_ = domains
return classifier
def __init__(self, **kwargs: object) -> None:
"""Instantiate a new type classifier.
Keyword Arguments:
Any additional keyword argument is passed as argument to the
internal `~sklearn.ensemble.RandomForestClassifier` constructor.
"""
self.model = sklearn.ensemble.RandomForestClassifier(**kwargs)
self.binarizer = sklearn.preprocessing.MultiLabelBinarizer()
def predict_types(self, clusters: "_S") -> "_S":
"""Predict types for each of the given clusters.
"""
# extract domain compositions from input clusters
comps = numpy.array([c.domain_composition(self.model.attributes_) for c in clusters])
# only take 'positive' probabilities for each class
probas = numpy.array(self.model.predict_proba(comps))[:, :, 1].transpose()
preds = self.binarizer.inverse_transform(probas > 0.5)
# annotate the input clusters
results = zip(typing.cast(Iterable["Cluster"], clusters), probas, preds)
for cluster, proba, pred in results:
cluster.types = list(pred)
cluster.types_probabilities = list(proba[proba > 0.5])
return clusters
......@@ -47,8 +47,6 @@ install_requires =
tqdm ~=4.41
verboselogs ~=1.7
[bdist_wheel]
universal = true
[options.packages.find]
exclude = tests
......@@ -66,6 +64,9 @@ gecco.cli.commands =
console-scripts =
gecco = gecco.cli:main
[bdist_wheel]
universal = true
[coverage:report]
include = gecco/*
show_missing = true
......
......@@ -22,7 +22,7 @@ from gecco.cli.commands.run import Run
from gecco.cli.commands.train import Train
# from gecco.cli.commands.tune import Tune
from gecco.model import Domain, Gene, Protein, Strand
from gecco.knn import ClusterKNN
from gecco.types import TypeClassifier
DATADIR = os.path.realpath(os.path.join(__file__, "..", "data"))
......@@ -123,10 +123,10 @@ class TestRun(TestCommand, unittest.TestCase):
mock.patch("gecco.crf.ClusterCRF.predict_probabilities", new=_predict_probabilities)
)
stack.enter_context(
mock.patch("gecco.cli.commands.run.ClusterKNN.trained", new=lambda model_path, metric: ClusterKNN(metric=metric))
mock.patch("gecco.cli.commands.run.TypeClassifier.trained", new=lambda model: TypeClassifier())
)
stack.enter_context(
mock.patch("gecco.cli.commands.run.ClusterKNN.predict_types", new=_fit_predict)
mock.patch("gecco.cli.commands.run.TypeClassifier.predict_types", new=_fit_predict)
)
argv = ["-vv", "--traceback", "run", "--genome", sequence, "--output", self.tmpdir]
main(argv, stream=io.StringIO())
......