Commits (24)
......@@ -5,7 +5,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
## [Unreleased]
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.8.10...master
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.9.1-alpha1...master
## [v0.9.1-alpha1] - 2022-03-20
[v0.9.1-alpha1]: https://git.embl.de/grp-zeller/GECCO/compare/v0.8.10...v0.9.1-alpha1
Candidate release with support for a sliding window in the CRF prediction algorithm.
## [v0.8.10] - 2022-02-23
[v0.8.10]: https://git.embl.de/grp-zeller/GECCO/compare/v0.8.9...v0.8.10
......@@ -68,12 +72,6 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
### Changed
- Default probability threshold for segmentation to 0.3 (from 0.4).
## [v0.9.0] - 2021-08-10 - **YANKED**
[v0.9.0]: https://git.embl.de/grp-zeller/GECCO/compare/v0.8.2...v0.9.0
### Changed
- Retrain internal model using `--select=0.35` instead of `--select=0.25` like before.
- Change default *p-value* filter from 1e-9 to 1e-5 to detect more features.
## [v0.8.2] - 2021-07-31
[v0.8.2]: https://git.embl.de/grp-zeller/GECCO/compare/v0.8.1...v0.8.2
### Fixed
......
<?xml version='1.0' encoding='utf-8'?>
<tool id="gecco" name="GECCO" version="0.8.9" python_template_version="3.5">
<tool id="gecco" name="GECCO" version="0.8.10" python_template_version="3.5">
<description>is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs).</description>
<requirements>
<requirement type="package" version="0.8.9">gecco</requirement>
<requirement type="package" version="0.8.10">gecco</requirement>
</requirements>
<version_command>gecco --version</version_command>
<command detect_errors="aggressive"><![CDATA[
......
......@@ -10,4 +10,4 @@ See Also:
__author__ = "Martin Larralde"
__license__ = "GPLv3"
__version__ = "0.8.10"
__version__ = "0.9.1-alpha1"
......@@ -9,7 +9,7 @@ import locale
import operator
import typing
from multiprocessing.pool import Pool
from typing import Any, Callable, Iterable, List, Tuple, Optional, Type
from typing import Any, Callable, Iterable, Iterator, List, Tuple, Optional, Type
if typing.TYPE_CHECKING:
from types import TracebackType
......@@ -76,53 +76,6 @@ class requires:
return newfunc
class OrderedPoolWrapper:
"""A `Pool` wrapper that returns results in the order they were given.
"""
class _OrderedFunc:
def __init__(self, inner: Callable[["_A"], "_R"], star: bool = False) -> None:
self.inner = inner
self.star = star
def __call__(self, args: Tuple[int, "_A"]) -> Tuple[int, "_R"]:
i, other = args
if self.star:
return i, self.inner(*other) # type: ignore
else:
return i, self.inner(other)
def __init__(self, inner: Pool) -> None:
self.inner = inner
def __enter__(self) -> "OrderedPoolWrapper":
self.inner.__enter__()
return self
def __exit__(
self,
exc_type: Optional[Type[BaseException]],
exc_value: Optional[BaseException],
traceback: Optional["TracebackType"],
) -> Optional[bool]:
return self.inner.__exit__(exc_type, exc_value, traceback)
def map(self, func: Callable[["_A"], "_R"], it: Iterable["_A"]) -> List["_R"]:
wrapped_it = enumerate(it)
wrapped_func = self._OrderedFunc(func)
results = self.inner.map(wrapped_func, wrapped_it)
results.sort(key=operator.itemgetter(0))
return list(map(operator.itemgetter(1), results))
def starmap(self, func: Callable[..., "_R"], it: Iterable[Iterable[Any]]) -> List["_R"]:
wrapped_it = enumerate(it)
wrapped_func = self._OrderedFunc(func, star=True)
results = self.inner.map(wrapped_func, wrapped_it)
results.sort(key=operator.itemgetter(0))
return list(map(operator.itemgetter(1), results))
class UniversalContainer(object):
"""A container that contains everything.
"""
......@@ -134,6 +87,17 @@ class UniversalContainer(object):
return True
def sliding_window(length: int, window: int, step: int) -> Iterator[slice]:
"""Iterate over a sequence of length `length` with a sliding window.
"""
if window <= 0:
raise ValueError("Window size must be strictly positive")
if step <= 0 or step > window:
raise ValueError("Window step must be strictly positive and under `window_size`")
for i in range(0, length + 1 - window, step):
yield slice(i, i+window)
@contextlib.contextmanager
def patch_locale(name: str):
"""Create a context manager to locally change the locale in use.
......
......@@ -125,11 +125,15 @@ class Command(metaclass=abc.ABCMeta):
message: Optional[str] = None,
hint: Optional[str] = None,
optional: bool = False,
default: _T = None
) -> _T:
_convert = (lambda x: x) if convert is None else convert
_check = (lambda x: True) if check is None else check
if optional and self.args[name] is None:
return None
if self.args[name] is None:
if optional:
return default
self.error(f"Missing value for argument [purple]{name}[/]")
raise InvalidArgument(self.args[name])
try:
value = _convert(self.args[name])
if not _check(value):
......
......@@ -44,16 +44,19 @@ class Annotate(Command): # noqa: D101
one of the sequences format supported
by Biopython.
Parameters - Output:
Parameters:
-f <fmt>, --format <fmt> the format of the input file, as a
Biopython format string. GECCO is able
to recognize FASTA and GenBank files
automatically if this is not given.
-j <jobs>, --jobs <jobs> the number of CPUs to use for
multithreading. Use 0 to use all of
the available CPUs. [default: 0]
Parameters - Output:
-o <out>, --output <out> the file where to write the feature
table. [default: features.tsv]
-j <jobs>, --jobs <jobs> the number of CPUs to use for
multithreading. Use 0 to use all of the
available CPUs. [default: 0]
Parameters - Gene Calling:
-M, --mask Enable unknown region masking to
......@@ -91,9 +94,9 @@ class Annotate(Command): # noqa: D101
optional=True,
)
self.jobs = self._check_flag("--jobs", int, lambda x: x >= 0, hint="positive or null integer")
self.format = self._check_flag("--format")
self.format = self._check_flag("--format", optional=True)
self.genome = self._check_flag("--genome")
self.hmm = self._check_flag("--hmm")
self.hmm = self._check_flag("--hmm", optional=True)
self.output = self._check_flag("--output")
self.mask = self._check_flag("--mask", bool)
except InvalidArgument:
......@@ -199,14 +202,14 @@ class Annotate(Command): # noqa: D101
def _filter_domains(self, genes):
# Filter i-evalue and p-value if required
if self.e_filter is not None:
self.info("Filtering", "domains with e-value under", self.e_filter, level=1)
self.info("Excluding", "domains with e-value over", self.e_filter, level=1)
key = lambda d: d.i_evalue < self.e_filter
genes = [
gene.with_protein(gene.protein.with_domains(filter(key, gene.protein.domains)))
for gene in genes
]
if self.p_filter is not None:
self.info("Filtering", "domains with p-value under", self.p_filter, level=1)
self.info("Excluding", "domains with p-value over", self.p_filter, level=1)
key = lambda d: d.pvalue < self.p_filter
genes = [
gene.with_protein(gene.protein.with_domains(filter(key, gene.protein.domains)))
......
......@@ -35,11 +35,14 @@ class Cv(Train): # noqa: D101
gecco cv loto --features <table>... --clusters <table> [options]
Arguments:
-f <data>, --features <table> a domain annotation table, used to
labeled as BGCs and non-BGCs.
-f <data>, --features <table> a domain annotation table, used
to train the CRF.
-c <data>, --clusters <table> a cluster annotation table, used
to stratify clusters by type in
LOTO mode.
-g <file>, --genes <file> a GFF file containing the
coordinates of the genes inside
the training sequence.
Parameters:
-o <out>, --output <out> the name of the output file where
......@@ -63,15 +66,17 @@ class Cv(Train): # noqa: D101
[default: 42]
Parameters - Training:
-W <N>, --window-size <N> the size of the sliding window for
CRF predictions. [default: 5]
--window-step <N> the step of the sliding window for
CRF predictions. [default: 1]
--c1 <C1> parameter for L1 regularisation.
[default: 0.15]
--c2 <C2> parameter for L2 regularisation.
[default: 0.15]
--feature-type <type> how features should be extracted
(single, overlap, or group).
[default: group]
--overlap <N> how much overlap to consider if
features overlap. [default: 2]
--feature-type <type> at which level features should be
extracted (protein or domain).
[default: protein]
--select <N> fraction of most significant features
to select from the training data.
--correction <method> the multiple test correction method
......
......@@ -94,7 +94,7 @@ class Run(Annotate): # noqa: D101
included if they are longer. A lower
number will increase the number of
false positives on small contigs.
[default: 10]
[default: 0]
Parameters - Debug:
--model <directory> the path to an alternative CRF model
......@@ -128,9 +128,9 @@ class Run(Annotate): # noqa: D101
self.jobs = self._check_flag("--jobs", int, lambda x: x >= 0, hint="positive or null integer")
self.postproc = self._check_flag("--postproc", str, lambda x: x in ("gecco", "antismash"), hint="'gecco' or 'antismash'")
self.edge_distance = self._check_flag("--edge-distance", int, lambda x: x >= 0, hint="positive or null integer")
self.format = self._check_flag("--format")
self.format = self._check_flag("--format", optional=True)
self.genome = self._check_flag("--genome")
self.model = self._check_flag("--model")
self.model = self._check_flag("--model", optional=True)
self.hmm = self._check_flag("--hmm")
self.output_dir = self._check_flag("--output-dir")
self.antismash_sideload = self._check_flag("--antismash-sideload", bool)
......@@ -281,6 +281,8 @@ class Run(Annotate): # noqa: D101
"e-filter": repr(self.e_filter),
"postproc": repr(self.postproc),
"threshold": repr(self.threshold),
"mask": repr(self.mask),
"edge-distance": repr(self.edge_distance),
}
}
}
......
......@@ -13,7 +13,7 @@ import pickle
import random
import signal
import typing
from typing import Any, Dict, Union, Optional, List, TextIO, Mapping
from typing import Any, Dict, Union, Optional, List, Iterator, TextIO, Mapping
from .._utils import in_context, patch_showwarnings, ProgressReader
from ._base import Command, CommandExit, InvalidArgument
......@@ -34,7 +34,7 @@ class Train(Command): # noqa: D101
gecco train - {cls.summary}
Usage:
gecco train --features <table>... --clusters <table> [options]
gecco train --features <table>... [options]
Arguments:
-f <data>, --features <table> a domain annotation table, used to
......@@ -42,14 +42,19 @@ class Train(Command): # noqa: D101
-c <data>, --clusters <table> a cluster annotation table, used to
extract the domain composition for
the type classifier.
-g <file>, --genes <file> a GFF file containing the
coordinates of the genes inside
the training sequence.
Parameters:
-o <out>, --output-dir <out> the directory to use for the model
files. [default: model]
-j <jobs>, --jobs <jobs> the number of CPUs to use for
multithreading. Use 0 to use all
the available CPUs. [default: 0]
Parameters - Output:
-o <out>, --output-dir <out> the directory to use for the model
files. [default: model]
Parameters - Domain Annotation:
-e <e>, --e-filter <e> the e-value cutoff for domains to
be included.
......@@ -64,15 +69,17 @@ class Train(Command): # noqa: D101
[default: 42]
Parameters - Training:
-W <N>, --window-size <N> the size of the sliding window for
CRF predictions. [default: 5]
--window-step <N> the step of the sliding window for
CRF predictions. [default: 1]
--c1 <C1> parameter for L1 regularisation.
[default: 0.15]
--c2 <C2> parameter for L2 regularisation.
[default: 0.15]
--feature-type <type> how features should be extracted
(single, overlap, or group).
[default: group]
--overlap <N> how much overlap to consider if
features overlap. [default: 2]
--feature-type <type> at which level features should be
extracted (protein or domain).
[default: protein]
--select <N> fraction of most significant features
to select from the training data.
--correction <method> the multiple test correction method
......@@ -89,14 +96,10 @@ class Train(Command): # noqa: D101
self.feature_type = self._check_flag(
"--feature-type",
str,
lambda x: x in {"single", "overlap", "group"},
hint="'single', 'overlap' or 'group'"
)
self.overlap = self._check_flag(
"--overlap",
int,
lambda x: x > 0,
hint="positive integer",
lambda x: x in {"protein", "domain"},
hint="'protein' or 'domain'",
default="protein",
optional=True,
)
self.c1 = self._check_flag("--c1", float, hint="real number")
self.c2 = self._check_flag("--c2", float, hint="real number")
......@@ -139,6 +142,19 @@ class Train(Command): # noqa: D101
self.output_dir = self._check_flag("--output-dir", str)
self.features = self._check_flag("--features", list)
self.clusters = self._check_flag("--clusters", str)
self.genes = self._check_flag("--genes", str)
self.window_size = self._check_flag(
"--window-size",
int,
lambda x: x > 0,
hint="positive integer",
)
self.window_step = self._check_flag(
"--window-step",
int,
lambda x: x > 0 and x <= self.window_size,
hint="positive integer smaller than `--window-size`",
)
except InvalidArgument:
raise CommandExit(1)
......@@ -169,6 +185,31 @@ class Train(Command): # noqa: D101
self.warn("Output folder contains files that will be overwritten")
break
def _load_genes(self) -> Iterator["Gene"]:
from Bio.SeqRecord import SeqRecord
from ...model import Gene, Protein, Strand, _UnknownSeq
try:
# get filesize and unit
input_size = os.stat(self.genes).st_size
total, scale, unit = ProgressReader.scale_size(input_size)
task = self.progress.add_task("Loading genes", total=total, unit=unit, precision=".1f")
#
self.info("Loading", "gene coordinates from file", repr(self.genes))
with ProgressReader(open(self.genes, "rb"), self.progress, task, scale) as gff_file:
for row in csv.reader(io.TextIOWrapper(gff_file), dialect="excel-tab"):
name, _, _, start, end, _, strand, *_ = row
yield Gene(
SeqRecord(_UnknownSeq(), id=name.rsplit("_", 1)[0]),
int(start),
int(end),
Strand.Coding if strand == "+" else Strand.Reverse,
Protein(name, _UnknownSeq()),
)
except OSError as err:
self.error("Fail to parse genes coordinates: {}", err)
raise CommandExit(err.errno) from err
def _load_features(self) -> "FeatureTable":
from ...model import FeatureTable
......@@ -190,28 +231,45 @@ class Train(Command): # noqa: D101
self.success("Loaded", "a total of", len(features), "features", level=1)
return features
def _convert_to_genes(self, features: "FeatureTable") -> List["Gene"]:
self.info("Converting", "features to genes")
gene_count = len(set(features.protein_id))
unit = "gene" if gene_count == 1 else "genes"
task = self.progress.add_task("Converting features", total=gene_count, unit=unit, precision="")
genes = list(self.progress.track(
features.to_genes(),
total=gene_count,
task_id=task
))
# filter domains out
Annotate._filter_domains(self, genes)
def _annotate_genes(self, genes: List["Gene"], features: "FeatureTable") -> List["Gene"]:
from ...model import Domain
# index genes by protein_id
gene_index = { gene.protein.id:gene for gene in genes }
if len(gene_index) < len(genes):
raise ValueError("Duplicate gene names in input genes")
# add domains from the feature table
unit = "row" if len(features) == 1 else "rows"
task = self.progress.add_task("Annotating genes", total=len(features), unit=unit, precision="")
for row in self.progress.track(features, total=len(features), task_id=task):
# get gene by ID and check consistency
gene = gene_index[row.protein_id]
if gene.source.id != row.sequence_id:
raise ValueError(f"Mismatched source sequence for {row.protein_id!r}: {gene.source.id!r} != {row.sequence_id!r}")
elif gene.end - gene.start != row.end - row.start:
raise ValueError(f"Mismatched gene length for {row.protein_id!r}: {gene.end - gene.start!r} != {row.end - row.start!r}")
elif gene.start != row.start:
raise ValueError(f"Mismatched gene start for {row.protein_id!r}: {gene.start!r} != {row.start!r}")
elif gene.end != row.end:
raise ValueError(f"Mismatched gene end for {row.protein_id!r}: {gene.end!r} != {row.end!r}")
elif gene.strand.sign != row.strand:
raise ValueError(f"Mismatched gene strand for {row.protein_id!r}: {gene.strand.sign!r} != {row.strand!r}")
elif gene.source.id != row.sequence_id:
raise ValueError(f"Mismatched sequence ID {row.protein_id!r}: {gene.source.id!r} != {row.sequence_id!r}")
# add the row domain to the gene
domain = Domain(
name=row.domain,
start=row.domain_start,
end=row.domain_end,
hmm=row.hmm,
i_evalue=row.i_evalue,
pvalue=row.pvalue,
)
gene.protein.domains.append(domain)
self.info("Sorting", "genes by genomic coordinates")
genes.sort(key=operator.attrgetter("source.id", "start", "end"))
self.info("Sorting", "domains by protein coordinates")
for gene in genes:
gene.protein.domains.sort(key=operator.attrgetter("start", "end"))
return genes
# return
return list(gene_index.values())
def _fit_model(self, genes: List["Gene"]) -> "ClusterCRF":
from ...crf import ClusterCRF
......@@ -219,11 +277,13 @@ class Train(Command): # noqa: D101
self.info("Creating", f"the CRF in [bold blue]{self.feature_type}[/] mode", level=1)
self.info("Using", f"hyperparameters C1={self.c1}, C2={self.c2}", level=1)
if self.select is not None:
self.info("Using", f"Fisher Exact Test significance threshold of {self.select}", level=1)
self.info("Selecting", f"features with Fisher Exact Test threshold of {self.select}", level=1)
self.info("Iterating", f"over features with a sliding window of size {self.window_size} and step {self.window_step}", level=1)
crf = ClusterCRF(
self.feature_type,
algorithm="lbfgs",
overlap=self.overlap,
window_size=self.window_size,
window_step=self.window_step,
c1=self.c1,
c2=self.c2,
)
......@@ -302,13 +362,9 @@ class Train(Command): # noqa: D101
cluster_row.start <= gene.start and gene.end <= cluster_row.end
for cluster_row in cluster_by_seq[seq_id]
):
gene = gene.with_protein(
gene.protein.with_domains([d.with_probability(1) for d in gene.protein.domains])
)
gene = gene.with_probability(1)
else:
gene = gene.with_protein(
gene.protein.with_domains([d.with_probability(0) for d in gene.protein.domains])
)
gene = gene.with_probability(0)
labelled_genes.append(gene)
self.progress.update(task_id=task, advance=1)
......@@ -377,9 +433,16 @@ class Train(Command): # noqa: D101
# attempt to create the output directory
self._make_output_directory()
# load features
genes = list(self._load_genes())
features = self._load_features()
genes = self._convert_to_genes(features)
del features
genes = self._annotate_genes(genes, features)
# Sort genes
self.info("Sorting", "genes by coordinates", level=2)
genes.sort(key=operator.attrgetter("source.id", "start", "end"))
for gene in genes:
gene.protein.domains.sort(key=operator.attrgetter("start", "end"))
# filter domains by p-value and/or e-value
genes = Annotate._filter_domains(self, genes)
# load clusters and label genes inside clusters
clusters = self._load_clusters()
genes = self._label_genes(genes, clusters)
......
......@@ -21,8 +21,11 @@ from typing import (
Dict,
FrozenSet,
Iterable,
Iterator,
List,
NamedTuple,
Optional,
Sequence,
Tuple,
Type,
Union,
......@@ -34,6 +37,7 @@ import sklearn_crfsuite
import sklearn.model_selection
import sklearn.preprocessing
from .._meta import sliding_window
from ..model import Gene
from . import features
from .cv import LeaveOneGroupOut
......@@ -93,20 +97,19 @@ class ClusterCRF(object):
self,
feature_type: str = "protein",
algorithm: str = "lbfgs",
overlap: int = 2,
window_size: int = 5,
window_step: int = 1,
**kwargs: Dict[str, object],
) -> None:
"""Create a new `ClusterCRF` instance.
Arguments:
feature_type (`str`): Defines how features should be extracted.
Should be either *domain*, *protein*, or *overlap*.
Should be either *domain* or *protein*.
algorithm (`str`): The optimization algorithm for the model. See
https://sklearn-crfsuite.readthedocs.io/en/latest/api.html
for available values.
overlap (`int`): In case of ``feature_type = "overlap"``, defines
the sliding window size to use. The resulting window width is
``2*overlap+1``.
window_size (`int`):
Any additional keyword argument is passed as-is to the internal
`~sklearn_crfsuite.CRF` constructor.
......@@ -116,49 +119,67 @@ class ClusterCRF(object):
`TypeError`: if one of the ``*_columns`` argument is not iterable.
"""
if feature_type not in {"single", "overlap", "group"}:
raise ValueError(f"unexpected feature type: {feature_type!r}")
if feature_type not in {"protein", "domain"}:
raise ValueError(f"invalid feature type: {feature_type!r}")
if window_size <= 0:
raise ValueError("Window size must be strictly positive")
if window_step <= 0 or window_step > window_size:
raise ValueError("Window step must be strictly positive and under `window_size`")
self.feature_type: str = feature_type
self.overlap: int = overlap
self.feature_type = feature_type
self.window_size = window_size
self.window_step = window_step
self.algorithm = algorithm
self.significance: Optional[Dict[str, float]] = None
self.significant_features: Optional[FrozenSet[str]] = None
self.model = sklearn_crfsuite.CRF(
algorithm=algorithm,
all_possible_transitions=True,
all_possible_states=True,
# all_possible_transitions=True,
# all_possible_states=True,
**kwargs,
)
def predict_probabilities(self, genes: Iterable[Gene], *, cpus: Optional[int] = None) -> List[Gene]:
"""Predict how likely each given gene is part of a gene cluster.
"""
_cpus = os.cpu_count() if not cpus else cpus
# group input genes by sequence
groups = itertools.groupby(genes, key=operator.attrgetter("source.id"))
seqs = [sorted(group, key=operator.attrgetter("start")) for _, group in groups]
# select the feature extraction method
if self.feature_type == "group":
extract = features.extract_features_group
annotate = features.annotate_probabilities_group
elif self.feature_type == "single":
extract = features.extract_features_single
annotate = features.annotate_probabilities_single
elif self.feature_type == "overlap":
raise NotImplementedError("todo: `features.extract_features_overlap`")
if self.feature_type == "protein":
extract_features = features.extract_features_protein
annotate_probabilities = features.annotate_probabilities_protein
elif self.feature_type == "domain":
extract_features = features.extract_features_domain
annotate_probabilities = features.annotate_probabilities_domain
else:
raise ValueError("invalid feature type")
raise ValueError(f"invalid feature type: {self.feature_type!r}")
# extract features and predict cluster probabilities
marginals = self.model.predict_marginals([list(extract(seq)) for seq in seqs])
# Annotate the genes with the predicted probabilities
annotated_seqs = [annotate(seq, m) for seq, m in zip(seqs, marginals)]
# sort genes by sequence id and domains inside genes by coordinate
genes = sorted(genes, key=operator.attrgetter("source.id"))
for gene in genes:
gene.protein.domains.sort(key=operator.attrgetter("start"))
# predict sequence by sequence
predicted = []
for _, group in itertools.groupby(genes, key=operator.attrgetter("source.id")):
# extract features
sequence: List[Gene] = sorted(group, key=operator.attrgetter("start"))
feats: List[Dict[str, bool]] = extract_features(sequence)
# ignore sequences too small with a warning
if len(feats) < self.window_size:
warnings.warn(
f"Contig {sequence[0].source.id!r} does not contain enough"
f" genes for sliding window of size {self.window_size}"
)
continue
# predict marginals over a sliding window, storing maximum probabilities
probabilities = numpy.zeros(len(sequence))
for win in sliding_window(len(feats), self.window_size, self.window_step):
marginals = [p['1'] for p in self.model.predict_marginals_single(feats[win])]
numpy.maximum(probabilities[win], marginals, out=probabilities[win])
# label genes with maximal probabilities
predicted.extend(annotate_probabilities(sequence, probabilities))
# return the genes that were passed as input but now having BGC
# probabilities set
return list(itertools.chain.from_iterable(annotated_seqs)) # type: ignore
return predicted
def fit(
self,
......@@ -171,24 +192,19 @@ class ClusterCRF(object):
) -> None:
_cpus = os.cpu_count() if not cpus else cpus
# select the feature extraction method
if self.feature_type == "group":
extract_features = features.extract_features_group
extract_labels = features.extract_labels_group
elif self.feature_type == "single":
extract_features = features.extract_features_single
extract_labels = features.extract_labels_single
elif self.feature_type == "overlap":
raise NotImplementedError("todo: `features.extract_features_overlap`")
if self.feature_type == "protein":
extract_features = features.extract_features_protein
extract_labels = features.extract_labels_protein
elif self.feature_type == "domain":
extract_features = features.extract_features_domain
extract_labels = features.extract_labels_domain
else:
raise ValueError("invalid feature type")
raise ValueError(f"invalid feature type: {self.feature_type!r}")
# group input genes by sequence
groups = itertools.groupby(genes, key=operator.attrgetter("source.id"))
seqs = [sorted(group, key=operator.attrgetter("start")) for _, group in groups]
# shuffle sequences
if shuffle:
random.shuffle(seqs)
# sort genes by sequence id and domains inside genes by coordinate
genes = sorted(genes, key=operator.attrgetter("source.id"))
for gene in genes:
gene.protein.domains.sort(key=operator.attrgetter("start"))
# perform feature selection
if select is not None:
......@@ -196,7 +212,7 @@ class ClusterCRF(object):
raise ValueError(f"invalid value for select: {select}")
# find most significant features
self.significance = sig = fisher_significance(
(g.protein for seq in seqs for g in seq),
(gene.protein for gene in genes),
correction_method=correction_method,
)
sorted_sig = sorted(sig, key=sig.get)[:int(select*len(sig))]
......@@ -209,24 +225,45 @@ class ClusterCRF(object):
UserWarning
)
# remove non significant domains
for i, seq in enumerate(seqs):
for j, gene in enumerate(seq):
seqs[i][j] = gene.with_protein(
gene.protein.with_domains([
domain for domain in gene.protein.domains
if domain.name in self.significant_features
])
)
# extract features and labels
X = [list(extract_features(seq)) for seq in seqs]
Y = [list(extract_labels(seq)) for seq in seqs]
genes = [
gene.with_protein(
gene.protein.with_domains([
domain for domain in gene.protein.domains
if domain.name in self.significant_features
])
)
for gene in genes
]
# group input genes by sequence
groups = itertools.groupby(genes, key=operator.attrgetter("source.id"))
sequences = [sorted(group, key=operator.attrgetter("start")) for _, group in groups]
# shuffle sequences if requested
if shuffle:
random.shuffle(sequences)
# build training instances
training_features, training_labels = [], []
for sequence in sequences:
# extract features and labels
feats: List[Dict[str, bool]] = extract_features(sequence)
labels: List[str] = extract_labels(sequence)
# check we have as many observations as we have labels
if len(feats) != len(labels):
raise ValueError("different number of features and labels found, something is wrong")
# check we have enough genes for the desired sliding window
if len(feats) < self.window_size:
raise ValueError(f"{sequence[0].source.id!r} has not enough observations ({len(feats)}) for requested window size ({self.window_size})")
# record every window in the sequence
for win in sliding_window(len(feats), self.window_size, self.window_step):
training_features.append(feats[win])
training_labels.append(labels[win])
# check labels
if all(y == "1" for y in Y):
if all(label == "1" for y in training_labels for label in y):
raise ValueError("only positives labels found, something is wrong.")
elif all(y == "0" for y in Y):
elif all(label == "0" for y in training_labels for label in y):
raise ValueError("only negative labels found, something is wrong.")
# fit the model
self.model.fit(X, Y)
self.model.fit(training_features, training_labels)
......@@ -10,76 +10,111 @@ if typing.TYPE_CHECKING:
_S = typing.TypeVar("_S", bound=Sequence[Gene])
def extract_features_group(sequence: Iterable[Gene]) -> List[Dict[str, float]]:
"""Extract features at the group/protein level from an iterable of genes.
def extract_features_protein(
sequence: Iterable[Gene], empty: bool = True
) -> List[Dict[str, bool]]:
"""Extract features at the gene level.
If several domains are part of the same gene, they are grouped at the
same position. Genes without domain annotation are represented with an
empty dictionary.
Arguments:
sequence (iterable of `~gecco.model.Gene`): The genes to extract the
features from.
empty (`bool`): Whether or not to skip empty genes. With `False`,
no empty feature dictionary will be emitted, and genes will be
skipped. With `True`, an empty feature dictionary will be emitted
for every gene without domain annotation.
"""
# FIXME: currently (v0.3.0 and later) we have to hide proteins missing
# domain annotations because the model has only been trained with proteins
# that had domains (since unannotated proteins do not appear in the input
# feature table used for training)
#
# Fixing this requires retraining the model so that it is aware of
# unannotated proteins in the training data, and learns to ignore them.
# When it is done, make sure to edit `postprocessing` as well.
return [
{d.name: 1 - d.i_evalue for d in g.protein.domains}
for g in sequence
if g.protein.domains
{domain.name: True for domain in gene.protein.domains}
for gene in sequence
if gene.protein.domains or empty
]
def extract_labels_group(sequence: Iterable[Gene]) -> List[str]:
"""Extract labels at the group/protein level from an iterable of genes.
"""
def extract_features_domain(
sequence: Iterable[Gene], empty: bool = True
) -> List[Dict[str, bool]]:
"""Extract features at the domain level."""
features = []
for gene in sequence:
if gene.protein.domains:
features.extend({domain.name: True} for domain in gene.protein.domains)
elif empty:
features.append({})
return features
def extract_labels_protein(sequence: Iterable[Gene], empty: bool = True) -> List["str"]:
"""Extract labels at the gene level for training."""
return [
str(int(g.average_probability > 0.5)) # type: ignore
for g in sequence
if g.protein.domains
"1" if gene.average_probability > 0.5 else "0"
for gene in sequence
if gene.protein.domains or empty
]
def annotate_probabilities_group(
sequence: "_S", marginals: List[Dict[str, float]]
) -> "_S":
"""Annotate genes with marginals obtained from a CRF at the protein level.
"""
probabilities = iter(marginals)
def extract_labels_domain(sequence: Iterable[Gene], empty: bool = True) -> List["str"]:
"""Extract labels at the domain level for training."""
labels = []
for gene in sequence:
if gene.protein.domains:
p = next(probabilities)["1"]
gene = gene.with_protein(
gene.protein.with_domains(
[d.with_probability(p) for d in gene.protein.domains]
)
labels.extend(
"1" if domain.probability > 0.5 else "0"
for domain in gene.protein.domains
)
yield gene
elif empty:
labels.append("1" if gene.average_probability > 0.5 else "0")
return labels
def annotate_probabilities_single(
sequence: "_S", marginals: List[Dict[str, float]]
) -> "_S":
"""Annotate genes with marginals obtained from a CRF at the domain level.
"""
# FIXME
domains = [domain for gene in sequence for domain in gene.protein.domains]
for domain, probability in itertools.zip_longest(domains, marginals):
assert domain is not None
assert probability is not None
domain.probability = probability["1"]
return sequence
def annotate_probabilities_protein(
sequence: List[Gene],
probabilities: List[float],
empty: bool = True,
) -> Iterable[Gene]:
"""Annotate genes with marginals obtained from a CRF at the protein level.
Arguments:
sequence (iterable of `~gecco.model.Gene`): The genes to annotate the
features from.
probabilities (iterable of `float`): The biosynthetic probabilities
that have been computed by the CRF.
empty (`bool`): Whether or not to empty genes where skipped. With
`False`, no empty feature dictionary have been emitted, so empty
genes have no probability. With `True`, empty genes have a
probability that can be extracted.
def extract_labels_single(sequence: Iterable[Gene]) -> List[str]:
"""Extract labels at the domain level from an iterable of genes.
"""
return [
str(int(d.probability > 0.5)) # type: ignore
for g in sequence
for d in g.protein.domains
]
genes = [gene for gene in sequence if gene.protein.domains or empty]
if len(genes) != len(probabilities):
raise ValueError("gene and probability lists don't have the same length")
for gene, probability in zip(genes, probabilities):
yield gene.with_probability(probability)
def extract_features_single(sequence: Iterable[Gene]) -> List[Dict[str, float]]:
"""Extract features at the domain level from an iterable of genes.
"""
return [{d.name: 1 - d.i_evalue} for g in sequence for d in g.protein.domains]
def annotate_probabilities_domain(
sequence: List[Gene],
probabilities: List[float],
empty: bool = True,
) -> Iterable[Gene]:
"""Annotate genes with marginals obtained from a CRF at the domain level."""
genes = iter(sequence)
probas = iter(probabilities)
for gene in genes:
if gene.protein.domains:
yield gene.with_protein(
gene.protein.with_domains(
[
domain.with_probability(p)
for domain, p in zip(gene.protein.domains, probas)
]
)
)
elif empty:
yield gene.with_probability(next(probas))
if next(probas, None) is not None:
raise ValueError("gene and probability lists don't have the same length")
No preview for this file type
39393c2a69c42b93305f9462cff1764a
\ No newline at end of file
a350f93db1c062d6162e7d7ac0f6b787
\ No newline at end of file
......@@ -73,7 +73,7 @@ def fisher_significance(
+-------------+-------------------------+-------------------------------+
Then, we run a Fisher Exact Test on this distribution, which gives us the
probability to observe the same table under the hypothesis of independece
probability to observe the same table under the hypothesis of independence
of the two variables.
Arguments:
......
[hmm]
id = Pfam
version = 34.0
size = 2683
url = ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam34.0/Pfam-A.hmm.gz
version = 35.0
size = 2766
url = ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam35.0/Pfam-A.hmm.gz
relabel_with = s/(PF\d+).\d+/\1/
md5 = e56170a43f91cc6afd41b696e045c107
md5 = 635284a9da9e84f920928d79cd0140fd
......@@ -3917,7 +3917,7 @@
{
"accession": "PF00452",
"go_terms": null,
"integrated": "IPR026298",
"integrated": "IPR046371",
"member_databases": null,
"name": "Apoptosis regulator proteins, Bcl-2 family",
"source_database": "pfam",
......@@ -6050,7 +6050,7 @@
{
"accession": "PF00700",
"go_terms": null,
"integrated": "IPR001029",
"integrated": "IPR046358",
"member_databases": null,
"name": "Bacterial flagellin C-terminal helical region",
"source_database": "pfam",
......@@ -7841,7 +7841,7 @@
{
"accession": "PF00907",
"go_terms": null,
"integrated": "IPR001699",
"integrated": "IPR046360",
"member_databases": null,
"name": "T-box",
"source_database": "pfam",
......@@ -20522,7 +20522,7 @@
{
"accession": "PF02458",
"go_terms": null,
"integrated": "IPR003480",
"integrated": null,
"member_databases": null,
"name": "Transferase family",
"source_database": "pfam",
......@@ -25526,7 +25526,7 @@
{
"accession": "PF03081",
"go_terms": null,
"integrated": "IPR004140",
"integrated": "IPR046364",
"member_databases": null,
"name": "Exo70 exocyst complex subunit",
"source_database": "pfam",
......@@ -32699,7 +32699,7 @@
{
"accession": "PF03932",
"go_terms": null,
"integrated": "IPR023648",
"integrated": "IPR005627",
"member_databases": null,
"name": "CutC family",
"source_database": "pfam",
......@@ -34049,7 +34049,7 @@
{
"accession": "PF04091",
"go_terms": null,
"integrated": "IPR007225",
"integrated": "IPR046361",
"member_databases": null,
"name": "Exocyst complex subunit Sec15-like ",
"source_database": "pfam",
......@@ -41843,7 +41843,7 @@
{
"accession": "PF05028",
"go_terms": null,
"integrated": "IPR007724",
"integrated": "IPR046372",
"member_databases": null,
"name": "Poly (ADP-ribose) glycohydrolase (PARG)",
"source_database": "pfam",
......@@ -53165,7 +53165,7 @@
{
"accession": "PF06409",
"go_terms": null,
"integrated": null,
"integrated": "IPR009443",
"member_databases": null,
"name": "Nuclear pore complex interacting protein (NPIP)",
"source_database": "pfam",
......@@ -83873,7 +83873,7 @@
{
"accession": "PF10073",
"go_terms": null,
"integrated": "IPR018753",
"integrated": "IPR046367",
"member_databases": null,
"name": "Uncharacterized protein conserved in bacteria (DUF2312)",
"source_database": "pfam",
......@@ -127253,7 +127253,7 @@
{
"accession": "PF15067",
"go_terms": null,
"integrated": "IPR029380",
"integrated": "IPR046365",
"member_databases": null,
"name": "FAM124 family",
"source_database": "pfam",
......@@ -140825,7 +140825,7 @@
{
"accession": "PF16617",
"go_terms": null,
"integrated": null,
"integrated": "IPR032140",
"member_databases": null,
"name": "Intersectin and clathrin adaptor AP2 binding region",
"source_database": "pfam",
......@@ -147629,7 +147629,7 @@
{
"accession": "PF17380",
"go_terms": null,
"integrated": "IPR035529",
"integrated": null,
"member_databases": null,
"name": "Family of unknown function (DUF5401)",
"source_database": "pfam",
......@@ -160364,7 +160364,7 @@
{
"accession": "PF18801",
"go_terms": null,
"integrated": "IPR040702",
"integrated": null,
"member_databases": null,
"name": "response regulator aspartate phosphatase H, N terminal",
"source_database": "pfam",
......@@ -210,6 +210,7 @@ class Gene:
strand: Strand
protein: Protein
qualifiers: Mapping[str, Union[str, List[str]]] = field(default_factory=dict)
_probability: Optional[float] = field(default_factory=lambda: None)
@property
def id(self) -> str:
......@@ -221,13 +222,21 @@ class Gene:
def average_probability(self) -> Optional[float]:
"""`float`: The average of domain probabilities of being biosynthetic.
"""
if self._probability is not None:
return self._probability
p = [d.probability for d in self.protein.domains if d.probability is not None]
return sum(p) / len(p) if p else None
@average_probability.setter
def average_probability(self, probability: Optional[float]):
self._probability = probability
@property
def maximum_probability(self) -> Optional[float]:
"""`float`: The highest of domain probabilities of being biosynthetic.
"""
if self._probability is not None:
return self._probability
p = [d.probability for d in self.protein.domains if d.probability is not None]
return max(p) if p else None
......@@ -250,6 +259,7 @@ class Gene:
return Gene(
self.source, self.start, self.end, self.strand, protein,
self.qualifiers.copy(),
_probability=self._probability,
)
def with_source(self, source: "SeqRecord") -> "Gene":
......@@ -258,6 +268,20 @@ class Gene:
return Gene(
source, self.start, self.end, self.strand, self.protein,
self.qualifiers.copy(),
_probability=self._probability,
)
def with_probability(self, probability: float) -> "Gene":
"""Copy the current gene and assign it a different probability.
"""
return Gene(
self.source, self.start, self.end, self.strand,
self.protein.with_domains([
domain.with_probability(probability)
for domain in self.protein.domains
]),
self.qualifiers.copy(),
_probability=probability
)
......
......@@ -81,6 +81,8 @@ class TypeClassifier(object):
with typs_file as typs_src:
types = [
ProductType.pack(ProductType.__members__[ty] for ty in raw.split(";"))
if raw.strip()
else ProductType.Unknown
for raw in (line.split("\t")[1].strip() for line in typs_src)
]
......
<
......@@ -68,6 +68,7 @@ PF00158
PF00160
PF00162
PF00163
PF00164
PF00165
PF00166
PF00168
......@@ -148,6 +149,7 @@ PF00324
PF00325
PF00326
PF00327
PF00328
PF00329
PF00330
PF00333
......@@ -159,6 +161,7 @@ PF00346
PF00347
PF00348
PF00350
PF00351
PF00353
PF00355
PF00356
......@@ -362,6 +365,7 @@ PF00862
PF00871
PF00873
PF00874
PF00875
PF00877
PF00881
PF00883
......@@ -426,6 +430,7 @@ PF01048
PF01050
PF01052
PF01053
PF01055
PF01058
PF01061
PF01066
......@@ -561,6 +566,7 @@ PF01427
PF01430
PF01431
PF01432
PF01433
PF01434
PF01435
PF01450
......@@ -574,6 +580,7 @@ PF01475
PF01476
PF01478
PF01479
PF01488
PF01493
PF01494
PF01502
......@@ -598,6 +605,7 @@ PF01547
PF01551
PF01555
PF01556
PF01557
PF01564
PF01565
PF01566
......@@ -629,6 +637,7 @@ PF01627
PF01628
PF01632
PF01633
PF01634
PF01636
PF01637
PF01641
......@@ -644,6 +653,7 @@ PF01663
PF01668
PF01670
PF01676
PF01678
PF01680
PF01687
PF01694
......@@ -698,6 +708,7 @@ PF01865
PF01869
PF01878
PF01882
PF01883
PF01890
PF01891
PF01895
......@@ -754,6 +765,7 @@ PF02108
PF02110
PF02120
PF02127
PF02129
PF02130
PF02132
PF02136
......@@ -768,6 +780,7 @@ PF02190
PF02195
PF02197
PF02203
PF02210
PF02223
PF02224
PF02230
......@@ -775,6 +788,7 @@ PF02237
PF02245
PF02254
PF02272
PF02277
PF02283
PF02291
PF02302
......@@ -788,7 +802,6 @@ PF02348
PF02353
PF02355
PF02361
PF02365
PF02367
PF02368
PF02373
......@@ -816,6 +829,7 @@ PF02458
PF02463
PF02464
PF02465
PF02469
PF02472
PF02475
PF02481
......@@ -828,6 +842,7 @@ PF02503
PF02504
PF02508
PF02514
PF02515
PF02518
PF02522
PF02525
......@@ -1126,6 +1141,7 @@ PF03704
PF03705
PF03706
PF03710
PF03711
PF03714
PF03717
PF03719
......@@ -1137,10 +1153,12 @@ PF03725
PF03726
PF03732
PF03734
PF03737
PF03739
PF03740
PF03741
PF03746
PF03747
PF03748
PF03755
PF03756
......@@ -1148,10 +1166,12 @@ PF03765
PF03772
PF03773
PF03780
PF03781
PF03786
PF03788
PF03793
PF03796
PF03797
PF03798
PF03799
PF03807
......@@ -1201,6 +1221,7 @@ PF04012
PF04014
PF04015
PF04016
PF04020
PF04023
PF04024
PF04025
......@@ -1237,6 +1258,7 @@ PF04172
PF04174
PF04183
PF04186
PF04191
PF04195
PF04198
PF04199
......@@ -1258,7 +1280,7 @@ PF04295
PF04296
PF04297
PF04298
PF04299
PF04307
PF04313
PF04316
PF04321
......@@ -1281,6 +1303,7 @@ PF04463
PF04464
PF04468
PF04471
PF04472
PF04479
PF04486
PF04488
......@@ -1304,6 +1327,7 @@ PF04657
PF04666
PF04672
PF04673
PF04679
PF04705
PF04715
PF04717
......@@ -1388,6 +1412,7 @@ PF05402
PF05423
PF05430
PF05433
PF05437
PF05491
PF05496
PF05504
......@@ -1433,15 +1458,18 @@ PF05952
PF05954
PF05958
PF05960
PF05973
PF05977
PF05978
PF06002
PF06013
PF06026
PF06050
PF06071
PF06080
PF06081
PF06103
PF06106
PF06114
PF06133
PF06144
......@@ -1481,7 +1509,6 @@ PF06580
PF06603
PF06609
PF06658
PF06682
PF06687
PF06689
PF06722
......@@ -1493,6 +1520,7 @@ PF06745
PF06750
PF06762