Commits (15)
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
## [Unreleased]
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.1.0...master
## [v0.1.0] - 2020-07-17
[v0.1.0]: https://git.embl.de/grp-zeller/GECCO/compare/v0.0.1...v0.1.0
Initial release.
## [v0.0.1] - 2018-08-13
[v0.0.1]: https://git.embl.de/grp-zeller/GECCO/compare/37afb97...v0.0.1
Proof-of-concept.
......@@ -6,9 +6,11 @@ set -e
# --- Install software dependencies ------------------------------------------
log Installing executable dependencies with aptitude
apt update
apt install -y hmmer
if [ ! -x "$(command -v hmmsearch)" ]; then
log Installing executable dependencies with aptitude
apt update
apt install -y hmmer
fi
log Installing Python dependencies with pip
pip install -U coverage
......@@ -18,6 +20,12 @@ pip install -U coverage
mkdir -p ci/cache
mkdir -p build/lib/gecco/data/hmms
if [ "$CI" == "true" ]; then
QUIET="-q"
else
QUIET=""
fi
for ini_file in gecco/hmmer/*.ini; do
url=$(grep "url" $ini_file | cut -d'=' -f2 | sed 's/ //g')
hmm=$(grep "id" $ini_file | cut -d'=' -f2 | sed 's/ //g')
......@@ -29,12 +37,12 @@ for ini_file in gecco/hmmer/*.ini; do
if ! [ -e "$cache_file" ]; then
if [ "$hmm" = "Panther" ]; then
log Extracting $hmm v$version
wget "$url" -q -O- \
wget "$url" $QUIET -O- \
| tar xz --wildcards --no-wildcards-match-slash --no-anchored PTHR\*/hmmer.hmm -O \
| gzip > "$cache_file"
else
log Downloading $hmm v$version
wget "$url" -q -O "$cache_file"
wget "$url" $QUIET -O "$cache_file"
fi
else
log Using cached $hmm v$version
......
......@@ -18,3 +18,7 @@ p {
box-sizing: border-box;
font-family: Menlo,Monaco,Consolas,"Courier New",monospace;
}
.class dd {
margin-left: 5%;
}
$(document).ready(function() {
(function ($) {
if (window.location.href.match("/api/gecco.*") !== null) {
$(".nav-list")
.children()
.filter("li")
.append("<ul id='apitoc' class='nav nav-list'></ul>");
$( "dt" )
.has( ".sig-name" )
.slice(1)
.each(function( index ) {
var html = (
"<li><a href='#"
+ $( this ).attr("id")
+ "'><code>"
+ $( this ).find(".sig-name").text()
+ "</code></a></li>"
);
$("#apitoc").append(html);
});
} else if (window.location.href.match("/api/warnings*") !== null) {
$(".nav-list")
.children()
.filter("li")
.append("<ul id='apitoc' class='nav nav-list'></ul>");
$( "dt" )
.each(function( index ) {
var html = (
"<li><a href='#"
+ $( this ).attr("id")
+ "'><code>"
+ $( this ).find(".sig-name").text()
+ "</code></a></li>"
);
$("#apitoc").append(html);
});
if (window.location.href.match("/api/*") !== null) {
// $(".nav-list")
// .children()
// .filter("li")
// .append("<ul id='apitoc' class='nav nav-list'></ul>");
// $( "dt" )
// .has( ".sig-name" )
// .slice(1)
// .each(function( index ) {
// var html = (
// "<li><a href='#"
// + $( this ).attr("id")
// + "'><code>"
// + $( this ).find(".sig-name").text()
// + "</code></a></li>"
// );
// $("#apitoc").append(html);
// });
}
})(window.$jqTheme || window.jQuery);
})
API Reference
==============
.. currentmodule:: gecco
.. automodule:: gecco
.. autosummary::
:nosignatures:
:template: summary.rst
:toctree: api/
gecco.bgc.Protein
gecco.bgc.BGC
gecco.crf.ClusterCRF
gecco.hmmer.HMMER
gecco.knn.ClusterKNN
gecco.orf.ORFFinder
gecco.refine.ClusterRefiner
BGC Detection
=============
.. currentmodule:: gecco.crf
.. automodule:: gecco.crf
.. autoclass:: ClusterCRF
:members:
Type Prediction
===============
.. currentmodule:: gecco.hmmer
.. automodule:: gecco.hmmer
.. autofunction:: embedded_hmms
.. autoclass:: HMMER
:members:
API Reference
==============
.. toctree::
:hidden:
model
orf
hmmer
crf
refine
knn
Data Model
----------
.. currentmodule:: gecco.model
.. autosummary::
:nosignatures:
Hmm
Strand
Domain
Protein
Gene
Cluster
ORF Extraction
--------------
.. currentmodule:: gecco.orf
.. autosummary::
:nosignatures:
ORFFinder
PyrodigalFinder
Domain Annotation
-----------------
.. currentmodule:: gecco.hmmer
.. autosummary::
:nosignatures:
HMMER
BGC Detection
-------------
.. currentmodule:: gecco.crf
.. autosummary::
:nosignatures:
ClusterCRF
BGC Extraction
--------------
.. currentmodule:: gecco.refine
.. autosummary::
:nosignatures:
ClusterRefiner
Type Prediction
---------------
.. currentmodule:: gecco.knn
.. autosummary::
:nosignatures:
ClusterKNN
Type Prediction
===============
.. currentmodule:: gecco.knn
.. automodule:: gecco.knn
.. autoclass:: ClusterKNN
:members:
Data Model
==========
.. currentmodule:: gecco.model
.. automodule:: gecco.model
.. autoclass:: Hmm
.. autoclass:: Domain
.. autoclass:: Protein
.. autoclass:: Gene
.. autoclass:: Cluster
ORF Extraction
==============
.. currentmodule:: gecco.orf
.. automodule:: gecco.orf
.. autoclass:: ORFFinder
:members:
.. autoclass:: PyrodigalFinder
:members:
BGC Extraction
==============
.. currentmodule:: gecco.refine
.. automodule:: gecco.refine
.. autoclass:: ClusterRefiner
:members:
......@@ -32,14 +32,18 @@ def setup(app):
app.add_css_file("css/main.css")
app.add_js_file("js/apitoc.js")
app.add_js_file("js/example-admonition.js")
# Copy `CHANGELOG.md` from project directory
changelog_src = os.path.join(project_dir, "CHANGELOG.md")
changelog_dst = os.path.join(docssrc_dir, "changes.md")
shutil.copy(changelog_src, changelog_dst)
# -- Project information -----------------------------------------------------
import gecco
project = 'GECCO'
copyright = '2020, Jonas Simon Fleck, Martin Larralde'
author = 'Jonas Simon Fleck, Martin Larralde'
copyright = '2020, {}'.format(gecco.__author__)
author = gecco.__author__
# The parsed semantic version
semver = semantic_version.Version.coerce(gecco.__version__)
......@@ -108,7 +112,7 @@ html_theme_path = sphinx_bootstrap_theme.get_html_theme_path()
#
html_theme_options = {
# Bootswatch (http://bootswatch.com/) theme.
"bootswatch_theme": "flatly",
"bootswatch_theme": "sandstone",
# Choose Bootstrap version.
"bootstrap_version": "3",
# Tab name for entire site. (Default: "Site")
......@@ -181,7 +185,7 @@ napoleon_use_rtype = False
autoclass_content = "class"
autodoc_member_order = 'bysource'
autosummary_generate = ['api']
autosummary_generate = []
# -- Options for intersphinx extension ---------------------------------------
......
Welcome to GECCO's documentation!
=================================
GECCO
=====
*Biosynthetic Gene Cluster prediction with Conditional Random Fields.*
Documentation
-------------
.. rubric:: Guides
.. toctree::
:maxdepth: 1
Installation <install>
.. rubric:: Library
.. toctree::
:maxdepth: 2
API reference <api>
API reference <api/index>
Changelog <changes>
License
-------
GECCO is released under the
`GNU General Public License v3 <https://choosealicense.com/licenses/gpl-3.0/>`_
*or later*, and is fully open-source. The ``LICENSE`` file distributed with
the software contains the complete license text.
About
-----
GECCO is developped by the Zeller Team at the European Molecular Biology Laboratory
in Heidelberg. The following individuals contributed to the development of
GECCO:
- `Laura M. Carroll <https://github.com/lmc297>`_
- `Martin Larralde <https://github.com/althonos>`_
- `Jonas S. Fleck <https://github.com/astair>`_
Indices and tables
==================
------------------
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
- :ref:`genindex`
- :ref:`modindex`
- :ref:`search`
Installation
============
PyPi
^^^^
``GECCO`` is hosted on the EMBL Git server, but the easiest way to install it is
to download the latest release from its `PyPi repository <https://pypi.python.org/pypi/gecco>`_.
It will install all dependencies then install the ``gecco`` module:
.. code:: console
$ pip install --user gecco
.. Conda
.. ^^^^^
..
.. Pronto is also available as a `recipe <https://anaconda.org/bioconda/GECCO>`_
.. in the `bioconda <https://bioconda.github.io/>`_ channel. To install, simply
.. use the `conda` installer:
..
.. .. code:: console
..
.. $ conda install -c bioconda GECCO
..
Git + ``pip``
^^^^^^^^^^^^^
If, for any reason, you prefer to download the library from the git repository,
you can clone the repository and install the repository by running:
.. code:: console
$ pip install https://git.embl.de/grp-zeller/GECCO/-/archive/master/GECCO-master.zip
Keep in mind this will install the development version of the library, so not
everything may work as expected compared to a stable versioned release.
GitHub + ``setuptools``
^^^^^^^^^^^^^^^^^^^^^^^
If you do not have ``pip`` installed, you can do the following (after
having properly installed all the dependencies):
.. code:: console
$ git clone https://git.embl.de/grp-zeller/GECCO/
$ cd GECCO
# python setup.py install
......@@ -147,9 +147,9 @@ class Run(Command): # noqa: D101
"Filtering results with e-value under {}", self.args["--e-filter"]
)
for gene in genes:
gene.protein.domains = [
d for d in gene.protein.domains if d.i_evalue < self.args["--e-filter"]
]
key = lambda d: d.i_evalue < self.args["--e-filter"]
gene.protein.domains = list(filter(key, gene.protein.domains))
count = sum(1 for gene in genes for domain in gene.protein.domains)
self.logger.debug("Using remaining {} domains", count)
......@@ -162,31 +162,15 @@ class Run(Command): # noqa: D101
# --- CRF ------------------------------------------------------------
self.logger.info("Predicting cluster probabilities with the CRF model")
# Build the feature table from the list of annotated genes
self.logger.debug("Building feature table from gene list")
feats_df = pandas.concat([g.to_feature_table() for g in genes])
feats_df.sort_values(
by=["sequence_id", "start", "end", "domain_start"], inplace=True
)
# Load trained CRF model
self.logger.debug("Loading trained CRF model")
crf = ClusterCRF.trained(self.args["--model"])
# Split input dataframe into one group per input sequence
feats_df = crf.predict_marginals(
data=[g for _, g in feats_df.groupby("sequence_id")]
)
self.logger.debug("Predicting BGC probabilies")
genes = crf.predict_probabilities(genes)
# Assign probabilities to data classes
for gene in genes:
rows = feats_df[
(feats_df.protein_id == gene.id)
& (feats_df.sequence_id == gene.source.id)
]
gene.probability = rows.p_pred.mean() if len(rows) else None
self.logger.debug("Extracting feature table")
feats_df = pandas.concat([g.to_feature_table() for g in genes])
# Write predictions to file
pred_out = os.path.join(out_dir, f"{base}.features.tsv")
self.logger.debug("Writing feature table to {!r}", pred_out)
feats_df.to_csv(pred_out, sep="\t", index=False)
......@@ -210,7 +194,7 @@ class Run(Command): # noqa: D101
# --- KNN ------------------------------------------------------------
self.logger.info("Predicting BGC types")
knn = ClusterKNN.trained(metric=self.args["--distance"])
knn = ClusterKNN.trained(self.args["--model"], metric=self.args["--distance"])
clusters = knn.predict_types(clusters)
# --- RESULTS --------------------------------------------------------
......
......@@ -2,6 +2,7 @@
"""
import csv
import hashlib
import itertools
import logging
import multiprocessing.pool
......@@ -182,6 +183,13 @@ class Train(Command): # noqa: D101
with open(model_out, "wb") as f:
pickle.dump(crf, f, protocol=4)
self.logger.debug("Computing and saving model checksum")
hasher = hashlib.md5()
with open(model_out, "rb") as f:
hasher.update(f.read()) # FIXME: iterate on file blocks
with open(f"{model_out}.md5", "w") as f:
f.write(hasher.hexdigest())
self.logger.info("Writing transitions and state weights")
crf.save_weights(self.args["--output-dir"])
......
......@@ -36,6 +36,7 @@ import sklearn_crfsuite
import sklearn.model_selection
import sklearn.preprocessing
from ..model import Gene
from . import preprocessing
from .cv import LeaveOneGroupOut
from .select import fisher_significance
......@@ -245,6 +246,28 @@ class ClusterCRF(object):
X, Y = self._extract_features(data, jobs=jobs)
self.model.fit(X, Y)
# --- Prediction ---------------------------------------------------------
def predict_probabilities(
self,
genes: List[Gene],
*,
jobs: Optional[int] = None,
) -> List[Gene]:
# Group genes by sequence id
seqs = itertools.groupby(genes, key=operator.attrgetter("source.id"))
# Build one feature table for sequence group
data = [pandas.concat([g.to_feature_table() for g in s]) for _, s in seqs]
# Predict marginals using the feature table
probs = self.predict_marginals(data, jobs=jobs)
# Assign results to the input gene sequence: each domain has a row in
# the probability table so we can just zip together
domains = itertools.chain.from_iterable(g.protein.domains for g in genes)
for domain, row in zip(domains, probs.itertuples()):
domain.probability = row.p_pred
# Return the genes with annotated domains
return genes
def predict_marginals(
self, data: Iterable[pandas.DataFrame], *, jobs: Optional[int] = None,
) -> pandas.DataFrame:
......