Commits (20)
......@@ -45,5 +45,5 @@ jobs:
- name: Install planemo
run: pip install -U planemo
- name: Deploy repository to Tool Shed
if: startsWith(github.ref, 'refs/tags/v')
run: planemo shed_update ./galaxy --owner althonos --name gecco -t toolshed -m ${{ github.event.head_commit.message }} --shed_key ${{ secrets.TOOL_SHED_API_KEY }}
run: planemo shed_update ./galaxy --owner althonos --name gecco -t toolshed -m '${{ github.event.head_commit.message }}' --shed_key '${{ secrets.TOOL_SHED_API_KEY }}'
......@@ -57,7 +57,8 @@ jobs:
upload:
environment: PyPI
runs-on: ubuntu-latest
name: Upload
name: Upload to PyPI
if: startsWith(github.ref, 'refs/tags/v')
needs:
- sdist
- wheel
......@@ -80,3 +81,46 @@ jobs:
user: __token__
password: ${{ secrets.PYPI_API_TOKEN }}
skip_existing: false
changelog:
runs-on: ubuntu-latest
if: startsWith(github.ref, 'refs/tags/v')
name: Update GitHub release notes
needs: upload
steps:
- name: Checkout code
uses: actions/checkout@v1
- name: Release a Changelog
uses: rasmus-saks/release-a-changelog-action@v1.0.1
with:
github-token: '${{ secrets.GITHUB_TOKEN }}'
attach:
runs-on: ubuntu-latest
name: Attach artifacts to GitHub release
if: startsWith(github.ref, 'refs/tags/v')
needs: changelog
steps:
- name: Checkout code
uses: actions/checkout@v2
with:
submodules: true
- name: Set up Python 3.9
uses: actions/setup-python@v2
with:
python-version: 3.9
- name: List project dependencies
run: python setup.py list_requirements
- name: Install project dependencies
run: python -m pip install -r requirements.txt
- name: Build new HMM artifacts
run: python setup.py build_data -f -r
- name: Compress Pfam HMM
run: gzip -c build/lib/gecco/hmmer/Pfam.h3m > Pfam.h3m.gz
- name: Upload HMM
uses: softprops/action-gh-release@v1
if: startsWith(github.ref, 'refs/tags/')
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
files: Pfam.h3m.gz
name: Release
on:
push:
tags:
- v*.*.*
jobs:
attach:
runs-on: ubuntu-latest
name: Attach HMM artifacts to GitHub releases page
steps:
- name: Checkout code
uses: actions/checkout@v2
with:
submodules: true
- name: Set up Python 3.9
uses: actions/setup-python@v2
with:
python-version: 3.9
- name: List project dependencies
run: python setup.py list_requirements
- name: Install project dependencies
run: python -m pip install -r requirements.txt
- name: Build new HMM artifacts
run: python setup.py build_data -f -r
- name: Compress Pfam HMM
run: gzip -c build/lib/gecco/hmmer/Pfam.h3m > Pfam.h3m.gz
- name: Upload HMM
uses: softprops/action-gh-release@v1
if: startsWith(github.ref, 'refs/tags/')
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
files: Pfam.h3m.gz
chandler:
environment: GitHub Releases
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v1
- name: Set up Ruby 2.7
uses: actions/setup-ruby@v1
with:
ruby-version: 2.7
- name: Install chandler gem
run: gem install chandler
- name: Update releases messages
run: chandler push --github=${{ github.repository }} --changelog="CHANGELOG.md"
env:
CHANDLER_GITHUB_API_TOKEN: ${{ secrets.CHANDLER_GITHUB_API_TOKEN }}
......@@ -5,7 +5,21 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
## [Unreleased]
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.8.5...master
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.8.6...master
## [v0.8.6] - 2022-02-17
[v0.8.6]: https://git.embl.de/grp-zeller/GECCO/compare/v0.8.5...v0.8.6
### Added
- CLI flag for enabling region masking for contigs processed by Prodigal.
- CLI flag for controlling region distance used for edge distance filtering.
### Changed
- `gecco.model.Gene` and `gecco.model.Protein` are now immutable data classes.
- Bump minimum `pyrodigal` version to `v0.6.4` to use region masking.
- Implement filtering for extracted clusters based on distance to the contig edge.
- Store InterPro metadata file uncompressed for version-control integration.
### Fixed
- Mark `BGC0000930` as `Terpene` in the type classifier data.
- Progress bar messages are now in consistent format.
## [v0.8.5] - 2021-11-21
[v0.8.5]: https://git.embl.de/grp-zeller/GECCO/compare/v0.8.4...v0.8.5
......
......@@ -45,14 +45,13 @@ $ conda install -c bioconda gecco
```
This will install GECCO, its dependencies, and the data needed to run
predictions. This requires around 100MB of data to be downloaded, so
it could take some time depending on your Internet connection. Once done, you
will have a ``gecco`` command available in your $PATH.
predictions. This requires around 40MB of data to be downloaded, so
it could take some time depending on your Internet connection. Once done,
you will have a ``gecco`` command available in your $PATH.
*Note that GECCO uses [HMMER3](http://hmmer.org/), which can only run
on PowerPC and recent x86-64 machines running a POSIX operating system.
Therefore, Linux and OSX are supported platforms, but GECCO will not be able
to run on Windows.*
Therefore, GECCO will work on Linux and OSX, but not on Windows.*
## 🧬 Running GECCO
......@@ -72,10 +71,30 @@ Additional parameters of interest are:
autodetect the number of CPUs on the machine using
[`os.cpu_count`](https://docs.python.org/3/library/os.html#os.cpu_count).
- `--cds`, controlling the minimum number of consecutive genes a BGC region
must have to be detected by GECCO (default is 3).
must have to be detected by GECCO. The default is *3*.
- `--threshold`, controlling the minimum probability for a gene to be
considered part of a BGC region. Using a lower number will increase the
number (and possibly length) of predictions, but reduce accuracy.
number (and possibly length) of predictions, but reduce accuracy. The
default of *0.3* was selected to optimize precision/recall on a test set
of 364 BGCs from [MIBiG 2.0](https://mibig.secondarymetabolites.org/).
## 🔎 Results
GECCO will create the following files:
- `{genome}.features.tsv`: The *features* file, containing the identified
proteins and domains in the input sequences, in tabular format.
- `{genome}.clusters.tsv`: If any were found, a *clusters* file, containing
the coordinates of the predicted clusters along their putative biosynthetic
type, in tabular format.
- `{genome}_cluster_{N}.gbk`: If any were found, a GenBank file per cluster,
containing the cluster sequence annotated with its member proteins and domains.
To get a more visual way of exploring of the predictions, you
can open the GenBank files in a genome editing software like [UGENE](http://ugene.net/),
or you can load the results into an AntiSMASH report.
Check the [Integrations](https://gecco.embl.de/integrations.html#antismash) page of the
documentation for a step-by-step guide.
## 🔖 Reference
......
......@@ -78,8 +78,8 @@ Or with Conda, using the `bioconda` channel:
Predictions
^^^^^^^^^^^
GECCO works with DNA sequences, and loads them using Biopython, allowing it
to support a `large variety of formats <https://biopython.org/wiki/SeqIO>`_,
GECCO works with DNA sequences, and loads them using `Biopython <http://biopython.org/>`_,
allowing it to support a `large variety of formats <https://biopython.org/wiki/SeqIO>`_,
including the common FASTA and GenBank files.
Run a prediction on a FASTA file named ``sequence.fna`` and output the
......@@ -170,8 +170,8 @@ the software contains the complete license text.
About
-----
GECCO is developped by the `Zeller group <https://www.embl.de/research/units/scb/zeller/index.html>`_
at the European Molecular Biology Laboratory in Heidelberg. The following
GECCO is developped by the `Zeller Team <https://www.embl.de/research/units/scb/zeller/index.html>`_
at the `European Molecular Biology Laboratory <https://www.embl.de/>`_ in Heidelberg. The following
individuals contributed to the development of GECCO:
- `Laura M. Carroll <https://github.com/lmc297>`_
......
......@@ -3,19 +3,22 @@ Installation
Requirements
^^^^^^^^^^^^
------------
GECCO requires additional libraries that can be installed directly from PyPI,
the Python Package Index. Contrary to other tools in the field
(such as DeepBGC or AntiSMASH), it does not require any external binary.
Installing GECCO locally
------------------------
PyPi
^^^^
GECCO is hosted on the EMBL Git server, but the easiest way to install it is
to download the latest release from its `PyPi repository <https://pypi.python.org/pypi/gecco>`_.
It will install all dependencies then install the ``gecco-tool`` package:
The GECCO source is hosted on the EMBL Git server and mirrored on GitHub, but
the easiest way to install it is to download the latest release from
`PyPi <https://pypi.python.org/pypi/gecco>`_ with ``pip``.
.. code:: console
......@@ -27,36 +30,46 @@ Conda
GECCO is also available as a `recipe <https://anaconda.org/bioconda/GECCO>`_
in the `Bioconda <https://bioconda.github.io/>`_ channel. To install, simply
use the `conda` installer:
use the ``conda`` installer:
.. code:: console
$ conda install -c bioconda GECCO
Git + ``pip``
^^^^^^^^^^^^^
If, for any reason, you prefer to download the library from the git repository,
you can clone the repository and install the repository by running:
.. code:: console
$ pip install https://github.com/zellerlab/GECCO/archive/master.zip
Keep in mind this will install the development version of the library, so not
everything may work as expected compared to a stable versioned release.
GitHub + ``setuptools``
^^^^^^^^^^^^^^^^^^^^^^^
If you do not have ``pip`` installed, you can do the following (after
having properly installed all the dependencies):
.. code:: console
$ git clone https://github.com/zellerlab/GECCO/
$ cd GECCO
# python setup.py install
.. Git + ``pip``
.. ^^^^^^^^^^^^^
..
.. If, for any reason, you prefer to download the library from the git repository,
.. you can clone the repository and install the repository by running:
..
.. .. code:: console
..
.. $ pip install https://github.com/zellerlab/GECCO/archive/master.zip
..
..
.. Keep in mind this will install the development version of the library, so not
.. everything may work as expected compared to a stable versioned release.
..
..
.. GitHub + ``setuptools``
.. ^^^^^^^^^^^^^^^^^^^^^^^
..
.. If you do not have ``pip`` installed, you can do the following (after
.. having properly installed all the dependencies):
..
.. .. code:: console
..
.. $ git clone https://github.com/zellerlab/GECCO/
.. $ cd GECCO
.. # python setup.py install
Using GECCO in Galaxy
---------------------
GECCO is available as a Galaxy tool in the `Toolshed <https://toolshed.g2.bx.psu.edu/>`_.
It can be used directly on the `Galaxy Europe server <https://usegalaxy.eu/>`_. To
add it to your local Galaxy server, get in touch with the admin and ask them
to add the `Toolshed repository for GECCO <https://toolshed.g2.bx.psu.edu/view/althonos/gecco/88dc16b4f583>`_
to the server tools.
Training
========
Requirements
^^^^^^^^^^^^
In order to train GECCO, you will need to have it installed with the *train*
dependencies:
Software Requirements
---------------------
In order to train GECCO, you need to have installed with the *train* optional dependencies.
This can be done with ``pip``:
.. code-block:: console
......@@ -13,77 +14,180 @@ dependencies:
This will install additional Python packages, such as `pandas <https://pandas.pydata.org/>`_
which is needed to process the feature tables, or `fisher <https://pypy.org/project/fisher>`_
which is used to select the most informative features.
which is used to select the most informative domains.
Domain database
---------------
GECCO needs HMM domains to use as features. Installing the ``gecco-tool`` package
will also install a subset of the Pfam database that can be used for making the
predictions. However, this subset should not be used for training, since a
different subset of domains may be selected with different training data.
You can get the latest version of Pfam (35.0 in December 2021) from the EMBL
FTP server:
.. code::
$ wget "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam35.0/Pfam-A.hmm.gz" -O Pfam35.hmm.gz
You are also free to get additional HMMs from other databases, such as
`TIGRFAM <https://www.jcvi.org/research/tigrfams>`_ or `PANTHER <http://www.pantherdb.org/panther/;jsessionid=D7BFDD605F98EC1159A5E0E77536FD76>`_,
or even to build your own HMMs, as long as they are in `HMMER <http://hmmer.org/>`_ format.
Training sequences
------------------
Regions in their genomic context
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Resources
^^^^^^^^^
The easiest case for training GECCO is when you have entire genomes with regions
of interest. In that case, you can directly use these sequences, and you will
only have to prepare a cluster table with the coordinates of each positive region.
Regions from separate datasets
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
In the event you don't have the genomic context available for your regions of
interest, you will have to provide a "fake" context by embedding the positive
regions into contigs that don't contain any positive.
You are free to use sequences coming from anywhere as your _positive_ regions.
GECCO was trained to detect Biosynthetic Gene Clusters, so the
`MIBiG <https://mibig.secondarymetabolites.org/>`_ database was used; you need
to get the FASTA file from MIBiG containing all BGC proteins. It can be found
`on the Download page of MIBiG <https://mibig.secondarymetabolites.org/download>`_.
`MIBiG <https://mibig.secondarymetabolites.org/>`_ database was used to get
the positives, with some additional filtering to remove redundancy and entries
with invalid annotations. For the negative regions, we used representative
genomes from the `proGenomes <https://progenomes.embl.de/>`_ database, and masked
known BGCs using `antiSMASH <https://antismash.secondarymetabolites.org/>`_.
You also need to have some _negative_ regions, such as bacterial genomes, and
make sure they are free of _positives_ (e.g., they must not contain any BGCs).
A common approach is to download a bunch of complete genomes from the
`ENA <https://www.ebi.ac.uk/ena/browser/home>`_ and to removes the regions where
positives are detected (e.g. removing BGCs found by `antiSMASH <https://antismash.secondarymetabolites.org/>`_).
Features
^^^^^^^^
Feature tables
--------------
GECCO does not train on sequences directly, but on feature tables. To obtain
a feature table from the sequences, the ``gecco annotate`` subcommand can be
used:
GECCO does not train on sequences directly, but on feature tables. You can build
the feature table yourself (see below for the expected format), but the easiest
way to obtain a feature table from the sequences is the ``gecco annotate`` subcommand.
To build a table from a collection of nucleotide sequences in ``sequences.fna``
and HMMs in ``Pfam35.hmm.gz``, use:
.. code-block:: console
$ gecco annotate --genome reference_genome.fna -o nobgc
$ gecco annotate --mibig mibig_prot_seqs_xx.faa -o bgc
$ gecco annotate --genome sequences.fna --hmm Pfam35.hmm.gz -o features.tsv
Use the `--hmm` flag with the path to a different HMM file to use other HMMs than
the internal ones from GECCO (a subset of Pfam v33.1 and Tigrfam v15.0). **Make
sure the `--mibig` input flag is used when annotating MIBiG sequences to ensure
the metadata are properly extracted from the sequence ID of each protein in the
input file.**
.. hint::
_This step takes a long time, count about 5 minutes to annotate 1M amino-acids
with Pfam alone._
If you have more than one HMM file, you can add additional ``--hmm`` flags
so that all of them are used.
The feature table is a TSV file that looks like this, with one row per domain,
per protein, per sequence:
Embedding
^^^^^^^^^
============ ============== ===== ==== ====== ======= ====== ======== =========== ============ ==========
sequence_id protein_id start end strand domain hmm i_evalue pvalue domain_start domain_end
============ ============== ===== ==== ====== ======= ====== ======== =========== ============ ==========
AFPU01000001 AFPU01000001_1 3 2555 ``+`` PF01573 Pfam35 95.95209 0.00500 2 27
AFPU01000001 AFPU01000001_2 2610 4067 ``-`` PF17032 Pfam35 0.75971 3.961e-05 83 142
AFPU01000001 AFPU01000001_2 2610 4067 ``-`` PF13719 Pfam35 4.89304 0.000255 85 98
... ... ... ... ... ... ... ... ... ... ...
============ ============== ===== ==== ====== ======= ====== ======== =========== ============ ==========
When both the BGC and the non-BGC sequences have been annotated, merge them into
a continuous table to train the CRF on using the ``gecco embed`` subcommand:
.. hint::
.. code-block:: console
If this step takes too long, you can also split the file containing your
input sequences, process them independently in parallel, and combine the
result.
Cluster tables
--------------
The cluster table is used to additional information to GECCO: the location of
each positive region in the input data, and the type of each region (if it makes
sense). You need to build this table manually, but it should be quite straightforward.
.. hint::
If a region has more than one type, use `;` to separate the two types
in the type column. For instance, a Polyketide/NRP hybrid cluster can be
marked with the type ``Polyketide;NRP``.
$ gecco embed --bgc bgc/*.features.tsv --nobgc nobgc/*.features.tsv -o embedding.tsv
The cluster table is a TSV file that looks like this, with one row per region:
============ ============ ====== ====== ==========
sequence_id bgc_id start end type
============ ============ ====== ====== ==========
AFPU01000001 BGC0000001 806243 865563 Polyketide
MTON01000024 BGC0001910 129748 142173 Terpene
... ... ... ... ...
============ ============ ====== ====== ==========
Fitting
^^^^^^^
.. hint::
To train the model with default parameters, you must have built a feature table
and a cluster table similar to the ones created by GECCO after a prediction.
Once it's done, use the following command to fit the weights of the CRF, and to
train the BGC type classifier:
If the concept of "type" makes no sense for the regions you are trying to
detect, you can omit the ``type`` column entirely. This will effectively
mark all the regions from the training sequences as "Unknown".
Fitting the model
-----------------
Now that you have everything needed, it's time to train GECCO! Use the
following method to fit the CRF model and the type classifier:
.. code-block:: console
$ gecco train --features embedding.features.tsv --clusters embedding.clusters.tsv -o model
$ gecco -vv train --features features.tsv --clusters clusters.tsv -o model
GECCO will create a directory named ``model`` containing all the required files
to make predictions later on.
L1/L2 regularisation
^^^^^^^^^^^^^^^^^^^^
Use the ``--c1`` and ``--c2`` flags to control the weight for the L1 and L2
regularisation, respectively. The command line defaults to *0.15* and *0.15*;
however, for training GECCO, we disabled L2 regularisation and selected
a value of *0.4* for :math:`C_1` by optimizing on an external validation dataset.
Feature selection
^^^^^^^^^^^^^^^^^
GECCO supports selecting the most informative features from the training dataset
using a simple contingency testing for the presence/absence of each domain in
the regions of interest. Reducing the number of features helps the CRF model to
get better accuracy. It also greatly reduces the time needed to make predictions
by skipping the HMM annotation step for useless domains.
Use the ``--select`` flag to select a fraction of most informative features
before training to reduce the total feature set (for instance, use ``--select 0.3``
to select the 30% features with the lowest Fisher *p-value*).
The command will output a folder (named ``model`` here) containing all the data
files necessary for predicting probabilities with ``gecco run``:
.. code-block:: console
$ gecco train --features features.tsv --clusters clusters.tsv -o model --select 0.3
.. hint::
You will get a warning in case you select a *p-value* threshold that is still
too high, resulting in non-informative domains to be included in the selected
features.
Predicting with the new model
-----------------------------
To make predictions with a model different from the one embedded in GECCO, you
will need the folder from a previous ``gecco train`` run, as well as the HMMs
used to build the feature tables in the first place.
.. code-block:: console
$ gecco run --model model ...
$ gecco run --model model --hmm Pfam35.hmm.gz --genome genome.fa -o ./predictions/
Congratulations, you trained GECCO with your own dataset, and successfully
used it to make predictions!
tool_test_output.html
tool_test_output.json
../CHANGELOG.md
\ No newline at end of file
<?xml version='1.0' encoding='utf-8'?>
<tool id="gecco" name="GECCO" version="0.8.5" python_template_version="3.5">
<description>GECCO (Gene Cluster prediction with Conditional Random Fields) is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs).</description>
<description>is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs).</description>
<requirements>
<requirement type="package" version="0.8.5">gecco</requirement>
</requirements>
......@@ -14,9 +14,9 @@
#end if
ln -s '$input' input_tempfile.$file_extension &&
gecco -vv run
--format $input.ext
--genome input_tempfile.$file_extension
gecco -vv run
--format $input.ext
--genome input_tempfile.$file_extension
--postproc $postproc
--force-clusters-tsv
#if $cds:
......@@ -29,10 +29,10 @@
--antismash-sideload
#end if
&& mv input_tempfile.features.tsv $features
&& mv input_tempfile.clusters.tsv $clusters
&& mv input_tempfile.features.tsv '$features'
&& mv input_tempfile.clusters.tsv '$clusters'
#if $antismash_sideload
&& mv input_tempfile.sideload.json $sideload
&& mv input_tempfile.sideload.json '$sideload'
#end if
]]></command>
......@@ -62,7 +62,7 @@
<output name="features" file="features.tsv"/>
<output name="clusters" file="clusters.tsv"/>
<output_collection name="records" type="list">
<element name="BGC0001866.1_cluster_1" file="BGC0001866.1_cluster_1.gbk" ftype="genbank" lines_diff="2"/>
<element name="BGC0001866.1_cluster_1" file="BGC0001866.1_cluster_1.gbk" ftype="genbank" compare="diff" lines_diff="4"/>
</output_collection>
</test>
<test>
......@@ -72,52 +72,41 @@
<output name="clusters" file="clusters.tsv"/>
<output name="sideload" file="sideload.json"/>
<output_collection name="records" type="list">
<element name="BGC0001866.1_cluster_1" file="BGC0001866.1_cluster_1.gbk" ftype="genbank" lines_diff="2"/>
<element name="BGC0001866.1_cluster_1" file="BGC0001866.1_cluster_1.gbk" ftype="genbank" compare="diff" lines_diff="4"/>
</output_collection>
</test>
</tests>
<help>
<![CDATA[
<help><![CDATA[
**Overview**
Overview
--------
GECCO is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs).
GECCO (Gene Cluster prediction with Conditional Random Fields) is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs).
It is developed in the Zeller group and is part of the suite of computational microbiome analysis tools hosted at EMBL.
**Input**
Input
-----
GECCO works with DNA sequences, and loads them using Biopython, allowing it to support a large variety of formats, including the common FASTA and GenBank files.
**Output**
Output
------
GECCO will create the following files once done (using the same prefix as the input file):
- features.tsv: The features file, containing the identified proteins and domains in the input sequences.
- clusters.tsv: If any were found, a clusters file, containing the coordinates of the predicted clusters, along their putative biosynthetic type.
- {sequence}_cluster_{N}.gbk: If any BGCs were found, a GenBank file per cluster, containing the cluster sequence annotated with its member proteins and domains.
- ``features.tsv``: The features file, containing the identified proteins and domains in the input sequences.
- ``clusters.tsv``: If any were found, a clusters file, containing the coordinates of the predicted clusters, along their putative biosynthetic type.
- ``{sequence}_cluster_{N}.gbk``: If any BGCs were found, a GenBank file per cluster, containing the cluster sequence annotated with its member proteins and domains.
**Contact**
Contact
-------
If you have any question about GECCO, if you run into any issue, or if you would like to make a feature request, please create an issue in the GitHub repository.
You can also directly contact Martin Larralde via email. If you want to contribute to GECCO, please have a look at the contribution guide first, and feel free to
open a pull request on the GitHub repository.
If you have any question about GECCO, if you run into any issue, or if you would like to make a feature request, please create an issue in the
`GitHub repository <https://github.com/zellerlab/gecco>`_. You can also directly contact `Martin Larralde via email <mailto:martin.larralde@embl.de>`_.
If you want to contribute to GECCO, please have a look at the contribution guide first, and feel free to open a pull request on the GitHub repository.
]]>
</help>
]]></help>
<citations>
<citation type="bibtex">
@article {Carroll2021.05.03.442509,
author = {Carroll, Laura M. and Larralde, Martin and Fleck, Jonas Simon and Ponnudurai, Ruby and Milanese, Alessio and Cappio, Elisa and Zeller, Georg},
title = {Accurate de novo identification of biosynthetic gene clusters with GECCO},
elocation-id = {2021.05.03.442509},
year = {2021},
doi = {10.1101/2021.05.03.442509},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Biosynthetic gene clusters (BGCs) are enticing targets for (meta)genomic mining efforts, as they may encode novel, specialized metabolites with potential uses in medicine and biotechnology. Here, we describe GECCO (GEne Cluster prediction with COnditional random fields; https://gecco.embl.de), a high-precision, scalable method for identifying novel BGCs in (meta)genomic data using conditional random fields (CRFs). Based on an extensive evaluation of de novo BGC prediction, we found GECCO to be more accurate and over 3x faster than a state-of-the-art deep learning approach. When applied to over 12,000 genomes, GECCO identified nearly twice as many BGCs compared to a rule-based approach, while achieving higher accuracy than other machine learning approaches. Introspection of the GECCO CRF revealed that its predictions rely on protein domains with both known and novel associations to secondary metabolism. The method developed here represents a scalable, interpretable machine learning approach, which can identify BGCs de novo with high precision.Competing Interest StatementThe authors have declared no competing interest.},
URL = {https://www.biorxiv.org/content/early/2021/05/04/2021.05.03.442509},
eprint = {https://www.biorxiv.org/content/early/2021/05/04/2021.05.03.442509.full.pdf},
journal = {bioRxiv}
}
</citation>
<citation type="doi">10.1101/2021.05.03.442509</citation>
</citations>
</tool>
......@@ -10,4 +10,4 @@ See Also:
__author__ = "Martin Larralde"
__license__ = "GPLv3"
__version__ = "0.8.5"
__version__ = "0.8.6"
......@@ -44,7 +44,7 @@ class Annotate(Command): # noqa: D101
one of the sequences format supported
by Biopython.
Parameters:
Parameters - Output:
-f <fmt>, --format <fmt> the format of the input file, as a
Biopython format string. GECCO is able
to recognize FASTA and GenBank files
......@@ -55,9 +55,16 @@ class Annotate(Command): # noqa: D101
multithreading. Use 0 to use all of the
available CPUs. [default: 0]
Parameters - Gene Calling:
-M, --mask Enable unknown region masking to
prevent genes from stretching across
unknown nucleotides.
Parameters - Domain Annotation:
-e <e>, --e-filter <e> the e-value cutoff for protein domains
to be included.
to be included. This is not stable
across versions, so consider using
a p-value filter instead.
-p <p>, --p-filter <p> the p-value cutoff for protein domains
to be included. [default: 1e-9]
......@@ -88,6 +95,7 @@ class Annotate(Command): # noqa: D101
self.genome = self._check_flag("--genome")
self.hmm = self._check_flag("--hmm")
self.output = self._check_flag("--output")
self.mask = self._check_flag("--mask", bool)
except InvalidArgument:
raise CommandExit(1)
......@@ -148,10 +156,10 @@ class Annotate(Command): # noqa: D101
from ...orf import PyrodigalFinder
self.info("Extracting", "genes from input sequences", level=1)
orf_finder = PyrodigalFinder(metagenome=True, cpus=self.jobs)
orf_finder = PyrodigalFinder(metagenome=True, mask=self.mask, cpus=self.jobs)
unit = "contigs" if len(sequences) > 1 else "contig"
task = self.progress.add_task(description="ORFs finding", total=len(sequences), unit=unit, precision="")
task = self.progress.add_task(description="Finding ORFs", total=len(sequences), unit=unit, precision="")
def callback(record, found, total):
self.success("Found", found, "genes in record", repr(record.id), level=2)
......@@ -166,9 +174,9 @@ class Annotate(Command): # noqa: D101
# Run all HMMs over ORFs to annotate with protein domains
hmms = list(self._custom_hmms() if self.hmm else embedded_hmms())
task = self.progress.add_task(description=f"HMM annotation", unit="HMMs", total=len(hmms), precision="")
task = self.progress.add_task(description=f"Annotating domains", unit="HMMs", total=len(hmms), precision="")
for hmm in self.progress.track(hmms, task_id=task, total=len(hmms)):
task = self.progress.add_task(description=f"{hmm.id} v{hmm.version}", total=hmm.size, unit="domains", precision="")
task = self.progress.add_task(description=f" {hmm.id} v{hmm.version}", total=hmm.size, unit="domains", precision="")
callback = lambda h, t: self.progress.update(task, advance=1)
self.info("Starting", f"annotation with [bold blue]{hmm.id} v{hmm.version}[/]", level=2)
PyHMMER(hmm, self.jobs, whitelist).run(genes, progress=callback)
......@@ -195,13 +203,17 @@ class Annotate(Command): # noqa: D101
if self.e_filter is not None:
self.info("Filtering", "domains with e-value under", self.e_filter, level=1)
key = lambda d: d.i_evalue < self.e_filter
for gene in genes:
gene.protein.domains = list(filter(key, gene.protein.domains))
genes = [
gene.with_protein(gene.protein.with_domains(filter(key, gene.protein.domains)))
for gene in genes
]
if self.p_filter is not None:
self.info("Filtering", "domains with p-value under", self.p_filter, level=1)
key = lambda d: d.pvalue < self.p_filter
for gene in genes:
gene.protein.domains = list(filter(key, gene.protein.domains))
genes = [
gene.with_protein(gene.protein.with_domains(filter(key, gene.protein.domains)))
for gene in genes
]
if self.p_filter is not None or self.e_filter is not None:
count = sum(1 for gene in genes for domain in gene.protein.domains)
self.info("Using", "remaining", count, "domains", level=1)
......
......@@ -90,7 +90,7 @@ class Convert(Command): # noqa: D101
# collect `*.clusters.tsv` files
cluster_files = glob.glob(os.path.join(self.input_dir, "*.clusters.tsv"))
unit = "table" if len(cluster_files) == 1 else "tables"
task = self.progress.add_task("Loading", total=len(cluster_files), unit=unit, precision="")
task = self.progress.add_task("Loading clusters", total=len(cluster_files), unit=unit, precision="")
# load the original coordinates from the `*.clusters.tsv` files
coordinates = {}
types = {}
......
......@@ -190,7 +190,7 @@ class Cv(Train): # noqa: D101
splits = self._kfold_splits(seqs)
# run CV
unit = "fold" if len(splits) == 1 else "folds"
task = self.progress.add_task(description="Cross-Validation", total=len(splits), unit=unit, precision="")
task = self.progress.add_task(description="Cross-validating", total=len(splits), unit=unit, precision="")
self.info("Performing cross-validation")
for i, (train_indices, test_indices) in enumerate(self.progress.track(splits, task_id=task)):
train_data = self._get_train_data(train_indices, seqs)
......
......@@ -66,14 +66,21 @@ class Run(Annotate): # noqa: D101
--force-clusters-tsv always write a ``clusters.tsv`` file
even when no clusters were found.
Parameters - Gene Calling:
-M, --mask Enable unknown region masking to
prevent genes from stretching across
unknown nucleotides.
Parameters - Domain Annotation:
-e <e>, --e-filter <e> the e-value cutoff for protein domains
to be included.
to be included. This is not stable
across versions, so consider using
a p-value filter instead.
-p <p>, --p-filter <p> the p-value cutoff for protein domains
to be included. [default: 1e-9]
Parameters - Cluster Detection:
-c, --cds <N> the minimum number of coding sequences a
-c <N>, --cds <N> the minimum number of coding sequences a
valid cluster must contain. [default: 3]
-m <m>, --threshold <m> the probability threshold for cluster
detection. Default depends on the
......@@ -81,6 +88,13 @@ class Run(Annotate): # noqa: D101
0.6 for antismash).
--postproc <method> the method to use for cluster validation
(antismash or gecco). [default: gecco]
-E <N>, --edge-distance <N> the minimum number of annotated genes
that must separate a cluster from the
edge. Edge clusters will still be
included if they are longer. A lower
number will increase the number of
false positives on small contigs.
[default: 10]
Parameters - Debug:
--model <directory> the path to an alternative CRF model
......@@ -113,6 +127,7 @@ class Run(Annotate): # noqa: D101
self.threshold = self._check_flag("--threshold", float, lambda x: 0 <= x <= 1, hint="number between 0 and 1")
self.jobs = self._check_flag("--jobs", int, lambda x: x >= 0, hint="positive or null integer")
self.postproc = self._check_flag("--postproc", str, lambda x: x in ("gecco", "antismash"), hint="'gecco' or 'antismash'")
self.edge_distance = self._check_flag("--edge-distance", int, lambda x: x >= 0, hint="positive or null integer")
self.format = self._check_flag("--format")
self.genome = self._check_flag("--genome")
self.model = self._check_flag("--model")
......@@ -120,6 +135,7 @@ class Run(Annotate): # noqa: D101
self.output_dir = self._check_flag("--output-dir")
self.antismash_sideload = self._check_flag("--antismash-sideload", bool)
self.force_clusters_tsv = self._check_flag("--force-clusters-tsv", bool)
self.mask = self._check_flag("--mask", bool)
except InvalidArgument:
raise CommandExit(1)
......@@ -173,7 +189,7 @@ class Run(Annotate): # noqa: D101
self.info("Predicting", "cluster probabilitites with the CRF model", level=1)
unit = "genes" if len(genes) > 1 else "gene"
task = self.progress.add_task("Prediction", total=len(genes), unit=unit, precision="")
task = self.progress.add_task("Predicting marginals", total=len(genes), unit=unit, precision="")
return list(crf.predict_probabilities(
self.progress.track(genes, task_id=task, total=len(genes)),
cpus=self.jobs
......@@ -192,11 +208,16 @@ class Run(Annotate): # noqa: D101
from ...refine import ClusterRefiner
self.info("Extracting", "predicted biosynthetic regions", level=1)
refiner = ClusterRefiner(self.threshold, self.postproc, self.cds)
refiner = ClusterRefiner(
threshold=self.threshold,
criterion=self.postproc,
n_cds=self.cds,
edge_distance=self.edge_distance
)
total = len({gene.source.id for gene in genes})
unit = "contigs" if total > 1 else "contig"
task = self.progress.add_task("Segmentation", total=total, unit=unit, precision="")
task = self.progress.add_task("Extracting clusters", total=total, unit=unit, precision="")
clusters = []
gene_groups = itertools.groupby(genes, lambda g: g.source.id)
......@@ -212,7 +233,7 @@ class Run(Annotate): # noqa: D101
self.info("Predicting", "BGC types", level=1)
unit = "cluster" if len(clusters) == 1 else "clusters"
task = self.progress.add_task("Type prediction", total=len(clusters), unit=unit, precision="")
task = self.progress.add_task("Predicting types", total=len(clusters), unit=unit, precision="")
clusters_new = []
classifier = TypeClassifier.trained(self.model)
......
......@@ -302,9 +302,13 @@ class Train(Command): # noqa: D101
cluster_row.start <= gene.start and gene.end <= cluster_row.end
for cluster_row in cluster_by_seq[seq_id]
):
gene.protein.domains = [d.with_probability(1) for d in gene.protein.domains]
gene = gene.with_protein(
gene.protein.with_domains([d.with_probability(1) for d in gene.protein.domains])
)
else:
gene.protein.domains = [d.with_probability(0) for d in gene.protein.domains]
gene = gene.with_protein(
gene.protein.with_domains([d.with_probability(0) for d in gene.protein.domains])
)
labelled_genes.append(gene)
self.progress.update(task_id=task, advance=1)
......
......@@ -16,7 +16,6 @@ import random
import textwrap
import typing
import warnings
from multiprocessing.pool import Pool
from typing import (
Callable,
Dict,
......@@ -36,7 +35,6 @@ import sklearn.model_selection
import sklearn.preprocessing
from ..model import Gene
from .._meta import OrderedPoolWrapper
from . import features
from .cv import LeaveOneGroupOut
from .select import fisher_significance
......@@ -96,7 +94,6 @@ class ClusterCRF(object):
feature_type: str = "protein",
algorithm: str = "lbfgs",
overlap: int = 2,
pool_factory: Union[Type[Pool], Callable[[Optional[int]], Pool]] = Pool,
**kwargs: Dict[str, object],
) -> None:
"""Create a new `ClusterCRF` instance.
......@@ -110,11 +107,6 @@ class ClusterCRF(object):
overlap (`int`): In case of ``feature_type = "overlap"``, defines
the sliding window size to use. The resulting window width is
``2*overlap+1``.
pool_factory (`multiprocessing.pool.Pool` subclass, or callable):
The factory to use to create a new pool instance for methods
that can perform operations in parallel. *It is called with
a single argument which is the number of workers to create,
or* `None` *to create a much workers as there are CPUs.*
Any additional keyword argument is passed as-is to the internal
`~sklearn_crfsuite.CRF` constructor.
......@@ -130,7 +122,6 @@ class ClusterCRF(object):
self.feature_type: str = feature_type
self.overlap: int = overlap
self.algorithm = algorithm
self.pool_factory = pool_factory
self.significance: Optional[Dict[str, float]] = None
self.significant_features: Optional[FrozenSet[str]] = None
self.model = sklearn_crfsuite.CRF(
......@@ -160,12 +151,10 @@ class ClusterCRF(object):
else:
raise ValueError("invalid feature type")
# proces each sequence / group in parallel
with OrderedPoolWrapper(self.pool_factory(_cpus)) as pool:
# extract features in parallel and predict cluster probabilities
marginals = self.model.predict_marginals(pool.map(extract, seqs))
# Annotate the genes with the predicted probabilities
annotated_seqs = pool.starmap(annotate, zip(seqs, marginals))
# extract features and predict cluster probabilities
marginals = self.model.predict_marginals([list(extract(seq)) for seq in seqs])
# Annotate the genes with the predicted probabilities
annotated_seqs = [annotate(seq, m) for seq, m in zip(seqs, marginals)]
# return the genes that were passed as input but now having BGC
# probabilities set
......@@ -222,17 +211,16 @@ class ClusterCRF(object):
# remove non significant domains
for i, seq in enumerate(seqs):
for j, gene in enumerate(seq):
seqs[i][j] = gene = copy.deepcopy(gene)
gene.protein.domains = [
domain for domain in gene.protein.domains
if domain.name in self.significant_features
]
# proces each sequence / group in parallel
with OrderedPoolWrapper(self.pool_factory(_cpus)) as pool:
# extract features in parallel and predict cluster probabilities
X = pool.map(extract_features, seqs)
Y = pool.map(extract_labels, seqs)
seqs[i][j] = gene.with_protein(
gene.protein.with_domains([
domain for domain in gene.protein.domains
if domain.name in self.significant_features
])
)
# extract features and labels
X = [list(extract_features(seq)) for seq in seqs]
Y = [list(extract_labels(seq)) for seq in seqs]
# check labels
if all(y == "1" for y in Y):
......
......@@ -21,7 +21,12 @@ def extract_features_group(sequence: Iterable[Gene]) -> List[Dict[str, float]]:
# Fixing this requires retraining the model so that it is aware of
# unannotated proteins in the training data, and learns to ignore them.
# When it is done, make sure to edit `postprocessing` as well.
return [{d.name: 1 - d.i_evalue for d in g.protein.domains } for g in sequence if g.protein.domains]
return [
{d.name: 1 - d.i_evalue for d in g.protein.domains}
for g in sequence
if g.protein.domains
]
def extract_labels_group(sequence: Iterable[Gene]) -> List[str]:
"""Extract labels at the group/protein level from an iterable of genes.
......@@ -33,28 +38,37 @@ def extract_labels_group(sequence: Iterable[Gene]) -> List[str]:
]
def annotate_probabilities_group(sequence: "_S", marginals: List[Dict[str, float]]) -> "_S":
def annotate_probabilities_group(
sequence: "_S", marginals: List[Dict[str, float]]
) -> "_S":
"""Annotate genes with marginals obtained from a CRF at the protein level.
"""
probabilities = iter(marginals)
for gene in sequence:
if not gene.protein.domains:
continue
p = next(probabilities)["1"]
gene.protein.domains = [ d.with_probability(p) for d in gene.protein.domains]
return sequence
if gene.protein.domains:
p = next(probabilities)["1"]
gene = gene.with_protein(
gene.protein.with_domains(
[d.with_probability(p) for d in gene.protein.domains]
)
)
yield gene
def annotate_probabilities_single(sequence: "_S", marginals: List[Dict[str, float]]) -> "_S":
def annotate_probabilities_single(
sequence: "_S", marginals: List[Dict[str, float]]
) -> "_S":
"""Annotate genes with marginals obtained from a CRF at the domain level.
"""
domains = [ domain for gene in sequence for domain in gene.protein.domains ]
# FIXME
domains = [domain for gene in sequence for domain in gene.protein.domains]
for domain, probability in itertools.zip_longest(domains, marginals):
assert domain is not None
assert probability is not None
domain.probability = probability["1"]
return sequence
def extract_labels_single(sequence: Iterable[Gene]) -> List[str]:
"""Extract labels at the domain level from an iterable of genes.
"""
......
......@@ -42,7 +42,6 @@ class InterPro:
@classmethod
def load(cls) -> "InterPro":
with importlib_resources.open_binary(__name__, "interpro.json.gz") as f:
data = json.load(gzip.open(f, mode="rt"))
entries = [ InterProEntry(**entry["metadata"]) for entry in data ]
with importlib_resources.open_binary(__name__, "interpro.json") as f:
entries = [ InterProEntry(**entry) for entry in json.load(f) ]
return cls(entries)