Commits (21)
......@@ -30,12 +30,37 @@ jobs:
name: dist
path: dist/*
wheel:
runs-on: ubuntu-latest
name: Build source distribution
steps:
- name: Checkout code
uses: actions/checkout@v2
with:
submodules: true
- name: Set up Python 3.9
uses: actions/setup-python@v2
with:
python-version: 3.9
- name: List project dependencies
run: python setup.py list_requirements
- name: Install project dependencies
run: python -m pip install -r requirements.txt
- name: Build wheel distribution
run: python setup.py bdist_wheel
- name: Store built wheels
uses: actions/upload-artifact@v2
with:
name: dist
path: dist/*
upload:
environment: PyPI
runs-on: ubuntu-latest
name: Upload
needs:
- sdist
- wheel
steps:
- name: Checkout code
uses: actions/checkout@v1
......
......@@ -43,7 +43,7 @@ jobs:
with:
python-version: ${{ matrix.python-version }}
- name: Update CI dependencies
run: python -m pip install -U pip coverage wheel
run: python -m pip install -U pip coverage wheel numpy
- name: List project dependencies
run: python setup.py list_requirements
- name: Install project dependencies
......
......@@ -18,8 +18,10 @@ variables:
before_script:
- python -m pip install -U wheel coverage tqdm pyhmmer
script:
- python setup.py list_requirements -s
- python -m pip install -U -r requirements.txt
- python setup.py build_data --inplace bdist_wheel
- python -m pip install --find-links=dist gecco[train]
- python -m pip install --find-links=dist --no-index gecco-tool[train]
- python -m coverage run -p -m unittest discover -vv
after_script:
- python -m coverage combine
......@@ -63,8 +65,8 @@ docs:
dependencies:
- test:python3.9
before_script:
- pip install -U -r docs/requirements.txt
- pip install -U --find-links dist gecco[train]
- python setup.py list_requirements -s
- pip install -U -r docs/requirements.txt -r requirements.txt
script:
- sphinx-build -b html docs public
artifacts:
......@@ -125,50 +127,3 @@ deploy:codacy:
- pip install -U codacy-coverage
script:
- python -m codacy -r coverage.xml
deploy:changelog:
image: ruby
stage: deploy
before_script:
- gem install chandler
script:
- chandler push --github="zellerlab/GECCO" --changelog="CHANGELOG.md"
deploy:releases:
image: python:3.9
stage: deploy
only:
- tags
before_script:
- python -m pip install -U tqdm pyhmmer
- wget "https://github.com/github-release/github-release/releases/download/v0.10.0/linux-amd64-github-release.bz2" -O- | bunzip2 > ./github-release
- chmod +x ./github-release
script:
- python setup.py build_data --inplace --rebuild
after_script:
- for hmm in gecco/hmmer/*.h3m; do gzip $hmm; ./github-release upload --user zellerlab --repo GECCO --tag "$CI_COMMIT_TAG" --name "$(basename $hmm).gz" --file "$hmm.gz" ; done
deploy:sdist:
image: python:3.9
stage: deploy
only:
- tags
before_script:
- python -m pip install -U wheel twine
script:
- python setup.py sdist
- twine check dist/*.tar.gz
# - twine upload --repository testpypi --skip-existing dist/*.tar.gz
deploy:wheel:
image: python:3.9
stage: deploy
dependencies:
- test:python3.9
only:
- tags
before_script:
- python -m pip install -U wheel twine
script:
- twine check dist/*.whl
# - twine upload --repository testpypi dist/*.whl
......@@ -5,7 +5,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
## [Unreleased]
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.6.2...master
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.6.3...master
## [v0.6.3] - 2021-05-10
[v0.6.3]: https://git.embl.de/grp-zeller/GECCO/compare/v0.6.2...v0.6.3
### Fixed
- HMMER annotation not properly handling inputs with multiple contigs.
- Some progress bar totals displaying as floats in the CLI.
### Changed
- `PyHMMER` now sets the `Z` and `domZ` values from the number of proteins given to the search pipeline.
- `gecco.cli` delegates imports to make CLI more responsive.
- `pkg_resources` has been replaced with `importlib.resources` and `importlib.metadata` where applicable.
- `multiprocessing.cpu_count` has been replaced with `os.cpu_count` where applicable.
## [v0.6.2] - 2021-05-04
[v0.6.2]: https://git.embl.de/grp-zeller/GECCO/compare/v0.6.1...v0.6.2
......
include README.md
include LICENSE
include static/gecco.png
include gecco/_version.txt
include gecco/py.typed
recursive-include gecco/crf *.pkl *.pkl.md5
......
<img align="right" width="180" height="180" src="static/gecco-square.png">
<img align="right" width="180" height="180" src="https://raw.githubusercontent.com/zellerlab/GECCO/v0.6.2/static/gecco-square.png">
# Hi, I'm GECCO!
......@@ -17,23 +17,36 @@ in genomic and metagenomic data using Conditional Random Fields (CRFs).
[![Changelog](https://img.shields.io/badge/keep%20a-changelog-8A0707.svg?maxAge=2678400&style=flat-square)](https://github.com/zellerlab/GECCO/blob/master/CHANGELOG.md)
[![Issues](https://img.shields.io/github/issues/zellerlab/GECCO.svg?style=flat-square&maxAge=600)](https://github.com/zellerlab/GECCO/issues)
[![Preprint](https://img.shields.io/badge/preprint-bioRxiv-darkblue?style=flat-square&maxAge=2678400)](https://www.biorxiv.org/content/10.1101/2021.05.03.442509v1)
[![PyPI](https://img.shields.io/pypi/v/gecco-tool.svg?style=flat-square&maxAge=3600)](https://pypi.python.org/pypi/gecco-tool)
[![Bioconda](https://img.shields.io/conda/vn/bioconda/gecco?style=flat-square&maxAge=3600)](https://anaconda.org/bioconda/gecco)
[![Versions](https://img.shields.io/pypi/pyversions/gecco-tool.svg?style=flat-square&maxAge=3600)](https://pypi.org/project/gecco-tool/#files)
[![Wheel](https://img.shields.io/pypi/wheel/gecco-tool?style=flat-square&maxAge=3600)](https://pypi.org/project/gecco-tool/#files)
## ๐Ÿ”ง Installing GECCO
GECCO is implemented in [Python](https://www.python.org/), and supports [all
versions](https://endoflife.date/python) from Python 3.6. It requires
additional libraries that can be installed directly from
PyPI, the Python Package Index.
[PyPI](https://pypi.org), the Python Package Index.
Use `pip` to install GECCO on your machine:
Use [`pip`](https://pip.pypa.io/en/stable/) to install GECCO on your
machine:
```console
$ pip install gecco-tool
```
If you'd rather use [Conda](https://conda.io), a package is available
in the [`bioconda`](https://bioconda.github.io/) channel. You can install
with:
```console
$ conda install -c bioconda gecco
```
This will install GECCO, its dependencies, and the data needed to run
predictions. This requires around 100MB of data to be downloaded, so
it could take some time depending on your connection. Once done, you will
have a ``gecco`` command available in your $PATH.
it could take some time depending on your Internet connection. Once done, you
will have a ``gecco`` command available in your $PATH.
*Note that GECCO uses [HMMER3](http://hmmer.org/), which can only run
on PowerPC and and recent x86-64 machines running a POSIX operating system.
......@@ -63,7 +76,6 @@ Additional parameters of interest are:
considered part of a BGC region. Using a lower number will increase the
number (and possibly length) of predictions, but reduce accuracy.
<!-- ## ๐Ÿ“– Documentation -->
## ๐Ÿ”– Reference
......
$(document).ready(function() {
(function ($) {
$("#gecco").children("p").last().html( (index, text) => text.replaceAll("</a> ", "</a>") ).end();
})(window.$jqTheme || window.jQuery);
})
......@@ -31,6 +31,7 @@ def setup(app):
# Add custom stylesheet
app.add_css_file("css/main.css")
app.add_js_file("js/apitoc.js")
app.add_js_file("js/homepage.js")
app.add_js_file("js/example-admonition.js")
# -- Project information -----------------------------------------------------
......
......@@ -3,8 +3,7 @@ GECCO
*Biosynthetic Gene Cluster prediction with Conditional Random Fields.*
|GitLabCI| |Coverage| |License| |Source| |Issues| |Preprint|
|GitLabCI| |License| |Coverage| |Source| |Mirror| |Issues| |Preprint| |PyPI| |Bioconda| |Versions| |Wheel|
.. |GitLabCI| image:: https://img.shields.io/gitlab/pipeline/grp-zeller/GECCO/master?gitlab_url=https%3A%2F%2Fgit.embl.de&logo=gitlab&style=flat-square&maxAge=600
:target: https://git.embl.de/grp-zeller/GECCO/-/pipelines
......@@ -27,6 +26,18 @@ GECCO
.. |Preprint| image:: https://img.shields.io/badge/preprint-bioRxiv-darkblue?style=flat-square&maxAge=2678400&logo=arxiv
:target: https://www.biorxiv.org/content/10.1101/2021.05.03.442509v1
.. |PyPI| image:: https://img.shields.io/pypi/v/gecco-tool.svg?style=flat-square&maxAge=3600&logo=pypi
:target: https://pypi.python.org/pypi/gecco-tool
.. |Bioconda| image:: https://img.shields.io/conda/vn/bioconda/gecco?logo=anaconda&style=flat-square&maxAge=3600
:target: https://anaconda.org/bioconda/gecco
.. |Versions| image:: https://img.shields.io/pypi/pyversions/gecco-tool.svg?style=flat-square&maxAge=3600&logo=python
:target: https://pypi.org/project/gecco-tool/#files
.. |Wheel| image:: https://img.shields.io/pypi/wheel/gecco-tool?style=flat-square&maxAge=3600&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAABhGlDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV9TS0UqCnYQdchQnSxIFXHUKhShQqgVWnUwufQLmjQkKS6OgmvBwY/FqoOLs64OroIg+AHi5uak6CIl/i8ptIj14Lgf7+497t4BQr3MNKtrAtB020wl4mImuyoGXxFAEP0YRkxmljEnSUl0HF/38PH1LsqzOp/7c/SqOYsBPpF4lhmmTbxBPL1pG5z3icOsKKvE58TjJl2Q+JHrisdvnAsuCzwzbKZT88RhYrHQxkobs6KpEU8RR1RNp3wh47HKeYuzVq6y5j35C0M5fWWZ6zRHkMAiliBBhIIqSijDRpRWnRQLKdqPd/APuX6JXAq5SmDkWEAFGmTXD/4Hv7u18pMxLykUBwIvjvMxCgR3gUbNcb6PHadxAvifgSu95a/UgZlP0mstLXIE9G0DF9ctTdkDLneAwSdDNmVX8tMU8nng/Yy+KQsM3AI9a15vzX2cPgBp6ip5AxwcAmMFyl7v8O7u9t7+PdPs7wdys3KnxRVKKQAAAAZiS0dEAP8A/wD/oL2nkwAABH9JREFUWMPFV81LJEcUr5kBD3PIQSYIEiYE8qG5iAgbCLiwJz/CXpR4WEMQFxL2H9hbTsldWMSFHILgJZBLUGHvHnKJOEO3joGNOOM6OuOsTPdMV3V3fbyXQ6onNT1+9ChLHjR0V3XVe1Xv937vPYKIJMlj23aOMbYopVxHxF0hhEAt+n1XSrnu+/7iwcFBLum+t/4AAKNCiA0A8DGhAIAvpdwAgNH7GJD1PO8lAOBdBQDQ87yXiJi9Tk8KEUlczs7ORoaGhrYymczHpFcuhRB/IuLrTCZzSAghSqnRVCr1STqd/jKTybwXX6CU+rterz8eHh7+q2e3uEXNZvMh57wdP00QBDuc89mlpaXI6AHbtscsyxojhAwgIpmbm0txzmeDINiJr+ect5vN5sMbXVCtVkfiygGgQimdRkTiOM44pXRNCHEUVyCEOKKUrrmuO46IhDE2DQCVuBHVanXkOgOyUsrXsX13CoXCoOu6ed/3N5P63vf9zVarlS8UCoOI2HUbWke2xwANli7lYRhmEXFSCOFFg0qpBmNs9fT09AdD4U+U0lWlVMO4EQ8RJ4MgyMaN0Lr+MwAARk20A0BFWz+JiFQPU8bY883NzSwiEinlhLHnBCKSra2tLGPsubkGESf39vYGTXcAAEYhShCRCCE2TAsppdOu6+ajkwPAm3K5/CAGoI4BjUajC1yVSuUBALyJbsJ13bznedMxzGwgIiH7+/s5k2Q0gonp83q9PhtHb6PReBTNu667FJ+v1WpLJiYQkZjRAQC+bds5whhbjCF1ViPZdAkVQqxwzjsU6/v+98Yv30XjpVLpszAMN5RSXYByHGc8DMNZc4wxtkg0t0cIdefn51OU0jWt+BIAqOkdpdSK7/sLiHhojB9SSp8wxn6RUoIB2DoiXmi3rk1NTaUQ8a2hb50g4q5x+lcaE0carauVSuUjIcSrfmmYMfarZVk5z/NWI57QgP/d+G2XmFktDMMXmtWi76+jq3Vd91sAuDROt9NsNvOO4+QBYMc41bnv+4+jdXoPxH/pcyAIghfGgQWJkcQz27bH4uFloHvCiJQPDOM+jMaPj48/vy5aLMsak1I+M3WmSR8CAJ13KWXnPZ1OdzJaq9Ua6GfPxC5ot9vfAMBb0wWO4+Rd1+1xQRAEfbngRhCen5/n9XhfQin9rVQq3Q7CeBguLCx0haFSqisMAWAlDMOeMPQ870kQBD8LIa4Nw5mZmd4wvIqIHMcZjx9IKbXCOX/fuNqnVxGRbdufBkGQnIgSUvFXcaq9uLjoUHGr1Xoanz87O7uVii3Lyl2ZjBhjfSWjWq32yJwrl8tfxJORLmp6k9F16bhYLN6YjjnnPel4e3v7ynRcKBRuTsc6zBIXJJTS1ZOTE7Mg+ZExdmVBovfoKki0rmQlWbFYHHQc504lmb7FZCXZTUUpY6xTlHqe986K0kRl+fLycqcstyzr3mV5342JUqoFAH+8s8bExES73b53a6YBl71zc6qU+t+a067Hsqyu9pxzLsysFrXnjLFF27YTt+f/AKtN0SMRWK0jAAAAAElFTkSuQmCC
:target: https://pypi.org/project/gecco-tool/#files
Overview
--------
......@@ -52,7 +63,13 @@ GECCO with ``pip``:
.. code:: console
$ pip install https://github.com/zellerlab/GECCO/archive/master.zip
$ pip install gecco-tool
Or with Conda, using the `bioconda` channel:
.. code::
$ conda install -c bioconda gecco
Predictions
......
......@@ -10,37 +10,35 @@ the Python Package Index. Contrary to other tools in the field
(such as DeepBGC or AntiSMASH), it does not require any external binary.
.. PyPi
.. ^^^^
..
.. GECCO is hosted on the EMBL Git server, but the easiest way to install it is
.. to download the latest release from its `PyPi repository <https://pypi.python.org/pypi/gecco>`_.
.. It will install all dependencies then install the ``gecco`` module:
..
.. .. code:: console
..
.. $ pip install gecco
.. Conda
.. ^^^^^
..
.. GECCO is also available as a `recipe <https://anaconda.org/bioconda/GECCO>`_
.. in the `bioconda <https://bioconda.github.io/>`_ channel. To install, simply
.. use the `conda` installer:
..
.. .. code:: console
..
.. $ conda install -c bioconda GECCO
..
PyPi
^^^^
GECCO is hosted on the EMBL Git server, but the easiest way to install it is
to download the latest release from its `PyPi repository <https://pypi.python.org/pypi/gecco>`_.
It will install all dependencies then install the ``gecco-tool`` package:
.. code:: console
$ pip install gecco-tool
Conda
^^^^^
GECCO is also available as a `recipe <https://anaconda.org/bioconda/GECCO>`_
in the `Bioconda <https://bioconda.github.io/>`_ channel. To install, simply
use the `conda` installer:
.. code:: console
$ conda install -c bioconda GECCO
Git + ``pip``
^^^^^^^^^^^^^
Until GECCO is released on PyPI, you can install it from the GitHub repository
directly with ``pip``:
.. If, for any reason, you prefer to download the library from the git repository,
.. you can clone the repository and install the repository by running:
If, for any reason, you prefer to download the library from the git repository,
you can clone the repository and install the repository by running:
.. code:: console
......@@ -59,6 +57,6 @@ having properly installed all the dependencies):
.. code:: console
$ git clone https://git.embl.de/grp-zeller/GECCO/
$ git clone https://github.com/zellerlab/GECCO/
$ cd GECCO
# python setup.py install
"""Biosynthetic Gene Cluster prediction with Conditional Random Fields.
See Also:
*Accurate de novo identification of biosynthetic gene clusters with GECCO*
Laura M. Carroll, Martin Larralde, Jonas Simon Fleck, Ruby Ponnudurai,
Alessio Milanese, Elisa Cappio, Georg Zeller. bioRxiv 2021.05.03.442509
`doi:10.1101/2021.05.03.442509 <https://doi.org/10.1101/2021.05.03.442509>`_
"""
__author__ = "Martin Larralde"
__license__ = "GPLv3"
__version__ = (
__import__("pkg_resources")
.resource_string(__name__, "_version.txt")
.strip()
.decode("ascii")
)
__version__ = "0.6.3"
......@@ -8,8 +8,6 @@ import typing
import warnings
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Type, TextIO
import numpy
from .._meta import classproperty
if typing.TYPE_CHECKING:
......@@ -32,6 +30,7 @@ def patch_showwarnings(new_showwarning: "ShowWarning") -> Iterator[None]:
@contextlib.contextmanager
def numpy_error_context(
numpy,
*,
all: Optional[str] = None,
divide: Optional[str] = None,
......@@ -42,10 +41,11 @@ def numpy_error_context(
"""A context manager to modify the `numpy` error behaviour locally.
Example:
>>> with numpy_error_context(divide="ignore"):
>>> import numpy
>>> with numpy_error_context(numpy, divide="ignore"):
... numpy.log10(0)
-inf
>>> with numpy_error_context(divide="raise"):
>>> with numpy_error_context(numpy, divide="raise"):
... numpy.log10(0)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
......
......@@ -140,6 +140,18 @@ class Command(metaclass=abc.ABCMeta):
else:
return value
def _on_import_error(
self,
subcommand: str,
e: ImportError
) -> None:
import rich.traceback
self.error(f"The [bold blue]{subcommand}[/] subcommand requires optional dependency [bold blue]{e.name}[/]")
traceback = rich.traceback.Traceback.from_exception(type(e), e, e.__traceback__, extra_lines=0)
self.console.print(traceback)
# -- Logging methods -----------------------------------------------------
def error(self, message, *args, level=0):
......
......@@ -11,36 +11,48 @@ from typing import Mapping, Optional, Type
import docopt
import operator
import pkg_resources
import rich.traceback
from ... import __version__
from ..._meta import classproperty
from .._utils import in_context, patch_showwarnings
from . import __name__ as __parent__
from ._base import Command, CommandExit, InvalidArgument
try:
import importlib.metadata as importlib_metadata
except ImportError:
import importlib_metadata
class Main(Command):
"""The *main* command launched before processing subcommands.
"""
_entry_points_cache = None
@classproperty
def _entry_points(cls):
if cls._entry_points_cache is None:
cls._entry_points_cache = importlib_metadata.entry_points().get(__parent__, [])
return cls._entry_points_cache
@classmethod
def _get_subcommand_names(cls) -> Mapping[str, Type[Command]]:
return [cmd.name for cmd in pkg_resources.iter_entry_points(__parent__)]
return [cmd.name for cmd in cls._entry_points]
@classmethod
def _get_subcommands(cls) -> Mapping[str, Type[Command]]:
commands = {}
for cmd in pkg_resources.iter_entry_points(__parent__):
for cmd in cls._entry_points:
try:
commands[cmd.name] = cmd.load()
except pkg_resources.DistributionNotFound as err:
except Exception:
pass
return commands
@classmethod
def _get_subcommand_by_name(cls, name: str) -> Optional[Type[Command]]:
for cmd in pkg_resources.iter_entry_points(__parent__):
for cmd in cls._entry_points:
if cmd.name == name:
return cmd.load()
return None
......@@ -102,8 +114,8 @@ class Main(Command):
subcmd_name = self.args["<cmd>"]
try:
subcmd_cls = self._get_subcommand_by_name(subcmd_name)
except pkg_resources.DistributionNotFound as dnf:
self.error("The", repr(subcmd_name), "subcommand requires package", dnf.req)
except ImportError as err:
self._on_import_error(subcmd_name, err)
return 1
# exit if no known command was found
......@@ -144,7 +156,12 @@ class Main(Command):
except KeyboardInterrupt:
self.error("Interrupted")
return -signal.SIGINT
except ImportError as err:
self._on_import_error(subcmd_name, err)
return 1
except Exception as e:
import rich.traceback
self.error(
"An unexpected error occurred. Consider opening"
" a new issue on the bug tracker"
......
......@@ -15,20 +15,8 @@ import typing
import signal
from typing import Any, Dict, Union, Optional, List, TextIO, Mapping
import numpy
import pyhmmer
import rich.emoji
import rich.progress
from Bio import SeqIO
from ._base import Command, CommandExit, InvalidArgument
from .._utils import guess_sequences_format, in_context, patch_showwarnings
from ...crf import ClusterCRF
from ...hmmer import PyHMMER, HMM, embedded_hmms
from ...model import FeatureTable, ClusterTable, ProductType
from ...orf import PyrodigalFinder
from ...types import TypeClassifier
from ...refine import ClusterRefiner
class Annotate(Command): # noqa: D101
......@@ -82,6 +70,9 @@ class Annotate(Command): # noqa: D101
raise CommandExit(1)
def _custom_hmms(self):
import pyhmmer
from ...hmmer import HMM
for path in self.hmm:
base = os.path.basename(path)
if base.endswith(".gz"):
......@@ -101,6 +92,8 @@ class Annotate(Command): # noqa: D101
# ---
def _load_sequences(self):
from Bio import SeqIO
if self.format is not None:
format = self.format
self.info("Using", "user-provided sequence format", repr(format), level=2)
......@@ -123,6 +116,8 @@ class Annotate(Command): # noqa: D101
return sequences
def _extract_genes(self, sequences):
from ...orf import PyrodigalFinder
self.info("Extracting", "genes from input sequences", level=1)
orf_finder = PyrodigalFinder(metagenome=True, cpus=self.jobs)
......@@ -136,12 +131,14 @@ class Annotate(Command): # noqa: D101
return list(orf_finder.find_genes(sequences, progress=callback))
def _annotate_domains(self, genes):
from ...hmmer import PyHMMER, embedded_hmms
self.info("Running", "HMMER domain annotation", level=1)
# Run all HMMs over ORFs to annotate with protein domains
hmms = list(self._custom_hmms() if self.hmm else embedded_hmms())
task = self.progress.add_task(description=f"HMM annotation", unit="HMMs", total=len(hmms))
for hmm in self.progress.track(hmms, task_id=task):
for hmm in self.progress.track(hmms, task_id=task, total=len(hmms)):
task = self.progress.add_task(description=f"{hmm.id} v{hmm.version}", total=hmm.size, unit="domains")
callback = lambda h, t: self.progress.update(task, advance=1)
self.info("Starting", f"annotation with [bold blue]{hmm.id} v{hmm.version}[/]", level=2)
......@@ -171,6 +168,8 @@ class Annotate(Command): # noqa: D101
return genes
def _write_feature_table(self, genes):
from ...model import FeatureTable
self.info("Writing", "feature table to", repr(self.output), level=1)
with open(self.output, "w") as f:
FeatureTable.from_genes(genes).dump(f)
......
......@@ -12,14 +12,8 @@ import random
import typing
from typing import Any, Dict, Union, Optional, List, TextIO, Mapping
import rich.progress
import sklearn.model_selection
from ._base import Command, CommandExit, InvalidArgument
from .._utils import in_context, patch_showwarnings
from ...model import ClusterTable, FeatureTable, ProductType
from ...crf import ClusterCRF
from ...crf.cv import LeaveOneGroupOut
from .._utils import patch_showwarnings
class Cv(Command): # noqa: D101
......@@ -118,6 +112,8 @@ class Cv(Command): # noqa: D101
# --
def _load_features(self):
from ...model import FeatureTable
self.info("Loading", "features table from file", repr(self.features))
with open(self.features) as in_:
return FeatureTable.load(in_)
......@@ -150,6 +146,9 @@ class Cv(Command): # noqa: D101
return seqs
def _loto_splits(self, seqs):
from ...crf.cv import LeaveOneGroupOut
from ...model import ClusterTable, ProductType
self.info("Loading", "the clusters table")
with open(self.clusters) as in_:
table = ClusterTable.load(in_)
......@@ -170,6 +169,7 @@ class Cv(Command): # noqa: D101
return list(LeaveOneGroupOut().split(seqs, groups=groups))
def _kfold_splits(self, seqs):
import sklearn.model_selection
return list(sklearn.model_selection.KFold(self.splits).split(seqs))
def _get_train_data(self, train_indices, seqs):
......@@ -184,6 +184,8 @@ class Cv(Command): # noqa: D101
return test_data
def _fit_predict(self, train_data, test_data):
from ...crf import ClusterCRF
# fit and predict the CRF for the current fold
crf = ClusterCRF(
self.feature_type,
......@@ -196,6 +198,8 @@ class Cv(Command): # noqa: D101
return crf.predict_probabilities(test_data, cpus=self.jobs)
def _write_fold(self, fold, genes, append=False):
from ...model import FeatureTable
frame = FeatureTable.from_genes(genes).to_dataframe()
with open(self.output, "a" if append else "w") as out:
frame.assign(fold=fold).to_csv(out, header=not append, sep="\t", index=False)
......
......@@ -4,22 +4,17 @@
import contextlib
import csv
import itertools
import logging
import math
import multiprocessing.pool
import os
import pickle
import random
import signal
import typing
import warnings
import numpy
import pandas
from ._base import Command, InvalidArgument, CommandExit
from .._utils import numpy_error_context, in_context, patch_showwarnings
if typing.TYPE_CHECKING:
import pandas
class Embed(Command): # noqa: D101
......@@ -79,10 +74,14 @@ class Embed(Command): # noqa: D101
# ---
def _read_table(self, path: str) -> "pandas.DataFrame":
import pandas
self.info("Reading", "table from", repr(path), level=2)
return pandas.read_table(path, dtype={"domain": str})
def _read_no_bgc(self):
import pandas
self.info("Reading", "non-BGC features")
# Read the non-BGC table and assign the Y column to `0`
......@@ -101,6 +100,8 @@ class Embed(Command): # noqa: D101
]
def _read_bgc(self):
import pandas
self.info("Reading", "BGC features")
# Read the BGC table, assign the Y column to `1`
......@@ -115,6 +116,8 @@ class Embed(Command): # noqa: D101
return [s for _, s in bgc_df.groupby("sequence_id", sort=True)]
def _read_mapping(self):
import pandas
if self.mapping is not None:
mapping = pandas.read_table(self.mapping)
return { t.bgc_id:t.sequence_id for t in mapping.itertuples() }
......@@ -130,6 +133,9 @@ class Embed(Command): # noqa: D101
no_bgc: "pandas.DataFrame",
bgc: "pandas.DataFrame",
) -> "pandas.DataFrame":
import pandas
import numpy
by_prots = [s for _, s in no_bgc.groupby("protein_id", sort=False)]
# cut the input in half to insert the bgc in the middle
index_half = len(by_prots) // 2
......@@ -151,7 +157,7 @@ class Embed(Command): # noqa: D101
embed = embed.reset_index(drop=True)
embed = embed[embed["i_evalue"] < self.e_filter]
# add additional columns based on info from BGC and non-BGC
with numpy_error_context(divide="ignore"):
with numpy_error_context(numpy, divide="ignore"):
bgc_id = bgc["sequence_id"].values[0]
sequence_id = no_bgc["sequence_id"].apply(lambda x: x).values[0]
embed = embed.assign(sequence_id=sequence_id, BGC_id=bgc_id)
......@@ -160,6 +166,8 @@ class Embed(Command): # noqa: D101
return embed
def _make_embeddings(self, no_bgc_list, bgc_list, mapping):
import pandas
self.info("Embedding", len(bgc_list), "BGCs into", len(no_bgc_list), "contigs")
_jobs = os.cpu_count() if not self.jobs else self.jobs
......
......@@ -2,15 +2,7 @@
"""
import contextlib
import csv
import logging
import multiprocessing
import os
import pickle
import random
import sys
import textwrap
import typing
from typing import Any, Dict, Mapping, List, Optional, TextIO
import rich.console
......
......@@ -15,20 +15,12 @@ import typing
import signal
from typing import Any, Dict, Union, Optional, List, TextIO, Mapping
import numpy
import rich.emoji
import rich.progress
from Bio import SeqIO
from ._base import Command, CommandExit, InvalidArgument
from .annotate import Annotate
from .._utils import guess_sequences_format, in_context, patch_showwarnings
from ...crf import ClusterCRF
from ...hmmer import PyHMMER, HMM, embedded_hmms
from ...model import FeatureTable, ClusterTable, ProductType
from ...orf import PyrodigalFinder
from ...types import TypeClassifier
from ...refine import ClusterRefiner
from .._utils import patch_showwarnings
class Run(Annotate): # noqa: D101
......@@ -119,6 +111,8 @@ class Run(Annotate): # noqa: D101
break
def _predict_probabilities(self, genes):
from ...crf import ClusterCRF
if self.model is None:
self.info("Loading", "embedded CRF pre-trained model", level=1)
else:
......@@ -129,11 +123,13 @@ class Run(Annotate): # noqa: D101
unit = "genes" if len(genes) > 1 else "gene"
task = self.progress.add_task("Prediction", total=len(genes), unit=unit)
return list(crf.predict_probabilities(
self.progress.track(genes, task_id=task),
self.progress.track(genes, task_id=task, total=len(genes)),
cpus=self.jobs
))
def _write_feature_table(self, genes):
from ...model import FeatureTable
base, _ = os.path.splitext(os.path.basename(self.genome))
pred_out = os.path.join(self.output_dir, f"{base}.features.tsv")
self.info("Writing", "feature table to", repr(pred_out), level=1)
......@@ -141,6 +137,8 @@ class Run(Annotate): # noqa: D101
FeatureTable.from_genes(genes).dump(f)
def _extract_clusters(self, genes):
from ...refine import ClusterRefiner
self.info("Extracting", "predicted biosynthetic regions", level=1)
refiner = ClusterRefiner(self.threshold, self.postproc, self.cds)
......@@ -156,6 +154,9 @@ class Run(Annotate): # noqa: D101
return clusters
def _predict_types(self, clusters):
from ...model import ProductType
from ...types import TypeClassifier
self.info("Predicting", "BGC types", level=1)
unit = "cluster" if len(clusters) == 1 else "clusters"
......@@ -178,6 +179,8 @@ class Run(Annotate): # noqa: D101
return clusters_new
def _write_cluster_table(self, clusters):
from ...model import ClusterTable
base, _ = os.path.splitext(os.path.basename(self.genome))
cluster_out = os.path.join(self.output_dir, f"{base}.clusters.tsv")
self.info("Writing", "cluster table to", repr(cluster_out), level=1)
......@@ -185,6 +188,8 @@ class Run(Annotate): # noqa: D101
ClusterTable.from_clusters(clusters).dump(out)
def _write_clusters(self, clusters):
from Bio import SeqIO
for cluster in clusters:
gbk_out = os.path.join(self.output_dir, f"{cluster.id}.gbk")
self.info("Writing", f"cluster [bold blue]{cluster.id}[/] to", repr(gbk_out), level=1)
......