Commits (7)
......@@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
## [Unreleased]
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.8.3-post1...master
## [v0.8.4] - 2021-09-26
[v0.8.4]: https://git.embl.de/grp-zeller/GECCO/compare/v0.8.3-post1...v0.8.4
### Fixed
- `gecco convert gbk --format bigslice` failing to run because of outdated code ([#5](https://github.com/zellerlab/GECCO/issues/5)).
- `gecco convert gbk --format bigslice` not creating files with names conforming to BiG-SLiCE expected input.
### Changed
- Bump minimum `pyrodigal` version to `v0.6.2` to use platform-accelerated code if supported.
## [v0.8.3-post1] - 2021-08-23
[v0.8.3-post1]: https://git.embl.de/grp-zeller/GECCO/compare/v0.8.3...v0.8.3-post1
### Fixed
......
......@@ -69,7 +69,7 @@ Additional parameters of interest are:
- `--jobs`, which controls the number of threads that will be spawned by
GECCO whenever a step can be parallelized. The default, *0*, will
autodetect the number of CPUs on the machine using
[`multiprocessing.cpu_count`](https://docs.python.org/3/library/multiprocessing.html#multiprocessing.cpu_count).
[`os.cpu_count`](https://docs.python.org/3/library/os.html#os.cpu_count).
- `--cds`, controlling the minimum number of consecutive genes a BGC region
must have to be detected by GECCO (default is 3).
- `--threshold`, controlling the minimum probability for a gene to be
......
{% extends "!layout.html" %}
{% block extrahead %}
{{ super() }}
<!-- EMBL usage tracker via Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://tr-denbi.embl.de/heimdall/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '38']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.type='text/javascript'; g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
{% endblock %}
......@@ -10,4 +10,4 @@ See Also:
__author__ = "Martin Larralde"
__license__ = "GPLv3"
__version__ = "0.8.3-post1"
__version__ = "0.8.4"
......@@ -171,7 +171,7 @@ class Annotate(Command): # noqa: D101
task = self.progress.add_task(description=f"{hmm.id} v{hmm.version}", total=hmm.size, unit="domains", precision="")
callback = lambda h, t: self.progress.update(task, advance=1)
self.info("Starting", f"annotation with [bold blue]{hmm.id} v{hmm.version}[/]", level=2)
features = PyHMMER(hmm, self.jobs, whitelist).run(genes, progress=callback)
PyHMMER(hmm, self.jobs, whitelist).run(genes, progress=callback)
self.success("Finished", f"annotation with [bold blue]{hmm.id} v{hmm.version}[/]", level=2)
self.progress.update(task_id=task, visible=False)
......
......@@ -11,6 +11,7 @@ import os
import operator
import multiprocessing
import random
import re
import typing
from typing import Any, Dict, Union, Optional, List, TextIO, Mapping
......@@ -48,15 +49,19 @@ class Convert(Command): # noqa: D101
combination with other tools. The supported formats are listed
below:
* ``gecco convert gbk --format=bigslice``: convert and alias the
GenBank files in the given directory so that they can be loaded by
BiG-SLiCE. Output files are named ``*.regionNNN.gbk``.
* ``gecco convert gbk --format=fna``: convert the GenBank files to
FASTA files containing the nucleotide sequence of the cluster.
Output files are named ``*.fna``.
* ``gecco convert gbk --format=faa``: convert the GenBank files to
FASTA files containing the amino-acid sequences of all the proteins
in a cluster. Output files are named ``*.faa``.
``gecco convert gbk --format=bigslice``
Convert and alias the GenBank files in the given directory so that
they can be loaded by BiG-SLiCE. Output files are named
``*.regionNNN.gbk``.
``gecco convert gbk --format=fna``
Convert the GenBank files to FASTA files containing the nucleotide
sequence of the cluster. Output files are named ``*.fna``.
``gecco convert gbk --format=faa``
Convert the GenBank files to FASTA files containing the amino-acid
sequences of all the proteins in a cluster. Output files are
named ``*.faa``.
"""
......@@ -89,7 +94,7 @@ class Convert(Command): # noqa: D101
# load the original coordinates from the `*.clusters.tsv` files
coordinates = {}
types = {}
for cluster_file in self.progress.track(cluster_files, task_id=task, precision=""):
for cluster_file in self.progress.track(cluster_files, task_id=task):
cluster_fh = ctx.enter_context(open(cluster_file))
for row in ClusterTable.load(cluster_fh):
ty = ";".join(sorted(ty.name for ty in row.type.unpack()))
......@@ -102,7 +107,7 @@ class Convert(Command): # noqa: D101
task = self.progress.add_task("Converting", total=len(gbk_files), unit=unit, precision="")
done = 0
# rewrite GenBank files
for gbk_file in self.progress.track(gbk_files, task_id=task, total=len(gbk_files), precision=""):
for gbk_file in self.progress.track(gbk_files, task_id=task, total=len(gbk_files)):
# load record and ensure it comes from GECCO
record = Bio.SeqIO.read(gbk_file, "genbank")
if "GECCO-Data" not in record.annotations.get('structured_comment', {}):
......@@ -123,7 +128,9 @@ class Convert(Command): # noqa: D101
subregion_feature.qualifiers["label"] = [types[record.id]]
record.features.append(subregion_feature)
# rename the {id}_cluster_{N}.gbk file to {id}.region{N}.gbk
new_name = gbk_file.replace("_cluster_", ".region")
gbk_name = os.path.basename(gbk_file)
contig_id, cluster_n = re.search("^(.*)_cluster_(\d+).gbk", gbk_file).groups()
new_name = os.path.join(self.input_dir, "{}.region{:03}.gbk".format(contig_id, int(cluster_n)))
self.info(f"Rewriting {gbk_file!r} to {new_name!r}")
Bio.SeqIO.write(record, new_name, "genbank")
done += 1
......
......@@ -50,7 +50,7 @@ install_requires =
numpy ~=1.16 ; python_version >= '3.7'
psutil ~=5.8
pyhmmer ~=0.4.2
pyrodigal ~=0.5.0
pyrodigal ~=0.6.2
rich >=9.0.0
scikit-learn ~=0.24.0
scipy ~=1.4
......
import contextlib
import glob
import io
import os
import shutil
import tempfile
import textwrap
import unittest
from unittest import mock
import Bio.SeqIO
from Bio.Seq import Seq
import gecco.orf
import gecco.hmmer
import gecco.cli.commands.convert
from gecco.model import Cluster, ProductType, ClusterTable, FeatureTable
from gecco.cli import main
from gecco.cli.commands.run import Run
from gecco.cli.commands.convert import Convert
from ._base import TestCommand
class TestConvert(TestCommand, unittest.TestCase):
command_type = Convert
@property
def folder(self):
return os.path.dirname(os.path.abspath(__file__))
def setUp(self):
self.tmpdir = tempfile.mkdtemp()
def tearDown(self):
shutil.rmtree(self.tmpdir)
def test_convert_gbk_bigslice(self):
# load the input sequence and pre-compute feature table
sequence = os.path.join(self.folder, "data", "BGC0001866.fna")
source = Bio.SeqIO.read(sequence, "fasta")
with open(os.path.join(self.folder, "data", "BGC0001866.features.tsv")) as f:
features = FeatureTable.load(f)
genes = list(features.to_genes())
for gene in genes:
gene.source = source
# create a cluster containing the whole sequence and write it down as
# a GenBank file to the mock "output" folder
cluster = Cluster("BGC0001866.1_cluster_1", genes, type=ProductType.Polyketide)
record = cluster.to_seq_record()
with open(os.path.join(self.tmpdir, "{}.gbk".format(cluster.id)), "w") as f:
Bio.SeqIO.write(record, f, "genbank")
# copy the `.clusters.tsv` and `.features.tsv` files
for tsv in ["clusters.tsv", "features.tsv"]:
shutil.copy(
os.path.join(self.folder, "data", "BGC0001866.{}".format(tsv)),
os.path.join(self.tmpdir, "BGC0001866.1.{}".format(tsv))
)
# run the `gecco convert gbk` command
argv = ["-vv", "convert", "gbk", "--input-dir", self.tmpdir, "--format", "bigslice"]
with io.StringIO() as stderr:
retcode = main(argv, stream=stderr)
self.assertEqual(retcode, 0, stderr.getvalue())
# check that the output folder has a new `regionXXX.gbk`
regions = list(map(os.path.basename, glob.glob(os.path.join(self.tmpdir, "*region*.gbk"))))
self.assertEqual(regions, ["BGC0001866.1.region001.gbk"])