......@@ -5,7 +5,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
## [Unreleased]
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.9.1...master
[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.9.2...master
## [v0.9.2] - 2022-04-11
[v0.9.2]: https://git.embl.de/grp-zeller/GECCO/compare/v0.9.1...v0.9.2
### Added
- Padding of short sequences with empty genes when predicting probabilities in `ClusterCRF`.
## [v0.9.1] - 2022-04-05
[v0.9.1]: https://git.embl.de/grp-zeller/GECCO/compare/v0.9.1-alpha4...v0.9.1
......
......@@ -10,4 +10,4 @@ See Also:
__author__ = "Martin Larralde"
__license__ = "GPLv3"
__version__ = "0.9.1"
__version__ = "0.9.2"
......@@ -64,9 +64,9 @@ class Run(Annotate): # noqa: D101
--antismash-sideload write an AntiSMASH v6 sideload JSON
file next to the output files.
--force-tsv always write TSV output files even
when they are empty (e.g. because
when they are empty (e.g. because
no genes or no clusters were found).
Parameters - Gene Calling:
-M, --mask Enable unknown region masking to
......@@ -82,6 +82,9 @@ class Run(Annotate): # noqa: D101
to be included. [default: 1e-9]
Parameters - Cluster Detection:
--no-pad disable padding of gene sequences
(used to predict BGCs in contigs
smaller than the CRF window length).
-c <N>, --cds <N> the minimum number of coding sequences a
valid cluster must contain. [default: 3]
-m <m>, --threshold <m> the probability threshold for cluster
......@@ -138,6 +141,7 @@ class Run(Annotate): # noqa: D101
self.antismash_sideload = self._check_flag("--antismash-sideload", bool)
self.force_tsv = self._check_flag("--force-tsv", bool)
self.mask = self._check_flag("--mask", bool)
self.no_pad = self._check_flag("--no-pad", bool)
except InvalidArgument:
raise CommandExit(1)
......@@ -175,7 +179,7 @@ class Run(Annotate): # noqa: D101
task = self.progress.add_task("Predicting marginals", total=len(genes), unit=unit, precision="")
return list(crf.predict_probabilities(
self.progress.track(genes, task_id=task, total=len(genes)),
cpus=self.jobs
pad=not self.no_pad,
))
def _extract_clusters(self, genes):
......
......@@ -139,7 +139,7 @@ class ClusterCRF(object):
**kwargs,
)
def predict_probabilities(self, genes: Iterable[Gene], *, cpus: Optional[int] = None) -> List[Gene]:
def predict_probabilities(self, genes: Iterable[Gene], *, pad: bool = True) -> List[Gene]:
"""Predict how likely each given gene is part of a gene cluster.
"""
# select the feature extraction method
......@@ -163,22 +163,38 @@ class ClusterCRF(object):
# extract features
sequence: List[Gene] = sorted(group, key=operator.attrgetter("start"))
feats: List[Dict[str, bool]] = extract_features(sequence)
delta: int = 0
# ignore sequences too small with a warning
if len(feats) < self.window_size:
warnings.warn(
f"Contig {sequence[0].source.id!r} does not contain enough"
f" genes ({len(sequence)}) for sliding window of size"
f" {self.window_size}"
)
predicted.extend(sequence)
continue
if pad:
unit = self.feature_type if self.window_size - len(feats) == 1 else f"{self.feature_type}s"
warnings.warn(
f"Contig {sequence[0].source.id!r} does not contain enough"
f" {self.feature_type}s ({len(sequence)}) for sliding window"
f" of size {self.window_size}, padding with"
f" {self.window_size - len(feats)} {unit}"
)
# insert on both ends
delta = self.window_size - len(feats)
for _ in range(delta // 2):
feats.insert(0, {})
for _ in range(delta // 2 + delta % 2):
feats.append({})
else:
warnings.warn(
f"Contig {sequence[0].source.id!r} does not contain enough"
f" {self.feature_type}s ({len(sequence)}) for sliding window"
f" of size {self.window_size}"
)
predicted.extend(sequence)
continue
# predict marginals over a sliding window, storing maximum probabilities
probabilities = numpy.zeros(len(sequence))
probabilities = numpy.zeros(max(len(sequence), self.window_size))
for win in sliding_window(len(feats), self.window_size, self.window_step):
marginals = [p['1'] for p in self.model.predict_marginals_single(feats[win])]
numpy.maximum(probabilities[win], marginals, out=probabilities[win])
# label genes with maximal probabilities
predicted.extend(annotate_probabilities(sequence, probabilities))
predicted.extend(annotate_probabilities(sequence, probabilities[delta//2:][:len(sequence)]))
# return the genes that were passed as input but now having BGC
return predicted
......
LOCUS BGC0001866.1_cluster_1 32633 bp DNA linear UNK 21-NOV-2021
LOCUS BGC0001866.1_cluster_1 32633 bp DNA linear UNK 06-APR-2022
DEFINITION BGC0001866.1 Byssochlamys spectabilis strain CBS 101075 chromosome
Unknown C8Q69scaffold_14, whole genome shotgun sequence.
ACCESSION BGC0001866.1_cluster_1
......@@ -15,15 +15,15 @@ REFERENCE 1
JOURNAL bioRxiv (2021.05.03.442509)
REMARK doi:10.1101/2021.05.03.442509
COMMENT ##GECCO-Data-START##
version :: GECCO v0.8.10
creation_date :: 2021-11-21T16:33:58.470847
version :: GECCO v0.9.1
creation_date :: 2022-04-06T01:08:36.965708
biosyn_class :: Polyketide
alkaloid_probability :: 0.0
polyketide_probability :: 0.98
alkaloid_probability :: 0.010000000000000009
polyketide_probability :: 0.96
ripp_probability :: 0.0
saccharide_probability :: 0.0
terpene_probability :: 0.0
nrp_probability :: 0.09999999999999998
terpene_probability :: 0.010000000000000009
nrp_probability :: 0.14
##GECCO-Data-END##
FEATURES Location/Qualifiers
CDS complement(1..1143)
......@@ -41,7 +41,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF00394"
/db_xref="InterPro:IPR001117"
/note="e-value: 2.1941888078432915e-08"
/note="e-value: 2.262067179461254e-08"
/note="p-value: 8.178117062405111e-12"
/function="Multicopper oxidase"
/standard_name="PF00394"
......@@ -49,7 +49,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF07731"
/db_xref="InterPro:IPR011706"
/note="e-value: 3.9374169295176556e-23"
/note="e-value: 4.059222969454281e-23"
/note="p-value: 1.467542649838858e-26"
/function="Multicopper oxidase"
/standard_name="PF07731"
......@@ -93,7 +93,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF00891"
/db_xref="InterPro:IPR001077"
/note="e-value: 4.743887678074703e-16"
/note="e-value: 4.890642309934635e-16"
/note="p-value: 1.7681280946979883e-19"
/function="O-methyltransferase domain"
/standard_name="PF00891"
......@@ -108,7 +108,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF00135"
/db_xref="InterPro:IPR002018"
/note="e-value: 4.674605664377319e-21"
/note="e-value: 4.819217021121008e-21"
/note="p-value: 1.7423055029360116e-24"
/function="Carboxylesterase family"
/standard_name="PF00135"
......@@ -123,7 +123,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF00135"
/db_xref="InterPro:IPR002018"
/note="e-value: 3.9706994470948554e-30"
/note="e-value: 4.0935350990176556e-30"
/note="p-value: 1.4799476135277136e-33"
/function="Carboxylesterase family"
/standard_name="PF00135"
......@@ -140,7 +140,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF00135"
/db_xref="InterPro:IPR002018"
/note="e-value: 1.4185801852307574e-15"
/note="e-value: 1.4624647008379705e-15"
/note="p-value: 5.287291037013632e-19"
/function="Carboxylesterase family"
/standard_name="PF00135"
......@@ -160,7 +160,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF13434"
/db_xref="InterPro:IPR025700"
/note="e-value: 5.777178703900199e-08"
/note="e-value: 5.955898730893757e-08"
/note="p-value: 2.153253337271785e-11"
/function="L-lysine 6-monooxygenase (NADPH-requiring)"
/standard_name="PF13434"
......@@ -168,7 +168,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF00743"
/db_xref="InterPro:IPR020946"
/note="e-value: 5.089108077410868e-07"
/note="e-value: 5.246542281818287e-07"
/note="p-value: 1.8967976434628658e-10"
/function="Flavin-binding monooxygenase-like"
/standard_name="PF00743"
......@@ -202,7 +202,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF07690"
/db_xref="InterPro:IPR011701"
/note="e-value: 5.839871260376694e-37"
/note="e-value: 6.020530714201243e-37"
/note="p-value: 2.1766199255969786e-40"
/function="Major Facilitator Superfamily"
/standard_name="PF07690"
......@@ -210,7 +210,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF06609"
/db_xref="InterPro:IPR010573"
/note="e-value: 9.543170598318239e-09"
/note="e-value: 9.83839354265682e-09"
/note="p-value: 3.55690294383833e-12"
/function="Fungal trichothecene efflux pump (TRI12)"
/standard_name="PF06609"
......@@ -235,8 +235,8 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF08493"
/db_xref="InterPro:IPR013700"
/note="e-value: 2.6165794251055913e-17"
/note="p-value: 9.752439154325723e-21"
/note="e-value: 2.686865976406516e-17"
/note="p-value: 9.713904470016327e-21"
/function="Aflatoxin regulatory protein"
/standard_name="PF08493"
CDS 16827..18797
......@@ -259,7 +259,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF00109"
/db_xref="InterPro:IPR014030"
/note="e-value: 9.025888536170949e-60"
/note="e-value: 9.30510909096118e-60"
/note="p-value: 3.364103069761815e-63"
/function="Beta-ketoacyl synthase, N-terminal domain"
/standard_name="PF00109"
......@@ -267,23 +267,23 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF02801"
/db_xref="InterPro:IPR014031"
/note="e-value: 2.2171445990751238e-35"
/note="e-value: 2.2857331200304854e-35"
/note="p-value: 8.263677223537547e-39"
/function="Beta-ketoacyl synthase, C-terminal domain"
/standard_name="PF02801"
misc_feature 17937..18287
misc_feature 17937..18290
/inference="protein motif"
/db_xref="PFAM:PF16197"
/db_xref="InterPro:IPR032821"
/note="e-value: 3.8698172759236842e-25"
/note="p-value: 1.4423471024687604e-28"
/note="e-value: 4.800730099641783e-25"
/note="p-value: 1.7356218726109122e-28"
/function="Ketoacyl-synthetase C-terminal extension"
/standard_name="PF16197"
misc_feature 18360..18770
/inference="protein motif"
/db_xref="PFAM:PF00698"
/db_xref="InterPro:IPR014043"
/note="e-value: 1.0799913424517567e-26"
/note="e-value: 1.113401436161595e-26"
/note="p-value: 4.025312495161225e-30"
/function="Acyl transferase domain"
/standard_name="PF00698"
......@@ -314,7 +314,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF00698"
/db_xref="InterPro:IPR014043"
/note="e-value: 2.639223271303753e-16"
/note="e-value: 2.7208690154402465e-16"
/note="p-value: 9.836836642950999e-20"
/function="Acyl transferase domain"
/standard_name="PF00698"
......@@ -322,14 +322,14 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF14765"
/db_xref="InterPro:IPR020807"
/note="e-value: 2.520598829779557e-60"
/note="e-value: 2.598574865139864e-60"
/note="p-value: 9.394703055458656e-64"
/function="Polyketide synthase dehydratase"
/standard_name="PF14765"
misc_feature 20786..21256
/inference="protein motif"
/db_xref="PFAM:PF13489"
/note="e-value: 1.0131254482174088e-12"
/note="e-value: 1.04446701072283e-12"
/note="p-value: 3.776091868123029e-16"
/function="Methyltransferase domain"
/standard_name="PF13489"
......@@ -337,23 +337,23 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF13847"
/db_xref="InterPro:IPR025714"
/note="e-value: 8.939870258494623e-11"
/note="p-value: 3.332042586095648e-14"
/note="e-value: 8.752004453621267e-11"
/note="p-value: 3.1641375465008194e-14"
/function="Methyltransferase domain"
/standard_name="PF13847"
misc_feature 20804..21097
/inference="protein motif"
/db_xref="PFAM:PF13649"
/db_xref="InterPro:IPR041698"
/note="e-value: 2.319131521369124e-13"
/note="p-value: 8.643799930559537e-17"
/note="e-value: 2.4253465299984994e-13"
/note="p-value: 8.76842563267715e-17"
/function="Methyltransferase domain"
/standard_name="PF13649"
misc_feature 20807..21103
/inference="protein motif"
/db_xref="PFAM:PF08242"
/db_xref="InterPro:IPR013217"
/note="e-value: 3.6288099491186147e-22"
/note="e-value: 3.7410690716593694e-22"
/note="p-value: 1.3525195486837923e-25"
/function="Methyltransferase domain"
/standard_name="PF08242"
......@@ -361,7 +361,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF08241"
/db_xref="InterPro:IPR013216"
/note="e-value: 5.245291385894328e-12"
/note="e-value: 5.4075572021556884e-12"
/note="p-value: 1.9550098344742185e-15"
/function="Methyltransferase domain"
/standard_name="PF08241"
......@@ -376,7 +376,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF00107"
/db_xref="InterPro:IPR013149"
/note="e-value: 1.0960342036668699e-15"
/note="e-value: 1.1299405916297285e-15"
/note="p-value: 4.085106983476965e-19"
/function="Zinc-binding dehydrogenase"
/standard_name="PF00107"
......@@ -396,7 +396,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF08659"
/db_xref="InterPro:IPR013968"
/note="e-value: 1.5141662612831146e-61"
/note="e-value: 1.5610077818520667e-61"
/note="p-value: 5.643556695054471e-65"
/function="KR domain"
/standard_name="PF08659"
......@@ -404,7 +404,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF00106"
/db_xref="InterPro:IPR002347"
/note="e-value: 1.1379002942545491e-07"
/note="e-value: 1.1731018314976082e-07"
/note="p-value: 4.2411490654288077e-11"
/function="short chain dehydrogenase"
/standard_name="PF00106"
......@@ -412,7 +412,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF00550"
/db_xref="InterPro:IPR009081"
/note="e-value: 3.359618716013185e-10"
/note="e-value: 3.463550267794435e-10"
/note="p-value: 1.2521873708584363e-13"
/function="Phosphopantetheine attachment site"
/standard_name="PF00550"
......@@ -426,8 +426,8 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF16073"
/db_xref="InterPro:IPR032088"
/note="e-value: 1.3071857188363548e-23"
/note="p-value: 4.872104803713585e-27"
/note="e-value: 9.422238725791962e-24"
/note="p-value: 3.406449286258844e-27"
/function="Starter unit:ACP transacylase in aflatoxin
biosynthesis"
/standard_name="PF16073"
......@@ -459,8 +459,8 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF16073"
/db_xref="InterPro:IPR032088"
/note="e-value: 8.208876065249628e-11"
/note="p-value: 3.059588544632735e-14"
/note="e-value: 4.380197593141013e-11"
/note="p-value: 1.5835855362042708e-14"
/function="Starter unit:ACP transacylase in aflatoxin
biosynthesis"
/standard_name="PF16073"
......@@ -468,7 +468,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF00109"
/db_xref="InterPro:IPR014030"
/note="e-value: 2.667462237983852e-82"
/note="e-value: 2.7499815692371726e-82"
/note="p-value: 9.942088102809735e-86"
/function="Beta-ketoacyl synthase, N-terminal domain"
/standard_name="PF00109"
......@@ -476,7 +476,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF02801"
/db_xref="InterPro:IPR014031"
/note="e-value: 2.4031043351141288e-34"
/note="e-value: 2.4774456171918303e-34"
/note="p-value: 8.956780973217029e-38"
/function="Beta-ketoacyl synthase, C-terminal domain"
/standard_name="PF02801"
......@@ -484,15 +484,15 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF16197"
/db_xref="InterPro:IPR032821"
/note="e-value: 2.535893425129411e-07"
/note="p-value: 9.451708628883381e-11"
/note="e-value: 8.475099126640419e-07"
/note="p-value: 3.0640271607521397e-10"
/function="Ketoacyl-synthetase C-terminal extension"
/standard_name="PF16197"
misc_feature 28322..29233
/inference="protein motif"
/db_xref="PFAM:PF00698"
/db_xref="InterPro:IPR014043"
/note="e-value: 4.597134671955754e-38"
/note="e-value: 4.739349423268586e-38"
/note="p-value: 1.7134307387088164e-41"
/function="Acyl transferase domain"
/standard_name="PF00698"
......@@ -509,7 +509,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF14765"
/db_xref="InterPro:IPR020807"
/note="e-value: 7.778696660229127e-11"
/note="e-value: 8.019334685871699e-11"
/note="p-value: 2.8992533209948296e-14"
/function="Polyketide synthase dehydratase"
/standard_name="PF14765"
......@@ -533,7 +533,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF00550"
/db_xref="InterPro:IPR009081"
/note="e-value: 5.884377030377924e-14"
/note="e-value: 6.066413293337807e-14"
/note="p-value: 2.193207987468477e-17"
/function="Phosphopantetheine attachment site"
/standard_name="PF00550"
......@@ -541,7 +541,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF00550"
/db_xref="InterPro:IPR009081"
/note="e-value: 3.9212317886052276e-10"
/note="e-value: 4.042537132792419e-10"
/note="p-value: 1.461510170930014e-13"
/function="Phosphopantetheine attachment site"
/standard_name="PF00550"
......@@ -549,7 +549,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF00550"
/db_xref="InterPro:IPR009081"
/note="e-value: 1.367829688372301e-08"
/note="e-value: 1.4101442109719659e-08"
/note="p-value: 5.098135252971677e-12"
/function="Phosphopantetheine attachment site"
/standard_name="PF00550"
......@@ -557,7 +557,7 @@ FEATURES Location/Qualifiers
/inference="protein motif"
/db_xref="PFAM:PF00975"
/db_xref="InterPro:IPR001031"
/note="e-value: 6.711355516947163e-24"
/note="e-value: 6.91897478936856e-24"
/note="p-value: 2.5014370171252933e-27"
/function="Thioesterase domain"
/standard_name="PF00975"
......
sequence_id bgc_id start end average_p max_p type alkaloid_probability polyketide_probability ripp_probability saccharide_probability terpene_probability nrp_probability proteins domains
BGC0001866.1 BGC0001866.1_cluster_1 347 32979 0.9969495815733557 0.9999999447224028 Polyketide 0.0 0.98 0.0 0.0 0.0 0.09999999999999998 BGC0001866.1_1;BGC0001866.1_2;BGC0001866.1_3;BGC0001866.1_4;BGC0001866.1_5;BGC0001866.1_6;BGC0001866.1_7;BGC0001866.1_8;BGC0001866.1_9;BGC0001866.1_10;BGC0001866.1_11;BGC0001866.1_12;BGC0001866.1_13;BGC0001866.1_14;BGC0001866.1_15;BGC0001866.1_16;BGC0001866.1_17;BGC0001866.1_18;BGC0001866.1_19;BGC0001866.1_20;BGC0001866.1_21;BGC0001866.1_22;BGC0001866.1_23 PF00106;PF00107;PF00109;PF00135;PF00394;PF00550;PF00698;PF00743;PF00891;PF00975;PF02801;PF06609;PF07690;PF07731;PF08241;PF08242;PF08493;PF08659;PF13434;PF13489;PF13649;PF13847;PF14765;PF16073;PF16197
BGC0001866.1 BGC0001866.1_cluster_1 347 32979 0.9958958770931704 0.9999999976946022 Polyketide 0.010000000000000009 0.96 0.0 0.0 0.010000000000000009 0.14 BGC0001866.1_1;BGC0001866.1_2;BGC0001866.1_3;BGC0001866.1_4;BGC0001866.1_5;BGC0001866.1_6;BGC0001866.1_7;BGC0001866.1_8;BGC0001866.1_9;BGC0001866.1_10;BGC0001866.1_11;BGC0001866.1_12;BGC0001866.1_13;BGC0001866.1_14;BGC0001866.1_15;BGC0001866.1_16;BGC0001866.1_17;BGC0001866.1_18;BGC0001866.1_19;BGC0001866.1_20;BGC0001866.1_21;BGC0001866.1_22;BGC0001866.1_23 PF00106;PF00107;PF00109;PF00135;PF00394;PF00550;PF00698;PF00743;PF00891;PF00975;PF02801;PF06609;PF07690;PF07731;PF08241;PF08242;PF08493;PF08659;PF13434;PF13489;PF13649;PF13847;PF14765;PF16073;PF16197
sequence_id protein_id start end strand domain hmm i_evalue pvalue domain_start domain_end bgc_probability
BGC0001866.1 BGC0001866.1_1 347 1489 - PF00394 Pfam 2.1941888078432915e-08 8.178117062405111e-12 1 63 0.9852038761627908
BGC0001866.1 BGC0001866.1_1 347 1489 - PF07731 Pfam 3.9374169295176556e-23 1.467542649838858e-26 150 281 0.9852038761627908
BGC0001866.1 BGC0001866.1_6 3946 4389 + PF00891 Pfam 4.743887678074703e-16 1.7681280946979883e-19 17 121 0.9910535094227727
BGC0001866.1 BGC0001866.1_7 4683 5138 + PF00135 Pfam 4.674605664377319e-21 1.7423055029360116e-24 48 140 0.9913598896683397
BGC0001866.1 BGC0001866.1_8 5384 5812 + PF00135 Pfam 3.9706994470948554e-30 1.4799476135277136e-33 2 114 0.9925093258822111
BGC0001866.1 BGC0001866.1_9 5823 6599 + PF00135 Pfam 1.4185801852307574e-15 5.287291037013632e-19 2 209 0.9946019708257335
BGC0001866.1 BGC0001866.1_10 7758 9029 + PF13434 Pfam 5.777178703900199e-08 2.153253337271785e-11 13 124 0.9978201609931655
BGC0001866.1 BGC0001866.1_10 7758 9029 + PF00743 Pfam 5.089108077410868e-07 1.8967976434628658e-10 36 102 0.9978201609931655
BGC0001866.1 BGC0001866.1_13 11550 12662 + PF07690 Pfam 5.839871260376694e-37 2.1766199255969786e-40 1 362 0.9990971143689635
BGC0001866.1 BGC0001866.1_13 11550 12662 + PF06609 Pfam 9.543170598318239e-09 3.55690294383833e-12 17 244 0.9990971143689635
BGC0001866.1 BGC0001866.1_15 14920 15912 + PF08493 Pfam 2.6165794251055913e-17 9.752439154325723e-21 139 224 0.9999977987864139
BGC0001866.1 BGC0001866.1_16 17173 19143 + PF00109 Pfam 9.025888536170949e-60 3.364103069761815e-63 2 248 0.9999994272691842
BGC0001866.1 BGC0001866.1_16 17173 19143 + PF02801 Pfam 2.2171445990751238e-35 8.263677223537547e-39 257 368 0.9999994272691842
BGC0001866.1 BGC0001866.1_16 17173 19143 + PF16197 Pfam 3.8698172759236842e-25 1.4423471024687604e-28 371 487 0.9999994272691842
BGC0001866.1 BGC0001866.1_16 17173 19143 + PF00698 Pfam 1.0799913424517567e-26 4.025312495161225e-30 512 648 0.9999994272691842
BGC0001866.1 BGC0001866.1_17 19152 22424 + PF00698 Pfam 2.639223271303753e-16 9.836836642950999e-20 2 151 0.9999940983719267
BGC0001866.1 BGC0001866.1_17 19152 22424 + PF14765 Pfam 2.520598829779557e-60 9.394703055458656e-64 228 504 0.9999940983719267
BGC0001866.1 BGC0001866.1_17 19152 22424 + PF13489 Pfam 1.0131254482174088e-12 3.776091868123029e-16 661 817 0.9999940983719267
BGC0001866.1 BGC0001866.1_17 19152 22424 + PF13847 Pfam 8.939870258494623e-11 3.332042586095648e-14 666 776 0.9999940983719267
BGC0001866.1 BGC0001866.1_17 19152 22424 + PF13649 Pfam 2.319131521369124e-13 8.643799930559537e-17 667 764 0.9999940983719267
BGC0001866.1 BGC0001866.1_17 19152 22424 + PF08242 Pfam 3.6288099491186147e-22 1.3525195486837923e-25 668 766 0.9999940983719267
BGC0001866.1 BGC0001866.1_17 19152 22424 + PF08241 Pfam 5.245291385894328e-12 1.9550098344742185e-15 668 767 0.9999940983719267
BGC0001866.1 BGC0001866.1_18 22762 23235 + PF00107 Pfam 1.0960342036668699e-15 4.085106983476965e-19 12 117 0.9999176675645223
BGC0001866.1 BGC0001866.1_19 23268 24623 + PF08659 Pfam 1.5141662612831146e-61 5.643556695054471e-65 65 239 0.9999724741067139
BGC0001866.1 BGC0001866.1_19 23268 24623 + PF00106 Pfam 1.1379002942545491e-07 4.2411490654288077e-11 68 221 0.9999724741067139
BGC0001866.1 BGC0001866.1_19 23268 24623 + PF00550 Pfam 3.359618716013185e-10 1.2521873708584363e-13 384 437 0.9999724741067139
BGC0001866.1 BGC0001866.1_20 25769 26056 + PF16073 Pfam 1.3071857188363548e-23 4.872104803713585e-27 8 94 0.999988513111687
BGC0001866.1 BGC0001866.1_21 26544 29999 + PF16073 Pfam 8.208876065249628e-11 3.059588544632735e-14 2 47 0.9999999447224028
BGC0001866.1 BGC0001866.1_21 26544 29999 + PF00109 Pfam 2.667462237983852e-82 9.942088102809735e-86 178 426 0.9999999447224028
BGC0001866.1 BGC0001866.1_21 26544 29999 + PF02801 Pfam 2.4031043351141288e-34 8.956780973217029e-38 434 555 0.9999999447224028
BGC0001866.1 BGC0001866.1_21 26544 29999 + PF16197 Pfam 2.535893425129411e-07 9.451708628883381e-11 567 673 0.9999999447224028
BGC0001866.1 BGC0001866.1_21 26544 29999 + PF00698 Pfam 4.597134671955754e-38 1.7134307387088164e-41 709 1012 0.9999999447224028
BGC0001866.1 BGC0001866.1_22 30150 30890 + PF14765 Pfam 7.778696660229127e-11 2.8992533209948296e-14 39 244 0.9999460955852995
BGC0001866.1 BGC0001866.1_23 30937 32979 + PF00550 Pfam 5.884377030377924e-14 2.193207987468477e-17 67 128 0.9997314383315643
BGC0001866.1 BGC0001866.1_23 30937 32979 + PF00550 Pfam 3.9212317886052276e-10 1.461510170930014e-13 174 238 0.9997314383315643
BGC0001866.1 BGC0001866.1_23 30937 32979 + PF00550 Pfam 1.367829688372301e-08 5.098135252971677e-12 299 360 0.9997314383315643
BGC0001866.1 BGC0001866.1_23 30937 32979 + PF00975 Pfam 6.711355516947163e-24 2.5014370171252933e-27 443 550 0.9997314383315643
BGC0001866.1 BGC0001866.1_1 347 1489 - PF00394 Pfam 2.262067179461254e-08 8.178117062405111e-12 1 63 0.9791890143072265
BGC0001866.1 BGC0001866.1_1 347 1489 - PF07731 Pfam 4.059222969454281e-23 1.467542649838858e-26 150 281 0.9791890143072265
BGC0001866.1 BGC0001866.1_6 3946 4389 + PF00891 Pfam 4.890642309934635e-16 1.7681280946979883e-19 17 121 0.9955095513800687
BGC0001866.1 BGC0001866.1_7 4683 5138 + PF00135 Pfam 4.819217021121008e-21 1.7423055029360116e-24 48 140 0.995982045872177
BGC0001866.1 BGC0001866.1_8 5384 5812 + PF00135 Pfam 4.0935350990176556e-30 1.4799476135277136e-33 2 114 0.9966491071789748
BGC0001866.1 BGC0001866.1_9 5823 6599 + PF00135 Pfam 1.4624647008379705e-15 5.287291037013632e-19 2 209 0.9975265367646511
BGC0001866.1 BGC0001866.1_10 7758 9029 + PF13434 Pfam 5.955898730893757e-08 2.153253337271785e-11 13 124 0.9986351193337516
BGC0001866.1 BGC0001866.1_10 7758 9029 + PF00743 Pfam 5.246542281818287e-07 1.8967976434628658e-10 36 102 0.9986351193337516
BGC0001866.1 BGC0001866.1_13 11550 12662 + PF07690 Pfam 6.020530714201243e-37 2.1766199255969786e-40 1 362 0.9994485509803548
BGC0001866.1 BGC0001866.1_13 11550 12662 + PF06609 Pfam 9.83839354265682e-09 3.55690294383833e-12 17 244 0.9994485509803548
BGC0001866.1 BGC0001866.1_15 14920 15912 + PF08493 Pfam 2.686865976406516e-17 9.713904470016327e-21 139 224 0.9999999296901834
BGC0001866.1 BGC0001866.1_16 17173 19143 + PF00109 Pfam 9.30510909096118e-60 3.364103069761815e-63 2 248 0.9999998571963613
BGC0001866.1 BGC0001866.1_16 17173 19143 + PF02801 Pfam 2.2857331200304854e-35 8.263677223537547e-39 257 368 0.9999998571963613
BGC0001866.1 BGC0001866.1_16 17173 19143 + PF16197 Pfam 4.800730099641783e-25 1.7356218726109122e-28 371 488 0.9999998571963613
BGC0001866.1 BGC0001866.1_16 17173 19143 + PF00698 Pfam 1.113401436161595e-26 4.025312495161225e-30 512 648 0.9999998571963613
BGC0001866.1 BGC0001866.1_17 19152 22424 + PF00698 Pfam 2.7208690154402465e-16 9.836836642950999e-20 2 151 0.9999990994944158
BGC0001866.1 BGC0001866.1_17 19152 22424 + PF14765 Pfam 2.598574865139864e-60 9.394703055458656e-64 228 504 0.9999990994944158
BGC0001866.1 BGC0001866.1_17 19152 22424 + PF13489 Pfam 1.04446701072283e-12 3.776091868123029e-16 661 817 0.9999990994944158
BGC0001866.1 BGC0001866.1_17 19152 22424 + PF13847 Pfam 8.752004453621267e-11 3.1641375465008194e-14 666 776 0.9999990994944158
BGC0001866.1 BGC0001866.1_17 19152 22424 + PF13649 Pfam 2.4253465299984994e-13 8.76842563267715e-17 667 764 0.9999990994944158
BGC0001866.1 BGC0001866.1_17 19152 22424 + PF08242 Pfam 3.7410690716593694e-22 1.3525195486837923e-25 668 766 0.9999990994944158
BGC0001866.1 BGC0001866.1_17 19152 22424 + PF08241 Pfam 5.4075572021556884e-12 1.9550098344742185e-15 668 767 0.9999990994944158
BGC0001866.1 BGC0001866.1_18 22762 23235 + PF00107 Pfam 1.1299405916297285e-15 4.085106983476965e-19 12 117 0.9999802025553775
BGC0001866.1 BGC0001866.1_19 23268 24623 + PF08659 Pfam 1.5610077818520667e-61 5.643556695054471e-65 65 239 0.9999913868972266
BGC0001866.1 BGC0001866.1_19 23268 24623 + PF00106 Pfam 1.1731018314976082e-07 4.2411490654288077e-11 68 221 0.9999913868972266
BGC0001866.1 BGC0001866.1_19 23268 24623 + PF00550 Pfam 3.463550267794435e-10 1.2521873708584363e-13 384 437 0.9999913868972266
BGC0001866.1 BGC0001866.1_20 25769 26056 + PF16073 Pfam 9.422238725791962e-24 3.406449286258844e-27 8 94 0.9999994733759681
BGC0001866.1 BGC0001866.1_21 26544 29999 + PF16073 Pfam 4.380197593141013e-11 1.5835855362042708e-14 2 47 0.9999999976946022
BGC0001866.1 BGC0001866.1_21 26544 29999 + PF00109 Pfam 2.7499815692371726e-82 9.942088102809735e-86 178 426 0.9999999976946022
BGC0001866.1 BGC0001866.1_21 26544 29999 + PF02801 Pfam 2.4774456171918303e-34 8.956780973217029e-38 434 555 0.9999999976946022
BGC0001866.1 BGC0001866.1_21 26544 29999 + PF16197 Pfam 8.475099126640419e-07 3.0640271607521397e-10 567 673 0.9999999976946022
BGC0001866.1 BGC0001866.1_21 26544 29999 + PF00698 Pfam 4.739349423268586e-38 1.7134307387088164e-41 709 1012 0.9999999976946022
BGC0001866.1 BGC0001866.1_22 30150 30890 + PF14765 Pfam 8.019334685871699e-11 2.8992533209948296e-14 39 244 0.9999912059124727
BGC0001866.1 BGC0001866.1_23 30937 32979 + PF00550 Pfam 6.066413293337807e-14 2.193207987468477e-17 67 128 0.9998703656415205
BGC0001866.1 BGC0001866.1_23 30937 32979 + PF00550 Pfam 4.042537132792419e-10 1.461510170930014e-13 174 238 0.9998703656415205
BGC0001866.1 BGC0001866.1_23 30937 32979 + PF00550 Pfam 1.4101442109719659e-08 5.098135252971677e-12 299 360 0.9998703656415205
BGC0001866.1 BGC0001866.1_23 30937 32979 + PF00975 Pfam 6.91897478936856e-24 2.5014370171252933e-27 443 550 0.9998703656415205
sequence_id protein_id start end strand average_p max_p
BGC0001866.1 BGC0001866.1_1 347 1489 - 0.9791890143072265 0.9791890143072265
BGC0001866.1 BGC0001866.1_2 1525 2016 + 0.9816626269970528 0.9816626269970528
BGC0001866.1 BGC0001866.1_3 2513 2722 - 0.9844997726878899 0.9844997726878899
BGC0001866.1 BGC0001866.1_4 2905 3378 + 0.9877300777686966 0.9877300777686966
BGC0001866.1 BGC0001866.1_5 3353 3922 + 0.9913872741253911 0.9913872741253911
BGC0001866.1 BGC0001866.1_6 3946 4389 + 0.9955095513800687 0.9955095513800687
BGC0001866.1 BGC0001866.1_7 4683 5138 + 0.995982045872177 0.995982045872177
BGC0001866.1 BGC0001866.1_8 5384 5812 + 0.9966491071789748 0.9966491071789748
BGC0001866.1 BGC0001866.1_9 5823 6599 + 0.9975265367646511 0.9975265367646511
BGC0001866.1 BGC0001866.1_10 7758 9029 + 0.9986351193337516 0.9986351193337516
BGC0001866.1 BGC0001866.1_11 9800 10384 + 0.9988029392597757 0.9988029392597757
BGC0001866.1 BGC0001866.1_12 11109 11537 + 0.999073142625125 0.999073142625125
BGC0001866.1 BGC0001866.1_13 11550 12662 + 0.9994485509803548 0.9994485509803548
BGC0001866.1 BGC0001866.1_14 12681 13127 + 0.9996778954036583 0.9996778954036583
BGC0001866.1 BGC0001866.1_15 14920 15912 + 0.9999999296901834 0.9999999296901834
BGC0001866.1 BGC0001866.1_16 17173 19143 + 0.9999998571963613 0.9999998571963613
BGC0001866.1 BGC0001866.1_17 19152 22424 + 0.9999990994944158 0.9999990994944158
BGC0001866.1 BGC0001866.1_18 22762 23235 + 0.9999802025553775 0.9999802025553775
BGC0001866.1 BGC0001866.1_19 23268 24623 + 0.9999913868972266 0.9999913868972266
BGC0001866.1 BGC0001866.1_20 25769 26056 + 0.9999994733759681 0.9999994733759681
BGC0001866.1 BGC0001866.1_21 26544 29999 + 0.9999999976946022 0.9999999976946022
BGC0001866.1 BGC0001866.1_22 30150 30890 + 0.9999912059124727 0.9999912059124727
BGC0001866.1 BGC0001866.1_23 30937 32979 + 0.9998703656415205 0.9998703656415205
......@@ -5,14 +5,14 @@
"subregions": [
{
"details": {
"alkaloid_probability": "0.000",
"average_p": "0.997",
"alkaloid_probability": "0.010",
"average_p": "0.996",
"max_p": "1.000",
"nrp_probability": "0.100",
"polyketide_probability": "0.980",
"nrp_probability": "0.140",
"polyketide_probability": "0.960",
"ripp_probability": "0.000",
"saccharide_probability": "0.000",
"terpene_probability": "0.000"
"terpene_probability": "0.010"
},
"end": 32979,
"label": "Polyketide",
......@@ -25,11 +25,13 @@
"configuration": {
"cds": "3",
"e-filter": "None",
"edge-distance": "0",
"mask": "False",
"postproc": "'gecco'",
"threshold": "0.3"
"threshold": "0.8"
},
"description": "Biosynthetic Gene Cluster prediction with Conditional Random Fields.",
"name": "GECCO",
"version": "0.8.10"
"version": "0.9.1"
}
}
\ No newline at end of file