Commit ba4d1c56 authored by Martin Larralde's avatar Martin Larralde
Browse files

Make `gecco.orf` handle STOP codons consistently

parent 516840aa
Pipeline #34992 passed with stages
in 2 minutes and 37 seconds
......@@ -186,18 +186,11 @@ class CDSFinder(ORFFinder):
if protein.id in ids:
raise ValueError(f"Duplicate gene identifier found in {record.id!r}: {protein.id!r}")
ids.add(protein.id)
# fix coordinates (using 1-based, leftmost start in `Gene`, no STOP codon)
if feature.location.strand == -1:
start = feature.location.start + 4
end = feature.location.end
else:
start = feature.location.start + 1
end = feature.location.end - 3
# wrap the gene into a Gene
yield Gene(
source=record,
start=start,
end=end,
start=feature.location.start + 1,
end=feature.location.end,
strand=Strand(feature.location.strand),
protein=protein,
)
......
......@@ -24,13 +24,8 @@ class TestCDSFinder(unittest.TestCase):
"""
finder = CDSFinder()
for gene in finder.find_genes([self.genome]):
subseq = self.genome.seq[gene.start-1:gene.end]
if gene.strand is Strand.Reverse:
subseq = subseq.reverse_complement()
# Don't compare first letter because it can be set as `M` in the
# CDS feature but actually correspond to a different letter when
# translating directly
self.assertEqual(subseq.translate()[1:], gene.protein.seq[1:])
# check that start is leftmost
self.assertLess(gene.start, gene.end)
def test_progress_callback(self):
"""Test that the progress callback is called for each genome.
......
......@@ -22,10 +22,17 @@ class TestPyrodigalFinder(unittest.TestCase):
cls.genome = Bio.SeqIO.read(os.path.join(folder, "data", "BGC0001737.fna"), "fasta")
cls.proteins = list(Bio.SeqIO.parse(os.path.join(folder, "data", "BGC0001737.faa"), "fasta"))
def test_stop_codon(self):
"""Test emitted genes always end with a '*' symbol representing the STOP codon.
"""
finder = PyrodigalFinder(cpus=1)
for gene in finder.find_genes([self.genome]):
self.assertEqual(gene.protein.seq[-1], "*")
def test_order(self):
"""Test proteins are emitted in the right order from the source file.
"""
finder = PyrodigalFinder(cpus=1)
finder = PyrodigalFinder(cpus=4)
for expected, actual in zip(self.proteins, finder.find_genes([self.genome])):
self.assertEqual(expected.id, actual.protein.id)
......@@ -45,6 +52,7 @@ class TestPyrodigalFinder(unittest.TestCase):
subseq = self.genome.seq[gene.start-1:gene.end]
if gene.strand is Strand.Reverse:
subseq = subseq.reverse_complement()
self.assertLess(gene.start, gene.end)
self.assertEqual(subseq.translate(), gene.protein.seq)
def test_progress_callback(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment