Commit fd64a45f authored by Thomas Schwarzl's avatar Thomas Schwarzl

added first drafts of optimization, added recursion feedback (counts, stop),...

added first drafts of optimization, added recursion feedback (counts, stop), added memory footprint and human readable formats, changed min matches before gap to a minimum of 1
parent c6e29aab
......@@ -29,6 +29,31 @@
# -> can be 0 to many nucleotides long, maximum gap size can be controlled
# -> total number of gaps can be controlled
# -> Stem length and loop length can be controlled
#
# Optimization:
# There are cases e.g. with the backbone of MggN (Match, Gap, Gap, Match Mate)
# where A) the first gap g and then B) the second gap g is blank, therefore resulting in:
# A) MgN
# B) MgN
# which results in the same sequences.
# There ere of course cases where this would lead to exactly the same evaluation
# of the same sequences, the longer the sequences get, exponentially more
# similar calculations are done, also if the gap sizes increase this can \
# exponentially increase - e.g. MggggggggggN
# Therefore a set is passed on with the sequences to be evaluated, also
# the backbone (e.g. MggggN), where the backbone matches the passed on sequences
# (so when a gap g is empty, then it will not be added to the backbone).
# The optimization is especially recommended when using a high number of
# gaps or optional loops. Keeping a list of unique sequences (set) at the runtime
# requires memory. Eventually this is a compromise between memory and cpu usage,
# however if you can afford the memory, it avoids a lot of duplicated calculations,
# ergo a lot of time.
#
#
# TODO: Test if match list has to be the same as well
#
# REQUIRED PYTHON VERSION: >= 3
# REQUIRED PACKAGES: psutil
# ---------------------------------------------------------------------------------
# Default variables
......@@ -65,6 +90,9 @@ MIN_MATCHES_BEFORE_FIRST_GAP = 2
# minimum GC content
MIN_GC_CONTENT = 0.0
# Optimize
OPTIMIZE = True
__HELP__ = '''
SYNOPSIS
......@@ -110,6 +138,10 @@ DESCRIPTION
minimum gc content of generated sequence
default: %s
[-o, --optimize]
optimize the recursive evalution by
avoiding duplication, memory intensive
[-t, --test] test run without sequence evaluation
[-h, --help] display help message
......@@ -167,6 +199,7 @@ def main():
f.stem_size = options.STEM_SIZE
f.min_matches_before_first_gap = options.MIN_MATCHES_BEFORE_FIRST_GAP
f.min_gc_content = options.MIN_GC_CONTENT
f.optimize = options.OPTIMIZE
#logging.getLogger().setLevel(logging.INFO)
......@@ -176,6 +209,7 @@ def main():
# if it is not a test run, print a version
if not options.TEST:
f.init_stemloop()
f.calculate_sequences()
LOGGER.warning(f.sequences_summary())
......@@ -261,6 +295,11 @@ if __name__ == '__main__':
default = False,
dest = "TEST")
parser.add_option('-o', '--optimize',
action = 'store_true',
default = False,
dest = "OPTIMIZE")
parser.add_option('-v', '--verbose',
action = 'store_true',
default = __VERBOSE__,
......@@ -280,8 +319,8 @@ if __name__ == '__main__':
raise Exception("maximal gap size has to be greater or equal 0")
if options.MAX_TOTAL_GAP_COUNT < 0:
raise Exception("maximal total gap count has to be greater or equal 0")
if options.MIN_MATCHES_BEFORE_FIRST_GAP < 0:
raise Exception("minimal matches before first gap has to be greater or equal 0")
if options.MIN_MATCHES_BEFORE_FIRST_GAP < 1:
raise Exception("minimal matches before first gap has to be greater or equal 1")
if options.STEM_SIZE < 1:
raise Exception("stem size has to be greater than 0")
......@@ -297,4 +336,5 @@ if __name__ == '__main__':
except Exception as exception:
print("------------[ Error ]--------------")
exceptionHandling(exception)
\ No newline at end of file
# TODO LOGGER
\ No newline at end of file
This diff is collapsed.
__author__ = 'Tom'
import sys
import sys, time, datetime, os, psutil, humanize
from ago import human
from lib.Nucleotide import Nucleotide
from lib.Match import Match
from lib.MatchMate import MatchMate
......@@ -20,7 +22,13 @@ class StemLoopFactory:
self.min_gc_content = 0
self.stemloop = None
self.sequences = None
self.optimize = None
self.recursion_count = 0
self.duplication_count = 0
self.gc_min_cutoff_counter = 0
self.calculate_sequences_time = 0
self.print_sequences_time = 0
self.memory_use = 0
def init_stemloop(self):
first = True
......@@ -94,12 +102,11 @@ class StemLoopFactory:
out = "# STEM LOOP GENERATOR v%s\n" % version
out += "# ========================================================== #\n"
out += "# stem size: %s\n" % self.stem_size
out += "# minimal loop size: %s\n" % self.min_loop_size
out += "# maximal loop size: %s\n" % self.max_loop_size
out += "# maximum gap size: %s\n" % self.max_gap_size
out += "# maximum total gap count: %s\n" % self.max_total_gap_count
out += "# minimal matches before first gap: %s\n" % self.min_matches_before_first_gap
out += "# minimal gc content [0-1]: %s\n" % self.min_gc_content
out += "# loop size: min %s, max %s\n" % (self.min_loop_size, self.max_loop_size)
out += "# gap size: max %s\n" % self.max_gap_size
out += "# total gap count: max %s\n" % self.max_total_gap_count
out += "# matches before first gap: min %s\n" % self.min_matches_before_first_gap
out += "# gc content [0-1]: min %s\n" % self.min_gc_content
out += "# ---------------------------------------------------------- #\n"
out += "# Backbone: \n"
out += "# \n"
......@@ -108,14 +115,33 @@ class StemLoopFactory:
out += "# \n"
out += "# M Match, N Mate, L Loop, O Loop (optional), g Gap\n"
out += "# \n"
out += "# minimal sequence length: %s\n" % self.min_length()
out += "# maximal sequence length: %s\n" % (self.min_length() + min(self.max_gap_size, self.max_total_gap_count))
out += "# total length: %s\n" % self.length()
out += "# sequence length: min %s, max %s, total %s\n" % (self.min_length(),
(self.min_length() + min(self.max_gap_size, self.max_total_gap_count)),
self.length())
out += "# ---------------------------------------------------------- #"
return(out)
def sequences_summary(self):
out = "# number of unique sequences: %s\n" % self.sequences_count()
out = "# %s recursions calculated in %s \n" % (humanize.intcomma(self.recursion_count),
human(datetime.datetime.now()
- datetime.timedelta(seconds=self.calculate_sequences_time),
precision = 10,
past_tense = '{}'))
if self.optimize:
out += "# %s duplicated recursions were stopped using optimization\n" % self.duplication_count
else:
out += "# no optimization was chosen\n"
if self.min_gc_content <= 0:
out += "# no minimal gc content cutoff was chosen\n"
else:
out += "# %s recursions stopped because of minimal gc content cutoff of %s\n" % (self.gc_min_cutoff_counter, self.min_gc_content)
out += "# %s memory was used \n" % humanize.naturalsize(self.memory_use)
out += "# ---------------------------------------------------------- #\n"
out += "# %s unique sequences were printed in %s \n" % (humanize.intcomma(self.sequences_count()),
human(datetime.datetime.now()
- datetime.timedelta(seconds=self.print_sequences_time),
precision = 10,
past_tense = '{}'))
out += "# example sequences: %s\n" % self.sequences_header(10)
out += "# ========================================================== #"
......@@ -127,8 +153,22 @@ class StemLoopFactory:
def calculate_sequences(self):
self.check_init()
self.sequences = self.stemloop.eval(self.max_gap_size, self.min_gc_content)
start_time = time.time()
(self.sequences,
self.recursion_count,
self.duplication_count,
self.gc_min_cutoff_counter) = self.stemloop.eval(self.max_gap_size,
self.min_gc_content,
self.optimize)
end_time = time.time()
self.calculate_sequences_time = end_time - start_time
process = psutil.Process(os.getpid())
self.memory_use = process.memory_info().rss
def sequences_count(self):
self.check_sequences_init()
return(len(self.sequences))
......@@ -136,6 +176,8 @@ class StemLoopFactory:
def print_sequences(self, file):
self.check_sequences_init()
start_time = time.time()
fh = sys.stdout
try:
......@@ -149,6 +191,10 @@ class StemLoopFactory:
fh.close()
except IOError as e:
print("IOError, problems opening or writing to the file:\n" + str(e))
end_time = time.time()
self.print_sequences_time = end_time - start_time
def sequences_header(self, i):
return list(self.sequences)[0:i]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment