Commit dda40887 authored by Thomas Schwarzl's avatar Thomas Schwarzl

init stem loop generator

parents
# STEMLOOPGENERATOR
# by Thomas Schwarzl, schwarzl@embl.de
# ---------------------------------------------------------------------------------
# The program builds up possible stem loop backbone structure
# and recursively produces all possible stem loops with
# certain properties like GC content, loop size or gaps between
# the matching pairs
# _
# (O)i | <- Loop Option
# L L | RNA LOOP
# L L _| ( usually between 4 and 9 nucleotides )
# M N |
# M N | RNA STEM
# (G)j |
# M N | ( occur between matching nucleotides
# M N | and can be optional or 1 or many nucleotides )
# (G)j |
# M N _|
#
# M ... nucleotide for match M::N
# N ... nucleotide for match N::M
# -> number of matching pairs for the stem can be controlled
# L ... nucleotide for Loop
# -> minimal and maximal number of nucleotides for loops can be controlled
# O ... optional nucleotide for Loop
# G ... Gap
# -> can only occur in the stem
# -> are optional
# -> can be 0 to many nucleotides long, maximum gap size can be controlled
# -> total number of gaps can be controlled
# -> Stem length and loop length can be controlled
# ---------------------------------------------------------------------------------
__author__ = 'Tom'
__VERBOSE__ = False
import lib
from lib.StemLoopFactory import StemLoopFactory
# maximum gap size
MAX_GAP_SIZE = 2
# maximum total gap count
MAX_TOTAL_GAP_COUNT = 0#3
# how many matches must be after the loops without gaps
MIN_MATCHES_BEFORE_FIRST_GAP = 2
# loop size
MIN_LOOP_SIZE = 4#4
# loop size
MAX_LOOP_SIZE = 4#8
# stem size
STEM_SIZE = 4#7
# minimum GC content
MIN_GC_CONTENT = 0.0
# out file name
OUTFILE = "out.txt"
#===================================================================================#
f = StemLoopFactory()
f.max_gap_size = MAX_GAP_SIZE
f.max_total_gap_count = MAX_TOTAL_GAP_COUNT
f.min_loop_size = MIN_LOOP_SIZE
f.max_loop_size = MAX_LOOP_SIZE
f.stem_size = STEM_SIZE
f.min_matches_before_first_gap = MIN_MATCHES_BEFORE_FIRST_GAP
f.min_gc_content = MIN_GC_CONTENT
f.init_stemloop()
print(f.summary())
f.calculate_sequences()
f.print_sequences(OUTFILE)
print(f.sequences_summary())
print(f.sequences_header(10))
# ================
# for debugging
# n = Match()
# n = n.add_next(Gap())
# n = n.add_next(Gap())
# n = n.add_next(Match())
# n = n.add_next(Gap())
# n = n.add_next(Gap())
# n = n.add_next(Match())
# n = n.add_next(Gap())
# n = n.add_next(Gap())
# n = n.add_next(Match())
# n = n.add_next(Gap())
# n = n.add_next(Gap())
# n = n.add_next(Match())
# n = n.add_next(Gap())
# n = n.add_next(Gap())
# n = n.add_next(Match())
# n = n.add_next(Match())
# n = n.add_next(Loop())
# n = n.add_next(Loop())
# n = n.add_next(Loop())
# n = n.add_next(Loop())
# n = n.add_next(MatchMate())
# n = n.add_next(MatchMate())
# n = n.add_next(Gap())
# n = n.add_next(Gap())
# n = n.add_next(MatchMate())
# n = n.add_next(Gap())
# n = n.add_next(Gap())
# n = n.add_next(MatchMate())
# n = n.add_next(Gap())
# n = n.add_next(Gap())
# n = n.add_next(MatchMate())
# n = n.add_next(Gap())
# n = n.add_next(Gap())
# n = n.add_next(MatchMate())
# n = n.add_next(Gap())
# n = n.add_next(Gap())
# n = n.add_next(MatchMate())
\ No newline at end of file
__author__ = 'Tom'
__VERBOSE__ = False
from lib.Nucleotide import Nucleotide
class Gap(Nucleotide):
shortname = "g"
states = ["", "A", "C", "G", "U"]
def __init__(self):
super().__init__()
def get_states(self, matchlist, match_index, gap_count, max_gap_count):
if gap_count >= max_gap_count:
return(([""], match_index, gap_count ))
else:
return(self.states, match_index, gap_count + 1)
# min length without gaps
def min_length(self):
if self.next is not None:
return self.next.min_length()
else:
return 0
\ No newline at end of file
__author__ = 'Tom'
__VERBOSE__ = False
from lib.Nucleotide import Nucleotide
# Class Loop derived from Nucleotide
class Loop(Nucleotide):
shortname = "L"
def __init__(self):
super().__init__()
\ No newline at end of file
__author__ = 'Tom'
__VERBOSE__ = False
from lib.Nucleotide import Nucleotide
class LoopOption(Nucleotide):
shortname = "O"
states = ["", "A", "C", "G", "U"]
def __init__(self):
super().__init__()
# min length without gaps
def min_length(self):
if self.next is not None:
return self.next.min_length()
else:
return 0
\ No newline at end of file
__author__ = 'Tom'
__VERBOSE__ = False
#import lib
#from Nucleotide import Nucleotide
from lib.Nucleotide import Nucleotide
# This class manages a stem loop as stem loops always start with a match
#
class Match(Nucleotide):
shortname = "M"
def __init__(self):
super().__init__()
# only matches have to report states
def report_state(self, state, matchlist):
if __VERBOSE__:
print("Reporting state %s to matchlist %s" % (state, matchlist))
matchlist += state #matchlist.append(state)
#print(matchlist)
return matchlist
\ No newline at end of file
__author__ = 'Tom'
__VERBOSE__ = False
from lib.Nucleotide import Nucleotide
# This
class MatchMate(Nucleotide):
states = [ "X" ]
shortname = "N"
def __init__(self):
super().__init__()
def get_states(self, matchlist, match_index, gap_count, max_gap_count):
mate_state = matchlist[ match_index ]
match_index += 1
if __VERBOSE__:
print("Got the mate state %s" % (mate_state))
if mate_state == "A":
return( ([ "U" ], match_index, gap_count ) )
elif mate_state == "U":
return( ([ "A" ], match_index, gap_count ) )
elif mate_state == "C":
return( ([ "G" ], match_index, gap_count ))
elif mate_state == "G":
return( ( [ "C" ], match_index, gap_count))
else:
Exception("unknown mate state %s" % mate_state)
__author__ = 'Tom'
__VERBOSE__ = False
class Nucleotide:
shortname = None
states = ["A", "C", "G", "U"]
def __init__(self):
self.next = None
def add_next(self, next):
if self.next is None:
self.next = next
else:
self.next.add_next(next)
return self
def evaluate(self, max_gap_count):
pass
def get_states(self, matchlist, match_index, gap_count, max_gap_count):
return ((self.states, match_index, gap_count))
# hook for 'Match' nucleotides to report state
def report_state(self, state, matchlist):
return matchlist
def length(self):
if self.next is not None:
return 1 + self.next.length()
else:
return 1
# without gaps
def min_length(self):
if self.next is not None:
return 1 + self.next.min_length()
else:
return 1
# This function finds all possible sequences and limits them to GC count
def do_evaluate(self, sequence, gap_count, max_gap_count, sequences, matchlist, match_index, step_count, total_length, gc_counts, min_gc_content):
# determines possible nucleotides (states) for the certain position
(states, match_index, gap_count) = self.get_states(matchlist, match_index, gap_count, max_gap_count)
# recursively explore all nucleotide variants (states)
for state in states:
# if C or G add to the GC count
new_gc_counts = gc_counts
if state == "C" or state == "G":
new_gc_counts += 1
# create new sequence
newsequence = sequence + state
# hook for matches to report
newmatchlist = self.report_state(state, matchlist)
# if there are still nucleotides to evaluate
if self.next is not None:
# check if gc content could not be reached for time optimisation purposes
# current gc count plus all the possible positions (which could be gs or cs)
# divided by the total length, results in the maximal gc count possible
max_possible_gc_content = (new_gc_counts + total_length + gap_count - step_count ) / (total_length + gap_count)
#DEBUG
#print("step: %s total: %s max poss: %s gcs: %s min wanted: %s seq: %s gaps: %s eval: %s" % (step_count, total_length, round(max_possible_gc_content,2) , new_gc_counts, min_gc_content, newsequence, gap_count, max_possible_gc_content > min_gc_content))
# if the maximal gc count possible is not higher than the min gc count,
# then stop, otherwise continue
if max_possible_gc_content > min_gc_content:
if __VERBOSE__:
print("Evaluating %s, adding %s, and going to next node" % (self.shortname, state))
new_step_count = step_count
if state != "":
new_step_count += 1
self.next.do_evaluate(newsequence, gap_count, max_gap_count, sequences, newmatchlist, match_index, new_step_count, total_length, new_gc_counts, min_gc_content)
else:
#print("max poss: %s min wanted: %s" % (max_possible_gc_content, min_gc_content))
pass # it is stopped
else: # if the end (a leaf) was recursively reached, add the sequence to the tree
if __VERBOSE__:
print("Evaluating %s, adding %s, adding to final list" % (self.shortname, state))
# check if GC constraints are met
if self.validate_sequence(newsequence, new_gc_counts, min_gc_content):
sequences.add(newsequence)
def eval(self, max_gap_count, min_gc_content):
sequences = set()
self.do_evaluate("", # sequence
0, # gap count
max_gap_count, # max gap count
sequences, # list of sequences
"", # match list
0, # match index
1, # step count
self.min_length(), # min total length
0, # gc counts
min_gc_content) # min gc content
return sequences
def validate_sequence(self, sequence, gc_counts, min_gc_content):
return gc_counts / len(sequence) >= min_gc_content
def __str__(self):
return "%s" % (self.shortname)
#return "%s(%s)" % (self.shortname, ",".join(self.states))
def str_all(self):
if self.next is not None:
return str(self) + self.next.str_all()
else:
return str(self)
\ No newline at end of file
__author__ = 'Tom'
from lib.Nucleotide import Nucleotide
from lib.Match import Match
from lib.MatchMate import MatchMate
from lib.Gap import Gap
from lib.Loop import Loop
from lib.LoopOption import LoopOption
class StemLoopFactory:
def __init__(self):
self.loops = list()
self.max_gap_size = 0
self.max_total_gap_count = 0
self.min_loop_size = 4
self.max_loop_size = 6
self.stem_size = 6
self.min_matches_before_first_gap = 0
self.min_gc_content = 0
self.stemloop = None
self.sequences = None
def init_stemloop(self):
first = True
# MATCHES
for i in range(0, self.stem_size):
# start with a match
if first:
n = Match()
first = False
else:
n = n.add_next(Match())
# GAPS
# prevent gaps in the beginning of the loops set by this
# parameter self.min_matches_before_first_gap
if i < self.stem_size - self.min_matches_before_first_gap:
for i in range(0, self.max_gap_size):
n = n.add_next(Gap())
# LOOP
for i in range(0, self.min_loop_size):
n = n.add_next(Loop())
# OPTIONAL LOOP
for i in range(0, self.max_loop_size - self.min_loop_size):
n = n.add_next(LoopOption())
# MATCHES
for i in range(0, self.stem_size - 1):
n = n.add_next(MatchMate())
# GAPS
# prevent gaps in the beginning of the loops set by this
# parameter self.min_matches_before_first_gap
if i >= self.min_matches_before_first_gap - 1:
for i in range(0, self.max_gap_size):
n = n.add_next(Gap())
# end with a match
n.add_next(MatchMate())
self.stemloop = n
def check_init(self):
if self.stemloop is None:
self.init_stemloop()
def min_length(self):
return self.stemloop.min_length()
def length(self):
self.check_init()
return self.stemloop.length()
def backbone(self):
return self.stemloop.str_all()
def summary(self):
out = "maximum gap size: %s\n" % self.max_gap_size
out += "maximum total gap count: %s\n" % self.max_total_gap_count
out += "minimal length: %s\n" % self.min_length()
out += "maximal length: %s\n" % (self.min_length() + min(self.max_gap_size, self.max_total_gap_count))
out += "total length: %s\n" % self.length()
out += self.backbone()
return(out)
def sequences_summary(self):
out = "number of unique sequences %s" % self.sequences_count()
return(out)
def check_sequences_init(self):
if self.sequences is None:
self.calculate_sequences()
def calculate_sequences(self):
self.check_init()
self.sequences = self.stemloop.eval(self.max_gap_size, self.min_gc_content)
def sequences_count(self):
self.check_sequences_init()
return(len(self.sequences))
def print_sequences(self, file):
self.check_sequences_init()
fh = open(file,'w')
for sequence in self.sequences:
fh.write("%s\n" % sequence)
fh.close()
def sequences_header(self, i):
return list(self.sequences)[0:i]
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment