Commit dde78cae authored by ilibarra's avatar ilibarra
Browse files

updating motif_plotter

parent 8eb71450
from lib.utils import *
from lib.plot_utils import *
from matplotlib.font_manager import FontProperties
from matplotlib.textpath import TextPath
import matplotlib.patches as patches
from matplotlib.transforms import Affine2D
from lib.motif_plotter.information_helper import *
import pandas as pd
import tempfile
from math import log
from lib.Motif.MotifConverter import MotifConverter
from Bio import motifs
def make_text_elements(text, x=0.0, y=0.0, width=1.0, height=1.0, color='blue', edgecolor="black",
font = FontProperties(family='monospace')):
tp = TextPath((0.0, 0.0), text, size=1, prop=font)
bbox = tp.get_extents()
bwidth = bbox.x1 - bbox.x0
bheight = bbox.y1 - bbox.y0
trafo = Affine2D()
trafo.translate(-bbox.x0, -bbox.y0)
trafo.scale(1 / bwidth * width, 1 / bheight * height)
trafo.translate(x,y)
tp = tp.transformed(trafo)
return patches.PathPatch(tp, facecolor=color, edgecolor=edgecolor)
def make_bar_plot(axes, texts, heights, width=0.8, colors=None, edgecolor="black"):
"""
Makes a bar plot but each bar is not just a rectangle but an element from the texts list
:param axes: the axes that is modified
:param texts: a list of strings, where each element is plotted as a "bar"
:param heights: a list of the height of each texts element
:param width: the width of the bar. Default: 0.8
:param colors: A list of colors, a list with a single entry or None. Default: None, which is plotted as blue
:return: None
"""
texts = list(texts)
heights = list(heights)
n_elem = len(texts)
if n_elem != len(heights):
raise ValueError("Texts and heights must be of the same length")
if colors is None:
colors = ['blue'] * n_elem
elif len(colors) == 1:
colors *= n_elem
axes.set_ylim(min(0,min(heights)), max(0,max(heights)))
axes.set_xlim(0, n_elem)
for idx, (text, height, color) in enumerate(zip(texts, heights, colors)):
text_shape = make_text_elements(text, x=idx+(1-width)/2, y=0, width=width, height=height,
color=color, edgecolor=edgecolor)
axes.add_patch(text_shape)
def make_stacked_bar_plot(axes, texts, heights, width=0.8, colors=None, edgecolor="black",
**kwargs):
"""
Makes a stackedbar plot but each bar is not just a rectangle but an element from the texts list
:param axes: the axes that is modified
:param texts: a list of list of strings, where each element is plotted as a "bar"
:param heights: a list of lists of the height of each texts element
:param width: the width of the bar. Default: 0.8
:param colors:
:return: None
"""
if colors is None:
colors = [['blue'] * len(text) for text in texts]
elif len(colors) == 1:
colors = [colors * len(text) for text in texts]
if len(texts) != len(heights):
raise ValueError("Texts and heights must be of the same length")
for idx, (text, height, color) in enumerate(zip(texts, heights, colors)):
y_stack_pos = 0
y_stack_neg = 0
for jdx, (t, h, c) in enumerate(zip(text, height, color)):
if h > 0:
text_shape = make_text_elements(t, x=idx+(1-width)/2, y=y_stack_pos, width=width, height=h,
color=c, edgecolor=edgecolor)
y_stack_pos += h
axes.add_patch(text_shape)
elif h < 0:
text_shape = make_text_elements(t, x=idx + (1 - width) / 2, y=y_stack_neg, width=width, height=h,
color=c, edgecolor=edgecolor)
y_stack_neg += h
axes.add_patch(text_shape)
axes.autoscale()
axes.set_xlim(0, len(texts))
plt.title(kwargs.get('title', ''))
plt.ylabel('Bits', fontsize=13)
plt.xlabel('Position')
plt.xticks([i + 0.5 for i in range(len(heights))],
[i + 1 for i in range(len(heights))])
plt.ylim(kwargs.get('ymin', 0), kwargs.get('ymax', 2.0))
class ConsensusMotifPlotter:
def __init__(self, elements, weights, colors=None):
self.n_elem = len(elements)
self.colors = colors
self.elements = elements
self.weights = weights
@classmethod
def from_importance_scoring(cls, value):
nucleotides = [['A', 'C', 'G', 'T']] * len(value.Sequence)
scores = value.Scores
colors = [['#008000', '#0000cc', '#cc0000', '#ffb300']] * len(value.Sequence)
sorted_nucleotides = np.array(nucleotides)
sorted_scores = np.array(scores)
sorted_colors = np.array(colors)
order = np.absolute(scores).argsort()
for i, order in enumerate(order):
sorted_scores[i, :] = sorted_scores[i, order]
sorted_nucleotides[i, :] = sorted_nucleotides[i, order]
sorted_colors[i, :] = sorted_colors[i, order]
return cls(sorted_nucleotides, sorted_scores, sorted_colors)
@classmethod
def from_weighted_sequence(cls, ws):
colors_scheme = {'G': '#ffb300', 'A': '#008000', 'C': '#0000cc', 'T': '#cc0000', '_': '#333333'}
return cls([[x] if x != '_' else "#" for x in ws.seq], [[x] for x in ws.scores],
[[colors_scheme[x]] for x in ws.seq])
@classmethod
def from_jaspar(cls, jaspar_path, trim_left=0, trim_right=0):
pfm = motifs.read(open(jaspar_path), 'jaspar')
ppm = pfm.counts.normalize(pseudocounts=0.5)
# print ppm
return ConsensusMotifPlotter.from_bio_motif(ppm, trim_left=trim_left,
trim_right=trim_right)
@classmethod
def from_pfm_matrix(cls, pfm, scale_info_content=True,
bits_per_position=None, bases=['A', 'C', 'G', 'T'],
colors=['#008000', '#0000cc', '#ffb300', '#cc0000']):
pfm = pd.DataFrame(pfm, columns=[i for i in range(len(pfm[0]))])
pfm = pfm.rename({0: 'A', 1: 'C', 2: 'G', 3: 'T'})
for i in range(pfm.shape[1]):
pfm[i] = pfm[i] / pfm[i].sum()
ppm = pfm
motif_converter = MotifConverter()
tmp_path = tempfile.mkstemp()[1]
motif_converter.convert_ppm_to_jaspar_pfm(ppm, tmp_path, motif_id='test')
motif_converter.convert_jaspar_pfm_to_homer_ppm(tmp_path, tmp_path)
return ConsensusMotifPlotter.from_homer_ppm(tmp_path)
@classmethod
def from_bio_motif(cls, pwm, scale_info_content=True,
bits_per_position=None, bases=['A', 'C', 'G', 'T'],
colors=['#008000', '#0000cc', '#ffb300', '#cc0000'],
trim_left=0, trim_right=0):
n_elem = len(pwm[bases[0]])
# print "motif width", n_elem
colors_scheme = {k: v for k, v in zip(bases, colors)}
basess = []
scoress = []
colorss = []
if bits_per_position is None and scale_info_content:
bits_per_position = []
for i in range(len(pwm['A'])):
probs = [pwm[b][i] for b in 'ACGT']
information = 2 - (-sum([log(pi, 2) * pi for pi in probs]))
# print probs, information
bits_per_position.append(information)
for i in range(0, n_elem):
if bits_per_position is None:
scores = [(b, pwm[b if b in pwm else i][i if b in pwm else j], colors_scheme[b])
for j, b in enumerate(bases)]
else:
scores = [(b, pwm[b if b in pwm else i][i if b in pwm else j] * bits_per_position[i], colors_scheme[b])
for j, b in enumerate(bases)]
scores.sort(key=lambda t: t[1])
basess += [[x[0] for x in scores]]
scoress += [[x[1] for x in scores]]
colorss += [[x[2] for x in scores]]
if trim_left != 0:
basess = basess[trim_left:]
scoress = scoress[trim_left:]
colorss = colorss[trim_left:]
if trim_right != 0:
basess = basess[:-trim_right]
scoress = scoress[:-trim_right]
colorss = colorss[:-trim_right]
return cls(basess, scoress, colorss)
@staticmethod
def get_bits_per_position(ppm):
bits_per_position = []
for i in range(len(ppm['A'])):
probs = [ppm[b][i] for b in 'ACGT']
# print probs
# print probs
information = 2 - (-sum([log(pi, 2) * pi for pi in probs]))
# print probs, information
bits_per_position.append(information)
return bits_per_position
@classmethod
def from_homer_ppm(cls, ppm_path, scale_info_content=True,
bases=['A', 'C', 'G', 'T'], colors=['#008000', '#0000cc', '#ffb300', '#cc0000']):
# gemerate a PWM from our sequences
ppm = pd.read_csv(ppm_path, sep='\t', skiprows=[0], header=None)
ppm.columns = ['A', 'C', 'G', 'T']
ppm = ppm + 0.01
ppm = pd.DataFrame([[ri / sum(r.values) for ri in r.values] for i, r in ppm.iterrows()],
columns=['A', 'C', 'G', 'T'])
bits_per_position = ConsensusMotifPlotter.get_bits_per_position(ppm)
return ConsensusMotifPlotter.from_bio_motif(ppm, scale_info_content=scale_info_content,
bits_per_position=bits_per_position,
bases=bases,
colors=colors)
@classmethod
def from_ppm(cls, ppm, scale_info_content=True,
bases=['A', 'C', 'G', 'T'],
colors=['#008000', '#0000cc', '#ffb300', '#cc0000'],
data_output_path=None, **kwargs):
# gemerate a PWM from our sequences
ppm = ppm + 0.01
ppm = ppm.transpose()
bits_per_position = ConsensusMotifPlotter.get_bits_per_position(ppm)
if data_output_path is not None:
cPickle.dump(bits_per_position, open(data_output_path, 'wb'))
return ConsensusMotifPlotter.from_bio_motif(ppm, scale_info_content=scale_info_content,
bits_per_position=bits_per_position,
bases=bases,
colors=colors,
**kwargs)
@classmethod
def from_alignment(cls, alignment_path):
seqs = [s.strip().split(" ")[-1] for s in open(alignment_path) if len(s.strip().split(" ")[-1]) != 0 and not 'CLUSTAL' in s]
return ConsensusMotifPlotter.from_sequences(seqs)
@classmethod
def from_sequences(cls, sequences, scale_info_content=True,
bits_per_position=None, bases=['A', 'C', 'G', 'T'],
colors=['#008000', '#0000cc', '#ffb300', '#cc0000'],
other=None):
# build a reference ppm
# print 'n sequences', len(sequences)
ppm = [[(sum([s[i] == k for s in sequences]) + 0.25) / float(len(sequences) + 1)
for k in ['A', 'C', 'G', 'T']] for i in
range(len(sequences[0]))]
ppm = pd.DataFrame(ppm, columns=['A', 'C', 'G', 'T'])
if bits_per_position is None:
bits_per_position = []
for i in range(len(ppm['A'])):
probs = [ppm[b][i] for b in 'ACGT']
# print probs
information = 2 - (-sum([log(pi, 2) * pi for pi in probs]))
# print probs, information
bits_per_position.append(information)
# sometimes, we want to compare the information divergence between two plots
# so we calculate a difference
# print other
if other is not None:
ppm2 = [[(sum([s[i] == k for s in other]) + 0.25) / float(len(other) + 1)
for k in ['A', 'C', 'G', 'T']] for i in
range(len(other[0]))]
ppm2 = pd.DataFrame(ppm2, columns=['A', 'C', 'G', 'T'])
information_difference = []
for i in range(len(ppm['A'])):
probs = [ppm[b][i] for b in 'ACGT']
probs2 = [ppm2[b][i] for b in 'ACGT']
print(probs)
print(probs2)
info_diff = 2 - (-sum([log(abs((pi - pj) if (pi - pj) != 0 else 0.001), 2) * abs(pi - pj)
for pi, pj in zip(probs, probs2)]))
# print probs, information
information_difference.append(information)
diff = (ppm - ppm2).abs() + 0.001
# gemerate a PWM from our sequences
return ConsensusMotifPlotter.from_bio_motif(diff, scale_info_content=scale_info_content,
bits_per_position=info_diff,
bases=bases,
colors=colors)
# gemerate a PWM from our sequences
return ConsensusMotifPlotter.from_bio_motif(ppm, scale_info_content=scale_info_content,
bits_per_position=bits_per_position,
bases=bases,
colors=colors)
def plot(self, axes, **kwargs):
"""
Add the motif to an axes
:return: modifies the axes object with all the necessary characters
"""
make_stacked_bar_plot(axes, self.elements, self.weights, width=1,
colors=self.colors, edgecolor="none", **kwargs)
if kwargs.get('style', None) == 'empty':
plt.title('') # G%i, N=%i' % (i, len(g.sequences)))
plt.xticks([])
plt.yticks([])
plt.xlabel('')
plt.ylabel('')
despine_all()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment