Skip to content
Snippets Groups Projects
Commit dc80e121 authored by Pierre Neveu's avatar Pierre Neveu
Browse files

Add new file

parent 989649c0
No related branches found
No related tags found
No related merge requests found
# Authors: Hanna L. Sladitschek & Pierre A. Neveu
# 03.04.2019
# Requires Python 2.7 >=2.7.5, scipy>=0.12, numpy >=1.7 and nmf.py
# tested on Mac OS
import os
import math
from scipy.cluster import vq
from numpy import *
from scipy.cluster import hierarchy
from nmf import *
import pickle
def scection_step(data, indices_to_consider, min_expression,expression_range, nmf_runs=50, cluster_threshold=0.25):
nb_metaprofiles=2
nb_samples=len(indices_to_consider)
data_to_consider=data[:,indices_to_consider]
# keep only genes that have in one sample and a fold change greater than expression_range across samples
genes_indices=((amax(data_to_consider,axis=1)>=log(min_expression)/log(2))
& (amax(data_to_consider,axis=1)-amin(data_to_consider,axis=1)>=log(expression_range)/log(2))).nonzero()[0]
clustering_consensus=zeros((nb_samples,nb_samples))
if len(genes_indices)>1:
data_to_consider=data_to_consider[genes_indices]
# rescale expression levels to have mean=1 and variance=1
samplematrix=array([row/mean(row) for row in data_to_consider])
samplematrix=vq.whiten(samplematrix)
# perform nmf_runs runs of NMF
for k in range(nmf_runs):
winit=random.rand(len(samplematrix),nb_metaprofiles)
hinit=random.rand(nb_metaprofiles,len(samplematrix[0]))
w,h=nmf(samplematrix,winit,hinit,0.0001,500,500)
clusternb=h.argsort(0)[0]
clustering_consensus+=array([[1 if clusternb[elem]==clusternb[elem2] else 0
for elem2 in range(nb_samples)]
for elem in range(nb_samples)])
clustering_consensus=clustering_consensus*1.0/nmf_runs
#cluster the connectivity matrix
z=hierarchy.average(clustering_consensus)
cluster_order=hierarchy.dendrogram(z)['leaves']
clustered=hierarchy.fcluster(z,cluster_threshold,criterion='distance')
if max(clustered)==nb_samples:
clustered=list(ones((nb_samples,1)))
else:
cluster_order=range(nb_samples)
clustered=list(ones((nb_samples,1)))
return [clustering_consensus, cluster_order, clustered]
def scection_full(data, indices_to_consider, min_expression, expression_range, nmf_runs=50, cluster_threshold=0.25, saveoutputfig=True, fig_name='sample', pickleresults=True):
if saveoutputfig or pickleresults:
allfiles=os.listdir('.')
if 'scectiondump' not in allfiles:
os.mkdir('scectiondump')
#first round of SCECTION
scection_result=[]
scection_i=1
scection_j=1
firststep=scection_step(data,indices_to_consider,min_expression,expression_range)
scection_result.append([firststep])
sample_indices=[[array(indices_to_consider)]]
if saveoutputfig:
fig=figure()
ax0 = fig.add_subplot(111)
im=ax0.pcolormesh(((firststep[0])[firststep[1]])[:,firststep[1]])
fig.colorbar(im,ax=ax0)
ax0.set_aspect('equal')
ax0.autoscale(tight=True)
savefig('scectiondump/'+fig_name+str(scection_i).zfill(3)+'.'+str(scection_j).zfill(3)+'.pdf',facecolor='None',edgecolor='None',format='pdf')
close()
#subsequent SCECTION rounds
tocontinue=True
while tocontinue:
scection_i+=1
scection_j=1
tocontinue=False
scection_result.append([])
sample_indices.append([])
for i in range(len(scection_result[-2])):
if len(scection_result[-2][i])>0 and max(scection_result[-2][i][2])>1:
for j in range(max(scection_result[-2][i][2])):
indices_to_consider_sub=(scection_result[-2][i][2]==j+1).nonzero()[0]
substep=scection_step(data,sample_indices[-2][i][indices_to_consider_sub],min_expression,expression_range)
if saveoutputfig:
fig=figure()
ax0 = fig.add_subplot(111)
im=ax0.pcolormesh(((substep[0])[substep[1]])[:,substep[1]])
fig.colorbar(im,ax=ax0)
ax0.set_aspect('equal')
ax0.autoscale(tight=True)
savefig('scectiondump/'+fig_name+str(scection_i).zfill(3)+'.'+str(scection_j).zfill(3)+'.pdf',facecolor='None',edgecolor='None',format='pdf')
close()
scection_result[-1].append(substep)
sample_indices[-1].append(sample_indices[-2][i][indices_to_consider_sub])
scection_j+=1
tocontinue=True
else:
scection_result[-1].append([])
sample_indices[-1].append([])
if pickleresults:
file_o=open('scectiondump/'+fig_name+'pickled','wb')
pickle.dump([scection_result, sample_indices],file_o)
file_o.close()
return [scection_result, sample_indices]
def average_scection_profiles(data, sample_indices):
averageprofiles=[]
averageindices=[]
for i in range(len(sample_indices)):
for j in range(len(sample_indices[i])):
if len(sample_indices[i][j])>0:
averageprofiles.append(mean(data[:,sample_indices[i][j]], axis=1))
averageindices.append(sample_indices[i][j])
averageprofiles=transpose(array(averageprofiles))
return [averageprofiles,averageindices]
def scection_hierarchy(averageindices):
index_hierarchy=[]
indextopop=range(len(averageindices))
for i in range(len(averageindices)-1,-1,-1):
index_hierarchy.append([i])
for j in range(i-1,-1,-1):
if averageindices[i][0] in averageindices[j]:
index_hierarchy[-1].append(j)
if j in indextopop:
indextopop.remove(j)
indextopop.sort()
return [index_hierarchy, indextopop]
def combine_scection_output(data, scection_results_all, min_expression, expression_range, estimated_coverage):
genes_indicesall=[]
averageprofilesall=[]
scection_hierarchyall=[]
for i in range(len(scection_results_all)):
data_to_consider=data[:,scection_results_all[i][1][0][0]]
averageprofilesall.append(average_scection_profiles(data, scection_results_all[i][1]))
scection_hierarchyall.append(scection_hierarchy(averageprofilesall[-1][1]))
genes_indicesall.append(((amax(data_to_consider,axis=1)>=log(min_expression)/log(2))
& (amax(data_to_consider,axis=1)-amin(data_to_consider,axis=1)>=log(expression_range)/log(2))).nonzero()[0])
# get all genes with variable expression
genes_indices=list(genes_indicesall[0])
for i in range(1,len(scection_results_all)):
genes_indices+=list(genes_indicesall[i])
genes_indices.sort()
newgenes_indices=[]
for elem in set(genes_indices):
if genes_indices.count(elem)>len(scection_results_all)*estimated_coverage-sqrt(len(scection_results_all)*estimated_coverage):
newgenes_indices.append(elem)
newgenes_indices.sort()
genes_indices=list(newgenes_indices)
#get associate sample indices for all average profiles
llall1=[]
for i in range(len(scection_results_all)):
llall1+=[averageprofilesall[i][1][elem] for elem in scection_hierarchyall[i][1]]
llall1=array(llall1)
# normalize data to have mean=0 in and std=1
data_to_consider=[]
for i in range(len(scection_results_all)):
data_to_consider0=(averageprofilesall[i][0])[genes_indices]
for j in range(len(genes_indices)):
data_to_consider0[j]=data_to_consider0[j]-mean(data_to_consider0[j])
for j in range(len(genes_indices)):
if std(data_to_consider0[j])>0:
data_to_consider0[j]=data_to_consider0[j]/std(data_to_consider0[j])
data_to_consider.append(data_to_consider0)
expected_k=int(ceil(1/estimated_coverage*median([len(scection_hierarchyall[i][1]) for i in range(0,len(scection_results_all))])))
data_to_considerall=data_to_consider[0][:,scection_hierarchyall[0][1]]
for i in range(1,len(scection_results_all)):
data_to_considerall=hstack((data_to_considerall,data_to_consider[i][:,scection_hierarchyall[i][1]]))
nb_samples=len(data_to_considerall[0])
#cluster profiles using k-means
clustering_consensus=zeros((nb_samples,nb_samples))
for kk in range(50):
kclustered,kclusterlabels=vq.kmeans2(transpose(data_to_considerall),expected_k,iter=500,minit='points')
ind_clusters=[(kclusterlabels==i).nonzero()[0] for i in range(expected_k)]
cluster_to_keep=[]
for i in range(expected_k):
if len(ind_clusters[i])>len(scection_results_all)*estimated_coverage-sqrt(len(scection_results_all)*estimated_coverage):
cluster_to_keep.append(i)
for i in range(len(kclusterlabels)):
if kclusterlabels[i] not in cluster_to_keep:
dd=[norm(data_to_considerall[:,i]-kclustered[elem]) for elem in cluster_to_keep]
kclusterlabels[i]=cluster_to_keep[argmin(dd)]
clustering_consensus+=array([[1 if kclusterlabels[elem]==kclusterlabels[elem2] else 0
for elem2 in range(nb_samples)]
for elem in range(nb_samples)])
clustering_consensus=clustering_consensus*1.0/50
#use k-means consensus clustering to build final clusters
clustering_consensus2=zeros((nb_samples,nb_samples))
clustering_consensus2[(clustering_consensus>0.5).nonzero()]=1
finalclustered=array([-1 for i in range(nb_samples)])
stilltoattribute=range(nb_samples)
cluster_nb=0
for i in range(nb_samples):
if i in stilltoattribute:
ind=(clustering_consensus2[i]==1).nonzero()[0]
grow_ind=set(ind)
tocontinue=True
while tocontinue:
tocontinue=False
for elem in grow_ind:
ind2=(clustering_consensus2[elem]==1).nonzero()[0]
grow_ind2=set(grow_ind | set(ind2))
if len(grow_ind2)>len(grow_ind):
tocontinue=True
grow_ind=set(grow_ind2)
finalclustered[list(grow_ind)]=cluster_nb
cluster_nb+=1
for elem in grow_ind:
if elem in stilltoattribute:
stilltoattribute.remove(elem)
final_indices=[[] for i in range(max(finalclustered)+1)]
for i in range(max(finalclustered)+1):
for elem in (finalclustered==i).nonzero()[0]:
final_indices[i].extend(llall1[elem])
return final_indices
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment