diff --git a/scection.py b/scection.py new file mode 100644 index 0000000000000000000000000000000000000000..34f7fcaf984f5b56cdd206662c7f9fc14ec5894b --- /dev/null +++ b/scection.py @@ -0,0 +1,264 @@ +# Authors: Hanna L. Sladitschek & Pierre A. Neveu +# 03.04.2019 +# Requires Python 2.7 >=2.7.5, scipy>=0.12, numpy >=1.7 and nmf.py +# tested on Mac OS + + +import os +import math +from scipy.cluster import vq +from numpy import * +from scipy.cluster import hierarchy +from nmf import * +import pickle + + +def scection_step(data, indices_to_consider, min_expression,expression_range, nmf_runs=50, cluster_threshold=0.25): + nb_metaprofiles=2 + nb_samples=len(indices_to_consider) + data_to_consider=data[:,indices_to_consider] + + # keep only genes that have in one sample and a fold change greater than expression_range across samples + genes_indices=((amax(data_to_consider,axis=1)>=log(min_expression)/log(2)) + & (amax(data_to_consider,axis=1)-amin(data_to_consider,axis=1)>=log(expression_range)/log(2))).nonzero()[0] + + clustering_consensus=zeros((nb_samples,nb_samples)) + + if len(genes_indices)>1: + data_to_consider=data_to_consider[genes_indices] + + # rescale expression levels to have mean=1 and variance=1 + samplematrix=array([row/mean(row) for row in data_to_consider]) + samplematrix=vq.whiten(samplematrix) + + # perform nmf_runs runs of NMF + for k in range(nmf_runs): + winit=random.rand(len(samplematrix),nb_metaprofiles) + hinit=random.rand(nb_metaprofiles,len(samplematrix[0])) + w,h=nmf(samplematrix,winit,hinit,0.0001,500,500) + clusternb=h.argsort(0)[0] + clustering_consensus+=array([[1 if clusternb[elem]==clusternb[elem2] else 0 + for elem2 in range(nb_samples)] + for elem in range(nb_samples)]) + clustering_consensus=clustering_consensus*1.0/nmf_runs + + + #cluster the connectivity matrix + z=hierarchy.average(clustering_consensus) + cluster_order=hierarchy.dendrogram(z)['leaves'] + clustered=hierarchy.fcluster(z,cluster_threshold,criterion='distance') + if max(clustered)==nb_samples: + clustered=list(ones((nb_samples,1))) + + else: + cluster_order=range(nb_samples) + clustered=list(ones((nb_samples,1))) + + return [clustering_consensus, cluster_order, clustered] + + + + +def scection_full(data, indices_to_consider, min_expression, expression_range, nmf_runs=50, cluster_threshold=0.25, saveoutputfig=True, fig_name='sample', pickleresults=True): + if saveoutputfig or pickleresults: + allfiles=os.listdir('.') + if 'scectiondump' not in allfiles: + os.mkdir('scectiondump') + + #first round of SCECTION + scection_result=[] + scection_i=1 + scection_j=1 + firststep=scection_step(data,indices_to_consider,min_expression,expression_range) + scection_result.append([firststep]) + sample_indices=[[array(indices_to_consider)]] + + if saveoutputfig: + fig=figure() + ax0 = fig.add_subplot(111) + im=ax0.pcolormesh(((firststep[0])[firststep[1]])[:,firststep[1]]) + fig.colorbar(im,ax=ax0) + ax0.set_aspect('equal') + ax0.autoscale(tight=True) + savefig('scectiondump/'+fig_name+str(scection_i).zfill(3)+'.'+str(scection_j).zfill(3)+'.pdf',facecolor='None',edgecolor='None',format='pdf') + close() + + #subsequent SCECTION rounds + tocontinue=True + while tocontinue: + scection_i+=1 + scection_j=1 + tocontinue=False + scection_result.append([]) + sample_indices.append([]) + for i in range(len(scection_result[-2])): + if len(scection_result[-2][i])>0 and max(scection_result[-2][i][2])>1: + for j in range(max(scection_result[-2][i][2])): + indices_to_consider_sub=(scection_result[-2][i][2]==j+1).nonzero()[0] + substep=scection_step(data,sample_indices[-2][i][indices_to_consider_sub],min_expression,expression_range) + + if saveoutputfig: + fig=figure() + ax0 = fig.add_subplot(111) + im=ax0.pcolormesh(((substep[0])[substep[1]])[:,substep[1]]) + fig.colorbar(im,ax=ax0) + ax0.set_aspect('equal') + ax0.autoscale(tight=True) + savefig('scectiondump/'+fig_name+str(scection_i).zfill(3)+'.'+str(scection_j).zfill(3)+'.pdf',facecolor='None',edgecolor='None',format='pdf') + close() + + scection_result[-1].append(substep) + sample_indices[-1].append(sample_indices[-2][i][indices_to_consider_sub]) + scection_j+=1 + tocontinue=True + else: + scection_result[-1].append([]) + sample_indices[-1].append([]) + + if pickleresults: + file_o=open('scectiondump/'+fig_name+'pickled','wb') + pickle.dump([scection_result, sample_indices],file_o) + file_o.close() + + return [scection_result, sample_indices] + + +def average_scection_profiles(data, sample_indices): + averageprofiles=[] + averageindices=[] + for i in range(len(sample_indices)): + for j in range(len(sample_indices[i])): + if len(sample_indices[i][j])>0: + averageprofiles.append(mean(data[:,sample_indices[i][j]], axis=1)) + averageindices.append(sample_indices[i][j]) + averageprofiles=transpose(array(averageprofiles)) + + return [averageprofiles,averageindices] + + + +def scection_hierarchy(averageindices): + index_hierarchy=[] + indextopop=range(len(averageindices)) + for i in range(len(averageindices)-1,-1,-1): + index_hierarchy.append([i]) + for j in range(i-1,-1,-1): + if averageindices[i][0] in averageindices[j]: + index_hierarchy[-1].append(j) + if j in indextopop: + indextopop.remove(j) + indextopop.sort() + return [index_hierarchy, indextopop] + + + + +def combine_scection_output(data, scection_results_all, min_expression, expression_range, estimated_coverage): + genes_indicesall=[] + averageprofilesall=[] + scection_hierarchyall=[] + for i in range(len(scection_results_all)): + data_to_consider=data[:,scection_results_all[i][1][0][0]] + + averageprofilesall.append(average_scection_profiles(data, scection_results_all[i][1])) + scection_hierarchyall.append(scection_hierarchy(averageprofilesall[-1][1])) + + genes_indicesall.append(((amax(data_to_consider,axis=1)>=log(min_expression)/log(2)) + & (amax(data_to_consider,axis=1)-amin(data_to_consider,axis=1)>=log(expression_range)/log(2))).nonzero()[0]) + + # get all genes with variable expression + genes_indices=list(genes_indicesall[0]) + for i in range(1,len(scection_results_all)): + genes_indices+=list(genes_indicesall[i]) + genes_indices.sort() + newgenes_indices=[] + for elem in set(genes_indices): + if genes_indices.count(elem)>len(scection_results_all)*estimated_coverage-sqrt(len(scection_results_all)*estimated_coverage): + newgenes_indices.append(elem) + newgenes_indices.sort() + genes_indices=list(newgenes_indices) + + #get associate sample indices for all average profiles + llall1=[] + for i in range(len(scection_results_all)): + llall1+=[averageprofilesall[i][1][elem] for elem in scection_hierarchyall[i][1]] + llall1=array(llall1) + + # normalize data to have mean=0 in and std=1 + data_to_consider=[] + for i in range(len(scection_results_all)): + data_to_consider0=(averageprofilesall[i][0])[genes_indices] + for j in range(len(genes_indices)): + data_to_consider0[j]=data_to_consider0[j]-mean(data_to_consider0[j]) + for j in range(len(genes_indices)): + if std(data_to_consider0[j])>0: + data_to_consider0[j]=data_to_consider0[j]/std(data_to_consider0[j]) + data_to_consider.append(data_to_consider0) + + + expected_k=int(ceil(1/estimated_coverage*median([len(scection_hierarchyall[i][1]) for i in range(0,len(scection_results_all))]))) + + data_to_considerall=data_to_consider[0][:,scection_hierarchyall[0][1]] + for i in range(1,len(scection_results_all)): + data_to_considerall=hstack((data_to_considerall,data_to_consider[i][:,scection_hierarchyall[i][1]])) + + nb_samples=len(data_to_considerall[0]) + + #cluster profiles using k-means + clustering_consensus=zeros((nb_samples,nb_samples)) + + for kk in range(50): + kclustered,kclusterlabels=vq.kmeans2(transpose(data_to_considerall),expected_k,iter=500,minit='points') + ind_clusters=[(kclusterlabels==i).nonzero()[0] for i in range(expected_k)] + cluster_to_keep=[] + for i in range(expected_k): + if len(ind_clusters[i])>len(scection_results_all)*estimated_coverage-sqrt(len(scection_results_all)*estimated_coverage): + cluster_to_keep.append(i) + + for i in range(len(kclusterlabels)): + if kclusterlabels[i] not in cluster_to_keep: + dd=[norm(data_to_considerall[:,i]-kclustered[elem]) for elem in cluster_to_keep] + kclusterlabels[i]=cluster_to_keep[argmin(dd)] + + clustering_consensus+=array([[1 if kclusterlabels[elem]==kclusterlabels[elem2] else 0 + for elem2 in range(nb_samples)] + for elem in range(nb_samples)]) + + clustering_consensus=clustering_consensus*1.0/50 + + + #use k-means consensus clustering to build final clusters + clustering_consensus2=zeros((nb_samples,nb_samples)) + clustering_consensus2[(clustering_consensus>0.5).nonzero()]=1 + + finalclustered=array([-1 for i in range(nb_samples)]) + stilltoattribute=range(nb_samples) + + cluster_nb=0 + for i in range(nb_samples): + if i in stilltoattribute: + ind=(clustering_consensus2[i]==1).nonzero()[0] + grow_ind=set(ind) + tocontinue=True + while tocontinue: + tocontinue=False + for elem in grow_ind: + ind2=(clustering_consensus2[elem]==1).nonzero()[0] + grow_ind2=set(grow_ind | set(ind2)) + if len(grow_ind2)>len(grow_ind): + tocontinue=True + grow_ind=set(grow_ind2) + + finalclustered[list(grow_ind)]=cluster_nb + cluster_nb+=1 + for elem in grow_ind: + if elem in stilltoattribute: + stilltoattribute.remove(elem) + + final_indices=[[] for i in range(max(finalclustered)+1)] + for i in range(max(finalclustered)+1): + for elem in (finalclustered==i).nonzero()[0]: + final_indices[i].extend(llall1[elem]) + + return final_indices + \ No newline at end of file