Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
morphoseq
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Pierre Neveu
morphoseq
Commits
dc80e121
Commit
dc80e121
authored
5 years ago
by
Pierre Neveu
Browse files
Options
Downloads
Patches
Plain Diff
Add new file
parent
989649c0
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
scection.py
+264
-0
264 additions, 0 deletions
scection.py
with
264 additions
and
0 deletions
scection.py
0 → 100644
+
264
−
0
View file @
dc80e121
# Authors: Hanna L. Sladitschek & Pierre A. Neveu
# 03.04.2019
# Requires Python 2.7 >=2.7.5, scipy>=0.12, numpy >=1.7 and nmf.py
# tested on Mac OS
import
os
import
math
from
scipy.cluster
import
vq
from
numpy
import
*
from
scipy.cluster
import
hierarchy
from
nmf
import
*
import
pickle
def
scection_step
(
data
,
indices_to_consider
,
min_expression
,
expression_range
,
nmf_runs
=
50
,
cluster_threshold
=
0.25
):
nb_metaprofiles
=
2
nb_samples
=
len
(
indices_to_consider
)
data_to_consider
=
data
[:,
indices_to_consider
]
# keep only genes that have in one sample and a fold change greater than expression_range across samples
genes_indices
=
((
amax
(
data_to_consider
,
axis
=
1
)
>=
log
(
min_expression
)
/
log
(
2
))
&
(
amax
(
data_to_consider
,
axis
=
1
)
-
amin
(
data_to_consider
,
axis
=
1
)
>=
log
(
expression_range
)
/
log
(
2
))).
nonzero
()[
0
]
clustering_consensus
=
zeros
((
nb_samples
,
nb_samples
))
if
len
(
genes_indices
)
>
1
:
data_to_consider
=
data_to_consider
[
genes_indices
]
# rescale expression levels to have mean=1 and variance=1
samplematrix
=
array
([
row
/
mean
(
row
)
for
row
in
data_to_consider
])
samplematrix
=
vq
.
whiten
(
samplematrix
)
# perform nmf_runs runs of NMF
for
k
in
range
(
nmf_runs
):
winit
=
random
.
rand
(
len
(
samplematrix
),
nb_metaprofiles
)
hinit
=
random
.
rand
(
nb_metaprofiles
,
len
(
samplematrix
[
0
]))
w
,
h
=
nmf
(
samplematrix
,
winit
,
hinit
,
0.0001
,
500
,
500
)
clusternb
=
h
.
argsort
(
0
)[
0
]
clustering_consensus
+=
array
([[
1
if
clusternb
[
elem
]
==
clusternb
[
elem2
]
else
0
for
elem2
in
range
(
nb_samples
)]
for
elem
in
range
(
nb_samples
)])
clustering_consensus
=
clustering_consensus
*
1.0
/
nmf_runs
#cluster the connectivity matrix
z
=
hierarchy
.
average
(
clustering_consensus
)
cluster_order
=
hierarchy
.
dendrogram
(
z
)[
'
leaves
'
]
clustered
=
hierarchy
.
fcluster
(
z
,
cluster_threshold
,
criterion
=
'
distance
'
)
if
max
(
clustered
)
==
nb_samples
:
clustered
=
list
(
ones
((
nb_samples
,
1
)))
else
:
cluster_order
=
range
(
nb_samples
)
clustered
=
list
(
ones
((
nb_samples
,
1
)))
return
[
clustering_consensus
,
cluster_order
,
clustered
]
def
scection_full
(
data
,
indices_to_consider
,
min_expression
,
expression_range
,
nmf_runs
=
50
,
cluster_threshold
=
0.25
,
saveoutputfig
=
True
,
fig_name
=
'
sample
'
,
pickleresults
=
True
):
if
saveoutputfig
or
pickleresults
:
allfiles
=
os
.
listdir
(
'
.
'
)
if
'
scectiondump
'
not
in
allfiles
:
os
.
mkdir
(
'
scectiondump
'
)
#first round of SCECTION
scection_result
=
[]
scection_i
=
1
scection_j
=
1
firststep
=
scection_step
(
data
,
indices_to_consider
,
min_expression
,
expression_range
)
scection_result
.
append
([
firststep
])
sample_indices
=
[[
array
(
indices_to_consider
)]]
if
saveoutputfig
:
fig
=
figure
()
ax0
=
fig
.
add_subplot
(
111
)
im
=
ax0
.
pcolormesh
(((
firststep
[
0
])[
firststep
[
1
]])[:,
firststep
[
1
]])
fig
.
colorbar
(
im
,
ax
=
ax0
)
ax0
.
set_aspect
(
'
equal
'
)
ax0
.
autoscale
(
tight
=
True
)
savefig
(
'
scectiondump/
'
+
fig_name
+
str
(
scection_i
).
zfill
(
3
)
+
'
.
'
+
str
(
scection_j
).
zfill
(
3
)
+
'
.pdf
'
,
facecolor
=
'
None
'
,
edgecolor
=
'
None
'
,
format
=
'
pdf
'
)
close
()
#subsequent SCECTION rounds
tocontinue
=
True
while
tocontinue
:
scection_i
+=
1
scection_j
=
1
tocontinue
=
False
scection_result
.
append
([])
sample_indices
.
append
([])
for
i
in
range
(
len
(
scection_result
[
-
2
])):
if
len
(
scection_result
[
-
2
][
i
])
>
0
and
max
(
scection_result
[
-
2
][
i
][
2
])
>
1
:
for
j
in
range
(
max
(
scection_result
[
-
2
][
i
][
2
])):
indices_to_consider_sub
=
(
scection_result
[
-
2
][
i
][
2
]
==
j
+
1
).
nonzero
()[
0
]
substep
=
scection_step
(
data
,
sample_indices
[
-
2
][
i
][
indices_to_consider_sub
],
min_expression
,
expression_range
)
if
saveoutputfig
:
fig
=
figure
()
ax0
=
fig
.
add_subplot
(
111
)
im
=
ax0
.
pcolormesh
(((
substep
[
0
])[
substep
[
1
]])[:,
substep
[
1
]])
fig
.
colorbar
(
im
,
ax
=
ax0
)
ax0
.
set_aspect
(
'
equal
'
)
ax0
.
autoscale
(
tight
=
True
)
savefig
(
'
scectiondump/
'
+
fig_name
+
str
(
scection_i
).
zfill
(
3
)
+
'
.
'
+
str
(
scection_j
).
zfill
(
3
)
+
'
.pdf
'
,
facecolor
=
'
None
'
,
edgecolor
=
'
None
'
,
format
=
'
pdf
'
)
close
()
scection_result
[
-
1
].
append
(
substep
)
sample_indices
[
-
1
].
append
(
sample_indices
[
-
2
][
i
][
indices_to_consider_sub
])
scection_j
+=
1
tocontinue
=
True
else
:
scection_result
[
-
1
].
append
([])
sample_indices
[
-
1
].
append
([])
if
pickleresults
:
file_o
=
open
(
'
scectiondump/
'
+
fig_name
+
'
pickled
'
,
'
wb
'
)
pickle
.
dump
([
scection_result
,
sample_indices
],
file_o
)
file_o
.
close
()
return
[
scection_result
,
sample_indices
]
def
average_scection_profiles
(
data
,
sample_indices
):
averageprofiles
=
[]
averageindices
=
[]
for
i
in
range
(
len
(
sample_indices
)):
for
j
in
range
(
len
(
sample_indices
[
i
])):
if
len
(
sample_indices
[
i
][
j
])
>
0
:
averageprofiles
.
append
(
mean
(
data
[:,
sample_indices
[
i
][
j
]],
axis
=
1
))
averageindices
.
append
(
sample_indices
[
i
][
j
])
averageprofiles
=
transpose
(
array
(
averageprofiles
))
return
[
averageprofiles
,
averageindices
]
def
scection_hierarchy
(
averageindices
):
index_hierarchy
=
[]
indextopop
=
range
(
len
(
averageindices
))
for
i
in
range
(
len
(
averageindices
)
-
1
,
-
1
,
-
1
):
index_hierarchy
.
append
([
i
])
for
j
in
range
(
i
-
1
,
-
1
,
-
1
):
if
averageindices
[
i
][
0
]
in
averageindices
[
j
]:
index_hierarchy
[
-
1
].
append
(
j
)
if
j
in
indextopop
:
indextopop
.
remove
(
j
)
indextopop
.
sort
()
return
[
index_hierarchy
,
indextopop
]
def
combine_scection_output
(
data
,
scection_results_all
,
min_expression
,
expression_range
,
estimated_coverage
):
genes_indicesall
=
[]
averageprofilesall
=
[]
scection_hierarchyall
=
[]
for
i
in
range
(
len
(
scection_results_all
)):
data_to_consider
=
data
[:,
scection_results_all
[
i
][
1
][
0
][
0
]]
averageprofilesall
.
append
(
average_scection_profiles
(
data
,
scection_results_all
[
i
][
1
]))
scection_hierarchyall
.
append
(
scection_hierarchy
(
averageprofilesall
[
-
1
][
1
]))
genes_indicesall
.
append
(((
amax
(
data_to_consider
,
axis
=
1
)
>=
log
(
min_expression
)
/
log
(
2
))
&
(
amax
(
data_to_consider
,
axis
=
1
)
-
amin
(
data_to_consider
,
axis
=
1
)
>=
log
(
expression_range
)
/
log
(
2
))).
nonzero
()[
0
])
# get all genes with variable expression
genes_indices
=
list
(
genes_indicesall
[
0
])
for
i
in
range
(
1
,
len
(
scection_results_all
)):
genes_indices
+=
list
(
genes_indicesall
[
i
])
genes_indices
.
sort
()
newgenes_indices
=
[]
for
elem
in
set
(
genes_indices
):
if
genes_indices
.
count
(
elem
)
>
len
(
scection_results_all
)
*
estimated_coverage
-
sqrt
(
len
(
scection_results_all
)
*
estimated_coverage
):
newgenes_indices
.
append
(
elem
)
newgenes_indices
.
sort
()
genes_indices
=
list
(
newgenes_indices
)
#get associate sample indices for all average profiles
llall1
=
[]
for
i
in
range
(
len
(
scection_results_all
)):
llall1
+=
[
averageprofilesall
[
i
][
1
][
elem
]
for
elem
in
scection_hierarchyall
[
i
][
1
]]
llall1
=
array
(
llall1
)
# normalize data to have mean=0 in and std=1
data_to_consider
=
[]
for
i
in
range
(
len
(
scection_results_all
)):
data_to_consider0
=
(
averageprofilesall
[
i
][
0
])[
genes_indices
]
for
j
in
range
(
len
(
genes_indices
)):
data_to_consider0
[
j
]
=
data_to_consider0
[
j
]
-
mean
(
data_to_consider0
[
j
])
for
j
in
range
(
len
(
genes_indices
)):
if
std
(
data_to_consider0
[
j
])
>
0
:
data_to_consider0
[
j
]
=
data_to_consider0
[
j
]
/
std
(
data_to_consider0
[
j
])
data_to_consider
.
append
(
data_to_consider0
)
expected_k
=
int
(
ceil
(
1
/
estimated_coverage
*
median
([
len
(
scection_hierarchyall
[
i
][
1
])
for
i
in
range
(
0
,
len
(
scection_results_all
))])))
data_to_considerall
=
data_to_consider
[
0
][:,
scection_hierarchyall
[
0
][
1
]]
for
i
in
range
(
1
,
len
(
scection_results_all
)):
data_to_considerall
=
hstack
((
data_to_considerall
,
data_to_consider
[
i
][:,
scection_hierarchyall
[
i
][
1
]]))
nb_samples
=
len
(
data_to_considerall
[
0
])
#cluster profiles using k-means
clustering_consensus
=
zeros
((
nb_samples
,
nb_samples
))
for
kk
in
range
(
50
):
kclustered
,
kclusterlabels
=
vq
.
kmeans2
(
transpose
(
data_to_considerall
),
expected_k
,
iter
=
500
,
minit
=
'
points
'
)
ind_clusters
=
[(
kclusterlabels
==
i
).
nonzero
()[
0
]
for
i
in
range
(
expected_k
)]
cluster_to_keep
=
[]
for
i
in
range
(
expected_k
):
if
len
(
ind_clusters
[
i
])
>
len
(
scection_results_all
)
*
estimated_coverage
-
sqrt
(
len
(
scection_results_all
)
*
estimated_coverage
):
cluster_to_keep
.
append
(
i
)
for
i
in
range
(
len
(
kclusterlabels
)):
if
kclusterlabels
[
i
]
not
in
cluster_to_keep
:
dd
=
[
norm
(
data_to_considerall
[:,
i
]
-
kclustered
[
elem
])
for
elem
in
cluster_to_keep
]
kclusterlabels
[
i
]
=
cluster_to_keep
[
argmin
(
dd
)]
clustering_consensus
+=
array
([[
1
if
kclusterlabels
[
elem
]
==
kclusterlabels
[
elem2
]
else
0
for
elem2
in
range
(
nb_samples
)]
for
elem
in
range
(
nb_samples
)])
clustering_consensus
=
clustering_consensus
*
1.0
/
50
#use k-means consensus clustering to build final clusters
clustering_consensus2
=
zeros
((
nb_samples
,
nb_samples
))
clustering_consensus2
[(
clustering_consensus
>
0.5
).
nonzero
()]
=
1
finalclustered
=
array
([
-
1
for
i
in
range
(
nb_samples
)])
stilltoattribute
=
range
(
nb_samples
)
cluster_nb
=
0
for
i
in
range
(
nb_samples
):
if
i
in
stilltoattribute
:
ind
=
(
clustering_consensus2
[
i
]
==
1
).
nonzero
()[
0
]
grow_ind
=
set
(
ind
)
tocontinue
=
True
while
tocontinue
:
tocontinue
=
False
for
elem
in
grow_ind
:
ind2
=
(
clustering_consensus2
[
elem
]
==
1
).
nonzero
()[
0
]
grow_ind2
=
set
(
grow_ind
|
set
(
ind2
))
if
len
(
grow_ind2
)
>
len
(
grow_ind
):
tocontinue
=
True
grow_ind
=
set
(
grow_ind2
)
finalclustered
[
list
(
grow_ind
)]
=
cluster_nb
cluster_nb
+=
1
for
elem
in
grow_ind
:
if
elem
in
stilltoattribute
:
stilltoattribute
.
remove
(
elem
)
final_indices
=
[[]
for
i
in
range
(
max
(
finalclustered
)
+
1
)]
for
i
in
range
(
max
(
finalclustered
)
+
1
):
for
elem
in
(
finalclustered
==
i
).
nonzero
()[
0
]:
final_indices
[
i
].
extend
(
llall1
[
elem
])
return
final_indices
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment