Commit 20466f4b authored by Amirreza Shaeiri's avatar Amirreza Shaeiri
Browse files

Update Human/data_analysis/7_TF_correlations_change_promoter_radius.R,...

Update Human/data_analysis/7_TF_correlations_change_promoter_radius.R, Human/data_analysis/6.random_forest_performance_batches_lung.R, Human/data_analysis/5.make_summary_for_significant_features.R, Human/data_analysis/4.fishers_tfs.R, Human/data_analysis/3.transfer_human_var_and_prior.R, Human/data_analysis/2.random_forest_performance.R, Human/data_analysis/1.summarize_boruta_results.R, Human/data_analysis/old/transfer_human_gene_groups.R, Human/data_analysis/old/transfer_human.R, Human/data_analysis/old/predict_priors.R, Human/data_analysis/old/huma_variation_analysis.Rmd, Human/data_analysis/old/conservation_between_fly_and_human.Rmd, Human/data_analysis/old/GO_enrichments.R, Human/data_analysis/old/3.feature_seletion_with_batchtools.R, Human/data_analysis/old/2.tissues_specific_tables.R, Human/data_analysis/old/1.calculate_average_variation_across_tissues.R files
parent 03165c8f
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(magrittr))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(mlr))
suppressPackageStartupMessages(library(Boruta))
options(stringsAsFactors = FALSE)
source("/g/furlong/project/62_expression_variation/src/shared/utils.R")
config = load_config() # json file with all paths to data
project_folder = config$project_parameters$project_dir
outdir = file.path(project_folder, "/analysis/human_data")
human_path = "/g/scb2/zaugg/shaeiri/variation_prediction/table/final"
make_recursive_dir(outdir)
boruta_path = "/g/scb2/zaugg/shaeiri/variation_prediction/table/final/"
load_boruta_res = function(f, tissue, responce, dir = boruta_path) {
load(file.path(dir, f))
imp_var = boruta_get_importance(Boruta)
data.frame(feature = names(imp_var), median_importance = imp_var, tissue = tissue,responce = responce)
}
df1 = load_boruta_res("Lung_resid.Rdata", "lung", "variation")
df2 = load_boruta_res("Muscle_resid.Rdata", "muscle", "variation")
df3 = load_boruta_res("Ovary_resid.Rdata", "ovary", "variation")
df4 = load_boruta_res("Lung_level.Rdata", "lung", "level")
df5 = load_boruta_res("Muscle_level.Rdata", "muscle", "level")
df6 = load_boruta_res("Ovary_level.Rdata", "ovary", "level")
df7 = load_boruta_res("share_mean_variation.Rdata", "average", "variation")
df8 = load_boruta_res("share_mean_median.Rdata", "average", "level")
df9 = load_boruta_res("prior_without.Rdata", "", "DE prior")
df = Reduce(rbind.data.frame, list(df1, df2, df3, df4, df5, df6, df7, df8, df9))
write.csv(df, file.path(outdir, "Boruta_median_importance_results.csv"), row.names = FALSE)
#### Add correlations #####################################################################
df_share = read.csv(file.path(human_path, "share.csv"))
df_prior = read.csv(file.path(human_path, "prior.csv"))
df_lung = read.csv(file.path(human_path, "Lung.csv"))
df_muscle = read.csv(file.path(human_path, "Muscle.csv"))
df_ovary = read.csv(file.path(human_path, "Ovary.csv"))
cor1 = get_correl_for_feature_list(df_lung, "resid_cv", df1$feature) %>% rename(feature = var, correlation_with_responce = cor) %>% mutate(tissue = "lung", responce = "variation")
cor2 = get_correl_for_feature_list(df_muscle, "resid_cv", df2$feature) %>% rename(feature = var, correlation_with_responce = cor) %>% mutate(tissue = "muscle", responce = "variation")
cor3 = get_correl_for_feature_list(df_ovary, "resid_cv", df3$feature) %>% rename(feature = var, correlation_with_responce = cor) %>% mutate(tissue = "ovary", responce = "variation")
cor4 = get_correl_for_feature_list(df_lung, "median", df4$feature) %>% rename(feature = var, correlation_with_responce = cor) %>% mutate(tissue = "lung", responce = "level")
cor5 = get_correl_for_feature_list(df_muscle, "median", df5$feature) %>% rename(feature = var, correlation_with_responce = cor) %>% mutate(tissue = "muscle", responce = "level")
cor6 = get_correl_for_feature_list(df_ovary, "median", df6$feature) %>% rename(feature = var, correlation_with_responce = cor) %>% mutate(tissue = "ovary", responce = "level")
cor7 = get_correl_for_feature_list(df_share, "mean_variation", df7$feature) %>% rename(feature = var, correlation_with_responce = cor) %>% mutate(tissue = "average", responce = "variation")
cor8 = get_correl_for_feature_list(df_share, "mean_median", df8$feature) %>% rename(feature = var, correlation_with_responce = cor) %>% mutate(tissue = "average", responce = "level")
cor9 = get_correl_for_feature_list(df_prior, "DE_Prior_Rank", df9$feature) %>% rename(feature = var, correlation_with_responce = cor) %>% mutate(tissue = "", responce = "DE prior")
cor = Reduce(rbind.data.frame, list(cor1, cor2, cor3, cor4, cor5, cor6, cor7, cor8, cor9))
res_all = merge(df, cor, by = c("feature", "tissue", "responce")) %>% filter(feature != "mean_median") # not removed in DE prior
write.csv(res_all, file.path(outdir, "Suppl_table_Feature_importance_and_correlations.csv"), row.names = FALSE)
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(magrittr))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(GenomicRanges))
suppressPackageStartupMessages(library(ggpubr))
suppressPackageStartupMessages(library(ranger))
suppressPackageStartupMessages(library(mlr))
suppressPackageStartupMessages(library(Boruta))
options(stringsAsFactors = FALSE)
source("/g/furlong/project/62_expression_variation/src/shared/utils.R")
config = load_config() # json file with all paths to data
times = load_times() # time-points used for analysis
project_folder = config$project_parameters$project_dir
outdir = file.path(project_folder, "/analysis/random_forests")
make_recursive_dir(outdir)
human_path1 = file.path(project_folder, "/analysis/human_data")
human_path2 = "/g/scb2/zaugg/shaeiri/variation_prediction/table/final/"
# Boruta results
features = read.csv(file.path(human_path1, "Boruta_median_importance_results.csv"))
# tables
mt_av_var = read.csv(file.path(human_path2, "share.csv")) %>% rename(resid_cv = mean_variation, median = mean_median)
mt_prior = read.csv(file.path(human_path2, "prior.csv"))
mt_lung = read.csv(file.path(human_path2, "Lung.csv"))
mt_muscle = read.csv(file.path(human_path2, "Muscle.csv"))
mt_ovary = read.csv(file.path(human_path2, "Ovary.csv"))
# remove some columns
#remove_list = names(mt)[grep("_cv|_median|variation|_mean|Prior_Rank|gene_name|gene_id", names(mt))]
# full dataset, predict variation
#mt = preprocess_master_table(master_table, remove_median = FALSE, log_transf_median = TRUE)
######## RF full ###################################################
mt1 = mt_av_var %>% select(resid_cv, one_of(features$feature[features$tissue == "average" & features$responce == "variation"]))
mt2 = mt_av_var %>% select(median, one_of(features$feature[features$tissue == "average" & features$responce == "level"]))
mt3 = mt_prior %>% select(DE_Prior_Rank, one_of(features$feature[features$tissue == "" & features$responce == "DE prior"]))
mt4 = mt_lung %>% select(resid_cv, one_of(features$feature[features$tissue == "lung" & features$responce == "variation"]))
mt5 = mt_lung %>% select(median, one_of(features$feature[features$tissue == "lung" & features$responce == "level"]))
mt6 = mt_muscle %>% select(resid_cv, one_of(features$feature[features$tissue == "muscle" & features$responce == "variation"]))
mt7 = mt_muscle %>% select(median, one_of(features$feature[features$tissue == "muscle" & features$responce == "level"]))
mt8 = mt_ovary %>% select(resid_cv, one_of(features$feature[features$tissue == "ovary" & features$responce == "variation"]))
mt9 = mt_ovary %>% select(median, one_of(features$feature[features$tissue == "ovary" & features$responce == "level"]))
lrn = makeLearner("regr.ranger")
task1 = makeRegrTask(data = mt1, target = "resid_cv")
task2 = makeRegrTask(data = mt2, target = "median")
task3 = makeRegrTask(data = mt3, target = "DE_Prior_Rank")
task4 = makeRegrTask(data = mt4, target = "resid_cv")
task5 = makeRegrTask(data = mt5, target = "median")
task6 = makeRegrTask(data = mt6, target = "resid_cv")
task7 = makeRegrTask(data = mt7, target = "median")
task8 = makeRegrTask(data = mt8, target = "resid_cv")
task9 = makeRegrTask(data = mt9, target = "median")
rdesc.outer = makeResampleDesc("CV", iters = 5)
ms = list(rsq, rmse, mse)
bmr_full = benchmark(lrn, tasks = list(task1, task2, task3, task4, task5, task6, task7, task8, task9), resampling = rdesc.outer, measures = ms, show.info = TRUE)
df_full = data.frame(bmr_full)
df_full$response = c(rep(c("variation", "level", "DE prior", "variation", "level", "variation", "level", "variation", "level"), each = 5))
df_full$type = rep(c("average", "average", "", "lung", "lung", "muscle", "muscle", "ovary", "ovary"), each = 5)
write.csv(df_full, file.path(human_path1, "Random_forest_results.csv"), row.names = FALSE)
library(ranger)
library(mlr)
# prior
#human_path = file.path(project_folder, "/analysis/human_data")
#df = read.csv(file.path(human_path, "variation_prior.csv"))
path = "/g/scb2/zaugg/shaeiri/variation_prediction/table/final/prior.csv"
df = read.csv(path)
outdir = "/g/furlong/project/62_expression_variation//analysis/human_data"
# Boruta results
features = read.csv(file.path(human_path1, "Boruta_median_importance_results.csv"))
# select features
df1 = df %>% select(gene_id, mean_variation, DE_Prior_Rank, one_of(features$feature[features$tissue == "average" & features$responce == "variation"]))
# variation
ranger(mean_variation ~ ., df1 %>% select(-DE_Prior_Rank))
# prior
ranger(DE_Prior_Rank ~ ., df1 %>% select(-mean_variation))
# transfer
mt_var = df1 %>% select(-DE_Prior_Rank) %>% mutate(bin_var = cut(mean_variation, quantile(mean_variation, 0:3/3), include.lowest = TRUE, labels = c(-1, 0, 1))) %>%
filter(bin_var %in% c(-1, 1)) %>% select(-mean_variation) %>% mutate(bin_var = factor(bin_var))
mt_prior = df1 %>% select(-mean_variation) %>% mutate(bin_var = cut(DE_Prior_Rank, quantile(DE_Prior_Rank, 0:3/3), include.lowest = TRUE, labels = c(-1, 0, 1))) %>%
filter(bin_var %in% c(-1, 1)) %>% select(-DE_Prior_Rank) %>% mutate(bin_var = factor(bin_var))
top_prior = mt_prior$gene_id[which(mt_prior$bin_var == 1)]
bottom_prior = mt_prior$gene_id[which(mt_prior$bin_var == -1)]
# overall performance
lrn = makeLearner("classif.ranger", predict.type = "prob")
ms = list(auc, mmce, acc)
task1 = makeClassifTask(data = mt_var %>% select(-gene_id), target = "bin_var")
task2 = makeClassifTask(data = mt_prior %>% select(-gene_id), target = "bin_var")
# cross-validation
r1 = crossval(learner = lrn, task1, iters = 5L, measures = ms)
print(calculateConfusionMatrix(r1$pred))
r2 = crossval(learner = lrn, task2, iters = 5L, measures = ms)
print(calculateConfusionMatrix(r2$pred))
# total number of genes
#n_genes = nrow(mt_var)
#test_size = 0.2 * n_genes
n_genes = nrow(mt_prior)
test_size = 0.5 * n_genes
# trained models
pred_all = data.frame(matrix(ncol = 6, nrow = 0))
names(pred_all) = c("learner", "fpr", "tpr", "threshold", "i", "AUC")
for (i in 1:10) {
# select 20% of genes for testing (same number from high and low class)
test_top = sample(top_prior, test_size / 2)
test_bottom = sample(bottom_prior, test_size / 2)
# test dataset
test_set = c(test_top, test_bottom)
mt_test = mt_prior[mt_prior$gene_id %in% test_set, ] %>% select(-gene_id)
# prepare dataset for training (exclude test set genes, variation or prior dataset)
df_sel = df1[!df1$gene_id %in% test_set, ]
#mt_train = df1[-test_set, ]
mt_train_var = df_sel %>% select(-DE_Prior_Rank, -gene_id) %>% mutate(bin_var = cut(mean_variation, quantile(mean_variation, 0:3/3), include.lowest = TRUE, labels = c(-1, 0, 1))) %>%
filter(bin_var %in% c(-1, 1)) %>% select(-mean_variation) %>% mutate(bin_var = factor(bin_var))
mt_train_prior = df_sel %>% select(-mean_variation, -gene_id) %>% mutate(bin_var = cut(DE_Prior_Rank, quantile(DE_Prior_Rank, 0:3/3), include.lowest = TRUE, labels = c(-1, 0, 1))) %>%
filter(bin_var %in% c(-1, 1)) %>% select(-DE_Prior_Rank) %>% mutate(bin_var = factor(bin_var))
# train models
lrn = makeLearner("classif.ranger", predict.type = "prob")
ms = list(auc, mmce, acc)
task1 = makeClassifTask(data = mt_train_var, target = "bin_var")
task2 = makeClassifTask(data = mt_train_prior, target = "bin_var")
mod1 = train(lrn, task1)
mod2 = train(lrn, task2)
# prediction
task_pred = makeClassifTask(data = mt_test, target = "bin_var")
pred1 = predict(mod1, task_pred)
pred2 = predict(mod2, task_pred)
# performance
AUC1 = round(performance(pred1, measures = auc), 2)
print(AUC1)
AUC2 = round(performance(pred2, measures = auc), 2)
print(AUC2)
# ROC curve data
rocl = generateThreshVsPerfData(list(train_var = pred1, train_prior = pred2), list(fpr, tpr), aggregate = TRUE)
# write data
df = rocl$data
df$i = i
df$AUC = rep(c(AUC1, AUC2), each = 100)
pred_all = rbind.data.frame(pred_all, df)
}
write.csv(pred_all, file.path(outdir, "transfer_learning_performance_human.csv"), row.names = FALSE)
suppressMessages(library(tidyverse))
suppressMessages(library(magrittr))
suppressMessages(library(data.table))
suppressMessages(library(gridExtra))
suppressMessages(library(ggrepel))
suppressMessages(library(ggpubr))
options(stringsAsFactors = FALSE)
source("/g/furlong/project/62_expression_variation/src/shared/utils.R")
config = load_config()
project_folder = config$project_parameters$project_dir
outdir = file.path(project_folder, "/analysis/human_data")
make_recursive_dir(outdir)
path = "/g/scb2/zaugg/shaeiri/variation_prediction/table/final/prior.csv"
mt = read.csv(path)
outdir = "/g/furlong/project/62_expression_variation//analysis/human_data"
# Boruta results
features = read.csv(file.path(human_path1, "Boruta_median_importance_results.csv"))
# select features
mt %<>% select(gene_id, mean_variation, one_of(features$feature[features$tissue == "average" & features$responce == "variation"]))
mt$var_type = "narrow"
mt$var_type[mt$percent_of_broad > 0.8 & mt$CpG_length > 0] = "broad, with CGI"
mt$var_type[mt$percent_of_broad > 0.8 & mt$CpG_length == 0] = "broad, no CGI"
groups = c("broad, with CGI", "broad, no CGI", "narrow")
# other classif
med_var_broad = median(mt[mt$percent_of_broad > 0.8, "mean_variation"])
mt$var_type = "narrow"
mt$var_type[mt$percent_of_broad > 0.8 & mt$mean_variation >= med_var_broad] = "broad, high-var"
mt$var_type[mt$percent_of_broad > 0.8 & mt$mean_variation < med_var_broad] = "broad, low-var"
#mt$var_type = factor(mt$var_type, levels = c("broad", "narrow-low", "narrow-high"))
groups = c("broad, low-var", "broad, high-var", "narrow")
mt_filt = mt[ , -grep("mean|num|length|percent|width|TATA|content", names(mt))]
features = names(mt_filt)[2:85]
################# 1 group vs 2 others ###########################################
out_df = data.frame(matrix(ncol = 9, nrow = 0))
names(out_df) = c("group", "group_val", "feature", "num_group_feature", "num_nongroup_feature", "num_group_nofeature", "num_nongroup_nofeature", "pval", "odds_ratio")
for(group in groups) {
print(group)
for(feature in features) {
print(feature)
# one group agains two others
df = get_fisher_group_tf(mt_filt, feature, "var_type", group)
out_df = rbind.data.frame(out_df, df)
}
}
out_df$N_peaks = rowSums(out_df[ ,4:5])
out_df$pval_bh = p.adjust(out_df$pval, method = "BH", n = nrow(out_df))
out_df$num_tfs = length(features)
out_df_filt = out_df %>% filter(pval_bh < 0.01 & odds_ratio > 2)
write.csv(out_df_filt, file.path(outdir, "fisher_tests_tf_peaks.csv"), row.names = FALSE)
############## Motifs ############################
features = names(mt)[grep("cisbp\\..*prox|ohler", names(mt))]
out_df_motifs = data.frame(matrix(ncol = 9, nrow = 0))
names(out_df_motifs) = c("group", "group_val", "feature", "num_group_feature", "num_nongroup_feature", "num_group_nofeature", "num_nongroup_nofeature", "pval", "odds_ratio")
for(group in groups) {
print(group)
for(feature in features) {
print(feature)
df = get_fisher_group_tf(mt_filt, feature, "var_type", group)
out_df_motifs = rbind.data.frame(out_df_motifs, df)
}
}
out_df_motifs$N_peaks = rowSums(out_df_motifs[ ,4:5])
out_df_motifs$pval_bh = p.adjust(out_df_motifs$pval, method = "BH", n = nrow(out_df_motifs))
out_df_motifs$num_motifs = length(features)
# significant TFs (at least one comparison)
enr_tfs = unique(out_df_motifs$feature[out_df_motifs$pval_bh < 0.01 & out_df_motifs$N_peaks > 100] )
out_df_filt_motifs = out_df_motifs %>% filter(feature %in% enr_tfs)
write.csv(out_df_filt_motifs, file.path(outdir, "fisher_tests_motifs.csv"), row.names = FALSE)
########## other features ##############
features = c("ohler_maj.TATA", "is_tf", "is_housekeeping")
out_df_features = data.frame(matrix(ncol = 9, nrow = 0))
names(out_df_features) = c("group", "group_val", "feature", "num_group_feature", "num_nongroup_feature", "num_group_nofeature", "num_nongroup_nofeature", "pval", "odds_ratio")
for(group in groups) {
print(group)
for(feature in features) {
print(feature)
df = get_fisher_group_tf(mt, feature, "var_type", group)
out_df_features = rbind.data.frame(out_df_features, df)
}
}
write.csv(out_df_features, file.path(outdir, "fisher_tests_selected_features.csv"), row.names = FALSE)
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(magrittr))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(GenomicRanges))
suppressPackageStartupMessages(library(ggpubr))
suppressPackageStartupMessages(library(ranger))
suppressPackageStartupMessages(library(mlr))
suppressPackageStartupMessages(library(Boruta))
options(stringsAsFactors = FALSE)
source("/g/furlong/project/62_expression_variation/src/shared/utils.R")
config = load_config() # json file with all paths to data
project_folder = config$project_parameters$project_dir
# master_table = load_master_table()
# mt = preprocess_master_table(master_table, remove_median = FALSE, log_transf_median = TRUE)
outdir = file.path(project_folder, "/analysis/random_forests")
######################################################################
######## RF full #####################################################
######################################################################
human_path = file.path(project_folder, "/analysis/human_data")
boruta_path = "/g/furlong/project/62_expression_variation/analysis/feature_selection_with_boruta_human/"
################################### Mean variation and prior #######################################
mt = read.csv(file.path(human_path, "variation_prior.csv"))
mt_lung = read.csv(file.path(human_path, "mt_lung.csv"))
######## Boruta results ###################################################
boruta_var = readRDS(file.path(boruta_path, "predict_mean_variation_glob/results/1.rds"))
imp_var = boruta_get_importance(boruta_var)
boruta_var = readRDS(file.path(boruta_path, "predict_mean_level/results/1.rds"))
imp_lev = boruta_get_importance(boruta_var)
boruta_var = readRDS(file.path(boruta_path, "predict_DE_Prior_Rank/results/1.rds"))
imp_prior = boruta_get_importance(boruta_var)
boruta_var = readRDS(file.path(boruta_path, "predict_resid_cv_lung/results/1.rds"))
imp_var_lung = boruta_get_importance(boruta_var)
boruta_var = readRDS(file.path(boruta_path, "predict_median_lung/results/1.rds"))
imp_lev_lung = boruta_get_importance(boruta_var)
boruta_var = readRDS(file.path(boruta_path, "predict_resid_cv_ovary/results/1.rds"))
imp_var_ov = boruta_get_importance(boruta_var)
boruta_var = readRDS(file.path(boruta_path, "predict_median_ovary/results/1.rds"))
imp_lev_ov = boruta_get_importance(boruta_var)
boruta_var = readRDS(file.path(boruta_path, "predict_resid_cv_muscle/results/1.rds"))
imp_var_mus = boruta_get_importance(boruta_var)
boruta_var = readRDS(file.path(boruta_path, "predict_median_muscle/results/1.rds"))
imp_lev_mus = boruta_get_importance(boruta_var)
data.frame(var = names(imp_var), med_imp = imp_var)
data.frame(var = names(imp_lev), med_imp = imp_lev)
data.frame(var = names(imp_prior), med_imp = imp_prior)
data.frame(var = names(imp_var_lung), med_imp = imp_var_lung)
data.frame(var = names(imp_var_lung), med_imp = imp_var_lung)
# importance for variation
boruta_var = readRDS(config$feature_selection$resid_cv)
imp_var = boruta_get_importance(boruta_var)
imp_var = data.frame(var = names(imp_var), med_imp_var = imp_var)
# importance for level
boruta_med = readRDS(config$feature_selection$median)
imp_med = boruta_get_importance(boruta_med)
imp_med = data.frame(var = names(imp_med), med_imp_med = imp_med)
# importance for shape
boruta_shape = readRDS(config$feature_selection$shape)
imp_shape = boruta_get_importance(boruta_shape)
imp_shape = data.frame(var = names(imp_shape), med_imp_shape = imp_shape)
# importance for variation in narrow
boruta_var = readRDS(config$feature_selection$resid_cv_narrow)
imp_var_narrow = boruta_get_importance(boruta_var)
imp_var_narrow = data.frame(var = names(imp_var_narrow), med_imp_narrow_var = imp_var_narrow)
# # importance for variation in broad
boruta_var = readRDS(config$feature_selection$resid_cv_broad)
imp_var_broad = boruta_get_importance(boruta_var)
imp_var_broad = data.frame(var = names(imp_var_broad), med_imp_broad_var = imp_var_broad)
# importance for level in narrow
boruta_var = readRDS(config$feature_selection$median_narrow)
imp_med_narrow = boruta_get_importance(boruta_var)
imp_med_narrow = data.frame(var = names(imp_med_narrow), med_imp_narrow_lev = imp_med_narrow)
# # importance for level in broad
boruta_var = readRDS(config$feature_selection$median_broad)
imp_med_broad = boruta_get_importance(boruta_var)
imp_med_broad = data.frame(var = names(imp_med_broad), med_imp_broad_lev = imp_med_broad)
boruta_imp = merge_df_list(list(imp_var, imp_med, imp_shape, imp_var_broad, imp_var_narrow, imp_med_narrow, imp_med_broad), merge_by = "var", all_both = TRUE)
#boruta_imp[is.na(boruta_imp)] = 0.1 # pseudocount for non-signifcinat features
######## Correlations with response ########################################
#all factors must be converted to numeric or logical
# check condition
# boruta_imp$va[!unlist(lapply(boruta_imp$var, function(x) is.numeric(mt[,x]) | is.logical(mt[,x]) ))]
mt$conserv_rank = as.numeric(mt$conserv_rank)
mt$shape = as.numeric(mt$shape)
mt$dhs_time_profile.prox = as.numeric(mt$dhs_time_profile.prox)
mt$dhs_tissue_profile.prox = as.numeric(mt$dhs_tissue_profile.prox)
mt$dhs_time_profile.dist = as.numeric(mt$dhs_time_profile.dist)
mt$dhs_tissue_profile.dist = as.numeric(mt$dhs_tissue_profile.dist)
mt[, grep("modERN_cisbp", names(mt))] = apply(mt[, grep("modERN_cisbp", names(mt))], 2, as.numeric)
# correlations
cor1 = get_correl_for_feature_list(mt, "resid_cv", boruta_imp$var) %>% rename(cor_var = cor)
cor2 = get_correl_for_feature_list(mt, "median", boruta_imp$var) %>% rename(cor_med = cor)
cor3 = get_correl_for_feature_list(mt, "shape", boruta_imp$var) %>% rename(cor_shape = cor)
cor4 = get_correl_for_feature_list(mt, "major_shape_ind", boruta_imp$var) %>% rename(cor_shape_ind = cor)
cor5 = get_correl_for_feature_list(mt %>% filter(shape == 1), "resid_cv", boruta_imp$var) %>% rename(cor_var_broad = cor)
cor6 = get_correl_for_feature_list(mt %>% filter(shape == 2), "resid_cv", boruta_imp$var) %>% rename(cor_var_narrow = cor)
cor7 = get_correl_for_feature_list(mt %>% filter(shape == 1), "median", boruta_imp$var) %>% rename(cor_med_broad = cor)
cor8 = get_correl_for_feature_list(mt %>% filter(shape == 2), "median", boruta_imp$var) %>% rename(cor_med_narrow = cor)
cor = merge_df_list(list(cor1, cor2, cor3, cor4, cor5, cor6, cor7, cor8), all_both = TRUE, merge_by = "var")
######## Correlations among features #################################
tmp = mt[ , cor$var]
cor_features = data.frame(cor(tmp))