Skip to content
Snippets Groups Projects
Commit 7a7d22c6 authored by Yunfeng Huang's avatar Yunfeng Huang
Browse files

Upload New File - R code for pairwise gene-gene interaction analysis on...

Upload New File - R code for pairwise gene-gene interaction analysis on PTV-burden with lead GWAS SNP for lipids in the UKBB
parent 55bb4920
No related branches found
No related tags found
1 merge request!2Upload R scripts for the UKBB analysis
#######################################################
######### UKBB lipids PTV-SNP interaction #############
######### Author: Yunfeng Huang #############
#######################################################
# Read phenotype data
library(data.table)
library(dplyr)
lipids_pheno <- as.data.frame(fread("~/lipids_heiko/LoF_UKB/2021/lipids_lofs_pheno.txt.gz"))
# merge PTV and SNP genotype data
# geno <- fread("/camhpc/home/yhuang5/lipids_heiko/UKB_SNP_extraction/ukb_chr1_lipids_CAD_28GWASlead_subset.raw")
# geno <- select(geno,1,7:ncol(geno))
# for(i in c(2,5,6,7,8,10,11,17,19,22)){
# temp <- fread(paste("/camhpc/home/yhuang5/lipids_heiko/UKB_SNP_extraction/ukb_chr",i,"_lipids_CAD_28GWASlead_subset.raw",sep=""))
# temp <- select(temp,1,7:ncol(temp))
# geno <- merge(geno,temp,by="FID")
# rm(temp)
# print(i)
# }
# rm(i)
#
# lipids_pheno <- merge(lipids_pheno,geno,by.x = "eid",by.y = "FID")
# ptv <- colnames(lipids_pheno)[6:35]
# snps <- colnames(lipids_pheno)[36:63]
# Test for PTV-SNP interaction through robust linear model fitting and extract BIC as well as interaction model coefficients (beta and t-value)
library(MASS)
# hdl
hdl_results <- as.data.frame(matrix(NA,840,12))
n <- 0
for(i in 1:length(ptv)){
for(j in 1:length(snps)){
n <- n+1
temp_data <- na.omit(select(lipids_pheno,c("hdl_res",ptv[i],snps[j])))
colnames(temp_data) <- c("hdl_res","ptv","snp")
model1 <- rlm(hdl_res~ptv,data = temp_data)
model2 <- rlm(hdl_res~snp,data = temp_data)
model3 <- rlm(hdl_res~ptv+snp,data = temp_data)
hdl_results[n,1] <- ptv[i]
hdl_results[n,2] <- snps[j]
hdl_results[n,3] <- BIC(model1)
hdl_results[n,4] <- BIC(model2)
hdl_results[n,5] <- BIC(model3)
c11 <- sum(temp_data$ptv>0 & temp_data$snp>0)
c10 <- sum(temp_data$ptv>0 & temp_data$snp==0)
if(c11>0 & c10>0){
model4 <- rlm(hdl_res~ptv+snp+ptv:snp,data = temp_data)
hdl_results[n,6] <- BIC(model4)
hdl_results[n,7] <- summary(model4)$coeff[2,1]
hdl_results[n,8] <- summary(model4)$coeff[3,1]
hdl_results[n,9] <- summary(model4)$coeff[4,1]
hdl_results[n,10] <- summary(model4)$coeff[2,3]
hdl_results[n,11] <- summary(model4)$coeff[3,3]
hdl_results[n,12] <- summary(model4)$coeff[4,3]
} else {
hdl_results[n,7] <- summary(model3)$coeff[2,1]
hdl_results[n,8] <- summary(model3)$coeff[3,1]
hdl_results[n,10] <- summary(model3)$coeff[2,3]
hdl_results[n,11] <- summary(model3)$coeff[3,3]
}
if(ceiling(n/100)==(n/100)){print(n)}
}
}
colnames(hdl_results) <- c("ptv_gene","SNP","model1_BIC","model2_BIC","model3_BIC","model4_BIC","PTV_E","SNP_E","PTV_SNP_E","PTV_t","SNP_t","PTV_SNP_t")
# ldl
ldl_results <- as.data.frame(matrix(NA,840,12))
n <- 0
for(i in 1:length(ptv)){
for(j in 1:length(snps)){
n <- n+1
temp_data <- na.omit(select(lipids_pheno,c("ldl_res",ptv[i],snps[j])))
colnames(temp_data) <- c("ldl_res","ptv","snp")
model1 <- rlm(ldl_res~ptv,data = temp_data)
model2 <- rlm(ldl_res~snp,data = temp_data)
model3 <- rlm(ldl_res~ptv+snp,data = temp_data)
ldl_results[n,1] <- ptv[i]
ldl_results[n,2] <- snps[j]
ldl_results[n,3] <- BIC(model1)
ldl_results[n,4] <- BIC(model2)
ldl_results[n,5] <- BIC(model3)
c11 <- sum(temp_data$ptv>0 & temp_data$snp>0)
c10 <- sum(temp_data$ptv>0 & temp_data$snp==0)
if(c11>0 & c10>0){
model4 <- rlm(ldl_res~ptv+snp+ptv:snp,data = temp_data)
ldl_results[n,6] <- BIC(model4)
ldl_results[n,7] <- summary(model4)$coeff[2,1]
ldl_results[n,8] <- summary(model4)$coeff[3,1]
ldl_results[n,9] <- summary(model4)$coeff[4,1]
ldl_results[n,10] <- summary(model4)$coeff[2,3]
ldl_results[n,11] <- summary(model4)$coeff[3,3]
ldl_results[n,12] <- summary(model4)$coeff[4,3]
} else {
ldl_results[n,7] <- summary(model3)$coeff[2,1]
ldl_results[n,8] <- summary(model3)$coeff[3,1]
ldl_results[n,10] <- summary(model3)$coeff[2,3]
ldl_results[n,11] <- summary(model3)$coeff[3,3]
}
if(ceiling(n/100)==(n/100)){print(n)}
}
}
colnames(ldl_results) <- c("ptv_gene","SNP","model1_BIC","model2_BIC","model3_BIC","model4_BIC","PTV_E","SNP_E","PTV_SNP_E","PTV_t","SNP_t","PTV_SNP_t")
# tc
tc_results <- as.data.frame(matrix(NA,840,12))
n <- 0
for(i in 1:length(ptv)){
for(j in 1:length(snps)){
n <- n+1
temp_data <- na.omit(select(lipids_pheno,c("tc_res",ptv[i],snps[j])))
colnames(temp_data) <- c("tc_res","ptv","snp")
model1 <- rlm(tc_res~ptv,data = temp_data)
model2 <- rlm(tc_res~snp,data = temp_data)
model3 <- rlm(tc_res~ptv+snp,data = temp_data)
tc_results[n,1] <- ptv[i]
tc_results[n,2] <- snps[j]
tc_results[n,3] <- BIC(model1)
tc_results[n,4] <- BIC(model2)
tc_results[n,5] <- BIC(model3)
c11 <- sum(temp_data$ptv>0 & temp_data$snp>0)
c10 <- sum(temp_data$ptv>0 & temp_data$snp==0)
if(c11>0 & c10>0){
model4 <- rlm(tc_res~ptv+snp+ptv:snp,data = temp_data)
tc_results[n,6] <- BIC(model4)
tc_results[n,7] <- summary(model4)$coeff[2,1]
tc_results[n,8] <- summary(model4)$coeff[3,1]
tc_results[n,9] <- summary(model4)$coeff[4,1]
tc_results[n,10] <- summary(model4)$coeff[2,3]
tc_results[n,11] <- summary(model4)$coeff[3,3]
tc_results[n,12] <- summary(model4)$coeff[4,3]
} else {
tc_results[n,7] <- summary(model3)$coeff[2,1]
tc_results[n,8] <- summary(model3)$coeff[3,1]
tc_results[n,10] <- summary(model3)$coeff[2,3]
tc_results[n,11] <- summary(model3)$coeff[3,3]
}
if(ceiling(n/100)==(n/100)){print(n)}
}
}
colnames(tc_results) <- c("ptv_gene","SNP","model1_BIC","model2_BIC","model3_BIC","model4_BIC","PTV_E","SNP_E","PTV_SNP_E","PTV_t","SNP_t","PTV_SNP_t")
# tg
tg_results <- as.data.frame(matrix(NA,840,12))
n <- 0
for(i in 1:length(ptv)){
for(j in 1:length(snps)){
n <- n+1
temp_data <- na.omit(select(lipids_pheno,c("tg_res",ptv[i],snps[j])))
colnames(temp_data) <- c("tg_res","ptv","snp")
model1 <- rlm(tg_res~ptv,data = temp_data)
model2 <- rlm(tg_res~snp,data = temp_data)
model3 <- rlm(tg_res~ptv+snp,data = temp_data)
tg_results[n,1] <- ptv[i]
tg_results[n,2] <- snps[j]
tg_results[n,3] <- BIC(model1)
tg_results[n,4] <- BIC(model2)
tg_results[n,5] <- BIC(model3)
c11 <- sum(temp_data$ptv>0 & temp_data$snp>0)
c10 <- sum(temp_data$ptv>0 & temp_data$snp==0)
if(c11>0 & c10>0){
model4 <- rlm(tg_res~ptv+snp+ptv:snp,data = temp_data)
tg_results[n,6] <- BIC(model4)
tg_results[n,7] <- summary(model4)$coeff[2,1]
tg_results[n,8] <- summary(model4)$coeff[3,1]
tg_results[n,9] <- summary(model4)$coeff[4,1]
tg_results[n,10] <- summary(model4)$coeff[2,3]
tg_results[n,11] <- summary(model4)$coeff[3,3]
tg_results[n,12] <- summary(model4)$coeff[4,3]
} else {
tg_results[n,7] <- summary(model3)$coeff[2,1]
tg_results[n,8] <- summary(model3)$coeff[3,1]
tg_results[n,10] <- summary(model3)$coeff[2,3]
tg_results[n,11] <- summary(model3)$coeff[3,3]
}
if(ceiling(n/100)==(n/100)){print(n)}
}
}
colnames(tg_results) <- c("ptv_gene","SNP","model1_BIC","model2_BIC","model3_BIC","model4_BIC","PTV_E","SNP_E","PTV_SNP_E","PTV_t","SNP_t","PTV_SNP_t")
# merge results for FDR-correction
hdl_results$trait <- "HDL"
ldl_results$trait <- "LDL"
tc_results$trait <- "TC"
tg_results$trait <- "TG"
hdl_results$PTV_P <- pnorm(abs(hdl_results$PTV_t),lower.tail=F)*2
hdl_results$SNP_P <- pnorm(abs(hdl_results$SNP_t),lower.tail=F)*2
hdl_results$PTV_SNP_P <- pnorm(abs(hdl_results$PTV_SNP_t),lower.tail=F)*2
ldl_results$PTV_P <- pnorm(abs(ldl_results$PTV_t),lower.tail=F)*2
ldl_results$SNP_P <- pnorm(abs(ldl_results$SNP_t),lower.tail=F)*2
ldl_results$PTV_SNP_P <- pnorm(abs(ldl_results$PTV_SNP_t),lower.tail=F)*2
tc_results$PTV_P <- pnorm(abs(tc_results$PTV_t),lower.tail=F)*2
tc_results$SNP_P <- pnorm(abs(tc_results$SNP_t),lower.tail=F)*2
tc_results$PTV_SNP_P <- pnorm(abs(tc_results$PTV_SNP_t),lower.tail=F)*2
tg_results$PTV_P <- pnorm(abs(tg_results$PTV_t),lower.tail=F)*2
tg_results$SNP_P <- pnorm(abs(tg_results$SNP_t),lower.tail=F)*2
tg_results$PTV_SNP_P <- pnorm(abs(tg_results$PTV_SNP_t),lower.tail=F)*2
all_results <- rbind(hdl_results,ldl_results,tc_results,tg_results)
getrs <- function(x){return(strsplit(x,"_")[[1]][1])}
all_results$rsid <- sapply(all_results$SNP,getrs)
snp_anno <- fread("~/lipids_heiko/SNP_LoF_interaction/2021/SNP_gene.anno")
all_results <- merge(all_results,select(snp_anno,`Lead SNP`,Gene),by.x = "rsid",by.y = "Lead SNP",all.x = T)
all_results$keep <- NA
for(i in 1:nrow(all_results)){
all_results$keep[i] <- ifelse(grepl(all_results$ptv_gene[i],all_results$Gene[i])==F,1,0)
}
all_results$keep <- ifelse(all_results$rsid=="rs12027135" & all_results$ptv_gene=="LDLR",1,all_results$keep)
all_results <- filter(all_results,keep==1)
all_results_long <- as.data.frame(matrix(NA,nrow(all_results)*3,5))
colnames(all_results_long) <- c("trait","PTV","SNP","name","P")
for(i in 1:nrow(all_results)){
all_results_long[(3*i-2):(3*i),1] <- all_results$trait[i]
all_results_long[(3*i-2):(3*i),2] <- all_results$ptv_gene[i]
all_results_long[(3*i-2):(3*i),3] <- all_results$SNP[i]
all_results_long[(3*i-2):(3*i),4] <- c("PTV","SNP","PTV_SNP")
all_results_long[(3*i-2):(3*i),5] <- as.numeric(all_results[i,15:17])
if(ceiling(i/100)==(i/100)){print(i)}
}
all_results_long$fdr_q <- p.adjust(all_results_long$P,"fdr")
temp <- as.data.frame(matrix(NA,nrow(all_results),6))
colnames(temp) <- c("trait","PTV","SNP","PTV_fdr_q","SNP_fdr_q","PTV_SNP_fdr_q")
for(i in 1:nrow(temp)){
temp[i,1] <- all_results_long$trait[(3*i-2)]
temp[i,2] <- all_results_long$PTV[(3*i-2)]
temp[i,3] <- all_results_long$SNP[(3*i-2)]
temp[i,4] <- all_results_long$fdr_q[(3*i-2)]
temp[i,5] <- all_results_long$fdr_q[(3*i-1)]
temp[i,6] <- all_results_long$fdr_q[(3*i)]
if(ceiling(i/100)==(i/100)){print(i)}
}
all_results_merge <- merge(all_results,temp,by.x = c("trait","ptv_gene","SNP"),by.y = c("trait","PTV","SNP"))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment