The code below loops through the 228 linear regressions of prostate cancer (PCa) status on log-transformed and scaled to z-scored metabolites. Then, a function is written to take 500 random bootstrap samples of the output of the 228 regressions.
#####################################################
# ProtecT Observational Analyses: Linear Bootstrap #
#####################################################
# Load packages
#install.packages('Rserve', repos='http://cran.r-project.org')
#install.packages('MASS', repos='http://cran.r-project.org')
#install.packages('metafor', repos='http://cran.r-project.org')
#install.packages('AER', repos='http://cran.r-project.org')
#install.packages('RColorBrewer', repos='http://cran.r-project.org')
#install.packages('aod', repos='http://cran.r-project.org')
#install.packages('ggplot2', repos='http://cran.r-project.org')
#install.packages('Rcpp', repos='http://cran.r-project.org')
#install.packages('data.table', repos='http://cran.r-project.org')
#install.packages("Cairo", repos='http://cran.r-project.org')
#install.packages('boot', repos='http://cran.r-project.org')
#install.packages('sandwich', repos='http://cran.r-project.org')
#install.packages('lmtest', repos='http://cran.r-project.org')
library('MASS')
library('metafor')
library('AER')
library('RColorBrewer')
library('aod')
library('ggplot2')
library('Rcpp')
library('Rserve')
library('data.table')
library('Cairo')
library('boot')
library('sandwich')
library('lmtest')
#install.packages("devtools")
#library('devtools')
#devtools::install_github("MRCIEU/MRInstruments")
#library(MRInstruments)
#install_github("MRCIEU/TwoSampleMR")
#library('TwoSampleMR')
# Read in the data
setwd('/panfs/panasas01/sscm/ca16591')
cleaned_met3 <- read.csv("/panfs/panasas01/sscm/ca16591/Boot_ProtecT_Mets/metabol.data.clean.demo3.20.02.17.csv")#BlueCrystal
data <- cleaned_met3
#cleaned_met3 <- read.csv("C:/Users/ca16591/Dropbox/Bristol/metabol.data.clean.demo3.20.02.17.csv")#Oakfield
#data <- cleaned_met3
#cleaned_met3 <- read.csv("~/Dropbox/Bristol/metabol.data.clean.demo3.20.02.17.csv") #laptop
#data <- cleaned_met3
# Create metabolite name lists for use in loops
mname=colnames(data)[-c(1:93)]
sname=scan('/panfs/panasas01/sscm/ca16591/Boot_ProtecT_Mets/short_list_Pro_Prac1.txt',what='')#BlueCrystal
#sname=scan('C:/Users/ca16591/Dropbox/Bristol/short_list_Pro_Prac1.txt',what='')#Oakfield
#sname=scan('~/Dropbox/Bristol/short_list_Pro_Prac1.txt',what='') #laptop
smname=mname[-c(229:231)]
data$casecontrol_n=ifelse(data$casecontrol=="Control", 0,1)
#completedData <- read.csv("~/Dropbox/Bristol/completedData_imputation.csv")#laptop
#completedData <- read.csv("C:/Users/ca16591/Dropbox/Bristol/completedData_imputation.csv")#Oakfield
completedData <- read.csv("/panfs/panasas01/sscm/ca16591/Boot_ProtecT_Mets/completedData_imputation.csv")#BlueCrystal
data$Glol <- completedData$Glol
data$pounds = (data$p1dhl_wght_stones)*14
data$weight = data$pounds + data$p1dhl_wght_pounds
data$feet_to_inches = (data$p1dhl_tall_ft)*12
data$height_inches = data$p1dhl_tall_in + data$feet_to_inches
data$inches = (data$height_inches^2)
data$inches_squared = (data$inches)
data$bmi=(data$weight*703)/data$inches_squared
data$fac_centre <- as.factor(data$centre)
data$Glol <- completedData$Glol
# Function to loop through the associations
logtransform<-function(dataset,smname)
{
tx=dataset[,smname]
tx=apply(tx,2,function(x){if (min(x,na.rm=T)==0) x=x+1 else x })
tx=log(tx)
dataset[,smname]=tx
dataset
}
linearRegress<-function(metabolite.name,exposure.name,dataset,covariate.name=TRUE,
metabolite.log='T',subset=NULL)
{
##subseting
tx=dataset
if(!is.null(subset)) {ind=with(tx,eval(parse(text=subset))); tx=tx[ind,]}
##log-transform and scaling the metabolites
logname=metabolite.name
scalename=metabolite.name
if (metabolite.log=='T') {tx=logtransform(tx,logname)}
tx[,scalename]=scale(tx[,scalename])
##linear regression of exposure with metabolites
add=numeric()
fom=formula(paste('met~',paste(c('casecontrol_n', covariate.name=c('age', 'fac_centre','fhhistpca')),collapse='+')))
for (j in 1:length(metabolite.name))
{
met=tx[,metabolite.name[j]];
fit=lm(fom,data=tx)
temp=c(summary(fit)$coef['casecontrol_n',],confint(fit)['casecontrol_n',]);
add=rbind(add,temp);
}
rownames(add)=metabolite.name
add=data.frame(add,check.names = F)
add
}
#
result_linear=linearRegress(smname,'casecontrol_n', data, covariate.name=c('age', 'fac_centre','fhhistpca'))
head(result_linear)
# This is the function which takes a random bootstrap sample and performs the analysis.
Funk<-function(Boot.Data){
###If your only input is the dataframe, then
A<-linearRegress(smname,'casecontrol_n', Boot.Data, covariate.name=c('age', 'fac_centre','fhhistpca'))
A <- as.data.frame(A)
A$Metabolites <- rownames(A)
return (A)
}
# This will return the output for linearRegress for the random bootstrap sample, running 500 times, one time for each dataset.
# First make 500 placeholders, which will be defined as random bootstrap dataframes.
beta.obs.boot <- numeric(500)
# Then the for loop of 500 dataframes...
for(i in 1:500) {
# Randomly sample a number of rows equal to the size of the original dataframe, using the original data frame
this.ind <- sample(nrow(data),nrow(data), replace=TRUE)
Boot.Data<-data[this.ind,]
beta.obs.boot[i] <- list(Boot.Data)
}
# Provies a big list of bootstrap dataframes which will be the input for the "Funk" function above
# "Sapply" to provide the set of random bootstrap dataframes, and after a lifetime of waiting, a list of each output of linearRegress for the corresponding random bootstrap dataframes is given
LSR.Est<-sapply(beta.obs.boot,Funk)
# Extract the coefficients, and calculate standard errors, 95% CIs, and p-values
##Effect
booted <- as.data.frame(LSR.Est[,1])
for(i in 2:500) {
booted <- rbind(booted, as.data.frame(LSR.Est[,i]))
}
head(booted)
summary(booted)
random_LSR.Est <- as.data.frame(LSR.Est[,1])
Mets <- random_LSR.Est$Metabolites
Mets_list <- levels(Mets)
funk.1 <- function(m){
mean <- mean(booted$Estimate[booted$Metabolites==Mets_list[m]])
se <- sqrt(var(booted$Estimate[booted$Metabolites==Mets_list[m]]))
return(list(mean,se))
}
funk.1.out <- mapply(funk.1, 1:228)
mean.EST <- unlist(funk.1.out[1,])
se.EST <- unlist(funk.1.out[2,])
big_df <- as.data.frame(mean.EST,Mets_list)
big_df$se <- se.EST
big_df$E_CI_min<- big_df$mean-(1.96*big_df$se)
big_df$E_CI_max<- big_df$mean+(1.96*big_df$se)
big_df = big_df[order(big_df$se),]
#head(big_df, n=50)
df.master <- big_df
write.csv(df.master, "/panfs/panasas01/sscm/ca16591/Boot_ProtecT_Mets/df.master.linear.csv")/panfs/panasas01/sscm/ca16591/Boot_ProtecT_Mets/
#!/bin/bash
#PBS -l nodes=1:ppn=1,walltime=48:00:00
#PBS -N Boot_Mets_linear
cd "/panfs/panasas01/sscm/ca16591/Boot_ProtecT_Mets"
echo Running on host `hostname`
echo Time is `date`
echo Directory is `pwd`
echo PBS job ID is $PBS_JOBID
echo This jobs runs on the following machines:
echo `cat $PBS_NODEFILE | uniq`
module add languages/R-3.3.2-ATLAS
R CMD BATCH Boot_loop_linear.Rdf.master <- read.csv("M:/data/protect/_devs/PROTECT_Clinical/data/Boot_ProtecT_Mets/df.master.linear.csv")
df.master = df.master[order(df.master$se),]
#head(df.master)
#dim(df.master)
df.master.sub <- subset(df.master, df.master$E_CI_min<0 & df.master$E_CI_max<0 |df.master$E_CI_min>0 & df.master$E_CI_max>0)
#dim(df.master.sub)
#head(df.master.sub)
df.master.sub = df.master.sub[order(df.master.sub$mean.EST),]
#head(df.master.sub)
#tail(df.master.sub)
summary(df.master$se)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02666 0.02942 0.02972 0.02994 0.03051 0.03335
# Calculate the p-value
df.master$t <- df.master$mean.EST/df.master$se
df.master$p<-2*pnorm(-abs(df.master$t))
df.master = df.master[order(df.master$p),]
#head(df.master, n=100)
write.csv(df.master, "C:/Users/ca16591/Dropbox/Bristol/df.master.linear.p.csv")