Save the code below as Boot_loop_linear.R

The code below loops through the 228 linear regressions of prostate cancer (PCa) status on log-transformed and scaled to z-scored metabolites. Then, a function is written to take 500 random bootstrap samples of the output of the 228 regressions.

#####################################################
# ProtecT Observational Analyses: Linear Bootstrap  #
#####################################################

# Load packages 

#install.packages('Rserve',  repos='http://cran.r-project.org') 
#install.packages('MASS',   repos='http://cran.r-project.org')  
#install.packages('metafor',    repos='http://cran.r-project.org')  
#install.packages('AER', repos='http://cran.r-project.org') 
#install.packages('RColorBrewer', repos='http://cran.r-project.org')    
#install.packages('aod',    repos='http://cran.r-project.org')  
#install.packages('ggplot2',    repos='http://cran.r-project.org')  
#install.packages('Rcpp',   repos='http://cran.r-project.org')
#install.packages('data.table',  repos='http://cran.r-project.org')
#install.packages("Cairo", repos='http://cran.r-project.org')
#install.packages('boot', repos='http://cran.r-project.org')
#install.packages('sandwich', repos='http://cran.r-project.org')
#install.packages('lmtest', repos='http://cran.r-project.org')

library('MASS') 
library('metafor')  
library('AER')  
library('RColorBrewer') 
library('aod')
library('ggplot2')
library('Rcpp')
library('Rserve')
library('data.table')
library('Cairo')
library('boot')
library('sandwich')
library('lmtest')
#install.packages("devtools")
#library('devtools')
#devtools::install_github("MRCIEU/MRInstruments")
#library(MRInstruments) 
#install_github("MRCIEU/TwoSampleMR")
#library('TwoSampleMR')

# Read in the data 

setwd('/panfs/panasas01/sscm/ca16591')

cleaned_met3 <- read.csv("/panfs/panasas01/sscm/ca16591/Boot_ProtecT_Mets/metabol.data.clean.demo3.20.02.17.csv")#BlueCrystal
data <- cleaned_met3


#cleaned_met3 <- read.csv("C:/Users/ca16591/Dropbox/Bristol/metabol.data.clean.demo3.20.02.17.csv")#Oakfield
#data <- cleaned_met3

#cleaned_met3 <- read.csv("~/Dropbox/Bristol/metabol.data.clean.demo3.20.02.17.csv") #laptop
#data <- cleaned_met3

# Create metabolite name lists for use in loops

mname=colnames(data)[-c(1:93)]  
sname=scan('/panfs/panasas01/sscm/ca16591/Boot_ProtecT_Mets/short_list_Pro_Prac1.txt',what='')#BlueCrystal
#sname=scan('C:/Users/ca16591/Dropbox/Bristol/short_list_Pro_Prac1.txt',what='')#Oakfield    
#sname=scan('~/Dropbox/Bristol/short_list_Pro_Prac1.txt',what='')   #laptop
smname=mname[-c(229:231)]
data$casecontrol_n=ifelse(data$casecontrol=="Control", 0,1)
#completedData <- read.csv("~/Dropbox/Bristol/completedData_imputation.csv")#laptop
#completedData <- read.csv("C:/Users/ca16591/Dropbox/Bristol/completedData_imputation.csv")#Oakfield
completedData <- read.csv("/panfs/panasas01/sscm/ca16591/Boot_ProtecT_Mets/completedData_imputation.csv")#BlueCrystal
data$Glol <- completedData$Glol
data$pounds = (data$p1dhl_wght_stones)*14
data$weight = data$pounds + data$p1dhl_wght_pounds
data$feet_to_inches = (data$p1dhl_tall_ft)*12
data$height_inches = data$p1dhl_tall_in + data$feet_to_inches
data$inches = (data$height_inches^2)
data$inches_squared = (data$inches)
data$bmi=(data$weight*703)/data$inches_squared 
data$fac_centre <- as.factor(data$centre)

data$Glol <- completedData$Glol

# Function to loop through the associations 

logtransform<-function(dataset,smname)  
{   
  tx=dataset[,smname]   
  tx=apply(tx,2,function(x){if (min(x,na.rm=T)==0) x=x+1 else x })  
  tx=log(tx)    
  dataset[,smname]=tx   
  dataset   
}   

linearRegress<-function(metabolite.name,exposure.name,dataset,covariate.name=TRUE,  
                        metabolite.log='T',subset=NULL) 
{   
  ##subseting   
  tx=dataset    
  if(!is.null(subset)) {ind=with(tx,eval(parse(text=subset))); tx=tx[ind,]} 
  
  ##log-transform and scaling the metabolites   
  logname=metabolite.name
  scalename=metabolite.name
  if (metabolite.log=='T') {tx=logtransform(tx,logname)}
  tx[,scalename]=scale(tx[,scalename])
  
  ##linear regression of exposure with metabolites      
  add=numeric() 
  fom=formula(paste('met~',paste(c('casecontrol_n', covariate.name=c('age', 'fac_centre','fhhistpca')),collapse='+')))  
  for (j in 1:length(metabolite.name))  
  { 
    met=tx[,metabolite.name[j]];    
    fit=lm(fom,data=tx) 
    temp=c(summary(fit)$coef['casecontrol_n',],confint(fit)['casecontrol_n',]); 
    add=rbind(add,temp);    
  }     
  rownames(add)=metabolite.name 
  add=data.frame(add,check.names = F)   
  add   
}   
#
result_linear=linearRegress(smname,'casecontrol_n', data, covariate.name=c('age', 'fac_centre','fhhistpca'))    
head(result_linear)

# This is the function which takes a random bootstrap sample and performs the analysis.

Funk<-function(Boot.Data){
  
  ###If your only input is the dataframe, then
  A<-linearRegress(smname,'casecontrol_n', Boot.Data, covariate.name=c('age', 'fac_centre','fhhistpca'))    
  A <- as.data.frame(A)
  A$Metabolites <- rownames(A)
  
  return (A)
}

# This will return the output for linearRegress for the random bootstrap sample, running 500 times, one time for each dataset.

# First make 500 placeholders, which will be defined as random bootstrap dataframes.
  beta.obs.boot <- numeric(500)

# Then the for loop of 500 dataframes...
  for(i in 1:500) {
  
# Randomly sample a number of rows equal to the size of the original dataframe, using the original data frame
  
  this.ind <- sample(nrow(data),nrow(data), replace=TRUE)
  Boot.Data<-data[this.ind,]
  
  beta.obs.boot[i] <- list(Boot.Data)
}

# Provies a big list of bootstrap dataframes which will be the input for the "Funk" function above

# "Sapply" to provide the set of random bootstrap dataframes, and after a lifetime of waiting, a list of each output of linearRegress for the corresponding random bootstrap dataframes is given

  LSR.Est<-sapply(beta.obs.boot,Funk)

# Extract the coefficients, and calculate standard errors, 95% CIs, and p-values

##Effect

booted <- as.data.frame(LSR.Est[,1])
for(i in 2:500) {
  booted <-  rbind(booted, as.data.frame(LSR.Est[,i]))
}

head(booted)
summary(booted)

random_LSR.Est <- as.data.frame(LSR.Est[,1])
Mets <- random_LSR.Est$Metabolites 
Mets_list <- levels(Mets)

funk.1 <- function(m){
  mean <- mean(booted$Estimate[booted$Metabolites==Mets_list[m]])
  se <-  sqrt(var(booted$Estimate[booted$Metabolites==Mets_list[m]]))
  return(list(mean,se))  
}

funk.1.out <- mapply(funk.1, 1:228)
mean.EST <- unlist(funk.1.out[1,])
se.EST <- unlist(funk.1.out[2,])


big_df <- as.data.frame(mean.EST,Mets_list)
big_df$se <- se.EST
big_df$E_CI_min<- big_df$mean-(1.96*big_df$se)
big_df$E_CI_max<- big_df$mean+(1.96*big_df$se)

big_df = big_df[order(big_df$se),]
#head(big_df, n=50)

df.master <- big_df

write.csv(df.master, "/panfs/panasas01/sscm/ca16591/Boot_ProtecT_Mets/df.master.linear.csv")

Set up a folder in my home drive for BlueCrystal

/panfs/panasas01/sscm/ca16591/Boot_ProtecT_Mets/

Write a short bash script that calls the code above and save as Boot_script_linear.sh

#!/bin/bash
#PBS -l nodes=1:ppn=1,walltime=48:00:00
#PBS -N Boot_Mets_linear

cd "/panfs/panasas01/sscm/ca16591/Boot_ProtecT_Mets"

echo Running on host `hostname`
echo Time is `date`
echo Directory is `pwd`
echo PBS job ID is $PBS_JOBID
echo This jobs runs on the following machines:
echo `cat $PBS_NODEFILE | uniq`

module add languages/R-3.3.2-ATLAS
R CMD BATCH Boot_loop_linear.R

Within /panfs/panasas01/sscm/ca16591/Boot_ProtecT_Mets/, type ‘qsub Boot_script_linear.sh’

Read back in R and look at results

df.master <- read.csv("M:/data/protect/_devs/PROTECT_Clinical/data/Boot_ProtecT_Mets/df.master.linear.csv")

df.master = df.master[order(df.master$se),]
#head(df.master)
#dim(df.master)

df.master.sub <- subset(df.master, df.master$E_CI_min<0 & df.master$E_CI_max<0 |df.master$E_CI_min>0 & df.master$E_CI_max>0)
#dim(df.master.sub)
#head(df.master.sub)
df.master.sub = df.master.sub[order(df.master.sub$mean.EST),]
#head(df.master.sub)
#tail(df.master.sub)

summary(df.master$se)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02666 0.02942 0.02972 0.02994 0.03051 0.03335
# Calculate the p-value 

df.master$t <- df.master$mean.EST/df.master$se
df.master$p<-2*pnorm(-abs(df.master$t))
df.master = df.master[order(df.master$p),]
#head(df.master, n=100)


write.csv(df.master, "C:/Users/ca16591/Dropbox/Bristol/df.master.linear.p.csv")