install.packages('Rserve',  
                 repos='http://cran.r-project.org') 
install.packages('MASS',    
                 repos='http://cran.r-project.org') 
install.packages('metafor', 
                 repos='http://cran.r-project.org') 
install.packages('AER', 
                 repos='http://cran.r-project.org') 
install.packages('RColorBrewer',    
                 repos='http://cran.r-project.org') 

install.packages('aod', 
                 repos='http://cran.r-project.org') 
install.packages('ggplot2', 
                 repos='http://cran.r-project.org') 
install.packages('Rcpp',    
                 repos='http://cran.r-project.org')
install.packages('data.table',  
                 repos='http://cran.r-project.org')
install.packages("Cairo")
library('MASS') 
library('metafor')  
library('AER')  
library('RColorBrewer') 
library('aod')
library('ggplot2')
library('Rcpp')
library('Rserve')
library('data.table')
library('Cairo')
install.packages("devtools")
library('devtools')
devtools::install_github("MRCIEU/MRInstruments")
library(MRInstruments) 
install_github("MRCIEU/TwoSampleMR")
library('TwoSampleMR')

Reading in the data for ProtecT & looping through the associations of metabolites on PCa

# Data merging and cleaning
setwd("C:/Users/ca16591/Dropbox/Bristol") #Oakfield
rm(list=ls())   
data <- read.table("brainshake.txt", header=TRUE, na.strings=c("NDEF","NA","TAG"))
dim(data)   

# Sort by sampleid (smallest to largest)

attach(data)
newdata <- data[order(sampleid),] 

# Read in map file with subject and sampleids

map <- read.delim("Z:/data/protect/metabolomic/brainshake/released/2016-07-07/data/data.id_map.tsv")

# Read in demographics data

#PCa_demo <-  read.csv("C:/Users/ca16591/Dropbox/Bristol/Michael_PCa2.demo/PCa2_demo.csv")

PCa_demo <-  read.csv("C:/Users/ca16591/Dropbox/Bristol/PCa3_demo.csv")
PCa_demo$subjectid <- PCa_demo$p0subjectid
PCa_demo$dob <- PCa_demo$p0dob

dim(PCa_demo)

# Identify overlap in sampleids from map and PCa_demo

mydata <- merge(PCa_demo, map, by=c("subjectid")) 
write.csv(mydata, "mydata.csv")

library(data.table)

mydata$subjectid <- factor(mydata$subjectid)
levels(mydata$subjectid)

test <- mydata[order(mydata$subjectid), ]
DT <- data.table(test, key = 'subjectid')

# Keep just the first observation for subjects with multiple metabolite measurements

new2 <- do.call(rbind, lapply(split(test, test$subjectid), `[`, 1, ))
dim(new2)

write.csv(new2, "new2.csv")

# Merge the metabolomic data ('data') with the demographic data (new2)

metab_demo <- merge(new2, data, by=c("sampleid")) 
data <- metab_demo

#write.csv(data, "metabol.data.clean.demo.tsv") #I resaved this as 12-15_cleaned_met3.txt
write.csv(data, "metabol.data.clean.demo3.csv") 

# Look in Excel and delete extra first column from merging
# Newest merge from Michael D., extra column count is (26*3)+14
# Format the date and dob variables in the year-month-day format
# Create the age variable 
# Make all the 'missing' and 'don't know' classifications in fhhistpca 'NA' and change 'yes' to 1 and 'no' to 0

cleaned_met3 <- read.csv("C:/Users/ca16591/Dropbox/Bristol/metabol.data.clean.demo3.20.02.17.csv")
data <- cleaned_met3

# Create metabolite name lists for use in loops

mname=colnames(data)[-c(1:93)]  
sname=scan('mininames.txt',what='') 
smname=mname[-c(229:231)]
write.csv(data, "final_protect_merged_cleaned_02-20-2017.csv") 

# Function to loop through the associations 

logRegress<-function(outcome.name,metabolite.name,dataset,covariate.name=NULL,  
                     metabolite.log='F',subset=NULL)    
{   
  ##subseting   
  tx=dataset    
  if(!is.null(subset)) {ind=with(tx,eval(parse(text=subset))); tx=tx[ind,]} 
  
  ##log-transform and scaling the metabolites   
  logname=metabolite.name
  scalename=metabolite.name
  if (metabolite.log=='T') {tx=logtransform(tx,logname)}
  tx[,scalename]=scale(tx[,scalename])
  
  ##logistic regression of exposure with metabolites        
  add=numeric() 
  fom=formula(paste('casecontrol~',paste(c('met', covariate.name),collapse='+')))   
  for (j in 1:length(metabolite.name))  
  { 
    met=tx[,metabolite.name[j]];    
    fit=glm(fom,data=tx, family="binomial") 
    temp=c(summary(fit)$coef['met',],confint(fit)['met',]); 
    add=rbind(add,temp);    
  }     
  rownames(add)=metabolite.name 
  add=data.frame(add,check.names = F)   
  add   
}   
result=logRegress('casecontrol',smname,data,covariate.name=c('age', 'centre','fhhistpca'))  
result$Estimate = exp(result$Estimate)
result$`2.5 %` = exp(result$`2.5 %`)
result$`97.5 %` = exp(result$`97.5 %`)
copy_result <- result
copy_result$beta <- copy_result$Estimate
copy_result$pval <- copy_result$'Pr(>|z|)'
copy_result$lci <- copy_result$'2.5 %'
copy_result$uci <- copy_result$'97.5 %'
copy_result = copy_result[order(copy_result$pval),]
myvarsc <- c("beta","pval", 'lci','uci')
newdatac <- copy_result[myvarsc]
sum(newdatac$pval<0.0014)
copy_result_sig <- subset(newdatac, newdatac$pval<0.0014)
copy_result_sig

write.csv(copy_result_sig, "protect_results.csv")
## Warning: package 'RColorBrewer' was built under R version 3.3.2

# Explore the results with copy_result but use 'result' for forestplot

copy_result$beta <- copy_result$Estimate
copy_result$pval <- copy_result$'Pr(>|z|)'
copy_result$lci <- copy_result$'2.5 %'
copy_result$uci <- copy_result$'97.5 %'
copy_result = copy_result[order(copy_result$pval),]
myvarsc <- c("beta","pval", 'lci','uci')
newdatac <- copy_result[myvarsc]
sum(newdatac$pval<0.0014)
copy_result_sig <- subset(newdatac, newdatac$pval<0.0014)
copy_result_sig

Obtain multiple-testing correction threshold with PCA

List PCa results under corrected p-value threshold of 0.0014

copy_result$beta <- copy_result$Estimate
copy_result$pval <- copy_result$'Pr(>|z|)'
copy_result$lci <- copy_result$'2.5 %'
copy_result$uci <- copy_result$'97.5 %'
copy_result = copy_result[order(copy_result$pval),]
myvarsc <- c("beta","pval", 'lci','uci')
newdatac <- copy_result[myvarsc]
sum(newdatac$pval<0.0014)
## [1] 24
copy_result_sig <- subset(newdatac, newdatac$pval<0.0014)
copy_result_sig
##                   beta         pval       lci       uci
## Glol         0.6553527 2.664802e-11 0.5780110 0.7412617
## M_VLDL_PL_.  0.8732285 7.323548e-06 0.8228883 0.9264453
## S_HDL_C      0.8732623 1.125733e-05 0.8217311 0.9274349
## S_HDL_CE     0.8774507 2.112785e-05 0.8259049 0.9317173
## Tyr          1.1376343 2.454522e-05 1.0716897 1.2081405
## Ile          1.1345019 4.146335e-05 1.0682633 1.2053172
## Leu          1.1318246 6.118838e-05 1.0656001 1.2028308
## SFA.FA       1.1274312 8.459010e-05 1.0621770 1.1971785
## Alb          0.8957930 3.175461e-04 0.8435500 0.9509507
## XS_VLDL_PL_. 0.8966989 4.138705e-04 0.8436937 0.9523114
## Val          1.1101809 6.384085e-04 1.0456760 1.1790166
## IDL_FC       0.9014798 6.439810e-04 0.8492479 0.9567330
## M_HDL_FC_.   0.8998385 6.531297e-04 0.8464827 0.9557802
## S_VLDL_TG_.  1.1088164 6.551682e-04 1.0449512 1.1768208
## L_LDL_FC     0.9022708 7.261058e-04 0.8499374 0.9576444
## M_VLDL_TG_.  1.1068419 7.645546e-04 1.0433930 1.1743955
## M_HDL_FC     0.9044687 9.626556e-04 0.8520305 0.9599479
## S_VLDL_C_.   0.9050544 1.001738e-03 0.8527355 0.9603868
## FAw6.FA      0.9051374 1.026871e-03 0.8527105 0.9605002
## VLDL_D       1.1039821 1.055388e-03 1.0406156 1.1714342
## IDL_PL       0.9055591 1.117105e-03 0.8530232 0.9611483
## S_VLDL_CE_.  0.9060277 1.144429e-03 0.8536167 0.9614573
## S_HDL_L      0.9043873 1.227302e-03 0.8503707 0.9606276
## M_LDL_PL_.   1.1064544 1.238442e-03 1.0409811 1.1770598
data <- read.csv("C:/Users/ca16591/Dropbox/Bristol/final_protect_merged_cleaned_02-20-2017.csv")
data <- data[c(-1)]
attach(data)
mname=colnames(data)[-c(1:93)]
smname=mname[-c(229:231)]   
ncomp<-function(data,metabolite.name)   
{   
  tx=data;      
  ptx=scale(tx[,metabolite.name])   
  pca=princomp(~.,data=data.frame(ptx),na.action=na.exclude)    
  var=pca$sdev^2 / sum(pca$sdev^2)  
  cumvar=cumsum(var)    
  n=which(cumvar>=0.99)[1]  
  n 
}   
ncomp(data,smname)
0.05/37
newp <- round(0.001351351,4)

p.Funk <-function(Dat){
  p.Dat <-subset(Dat,Dat$'pvalue'<0.0014)
  return(p.Dat)
}
TEST <-p.Funk(copy_result)

ProtecT results by stage

copy_result_stage$beta <- copy_result_stage$Estimate
copy_result_stage$pval <- copy_result_stage$'Pr(>|z|)'
copy_result_stage$lci <- copy_result_stage$'2.5 %'
copy_result_stage$uci <- copy_result_stage$'97.5 %'
copy_result_stage= copy_result_stage[order(copy_result_stage$pval),]
myvarsc <- c("beta","pval", 'lci','uci')
newdatac <- copy_result_stage[myvarsc]
sum(newdatac$pval<0.0014)
## [1] 11
copy_result_stagesig <- subset(newdatac, newdatac$pval<0.0014)
copy_result_stagesig
##                   beta         pval       lci       uci
## S_HDL_PL_.   1.3284078 0.0001159095 1.1483494 1.5335032
## S_LDL_PL_.   1.2762402 0.0003202600 1.1141019 1.4544262
## M_LDL_C_.    0.7860330 0.0003507433 0.6905401 0.9001361
## M_LDL_PL_.   1.2736280 0.0003508704 1.1115246 1.4507506
## S_HDL_CE_.   0.7731154 0.0003920311 0.6715387 0.8930492
## S_HDL_C_.    0.7736563 0.0003924222 0.6722889 0.8933916
## S_LDL_C_.    0.7881754 0.0004068676 0.6923828 0.9023682
## S_LDL_CE_.   0.7879145 0.0004398147 0.6914414 0.9027815
## M_LDL_CE_.   0.7906808 0.0005693380 0.6935472 0.9068247
## PUFA.FA      0.7815378 0.0007005327 0.6783451 0.9023850
## XS_VLDL_FC_. 0.7890715 0.0008364721 0.6879315 0.9093618

ProtecT results by grade

copy_result_grade$beta <- copy_result_grade$Estimate
copy_result_grade$pval <- copy_result_grade$'Pr(>|z|)'
copy_result_grade$lci <- copy_result_grade$'2.5 %'
copy_result_grade$uci <- copy_result_grade$'97.5 %'
copy_result_grade= copy_result_grade[order(copy_result_grade$pval),]
myvarsc <- c("beta","pval", 'lci','uci')
newdatac <- copy_result_grade[myvarsc]
sum(newdatac$pval<0.0014)
## [1] 1
copy_result_gradesig <- subset(newdatac, newdatac$pval<0.0014)
copy_result_gradesig
##                beta        pval      lci      uci
## M_LDL_TG_. 1.175836 0.001097428 1.066641 1.296052

## ProtecT results by grade (>8 & <=8) 
copy_result_grade4$beta <- copy_result_grade4$Estimate
copy_result_grade4$pval <- copy_result_grade4$'Pr(>|z|)'
copy_result_grade4$lci <- copy_result_grade4$'2.5 %'
copy_result_grade4$uci <- copy_result_grade4$'97.5 %'
copy_result_grade4= copy_result_grade4[order(copy_result_grade4$pval),]
myvarsc <- c("beta","pval", 'lci','uci')
newdatac <- copy_result_grade4[myvarsc]
sum(newdatac$pval<0.0014)
## [1] 8
copy_result_grade4sig <- subset(newdatac, newdatac$pval<0.0014)
copy_result_grade4sig
##                   beta         pval       lci       uci
## S_HDL_C_.    0.7201618 0.0002377088 0.6066479 0.8619930
## S_HDL_CE_.   0.7199488 0.0002463473 0.6060987 0.8621997
## S_HDL_PL_.   1.3776690 0.0005781698 1.1435806 1.6483907
## XS_VLDL_FC_. 0.7440629 0.0006145425 0.6313257 0.8872124
## S_HDL_CE     0.7349189 0.0007917589 0.6161985 0.8837708
## XS_VLDL_PL_. 0.7474369 0.0008556690 0.6333012 0.8931350
## IDL_FC_.     0.7617759 0.0008787680 0.6517801 0.9008792
## S_LDL_C_.    0.7644256 0.0012497011 0.6532419 0.9068541

MR of impact of metabolites on PCa (incidence)

library('data.table')
library('Cairo')
library('devtools')
library('MRInstruments') 
library('TwoSampleMR')

Phenotype.vec<-c(metab_qtls[,1])

Funk.1<-function(PHENO) 
{   
  #loop over all metabolites 
  exposure_dat <- format_metab_qtls(subset(metab_qtls, phenotype==PHENO))
  
  outcome_dat <- read_outcome_data(
    snps = metab_qtls$SNP,
    filename = "App_266_results2.txt",
    sep = "\t",
    snp_col = "SNP",
    beta_col = "OR",
    se_col = "SE",
    effect_allele_col = "Effect",
    other_allele_col = "Baseline",
    eaf_col = "EAFcontrols",
    samplesize_col = "NumCalled"
  )
  
  #harmonization
  dat <- harmonise_data(exposure_dat, outcome_dat, action=2)
  
  #mr analysis
  #  res <- mr(dat)
  res <-  mr(dat, method_list=c("mr_two_sample_ml"))

  # "mr_ivw"
  
  #head(res)
  
  #sensitivity analyses
  het.dat<-mr_heterogeneity(dat)
  
  p1 <- mr_scatter_plot(res, dat)
  
   # Report
  
  #rep <- mr_report(dat)
  
  return(list(res,het.dat,p1, res))
}

outcome.test <-Funk.1(Phenotype.vec)

#outcome.testf <- data.frame(outcome.test[1])
#outcome.test[3]
#outcome.test[2]
#head(outcome.test[1])
#head(exp(outcome.test[,6]))
#outcome.test[4]

OT<-outcome.test[1]
OT<-data.frame(OT)
head(OT)
OT$exp_b <- exp(OT$b)
OT$beta <- OT$exp_b
OT$Metabolite <- OT$exposure
OT$lci1 <- OT$b-(1.96*OT$se)
OT$lci <- exp(OT$lci1)
head(OT$lci)
OT$uci1 <- OT$b+(1.96*OT$se)
OT$uci <- exp(OT$uci1)
write.csv(OT, "pca.mr.csv")
#######
sorted_OT = OT[order(OT$'pval'),]
head(sorted_OT)
write.csv(OT, "PRACTICAL_MR1.csv")

#OT$'Std. Error' <- OT$se
#OT$'Pr(>|z|)' <- OT$pval  
#OT$'2.5 %' <- OT$lci
#OT$'97.5 %' <- OT$uci 
PRACTICAL_MR1 <- sorted_OT

head(PRACTICAL_MR1)
#For correlation plot
myvarscor2 <- c("Metabolite","beta","lci", "uci", "pval", "method")
PRACTICAL_MR1 <- sorted_OT[myvarscor2]
sum(PRACTICAL_MR1$pval<0.0014)
write.csv(PRACTICAL_MR1, "PRACTICAL_MR1.csv")

MR Results

library('data.table')
library('Cairo')
myvarscor2 <- c("Metabolite","beta","lci", "uci", "pval", "method")
PRACTICAL_MR1 <- sorted_OT[myvarscor2]
sum(PRACTICAL_MR1$pval<0.0014)
## [1] 1
head(PRACTICAL_MR1)
##     Metabolite     beta      lci      uci         pval
## 23        Crea 1.410868 1.200116 1.658631 3.046832e-05
## 29          Gp 1.167327 1.044425 1.304692 6.414927e-03
## 32         Pyr 1.292627 1.034110 1.615771 2.415705e-02
## 31         Pyr 1.292758 1.028809 1.624425 2.754147e-02
## 22         Lac 1.379063 1.034514 1.838367 2.842687e-02
## 134        Ala 1.196719 1.018176 1.406572 2.936834e-02
##                        method
## 23         Maximum likelihood
## 29         Maximum likelihood
## 32  Inverse variance weighted
## 31         Maximum likelihood
## 22  Inverse variance weighted
## 134 Inverse variance weighted

Get the intersection of the observational and MR metabolite names

#' Open the ProtecT results in Excel and change the names of the metabolites to the period vs hyphen form. Then also look to see if any of the names do match but with periods in wrong place. 

ProtecT <- read.csv("C:/Users/ca16591/Dropbox/Bristol/protect_results_periods_double.csv")
head(ProtecT)
##    Metabolite     beta Std..Error  z.value        pval      lci      uci
## 1  XXL.VLDL.P 1.091716 0.03069816 2.858515 0.004256291 1.028304 1.159880
## 2  XXL.VLDL.L 1.091028 0.03068892 2.838809 0.004528226 1.027669 1.159122
## 3 XXL.VLDL.PL 1.090463 0.03068849 2.821980 0.004772812 1.027142 1.158527
## 4  XXL.VLDL.C 1.077180 0.03052235 2.435815 0.014858280 1.014872 1.143933
## 5 XXL.VLDL.CE 1.062743 0.03042754 1.999933 0.045507518 1.001385 1.128308
## 6 XXL.VLDL.FC 1.091182 0.03066571 2.845577 0.004433099 1.027861 1.159233
dim(ProtecT)
## [1] 228   7
PRACTICAL_MR1 <- read.csv("C:/Users/ca16591/Dropbox/Bristol/PRACTICAL_MR1.csv")
dim(PRACTICAL_MR1)
## [1] 114  17
ProtecT.t <- ProtecT[which(ProtecT$Metabolite %in% PRACTICAL_MR1$Metabolite),]
dim(ProtecT.t)
## [1] 105   7
PRACTICAL.MRt<- PRACTICAL_MR1[which(PRACTICAL_MR1$Metabolite %in% ProtecT.t$Metabolite),]
dim(PRACTICAL.MRt)
## [1] 105  17
source('corPlot_mods.R')

Correlation Plot