install.packages('Rserve',
repos='http://cran.r-project.org')
install.packages('MASS',
repos='http://cran.r-project.org')
install.packages('metafor',
repos='http://cran.r-project.org')
install.packages('AER',
repos='http://cran.r-project.org')
install.packages('RColorBrewer',
repos='http://cran.r-project.org')
install.packages('aod',
repos='http://cran.r-project.org')
install.packages('ggplot2',
repos='http://cran.r-project.org')
install.packages('Rcpp',
repos='http://cran.r-project.org')
install.packages('data.table',
repos='http://cran.r-project.org')
install.packages("Cairo")library('MASS')
library('metafor')
library('AER')
library('RColorBrewer')
library('aod')
library('ggplot2')
library('Rcpp')
library('Rserve')
library('data.table')
library('Cairo')
install.packages("devtools")
library('devtools')
devtools::install_github("MRCIEU/MRInstruments")
library(MRInstruments)
install_github("MRCIEU/TwoSampleMR")
library('TwoSampleMR')# Data merging and cleaning
setwd("C:/Users/ca16591/Dropbox/Bristol") #Oakfield
rm(list=ls())
data <- read.table("brainshake.txt", header=TRUE, na.strings=c("NDEF","NA","TAG"))
dim(data)
# Sort by sampleid (smallest to largest)
attach(data)
newdata <- data[order(sampleid),]
# Read in map file with subject and sampleids
map <- read.delim("Z:/data/protect/metabolomic/brainshake/released/2016-07-07/data/data.id_map.tsv")
# Read in demographics data
#PCa_demo <- read.csv("C:/Users/ca16591/Dropbox/Bristol/Michael_PCa2.demo/PCa2_demo.csv")
PCa_demo <- read.csv("C:/Users/ca16591/Dropbox/Bristol/PCa3_demo.csv")
PCa_demo$subjectid <- PCa_demo$p0subjectid
PCa_demo$dob <- PCa_demo$p0dob
dim(PCa_demo)
# Identify overlap in sampleids from map and PCa_demo
mydata <- merge(PCa_demo, map, by=c("subjectid"))
write.csv(mydata, "mydata.csv")
library(data.table)
mydata$subjectid <- factor(mydata$subjectid)
levels(mydata$subjectid)
test <- mydata[order(mydata$subjectid), ]
DT <- data.table(test, key = 'subjectid')
# Keep just the first observation for subjects with multiple metabolite measurements
new2 <- do.call(rbind, lapply(split(test, test$subjectid), `[`, 1, ))
dim(new2)
write.csv(new2, "new2.csv")
# Merge the metabolomic data ('data') with the demographic data (new2)
metab_demo <- merge(new2, data, by=c("sampleid"))
data <- metab_demo
#write.csv(data, "metabol.data.clean.demo.tsv") #I resaved this as 12-15_cleaned_met3.txt
write.csv(data, "metabol.data.clean.demo3.csv")
# Look in Excel and delete extra first column from merging
# Newest merge from Michael D., extra column count is (26*3)+14
# Format the date and dob variables in the year-month-day format
# Create the age variable
# Make all the 'missing' and 'don't know' classifications in fhhistpca 'NA' and change 'yes' to 1 and 'no' to 0
cleaned_met3 <- read.csv("C:/Users/ca16591/Dropbox/Bristol/metabol.data.clean.demo3.20.02.17.csv")
data <- cleaned_met3
# Create metabolite name lists for use in loops
mname=colnames(data)[-c(1:93)]
sname=scan('mininames.txt',what='')
smname=mname[-c(229:231)]
write.csv(data, "final_protect_merged_cleaned_02-20-2017.csv")
# Function to loop through the associations
logRegress<-function(outcome.name,metabolite.name,dataset,covariate.name=NULL,
metabolite.log='F',subset=NULL)
{
##subseting
tx=dataset
if(!is.null(subset)) {ind=with(tx,eval(parse(text=subset))); tx=tx[ind,]}
##log-transform and scaling the metabolites
logname=metabolite.name
scalename=metabolite.name
if (metabolite.log=='T') {tx=logtransform(tx,logname)}
tx[,scalename]=scale(tx[,scalename])
##logistic regression of exposure with metabolites
add=numeric()
fom=formula(paste('casecontrol~',paste(c('met', covariate.name),collapse='+')))
for (j in 1:length(metabolite.name))
{
met=tx[,metabolite.name[j]];
fit=glm(fom,data=tx, family="binomial")
temp=c(summary(fit)$coef['met',],confint(fit)['met',]);
add=rbind(add,temp);
}
rownames(add)=metabolite.name
add=data.frame(add,check.names = F)
add
}
result=logRegress('casecontrol',smname,data,covariate.name=c('age', 'centre','fhhistpca'))
result$Estimate = exp(result$Estimate)
result$`2.5 %` = exp(result$`2.5 %`)
result$`97.5 %` = exp(result$`97.5 %`)
copy_result <- result
copy_result$beta <- copy_result$Estimate
copy_result$pval <- copy_result$'Pr(>|z|)'
copy_result$lci <- copy_result$'2.5 %'
copy_result$uci <- copy_result$'97.5 %'
copy_result = copy_result[order(copy_result$pval),]
myvarsc <- c("beta","pval", 'lci','uci')
newdatac <- copy_result[myvarsc]
sum(newdatac$pval<0.0014)
copy_result_sig <- subset(newdatac, newdatac$pval<0.0014)
copy_result_sig
write.csv(copy_result_sig, "protect_results.csv")## Warning: package 'RColorBrewer' was built under R version 3.3.2
# Explore the results with copy_result but use 'result' for forestplot
copy_result$beta <- copy_result$Estimate
copy_result$pval <- copy_result$'Pr(>|z|)'
copy_result$lci <- copy_result$'2.5 %'
copy_result$uci <- copy_result$'97.5 %'
copy_result = copy_result[order(copy_result$pval),]
myvarsc <- c("beta","pval", 'lci','uci')
newdatac <- copy_result[myvarsc]
sum(newdatac$pval<0.0014)
copy_result_sig <- subset(newdatac, newdatac$pval<0.0014)
copy_result_sigcopy_result$beta <- copy_result$Estimate
copy_result$pval <- copy_result$'Pr(>|z|)'
copy_result$lci <- copy_result$'2.5 %'
copy_result$uci <- copy_result$'97.5 %'
copy_result = copy_result[order(copy_result$pval),]
myvarsc <- c("beta","pval", 'lci','uci')
newdatac <- copy_result[myvarsc]
sum(newdatac$pval<0.0014)## [1] 24
copy_result_sig <- subset(newdatac, newdatac$pval<0.0014)
copy_result_sig## beta pval lci uci
## Glol 0.6553527 2.664802e-11 0.5780110 0.7412617
## M_VLDL_PL_. 0.8732285 7.323548e-06 0.8228883 0.9264453
## S_HDL_C 0.8732623 1.125733e-05 0.8217311 0.9274349
## S_HDL_CE 0.8774507 2.112785e-05 0.8259049 0.9317173
## Tyr 1.1376343 2.454522e-05 1.0716897 1.2081405
## Ile 1.1345019 4.146335e-05 1.0682633 1.2053172
## Leu 1.1318246 6.118838e-05 1.0656001 1.2028308
## SFA.FA 1.1274312 8.459010e-05 1.0621770 1.1971785
## Alb 0.8957930 3.175461e-04 0.8435500 0.9509507
## XS_VLDL_PL_. 0.8966989 4.138705e-04 0.8436937 0.9523114
## Val 1.1101809 6.384085e-04 1.0456760 1.1790166
## IDL_FC 0.9014798 6.439810e-04 0.8492479 0.9567330
## M_HDL_FC_. 0.8998385 6.531297e-04 0.8464827 0.9557802
## S_VLDL_TG_. 1.1088164 6.551682e-04 1.0449512 1.1768208
## L_LDL_FC 0.9022708 7.261058e-04 0.8499374 0.9576444
## M_VLDL_TG_. 1.1068419 7.645546e-04 1.0433930 1.1743955
## M_HDL_FC 0.9044687 9.626556e-04 0.8520305 0.9599479
## S_VLDL_C_. 0.9050544 1.001738e-03 0.8527355 0.9603868
## FAw6.FA 0.9051374 1.026871e-03 0.8527105 0.9605002
## VLDL_D 1.1039821 1.055388e-03 1.0406156 1.1714342
## IDL_PL 0.9055591 1.117105e-03 0.8530232 0.9611483
## S_VLDL_CE_. 0.9060277 1.144429e-03 0.8536167 0.9614573
## S_HDL_L 0.9043873 1.227302e-03 0.8503707 0.9606276
## M_LDL_PL_. 1.1064544 1.238442e-03 1.0409811 1.1770598
data <- read.csv("C:/Users/ca16591/Dropbox/Bristol/final_protect_merged_cleaned_02-20-2017.csv")
data <- data[c(-1)]
attach(data)
mname=colnames(data)[-c(1:93)]
smname=mname[-c(229:231)]
ncomp<-function(data,metabolite.name)
{
tx=data;
ptx=scale(tx[,metabolite.name])
pca=princomp(~.,data=data.frame(ptx),na.action=na.exclude)
var=pca$sdev^2 / sum(pca$sdev^2)
cumvar=cumsum(var)
n=which(cumvar>=0.99)[1]
n
}
ncomp(data,smname)
0.05/37
newp <- round(0.001351351,4)
p.Funk <-function(Dat){
p.Dat <-subset(Dat,Dat$'pvalue'<0.0014)
return(p.Dat)
}
TEST <-p.Funk(copy_result)copy_result_stage$beta <- copy_result_stage$Estimate
copy_result_stage$pval <- copy_result_stage$'Pr(>|z|)'
copy_result_stage$lci <- copy_result_stage$'2.5 %'
copy_result_stage$uci <- copy_result_stage$'97.5 %'
copy_result_stage= copy_result_stage[order(copy_result_stage$pval),]
myvarsc <- c("beta","pval", 'lci','uci')
newdatac <- copy_result_stage[myvarsc]
sum(newdatac$pval<0.0014)## [1] 11
copy_result_stagesig <- subset(newdatac, newdatac$pval<0.0014)
copy_result_stagesig## beta pval lci uci
## S_HDL_PL_. 1.3284078 0.0001159095 1.1483494 1.5335032
## S_LDL_PL_. 1.2762402 0.0003202600 1.1141019 1.4544262
## M_LDL_C_. 0.7860330 0.0003507433 0.6905401 0.9001361
## M_LDL_PL_. 1.2736280 0.0003508704 1.1115246 1.4507506
## S_HDL_CE_. 0.7731154 0.0003920311 0.6715387 0.8930492
## S_HDL_C_. 0.7736563 0.0003924222 0.6722889 0.8933916
## S_LDL_C_. 0.7881754 0.0004068676 0.6923828 0.9023682
## S_LDL_CE_. 0.7879145 0.0004398147 0.6914414 0.9027815
## M_LDL_CE_. 0.7906808 0.0005693380 0.6935472 0.9068247
## PUFA.FA 0.7815378 0.0007005327 0.6783451 0.9023850
## XS_VLDL_FC_. 0.7890715 0.0008364721 0.6879315 0.9093618
copy_result_grade$beta <- copy_result_grade$Estimate
copy_result_grade$pval <- copy_result_grade$'Pr(>|z|)'
copy_result_grade$lci <- copy_result_grade$'2.5 %'
copy_result_grade$uci <- copy_result_grade$'97.5 %'
copy_result_grade= copy_result_grade[order(copy_result_grade$pval),]
myvarsc <- c("beta","pval", 'lci','uci')
newdatac <- copy_result_grade[myvarsc]
sum(newdatac$pval<0.0014)## [1] 1
copy_result_gradesig <- subset(newdatac, newdatac$pval<0.0014)
copy_result_gradesig## beta pval lci uci
## M_LDL_TG_. 1.175836 0.001097428 1.066641 1.296052
## ProtecT results by grade (>8 & <=8)
copy_result_grade4$beta <- copy_result_grade4$Estimate
copy_result_grade4$pval <- copy_result_grade4$'Pr(>|z|)'
copy_result_grade4$lci <- copy_result_grade4$'2.5 %'
copy_result_grade4$uci <- copy_result_grade4$'97.5 %'
copy_result_grade4= copy_result_grade4[order(copy_result_grade4$pval),]
myvarsc <- c("beta","pval", 'lci','uci')
newdatac <- copy_result_grade4[myvarsc]
sum(newdatac$pval<0.0014)## [1] 8
copy_result_grade4sig <- subset(newdatac, newdatac$pval<0.0014)
copy_result_grade4sig## beta pval lci uci
## S_HDL_C_. 0.7201618 0.0002377088 0.6066479 0.8619930
## S_HDL_CE_. 0.7199488 0.0002463473 0.6060987 0.8621997
## S_HDL_PL_. 1.3776690 0.0005781698 1.1435806 1.6483907
## XS_VLDL_FC_. 0.7440629 0.0006145425 0.6313257 0.8872124
## S_HDL_CE 0.7349189 0.0007917589 0.6161985 0.8837708
## XS_VLDL_PL_. 0.7474369 0.0008556690 0.6333012 0.8931350
## IDL_FC_. 0.7617759 0.0008787680 0.6517801 0.9008792
## S_LDL_C_. 0.7644256 0.0012497011 0.6532419 0.9068541
library('data.table')
library('Cairo')
library('devtools')
library('MRInstruments')
library('TwoSampleMR')
Phenotype.vec<-c(metab_qtls[,1])
Funk.1<-function(PHENO)
{
#loop over all metabolites
exposure_dat <- format_metab_qtls(subset(metab_qtls, phenotype==PHENO))
outcome_dat <- read_outcome_data(
snps = metab_qtls$SNP,
filename = "App_266_results2.txt",
sep = "\t",
snp_col = "SNP",
beta_col = "OR",
se_col = "SE",
effect_allele_col = "Effect",
other_allele_col = "Baseline",
eaf_col = "EAFcontrols",
samplesize_col = "NumCalled"
)
#harmonization
dat <- harmonise_data(exposure_dat, outcome_dat, action=2)
#mr analysis
# res <- mr(dat)
res <- mr(dat, method_list=c("mr_two_sample_ml"))
# "mr_ivw"
#head(res)
#sensitivity analyses
het.dat<-mr_heterogeneity(dat)
p1 <- mr_scatter_plot(res, dat)
# Report
#rep <- mr_report(dat)
return(list(res,het.dat,p1, res))
}
outcome.test <-Funk.1(Phenotype.vec)
#outcome.testf <- data.frame(outcome.test[1])
#outcome.test[3]
#outcome.test[2]
#head(outcome.test[1])
#head(exp(outcome.test[,6]))
#outcome.test[4]
OT<-outcome.test[1]
OT<-data.frame(OT)
head(OT)
OT$exp_b <- exp(OT$b)
OT$beta <- OT$exp_b
OT$Metabolite <- OT$exposure
OT$lci1 <- OT$b-(1.96*OT$se)
OT$lci <- exp(OT$lci1)
head(OT$lci)
OT$uci1 <- OT$b+(1.96*OT$se)
OT$uci <- exp(OT$uci1)
write.csv(OT, "pca.mr.csv")
#######
sorted_OT = OT[order(OT$'pval'),]
head(sorted_OT)
write.csv(OT, "PRACTICAL_MR1.csv")
#OT$'Std. Error' <- OT$se
#OT$'Pr(>|z|)' <- OT$pval
#OT$'2.5 %' <- OT$lci
#OT$'97.5 %' <- OT$uci
PRACTICAL_MR1 <- sorted_OT
head(PRACTICAL_MR1)
#For correlation plot
myvarscor2 <- c("Metabolite","beta","lci", "uci", "pval", "method")
PRACTICAL_MR1 <- sorted_OT[myvarscor2]
sum(PRACTICAL_MR1$pval<0.0014)
write.csv(PRACTICAL_MR1, "PRACTICAL_MR1.csv")library('data.table')
library('Cairo')
myvarscor2 <- c("Metabolite","beta","lci", "uci", "pval", "method")
PRACTICAL_MR1 <- sorted_OT[myvarscor2]
sum(PRACTICAL_MR1$pval<0.0014)## [1] 1
head(PRACTICAL_MR1)## Metabolite beta lci uci pval
## 23 Crea 1.410868 1.200116 1.658631 3.046832e-05
## 29 Gp 1.167327 1.044425 1.304692 6.414927e-03
## 32 Pyr 1.292627 1.034110 1.615771 2.415705e-02
## 31 Pyr 1.292758 1.028809 1.624425 2.754147e-02
## 22 Lac 1.379063 1.034514 1.838367 2.842687e-02
## 134 Ala 1.196719 1.018176 1.406572 2.936834e-02
## method
## 23 Maximum likelihood
## 29 Maximum likelihood
## 32 Inverse variance weighted
## 31 Maximum likelihood
## 22 Inverse variance weighted
## 134 Inverse variance weighted
#' Open the ProtecT results in Excel and change the names of the metabolites to the period vs hyphen form. Then also look to see if any of the names do match but with periods in wrong place.
ProtecT <- read.csv("C:/Users/ca16591/Dropbox/Bristol/protect_results_periods_double.csv")
head(ProtecT)## Metabolite beta Std..Error z.value pval lci uci
## 1 XXL.VLDL.P 1.091716 0.03069816 2.858515 0.004256291 1.028304 1.159880
## 2 XXL.VLDL.L 1.091028 0.03068892 2.838809 0.004528226 1.027669 1.159122
## 3 XXL.VLDL.PL 1.090463 0.03068849 2.821980 0.004772812 1.027142 1.158527
## 4 XXL.VLDL.C 1.077180 0.03052235 2.435815 0.014858280 1.014872 1.143933
## 5 XXL.VLDL.CE 1.062743 0.03042754 1.999933 0.045507518 1.001385 1.128308
## 6 XXL.VLDL.FC 1.091182 0.03066571 2.845577 0.004433099 1.027861 1.159233
dim(ProtecT)## [1] 228 7
PRACTICAL_MR1 <- read.csv("C:/Users/ca16591/Dropbox/Bristol/PRACTICAL_MR1.csv")
dim(PRACTICAL_MR1)## [1] 114 17
ProtecT.t <- ProtecT[which(ProtecT$Metabolite %in% PRACTICAL_MR1$Metabolite),]
dim(ProtecT.t)## [1] 105 7
PRACTICAL.MRt<- PRACTICAL_MR1[which(PRACTICAL_MR1$Metabolite %in% ProtecT.t$Metabolite),]
dim(PRACTICAL.MRt)## [1] 105 17
source('corPlot_mods.R')