#/////////////////////////////////////////////////////////////////////////////
# Libraries
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
library("gplots")
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
library("car")
library(limma)
library(Hmisc)
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: splines
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
##
## The following object is masked from '.env':
##
## %nin%
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library(VIM)
## Loading required package: colorspace
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
##
##
## Attaching package: 'VIM'
##
## The following object is masked from 'package:datasets':
##
## sleep
library(Amelia)
## Loading required package: foreign
## Loading required package: Rcpp
## Loading required package: RcppArmadillo
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.2, built: 2013-04-03)
## ## Copyright (C) 2005-2014 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(ggplot2)
library(RColorBrewer)
library(car)
library(ggplot2)
library(reshape)
library(scales)
#/////////////////////////////////////////////////////////////////////////////
# Read data
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
setwd("~/ANDIS/2014-01_Metabolomics/Analysis/")
MetaData <- read.csv("../RawData//ANDIS_Matrix_for_metabolomics.forAnalysis.20140121.csv",row.names=1)
MetaData$ID <- paste("ANDIS",MetaData$INDEX,sep="_")
rownames(MetaData) <- MetaData$ID
RawDataIp <- read.csv("../RawData/ANDIS_RP_ESIp_140318.txt",row.names=1,comment.char = "#",sep="\t")
#RawDataIn <- read.csv("../RawData/ANDIS_RP_ESIn_140318.txt",row.names=1,comment.char = "#",sep="\t")
# Fic colnames and remove non-metabolites from dataframe
colnames(RawDataIp) <- sapply(strsplit(colnames(RawDataIp), "\\."), `[`, 1)
RawDataIpS <- as.matrix(RawDataIp[,c(2:50,59:156)])
#colnames(RawDataIn) <- sapply(strsplit(colnames(RawDataIn), "\\."), `[`, 1)
#RawDataInS <- as.matrix(RawDataIn[,c(2:50,59:156)])
setdiff(MetaData$ID,colnames(RawDataIpS))
## [1] "ANDIS_15" "ANDIS_77" "ANDIS_140"
MetaDataS <- MetaData[colnames(RawDataIpS),]
RawDataIpS[RawDataIpS==0] <- NA
png("missingness.png")
barplot(100*colSums(apply(RawDataIpS,2,is.na))/nrow(RawDataIpS),las=2,main="NA per samle",ylab="% of metabolites with NA")
dev.off()
## pdf
## 2
#/////////////////////////////////////////////////////////////////////////////
# Imputera missing data
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
# Missing data
missmap(as.data.frame(RawDataIpS),col=c("blue","grey"))
RawDataIpS_imputed<-apply(RawDataIpS,1,impute)
missmap(as.data.frame(t(RawDataIpS_imputed)),col=c("blue","grey"))
RawData_complet <- RawDataIpS[complete.cases(RawDataIpS),order(MetaDataS$patid)]
RawDataIpS <- RawDataIpS[,order(MetaDataS$patid)]
RDClog3 <- RawDataIpS
MetaData3 <- MetaDataS[colnames(RDClog3),]
MetaData3$DiaCol <- as.character(recode(MetaData3$diagresult_id, "'LDA'='pink';'T2DCLC'='black';'T1DABS'='steelblue'"))
#/////////////////////////////////////////////////////////////////////////////
# QC
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
png("intensity_dist.png",w=1200,h=800,pointsize=18)
par(mfrow=c(2,1))
boxplot(RDClog3,las=2,col=MetaData3$DiaCol)
plotDensities(RDClog3,group=MetaData3$diagresult_id)
## Error: 'x' contains missing values
dev.off()
## pdf
## 2
par(mfrow=c(1,1))
SampleCor<-cor(RDClog3,method="spearman", use = "pairwise.complete.obs")
#qqPlot(colMeans(SampleCor))
hmcol<-brewer.pal(12,"RdBu")
## Warning: n too large, allowed maximum for palette RdBu is 11
## Returning the palette you asked for with that many colors
pdf("Correlation_matrix_samples.pdf")
heatmap.2(SampleCor, col=bluered(101), scale="none", ColSideColors=as.character(MetaData3$DiaCol),
RowSideColors=redgreen(ncol(SampleCor)),
key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.5)
dev.off()
## pdf
## 2
#
# SampleCorMetabo<-cor(t(RDClog3),method="pearson")
#
# rownames(SampleCorMetabo) <- 1:2010
#
# pdf("Correlation_matrix_metabolites.pdf")
#
# heatmap.2(SampleCorMetabo, col=redgreen(75), scale="none", ColSideColors=as.character(MetaData3$DiaCol),
# key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.5)
#
# dev.off()
#
# ##
d <- dist(t(RDClog3),method="euclidean") # euclidean distances between the rows
fit <- cmdscale(d,eig=TRUE, k=3) # k is the number of dim
#fit # view results
x <- fit$points[,1]
y <- fit$points[,2]
z <- fit$points[,3]
Comp <- data.frame(PC1=x,PC2=y,PC3=z)
PropoVarExplained <- fit$eig/sum(fit$eig)*100
qplot(PC1,PC2,data=Comp,colour=MetaData3$INDEX,shape=MetaData3$diagresult_id,size=2)+
xlab(paste("PC1 (",prettyNum(PropoVarExplained[1],digits=3)," %)",sep=""))+
ylab(paste("PC2 (",prettyNum(PropoVarExplained[2],digits=3)," %)",sep=""))
qplot(PC1,PC3,data=Comp,colour=MetaData3$diagresult_id,size=2)+
xlab(paste("PC2 (",prettyNum(PropoVarExplained[2],digits=3)," %)",sep=""))+
ylab(paste("PC3 (",prettyNum(PropoVarExplained[3],digits=3)," %)",sep=""))
qplot(PC2,PC3,data=Comp,colour=MetaData3$INDEX,shape=MetaData3$diagresult_id,size=2)+
xlab(paste("PC1 (",prettyNum(PropoVarExplained[1],digits=3)," %)",sep=""))+
ylab(paste("PC3 (",prettyNum(PropoVarExplained[3],digits=3)," %)",sep=""))
#/////////////////////////////////////////////////////////////////////////////
# Batch effect removal
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
#
#
# library(sva)
#
# # Use imputed or complete
# batchdata <- t(RawDataIpS_imputed)
#
#
# n.sv = num.sv(batchdata,model.matrix(~MetaData3$diagresult_id),method="leek")
#
# RDClog3.batchrm <- removeBatchEffect(x=batchdata,batch=MetaData3$INDEX)
#
#
#
# png("intensity_dist.png")
# par(mfrow=c(1,2))
# boxplot(RDClog3.batchrm,las=2,col=MetaData3$DiaCol)
#
# plotDensities(RDClog3.batchrm,group=MetaData3$diagresult_id)
# dev.off()
#
#
# par(mfrow=c(1,1))
# SampleCor<-cor(RDClog3.batchrm,method="spearman")
#
# qqPlot(colMeans(SampleCor))
#
# pdf("Correlation_matrix_raw.pdf")
#
# heatmap.2(SampleCor, col=redgreen(75), scale="none", ColSideColors=as.character(MetaData3$DiaCol),
# key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.5)
#
# dev.off()
#
#
#
#
# ##
#/////////////////////////////////////////////////////////////////////////////
# Inference
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
MetaData3$DiagNum <- as.numeric(recode(MetaData3$diagresult_id, "'LDA'='1';'T2DCLC'='2';'T1DABS'='0'"))
# select Model
Diagnosis <- MetaData3$DiagNum
Index <- MetaData3$INDEX
designlist <- list(
None=cbind(Int=rep(1,length(Diagnosis))),
Diagnosis=cbind(Int=1,Diagnosis=Diagnosis),
Index=cbind(Int=1,Index=Index),
Both=cbind(Int=1,DiagnosisIndex=Diagnosis*Index),
Add=cbind(Int=1,Diagnosis=Diagnosis,Index=Index),
Full=cbind(Int=1,Diagnosis=Diagnosis,Index=Index,DiagnosisIndex=Diagnosis*Index)
)
out <- selectModel(t(RawDataIpS_imputed),designlist)
table(out$pref)
##
## None Diagnosis Index Both Add Full
## 4806 487 1408 679 112 149
# no extra covariate
design <- model.matrix(~MetaData3$DiagNum+MetaData3$INDEX)
fit <- lmFit(RawDataIpS[,],design)
fit <- eBayes(fit)
tt<-topTable(fit, coef="MetaData3$DiagNum", number=10000,adjust="BH")
head(tt)
## logFC AveExpr t P.Value adj.P.Val B
## 943.3522@2.76802 0.8445 20.88 9.950 3.893e-18 2.975e-14 30.57
## 1489.1746@12.541768 0.8839 21.31 9.834 1.042e-17 3.981e-14 29.61
## 823.2984@3.0197556 0.7507 16.27 9.629 2.659e-17 6.772e-14 28.72
## 468.3844@10.327345 -0.7967 20.83 -9.557 4.087e-17 7.808e-14 28.30
## 133.0159@0.6088531 0.3848 18.15 9.250 4.578e-16 6.996e-13 25.95
## 676.2307@0.9869868 0.6118 17.35 8.971 1.296e-15 1.650e-12 24.95
table(tt$adj.P.Val < .05)
##
## FALSE TRUE
## 6219 1422
par(mfrow=c(1,1))
qqPlot(fit$t[,2])
volcanoplot(fit,coef=2,highlight=8,names=rownames(fit$lods),cex=1)
plotMA(fit,array=2,main="MA-plot, marked = top 30 lowest p-value",ylab="M / log2 FC",
xlab="A/Average Intensity")
abline(h=0,col="red")
top30 <- head(tt,n=30)
points(top30$AveExpr,top30$logFC,cex=0.8,col="blue")
#write.csv(file="topMetabolites_high_vs_low.csv",tt)
MetaData3$diagresult_id <- relevel(MetaData3$diagresult_id,ref="T1DABS")
par(mfrow=c(3,3))
for (i in 1:9) {
boxplot(RawDataIpS[rownames(tt)[i],]~MetaData3$diagresult_id,ylab=rownames(tt)[i],
col=unique(MetaData3$DiaCol),main=prettyNum(tt$adj.P.Val[i],digits=2))
}
write.csv(tt,file="topTable_anova_ESIp.csv")
#
#
# ClusterData <- RawData_complet[,order(MetaData3$diagresult_id)]
#
#
# cl <- kmeans(ClusterData , 6)
#
#
# #At this stage I also make a new variable indicating cluster membership as below.
# # I have a good #idea of what my clusters will be called so
# #I gave them those names in the second line of the code.
# #Then I put it together and put the data in a form that is good for graphing.
#
# cl.cluster<-as.factor(cl$cluster)
#
# #cl.cluster<-as.vector(recode (cluster, "1='FC'; 2='FV'; 3='SO'; 4= 'OS' ",
# # as.numeric.result=FALSE) )
#
# clustT1WIN2<- data.frame (ClusterData , cl.cluster)
# clustT1WIN2$.row=rownames(ClusterData)
# molten2 <- melt(clustT1WIN2, id = c(".row", "cl.cluster") )
#
# #OK set up the graph background.
# #Following the ggplot book I also create a jit parameter cause it is
# #much easier to alter this and type it in than the full code over and over again.
#
# pcp_cl <- ggplot(molten2, aes(variable, value, group = .row, colour = cl.cluster) )
# jit<- position_jitter (width = .08, height = .08)
#
# #Ok first graph the cluster means.
#
# pcp_cl + stat_summary(aes(group = cl.cluster), fun.y = mean, geom = "line")
#
# #Then we produce a colourful but uninformative parallel coordinates
# #plot with a bit of alpha blending and jitter.
#
# pcp_cl + geom_line(position = jit, alpha = 1/5)
#
# #All code up to this point is as per Wickham but
# #I also add the cluster means graph that we
# #first produced as well as changing the angle of the x axis text so it is readable.
#
# pcp_cl + geom_line(position = jit, colour = alpha("black", 1/4))+
# stat_summary(aes(group = cl.cluster), fun.y = mean, geom = "line", size = 1.5 )+
# facet_wrap(~ cl.cluster)+ opts(axis.text.x=theme_text(angle=-45, hjust=0) )+
# theme(axis.text.x=element_text(rep(c("red","blue"),each=18)))
#
#
#
#