#/////////////////////////////////////////////////////////////////////////////
# Libraries
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
library("gplots")
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
library("car")
library(limma)
library(Hmisc)
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: splines
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
##
## The following object is masked from '.env':
##
## %nin%
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library(VIM)
## Loading required package: colorspace
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
##
##
## Attaching package: 'VIM'
##
## The following object is masked from 'package:datasets':
##
## sleep
library(Amelia)
## Loading required package: foreign
## Loading required package: Rcpp
## Loading required package: RcppArmadillo
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.2, built: 2013-04-03)
## ## Copyright (C) 2005-2014 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(ggplot2)
library(RColorBrewer)
library(car)
library(ggplot2)
library(reshape)
library(scales)
#/////////////////////////////////////////////////////////////////////////////
# Read data
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
setwd("~/ANDIS/2014-01_Metabolomics/Analysis/")
# metadata
MetaData <- read.csv("../RawData//ANDIS_Matrix_for_metabolomics.forAnalysis.20140121.csv",row.names=1)
MetaData$ID <- paste("ANDIS",MetaData$INDEX,sep="_")
rownames(MetaData) <- MetaData$ID
# metabolomics
RawDataIn <- read.csv("../RawData/ANDIS_RP_ESIn_140318.txt",row.names=1,comment.char = "#",sep="\t")
# Fic colnames and remove non-metabolites from dataframe
colnames(RawDataIn) <- sapply(strsplit(colnames(RawDataIn), "\\."), `[`, 1)
RawDataInS <- as.matrix(RawDataIn[,c(1:99)])
#colnames(RawDataIn) <- sapply(strsplit(colnames(RawDataIn), "\\."), `[`, 1)
#RawDataInS <- as.matrix(RawDataIn[,c(2:50,59:156)])
setdiff(MetaData$ID,colnames(RawDataInS))
## [1] "ANDIS_5" "ANDIS_6" "ANDIS_7" "ANDIS_9" "ANDIS_11"
## [6] "ANDIS_17" "ANDIS_18" "ANDIS_20" "ANDIS_22" "ANDIS_31"
## [11] "ANDIS_37" "ANDIS_42" "ANDIS_45" "ANDIS_47" "ANDIS_52"
## [16] "ANDIS_55" "ANDIS_56" "ANDIS_58" "ANDIS_59" "ANDIS_60"
## [21] "ANDIS_68" "ANDIS_69" "ANDIS_73" "ANDIS_74" "ANDIS_75"
## [26] "ANDIS_76" "ANDIS_77" "ANDIS_80" "ANDIS_90" "ANDIS_93"
## [31] "ANDIS_95" "ANDIS_96" "ANDIS_98" "ANDIS_101" "ANDIS_103"
## [36] "ANDIS_104" "ANDIS_109" "ANDIS_110" "ANDIS_111" "ANDIS_113"
## [41] "ANDIS_114" "ANDIS_117" "ANDIS_122" "ANDIS_123" "ANDIS_129"
## [46] "ANDIS_133" "ANDIS_136" "ANDIS_137" "ANDIS_138" "ANDIS_139"
## [51] "ANDIS_140"
MetaDataS <- MetaData[colnames(RawDataInS),]
RawDataInS[RawDataInS==0] <- NA
png("missingness.png")
barplot(100*colSums(apply(RawDataInS,2,is.na))/nrow(RawDataInS),las=2,main="NA per samle",ylab="% of metabolites with NA")
dev.off()
## pdf
## 2
#/////////////////////////////////////////////////////////////////////////////
# Imputera missing data
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
# Missing data
missmap(as.data.frame(RawDataInS),col=c("blue","grey"))
RawDataInS_imputed<-apply(RawDataInS,1,impute)
missmap(as.data.frame(t(RawDataInS_imputed)),col=c("blue","grey"))
RawData_complet <- RawDataInS[complete.cases(RawDataInS),order(MetaDataS$patid)]
RawDataInS <- RawDataInS[,order(MetaDataS$patid)]
RDClog3 <- RawDataInS
MetaData3 <- MetaDataS[colnames(RDClog3),]
MetaData3$DiaCol <- as.character(recode(MetaData3$diagresult_id, "'LDA'='pink';'T2DCLC'='black';'T1DABS'='steelblue'"))
#/////////////////////////////////////////////////////////////////////////////
# QC
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
png("intensity_dist.png",w=1200,h=800,pointsize=18)
par(mfrow=c(2,1))
boxplot(RDClog3,las=2,col=MetaData3$DiaCol)
plotDensities(RDClog3,group=MetaData3$diagresult_id)
## Error: 'x' contains missing values
dev.off()
## pdf
## 2
par(mfrow=c(1,1))
SampleCor<-cor(RDClog3,method="spearman", use = "pairwise.complete.obs")
#qqPlot(colMeans(SampleCor))
hmcol<-brewer.pal(12,"RdBu")
## Warning: n too large, allowed maximum for palette RdBu is 11
## Returning the palette you asked for with that many colors
pdf("Correlation_matrix_samples.pdf")
heatmap.2(SampleCor, col=bluered(101), scale="none", ColSideColors=as.character(MetaData3$DiaCol),
RowSideColors=redgreen(ncol(SampleCor)),
key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.5)
dev.off()
## pdf
## 2
#
# SampleCorMetabo<-cor(t(RDClog3),method="pearson")
#
# rownames(SampleCorMetabo) <- 1:2010
#
# pdf("Correlation_matrix_metabolites.pdf")
#
# heatmap.2(SampleCorMetabo, col=redgreen(75), scale="none", ColSideColors=as.character(MetaData3$DiaCol),
# key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.5)
#
# dev.off()
#
# ##
d <- dist(t(RDClog3),method="euclidean") # euclidean distances between the rows
fit <- cmdscale(d,eig=TRUE, k=3) # k is the number of dim
fit # view results
## $points
## [,1] [,2] [,3]
## ANDIS_1 -45.764 5.5263 -62.4639
## ANDIS_2 -37.789 -3.6748 -63.2096
## ANDIS_3 -41.584 -9.7425 -52.3843
## ANDIS_4 -37.896 -14.6059 -46.6289
## ANDIS_8 -36.653 3.8569 -33.0596
## ANDIS_10 -35.595 -27.0872 -24.6627
## ANDIS_12 -47.516 18.6994 -16.9712
## ANDIS_13 -32.985 -26.7723 -24.7268
## ANDIS_14 -29.846 -3.1798 -32.6195
## ANDIS_15 -35.260 -23.1269 -16.4639
## ANDIS_16 -40.945 2.9511 -15.4338
## ANDIS_19 -48.411 15.6938 -1.7897
## ANDIS_21 -37.668 26.6617 -10.1342
## ANDIS_23 -34.738 -23.8295 -7.7259
## ANDIS_24 -34.509 -21.6500 0.5451
## ANDIS_25 -36.817 -26.5163 -8.6594
## ANDIS_26 -36.248 -17.2154 -7.2717
## ANDIS_27 -38.883 10.0633 -11.3839
## ANDIS_28 -29.119 -27.6402 -8.9357
## ANDIS_29 31.629 -13.7619 -28.9312
## ANDIS_30 33.772 4.0906 -26.4355
## ANDIS_32 -29.674 27.5048 -0.2679
## ANDIS_33 -32.756 17.6122 -8.7398
## ANDIS_34 -41.537 15.9313 8.2326
## ANDIS_35 -39.772 27.4407 7.2881
## ANDIS_36 -20.060 34.1900 -2.7112
## ANDIS_38 -29.762 1.3044 -6.7610
## ANDIS_39 -27.288 -34.5455 5.4264
## ANDIS_40 -25.282 -2.3324 6.6179
## ANDIS_41 -23.055 -34.7235 19.4354
## ANDIS_43 -36.396 19.7258 -1.9264
## ANDIS_44 30.758 14.4963 -15.2753
## ANDIS_46 -21.994 -37.7419 29.7101
## ANDIS_48 -30.590 -21.8136 24.0630
## ANDIS_49 -27.331 -28.8581 20.7602
## ANDIS_50 -35.850 10.0555 15.8428
## ANDIS_51 -29.649 -29.3407 29.8859
## ANDIS_53 -44.984 16.8512 15.2327
## ANDIS_54 -26.741 3.1967 11.7616
## ANDIS_57 -23.539 -20.5833 19.7389
## ANDIS_61 -24.726 2.7680 16.7486
## ANDIS_62 -16.101 -25.4160 17.8480
## ANDIS_63 -29.683 20.9933 32.2268
## ANDIS_64 -15.870 -21.8401 26.7700
## ANDIS_65 -23.925 20.2941 17.4473
## ANDIS_66 -30.786 14.4006 23.4236
## ANDIS_67 -21.452 19.0860 29.9095
## ANDIS_70 -22.021 16.1926 22.6290
## ANDIS_71 -33.702 22.0563 29.2051
## ANDIS_72 -27.660 14.5569 30.5046
## ANDIS_78 -11.728 -9.4941 7.6881
## ANDIS_79 -18.014 25.1393 33.9927
## ANDIS_81 -21.933 7.5746 11.1364
## ANDIS_82 -20.672 -8.5396 23.9248
## ANDIS_83 -9.283 30.6374 9.6777
## ANDIS_84 -4.259 21.8721 0.8192
## ANDIS_85 1.617 -19.4523 6.7297
## ANDIS_86 10.200 -12.2512 3.0065
## ANDIS_87 -3.199 31.6892 5.9020
## ANDIS_88 19.583 -12.6825 6.5198
## ANDIS_89 46.990 -17.4296 10.1076
## ANDIS_91 17.303 23.3418 -2.3238
## ANDIS_92 44.678 -33.8446 9.7912
## ANDIS_94 42.339 -11.7350 -10.7137
## ANDIS_97 21.170 -24.9704 -3.1676
## ANDIS_99 25.956 -21.7453 2.6806
## ANDIS_100 37.846 -8.2020 0.1471
## ANDIS_102 17.800 -26.0764 1.7465
## ANDIS_105 30.368 5.9976 -8.0591
## ANDIS_106 8.542 5.2075 4.8238
## ANDIS_107 19.852 -0.8222 1.7633
## ANDIS_108 14.002 25.2722 8.7423
## ANDIS_112 34.706 -12.5473 3.3853
## ANDIS_115 28.347 37.5974 -8.5489
## ANDIS_116 35.804 17.9753 -2.9616
## ANDIS_118 20.764 -10.5092 7.6915
## ANDIS_119 38.269 -16.9829 3.6248
## ANDIS_120 40.691 15.3636 4.7765
## ANDIS_121 43.224 3.6077 2.4006
## ANDIS_124 44.731 -21.7613 6.8732
## ANDIS_125 49.778 -30.9040 0.6342
## ANDIS_126 40.136 18.9279 -4.7107
## ANDIS_127 40.496 16.1832 -13.4429
## ANDIS_128 35.063 41.7729 -5.2699
## ANDIS_130 50.657 -5.2572 7.4464
## ANDIS_131 39.899 15.3226 -7.9795
## ANDIS_132 43.496 -19.4761 -2.4218
## ANDIS_134 49.031 -19.5626 -2.8370
## ANDIS_135 49.318 -35.2502 -1.6002
## ANDIS_141 49.445 19.5222 -0.2347
## ANDIS_142 49.438 -15.5573 -8.2656
## ANDIS_143 46.942 19.1849 7.9759
## ANDIS_144 45.068 15.3169 3.7564
## ANDIS_145 42.492 4.7176 -11.1533
## ANDIS_146 50.173 7.2063 5.7567
## ANDIS_147 48.841 -7.3803 0.3245
## ANDIS_148 43.286 19.9334 -12.8224
## ANDIS_149 49.614 14.9724 -7.7180
## ANDIS_150 35.386 47.8952 -15.2949
##
## $eig
## [1] 1.172e+05 4.248e+04 3.417e+04 2.363e+04 1.692e+04 1.300e+04
## [7] 1.059e+04 9.422e+03 8.795e+03 7.762e+03 7.515e+03 7.255e+03
## [13] 7.078e+03 6.850e+03 6.201e+03 6.046e+03 5.862e+03 5.726e+03
## [19] 5.384e+03 5.338e+03 5.112e+03 5.012e+03 4.920e+03 4.840e+03
## [25] 4.711e+03 4.679e+03 4.511e+03 4.382e+03 4.319e+03 4.181e+03
## [31] 4.168e+03 4.020e+03 3.996e+03 3.903e+03 3.867e+03 3.783e+03
## [37] 3.770e+03 3.713e+03 3.656e+03 3.609e+03 3.551e+03 3.473e+03
## [43] 3.400e+03 3.309e+03 3.275e+03 3.242e+03 3.192e+03 3.184e+03
## [49] 3.130e+03 3.101e+03 3.028e+03 2.999e+03 2.952e+03 2.926e+03
## [55] 2.875e+03 2.822e+03 2.777e+03 2.761e+03 2.664e+03 2.639e+03
## [61] 2.633e+03 2.603e+03 2.574e+03 2.556e+03 2.550e+03 2.518e+03
## [67] 2.448e+03 2.425e+03 2.403e+03 2.350e+03 2.338e+03 2.273e+03
## [73] 2.246e+03 2.187e+03 2.158e+03 2.139e+03 2.103e+03 2.077e+03
## [79] 2.052e+03 2.014e+03 2.006e+03 1.953e+03 1.926e+03 1.903e+03
## [85] 1.895e+03 1.829e+03 1.788e+03 1.749e+03 1.673e+03 1.661e+03
## [91] 1.626e+03 1.581e+03 1.553e+03 1.538e+03 1.513e+03 1.424e+03
## [97] 1.400e+03 1.305e+03 -4.313e-13
##
## $x
## NULL
##
## $ac
## [1] 0
##
## $GOF
## [1] 0.3397 0.3397
x <- fit$points[,1]
y <- fit$points[,2]
z <- fit$points[,3]
Comp <- data.frame(PC1=x,PC2=y,PC3=z)
PropoVarExplained <- fit$eig/sum(fit$eig)*100
qplot(PC1,PC2,data=Comp,colour=MetaData3$INDEX,shape=MetaData3$diagresult_id,size=2)+
xlab(paste("PC1 (",prettyNum(PropoVarExplained[1],digits=3)," %)",sep=""))+
ylab(paste("PC2 (",prettyNum(PropoVarExplained[2],digits=3)," %)",sep=""))
qplot(PC1,PC3,data=Comp,colour=MetaData3$diagresult_id,size=2)+
xlab(paste("PC2 (",prettyNum(PropoVarExplained[2],digits=3)," %)",sep=""))+
ylab(paste("PC3 (",prettyNum(PropoVarExplained[3],digits=3)," %)",sep=""))
qplot(PC2,PC3,data=Comp,colour=MetaData3$INDEX,shape=MetaData3$diagresult_id,size=2)+
xlab(paste("PC1 (",prettyNum(PropoVarExplained[1],digits=3)," %)",sep=""))+
ylab(paste("PC3 (",prettyNum(PropoVarExplained[3],digits=3)," %)",sep=""))
#/////////////////////////////////////////////////////////////////////////////
# Batch effect removal
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
#
#
# library(sva)
#
# # Use imputed or complete
# batchdata <- t(RawDataIpS_imputed)
#
#
# n.sv = num.sv(batchdata,model.matrix(~MetaData3$diagresult_id),method="leek")
#
# RDClog3.batchrm <- removeBatchEffect(x=batchdata,batch=MetaData3$INDEX)
#
#
#
# png("intensity_dist.png")
# par(mfrow=c(1,2))
# boxplot(RDClog3.batchrm,las=2,col=MetaData3$DiaCol)
#
# plotDensities(RDClog3.batchrm,group=MetaData3$diagresult_id)
# dev.off()
#
#
# par(mfrow=c(1,1))
# SampleCor<-cor(RDClog3.batchrm,method="spearman")
#
# qqPlot(colMeans(SampleCor))
#
# pdf("Correlation_matrix_raw.pdf")
#
# heatmap.2(SampleCor, col=redgreen(75), scale="none", ColSideColors=as.character(MetaData3$DiaCol),
# key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.5)
#
# dev.off()
#
#
#
#
# ##
#/////////////////////////////////////////////////////////////////////////////
# Inference
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
MetaData3$DiagNum <- as.numeric(recode(MetaData3$diagresult_id, "'LDA'='1';'T2DCLC'='2';'T1DABS'='0'"))
# select Model
Diagnosis <- MetaData3$DiagNum
Index <- MetaData3$INDEX
designlist <- list(
None=cbind(Int=rep(1,length(Diagnosis))),
Diagnosis=cbind(Int=1,Diagnosis=Diagnosis),
Index=cbind(Int=1,Index=Index),
Both=cbind(Int=1,DiagnosisIndex=Diagnosis*Index),
Add=cbind(Int=1,Diagnosis=Diagnosis,Index=Index),
Full=cbind(Int=1,Diagnosis=Diagnosis,Index=Index,DiagnosisIndex=Diagnosis*Index)
)
out <- selectModel(RawData_complet,designlist)
table(out$pref)
##
## None Diagnosis Index Both Add Full
## 147 258 286 134 387 568
# extra covariate
design <- model.matrix(~MetaData3$DiagNum+MetaData3$INDEX)
fit <- lmFit(RawDataInS[,],design)
## Warning: Partial NA coefficients for 95 probe(s)
fit <- eBayes(fit)
tt<-topTable(fit, coef="MetaData3$DiagNum", number=10000,adjust="BH")
head(tt)
## logFC AveExpr t P.Value adj.P.Val B
## 676.229@0.89185095 1.750 16.30 10.522 1.002e-17 6.211e-14 29.86
## 426.407@11.354644 -1.615 17.24 -10.026 1.173e-16 2.657e-13 27.44
## 440.3855@10.9604025 -1.197 17.84 -9.987 1.286e-16 2.657e-13 27.34
## 496.4129@10.920985 -2.016 21.93 -9.844 2.906e-16 3.830e-13 26.55
## 965.3308@2.760915 1.423 19.57 9.852 3.088e-16 3.830e-13 26.49
## 717.5305@12.563882 1.389 20.35 9.643 7.203e-16 6.961e-13 25.65
table(tt$adj.P.Val < .05)
##
## FALSE TRUE
## 4071 2130
par(mfrow=c(1,1))
qqPlot(fit$t[,2])
volcanoplot(fit,coef=2,highlight=8,names=rownames(fit$lods),cex=1)
plotMA(fit,array=2,main="MA-plot, marked = top 30 lowest p-value",ylab="M / log2 FC",
xlab="A/Average Intensity")
abline(h=0,col="red")
top30 <- head(tt,n=30)
points(top30$AveExpr,top30$logFC,cex=0.8,col="blue")
#write.csv(file="topMetabolites_high_vs_low.csv",tt)
MetaData3$diagresult_id <- relevel(MetaData3$diagresult_id,ref="T1DABS")
par(mfrow=c(3,3))
for (i in 1:9) {
boxplot(RawDataInS[rownames(tt)[i],]~MetaData3$diagresult_id,ylab=rownames(tt)[i],
col=unique(MetaData3$DiaCol),main=prettyNum(tt$adj.P.Val[i],digits=2))
}
write.csv(tt,file="topTable_anova_ESIp.csv")
#
#
# ClusterData <- RawData_complet[,order(MetaData3$diagresult_id)]
#
#
# cl <- kmeans(ClusterData , 6)
#
#
# #At this stage I also make a new variable indicating cluster membership as below.
# # I have a good #idea of what my clusters will be called so
# #I gave them those names in the second line of the code.
# #Then I put it together and put the data in a form that is good for graphing.
#
# cl.cluster<-as.factor(cl$cluster)
#
# #cl.cluster<-as.vector(recode (cluster, "1='FC'; 2='FV'; 3='SO'; 4= 'OS' ",
# # as.numeric.result=FALSE) )
#
# clustT1WIN2<- data.frame (ClusterData , cl.cluster)
# clustT1WIN2$.row=rownames(ClusterData)
# molten2 <- melt(clustT1WIN2, id = c(".row", "cl.cluster") )
#
# #OK set up the graph background.
# #Following the ggplot book I also create a jit parameter cause it is
# #much easier to alter this and type it in than the full code over and over again.
#
# pcp_cl <- ggplot(molten2, aes(variable, value, group = .row, colour = cl.cluster) )
# jit<- position_jitter (width = .08, height = .08)
#
# #Ok first graph the cluster means.
#
# pcp_cl + stat_summary(aes(group = cl.cluster), fun.y = mean, geom = "line")
#
# #Then we produce a colourful but uninformative parallel coordinates
# #plot with a bit of alpha blending and jitter.
#
# pcp_cl + geom_line(position = jit, alpha = 1/5)
#
# #All code up to this point is as per Wickham but
# #I also add the cluster means graph that we
# #first produced as well as changing the angle of the x axis text so it is readable.
#
# pcp_cl + geom_line(position = jit, colour = alpha("black", 1/4))+
# stat_summary(aes(group = cl.cluster), fun.y = mean, geom = "line", size = 1.5 )+
# facet_wrap(~ cl.cluster)+ opts(axis.text.x=theme_text(angle=-45, hjust=0) )+
# theme(axis.text.x=element_text(rep(c("red","blue"),each=18)))
#
#
#
#