#/////////////////////////////////////////////////////////////////////////////
#   Libraries
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\


library("gplots")

## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess

library("car")
library(limma)
library(Hmisc)

## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: splines
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## 
## The following object is masked from '.env':
## 
##     %nin%
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

library(VIM)

## Loading required package: colorspace
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## 
## 
## Attaching package: 'VIM'
## 
## The following object is masked from 'package:datasets':
## 
##     sleep

library(Amelia)

## Loading required package: foreign
## Loading required package: Rcpp
## Loading required package: RcppArmadillo
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.2, built: 2013-04-03)
## ## Copyright (C) 2005-2014 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

library(ggplot2)
library(RColorBrewer)
library(car)
library(ggplot2)
library(reshape)
library(scales)


#/////////////////////////////////////////////////////////////////////////////
#   Read data
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\


setwd("~/ANDIS/2014-01_Metabolomics/Analysis/")

MetaData <- read.csv("../RawData//ANDIS_Matrix_for_metabolomics.forAnalysis.20140121.csv",row.names=1)
MetaData$ID <- paste("ANDIS",MetaData$INDEX,sep="_")
rownames(MetaData) <- MetaData$ID
RawDataIp <- read.csv("../RawData/ANDIS_RP_ESIp_140318.txt",row.names=1,comment.char = "#",sep="\t")
#RawDataIn <- read.csv("../RawData/ANDIS_RP_ESIn_140318.txt",row.names=1,comment.char = "#",sep="\t")

# Fic colnames and remove non-metabolites from dataframe
colnames(RawDataIp) <- sapply(strsplit(colnames(RawDataIp), "\\."), `[`, 1)
RawDataIpS <- as.matrix(RawDataIp[,c(2:50,59:156)])
#colnames(RawDataIn) <- sapply(strsplit(colnames(RawDataIn), "\\."), `[`, 1)
#RawDataInS <- as.matrix(RawDataIn[,c(2:50,59:156)])

setdiff(MetaData$ID,colnames(RawDataIpS))

## [1] "ANDIS_15"  "ANDIS_77"  "ANDIS_140"


MetaDataS <- MetaData[colnames(RawDataIpS),]

RawDataIpS[RawDataIpS==0] <- NA

png("missingness.png")
barplot(100*colSums(apply(RawDataIpS,2,is.na))/nrow(RawDataIpS),las=2,main="NA per samle",ylab="% of metabolites with NA")
dev.off()

## pdf 
##   2


#/////////////////////////////////////////////////////////////////////////////
#  Imputera missing data
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\


# Missing data



missmap(as.data.frame(RawDataIpS),col=c("blue","grey"))

plot of chunk unnamed-chunk-1


RawDataIpS_imputed<-apply(RawDataIpS,1,impute)

missmap(as.data.frame(t(RawDataIpS_imputed)),col=c("blue","grey"))

plot of chunk unnamed-chunk-1


RawData_complet <- RawDataIpS[complete.cases(RawDataIpS),order(MetaDataS$patid)]

RawDataIpS <- RawDataIpS[,order(MetaDataS$patid)]







RDClog3 <- RawDataIpS
MetaData3 <- MetaDataS[colnames(RDClog3),]


MetaData3$DiaCol <- as.character(recode(MetaData3$diagresult_id, "'LDA'='pink';'T2DCLC'='black';'T1DABS'='steelblue'"))

#/////////////////////////////////////////////////////////////////////////////
#  QC
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\

png("intensity_dist.png",w=1200,h=800,pointsize=18)
par(mfrow=c(2,1))
boxplot(RDClog3,las=2,col=MetaData3$DiaCol)

plotDensities(RDClog3,group=MetaData3$diagresult_id)

## Error: 'x' contains missing values

dev.off()

## pdf 
##   2



par(mfrow=c(1,1))
SampleCor<-cor(RDClog3,method="spearman", use = "pairwise.complete.obs")

#qqPlot(colMeans(SampleCor))
hmcol<-brewer.pal(12,"RdBu")

## Warning: n too large, allowed maximum for palette RdBu is 11
## Returning the palette you asked for with that many colors


pdf("Correlation_matrix_samples.pdf")

heatmap.2(SampleCor, col=bluered(101), scale="none", ColSideColors=as.character(MetaData3$DiaCol),
          RowSideColors=redgreen(ncol(SampleCor)),
          key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.5)

dev.off()

## pdf 
##   2

# 
# SampleCorMetabo<-cor(t(RDClog3),method="pearson")
# 
# rownames(SampleCorMetabo) <- 1:2010
# 
# pdf("Correlation_matrix_metabolites.pdf")
# 
# heatmap.2(SampleCorMetabo, col=redgreen(75), scale="none", ColSideColors=as.character(MetaData3$DiaCol),
#           key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.5)
# 
# dev.off()
# 
# ##

d <- dist(t(RDClog3),method="euclidean") # euclidean distances between the rows
fit <- cmdscale(d,eig=TRUE, k=3) # k is the number of dim
#fit # view results

x <- fit$points[,1]
y <- fit$points[,2]
z <- fit$points[,3]

Comp <- data.frame(PC1=x,PC2=y,PC3=z)

PropoVarExplained <- fit$eig/sum(fit$eig)*100

qplot(PC1,PC2,data=Comp,colour=MetaData3$INDEX,shape=MetaData3$diagresult_id,size=2)+
  xlab(paste("PC1 (",prettyNum(PropoVarExplained[1],digits=3)," %)",sep=""))+
  ylab(paste("PC2 (",prettyNum(PropoVarExplained[2],digits=3)," %)",sep=""))

plot of chunk unnamed-chunk-1


qplot(PC1,PC3,data=Comp,colour=MetaData3$diagresult_id,size=2)+
  xlab(paste("PC2 (",prettyNum(PropoVarExplained[2],digits=3)," %)",sep=""))+
  ylab(paste("PC3 (",prettyNum(PropoVarExplained[3],digits=3)," %)",sep=""))

plot of chunk unnamed-chunk-1


qplot(PC2,PC3,data=Comp,colour=MetaData3$INDEX,shape=MetaData3$diagresult_id,size=2)+
  xlab(paste("PC1 (",prettyNum(PropoVarExplained[1],digits=3)," %)",sep=""))+
  ylab(paste("PC3 (",prettyNum(PropoVarExplained[3],digits=3)," %)",sep=""))

plot of chunk unnamed-chunk-1





#/////////////////////////////////////////////////////////////////////////////
# Batch effect removal
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
# 
# 
# library(sva)
# 
# # Use imputed or complete
# batchdata <- t(RawDataIpS_imputed)
# 
# 
# n.sv = num.sv(batchdata,model.matrix(~MetaData3$diagresult_id),method="leek")
# 
# RDClog3.batchrm <- removeBatchEffect(x=batchdata,batch=MetaData3$INDEX)
# 
# 
# 
# png("intensity_dist.png")
# par(mfrow=c(1,2))
# boxplot(RDClog3.batchrm,las=2,col=MetaData3$DiaCol)
# 
# plotDensities(RDClog3.batchrm,group=MetaData3$diagresult_id)
# dev.off()
# 
# 
# par(mfrow=c(1,1))
# SampleCor<-cor(RDClog3.batchrm,method="spearman")
# 
# qqPlot(colMeans(SampleCor))
# 
# pdf("Correlation_matrix_raw.pdf")
# 
# heatmap.2(SampleCor, col=redgreen(75), scale="none", ColSideColors=as.character(MetaData3$DiaCol),
#           key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.5)
# 
# dev.off()
# 
# 
# 
# 
# ##





#/////////////////////////////////////////////////////////////////////////////
# Inference
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\



MetaData3$DiagNum <- as.numeric(recode(MetaData3$diagresult_id, "'LDA'='1';'T2DCLC'='2';'T1DABS'='0'"))



# select Model

Diagnosis <- MetaData3$DiagNum
Index <- MetaData3$INDEX

designlist <- list(
  None=cbind(Int=rep(1,length(Diagnosis))),
  Diagnosis=cbind(Int=1,Diagnosis=Diagnosis),
  Index=cbind(Int=1,Index=Index),
  Both=cbind(Int=1,DiagnosisIndex=Diagnosis*Index),
  Add=cbind(Int=1,Diagnosis=Diagnosis,Index=Index),
  Full=cbind(Int=1,Diagnosis=Diagnosis,Index=Index,DiagnosisIndex=Diagnosis*Index)
)

out <- selectModel(t(RawDataIpS_imputed),designlist)

table(out$pref)

## 
##      None Diagnosis     Index      Both       Add      Full 
##      4806       487      1408       679       112       149




# no extra covariate

design <- model.matrix(~MetaData3$DiagNum+MetaData3$INDEX)
fit <- lmFit(RawDataIpS[,],design)
fit <- eBayes(fit)

tt<-topTable(fit, coef="MetaData3$DiagNum", number=10000,adjust="BH")
head(tt)

##                       logFC AveExpr      t   P.Value adj.P.Val     B
## 943.3522@2.76802     0.8445   20.88  9.950 3.893e-18 2.975e-14 30.57
## 1489.1746@12.541768  0.8839   21.31  9.834 1.042e-17 3.981e-14 29.61
## 823.2984@3.0197556   0.7507   16.27  9.629 2.659e-17 6.772e-14 28.72
## 468.3844@10.327345  -0.7967   20.83 -9.557 4.087e-17 7.808e-14 28.30
## 133.0159@0.6088531   0.3848   18.15  9.250 4.578e-16 6.996e-13 25.95
## 676.2307@0.9869868   0.6118   17.35  8.971 1.296e-15 1.650e-12 24.95

table(tt$adj.P.Val < .05)

## 
## FALSE  TRUE 
##  6219  1422


par(mfrow=c(1,1))
qqPlot(fit$t[,2])

plot of chunk unnamed-chunk-1



volcanoplot(fit,coef=2,highlight=8,names=rownames(fit$lods),cex=1)

plot of chunk unnamed-chunk-1



plotMA(fit,array=2,main="MA-plot, marked = top 30 lowest p-value",ylab="M / log2 FC",
       xlab="A/Average Intensity")
abline(h=0,col="red")
top30 <- head(tt,n=30)
points(top30$AveExpr,top30$logFC,cex=0.8,col="blue")

plot of chunk unnamed-chunk-1



#write.csv(file="topMetabolites_high_vs_low.csv",tt)

MetaData3$diagresult_id <- relevel(MetaData3$diagresult_id,ref="T1DABS")

par(mfrow=c(3,3))
for (i in 1:9) {
  boxplot(RawDataIpS[rownames(tt)[i],]~MetaData3$diagresult_id,ylab=rownames(tt)[i],
          col=unique(MetaData3$DiaCol),main=prettyNum(tt$adj.P.Val[i],digits=2))
}

plot of chunk unnamed-chunk-1



write.csv(tt,file="topTable_anova_ESIp.csv")

# 
# 
# ClusterData <- RawData_complet[,order(MetaData3$diagresult_id)]
# 
# 
# cl <- kmeans(ClusterData , 6)
# 
# 
# #At this stage I also make a new variable indicating cluster membership as below.
# # I have a good #idea of what my clusters will be called so 
# #I gave them those names in the second line of the code. 
# #Then I put it together and put the data in a form that is good for graphing.
# 
# cl.cluster<-as.factor(cl$cluster)
# 
# #cl.cluster<-as.vector(recode (cluster, "1='FC'; 2='FV'; 3='SO'; 4= 'OS' ", 
# #                              as.numeric.result=FALSE) )
# 
# clustT1WIN2<- data.frame (ClusterData , cl.cluster) 
# clustT1WIN2$.row=rownames(ClusterData)
# molten2 <- melt(clustT1WIN2, id = c(".row", "cl.cluster") )
# 
# #OK set up the graph background. 
# #Following the ggplot book I also create a jit parameter cause it is
# #much easier to alter this and type it in than the full code over and over again.
# 
# pcp_cl <- ggplot(molten2,   aes(variable, value, group = .row, colour = cl.cluster) )          
# jit<- position_jitter (width = .08, height = .08)
# 
# #Ok first graph the cluster means.
# 
# pcp_cl + stat_summary(aes(group = cl.cluster), fun.y = mean,   geom = "line")
# 
# #Then we produce a colourful but uninformative parallel coordinates 
# #plot with a bit of alpha blending and jitter.
# 
# pcp_cl + geom_line(position = jit, alpha = 1/5)
# 
# #All code up to this point is as per Wickham but 
# #I also add the cluster means graph that we
# #first produced as well as changing the angle of the x axis text so it is readable.
# 
# pcp_cl + geom_line(position = jit, colour =  alpha("black", 1/4))+
#   stat_summary(aes(group = cl.cluster), fun.y = mean,   geom = "line", size = 1.5 )+
#   facet_wrap(~ cl.cluster)+ opts(axis.text.x=theme_text(angle=-45, hjust=0) )+
#   theme(axis.text.x=element_text(rep(c("red","blue"),each=18)))
# 
# 
# 
#