##1
require(Biobase)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colMeans,
##     colnames, colSums, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply,
##     lengths, Map, mapply, match, mget, order, paste, pmax,
##     pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
##     rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
##     tapply, union, unique, unsplit, which, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
Sys.setenv(OMPATH="Users/Sara/Dropbox/Courses/BS831/Code")
OMPATH="C:/Users/Sara/Dropbox/Courses/BS831/Code"
CPM <- readRDS( file.path(OMPATH,"HNSC_htseq_normalized_AEvsG1vsG3.RDS") )
##1
table(CPM$grade)
table(CPM$stage)
stage <- factor(c("AE","stage.lo","stage.lo","stage.lo","stage.hi")[CPM$stage],
                levels=c("AE","stage.lo","stage.hi"))
CPM$stage <- stage
table(CPM$grade,CPM$stage)
read.gmt <- function( gmt.file ) {
  gmt <- scan(gmt.file,"character",sep="\n")
  gmt <- lapply(gmt,function(Z) unlist(strsplit(Z,"\t"))[-2])
  names(gmt) <- sapply(gmt,function(Z) Z[1])
  gmt <- lapply(gmt,function(Z) Z[-1])
}
hall <- read.gmt( file.path(OMPATH,"h.all.v6.1.symbols.gmt") )
print(head(names(hall)))

g1vsg3 <- CPM[,CPM$grade %in% c("g1","g3")]
g1vsg3$grade <- droplevels(g1vsg3$grade)
exprs(g1vsg3) <- log2(exprs(g1vsg3)+1)

DM1 <- g1vsg3[,!is.na(pData(g1vsg3)[,"grade"])] 
table(droplevels(pData(DM1)[,"grade"]))

group1<-exprs(DM1)[,DM1$grade=="g1"]
group2<-exprs(DM1)[,DM1$grade=="g3"]

ttest <- data.frame(t(sapply(1:nrow(group1), 
                             function(i){
                               res <- t.test(x = group1[i, ], y = group2[i,], alternative ="two.sided")
                               res.list <- c(t.score=res$statistic,t.pvalue = res$p.value)
                               return(res.list)
                             })))
rownames(ttest)<- make.names(fData(DM1)$hgnc_symbol, unique=TRUE)
head(ttest)

ttestOrd <- order(ttest[,'t.score.t'],decreasing=FALSE)
ttestRes1<-(ttest[ttestOrd,])
head(ttestRes1[ttestOrd,])

KShall<- data.frame(t(sapply(1:50, function(i){
  geneIDX <- match(toupper(hall[[i]][]),toupper(rownames(ttestRes1))) # convert all to capital case before matching
  
  rank<-data.frame(rbind(hall[[i]][],geneIDX))
  rank1<- rank[, colSums(is.na(rank)) < 1] 
  
  dataset<-dim(fData(DM1))[1]
  geneset<-as.numeric(as.vector(unname(unlist(rank1[2,]))))
  ks<-ksGenescore(dataset, geneset, do.plot=FALSE)
  ks.list <- c(ks.score=ks$statistic,p.value = ks$p.value)
  return(ks.list)
})))
rownames(KShall)<-names(hall)
KShall$q.value <- p.adjust(KShall$p.value,method="BH")
rownames(KShall) <- gsub("HALLMARK_","",rownames(KShall))
print(head(KShall))
saveRDS( KShall, file=file.path("final.exercise1.RDS"))

##2

ksGenescore <- function
(
  n.x,               # length of ranked list
  y,                 # positions of geneset items in ranked list (basically, ranks)
  do.pval=TRUE,      # compute asymptotic p-value
  alternative=c("two.sided","greater","less"),
  do.plot=F,         # draw the ES plot
  bare=FALSE,        # return score & p-value only (a 2-tuple)
  cls.lev=c(0,1),    # class labels to display
  absolute=FALSE,    # takes max - min score rather than the maximum deviation from null
  plot.labels=FALSE, # hits' labels
  ...                # additional plot arguments
)
{
  # efficient version of ks.score (should give same results as ks.test, when weight=NULL)
  #
  alternative <- match.arg(alternative)
  #DNAME <- paste( "1:", n.x, " and ", deparse(substitute(y)), sep="" )
  #METHOD <- "Two-sample Kolmogorov-Smirnov test"
  n.y <- length(y)
  if ( n.y < 1 )  stop("Not enough y data")
  if ( any(y>n.x) ) stop( "y must be <= n.x: ", max(y) )
  if ( any(y<1) ) stop( "y must be positive: ", min(y) )
  x.axis <- y.axis <- NULL
  
  # KS score
  #
  y <- sort(y)
  n <- n.x * n.y/(n.x + n.y)
  hit <- 1/n.y
  mis <- 1/n.x
  
  ## to compute score, only the y positions and their immediate preceding
  ## ..positions are needed
  ##
  Y <- sort(c(y-1,y)); Y <- Y[diff(Y)!=0]; y.match <- match(y,Y)
  D <- rep( 0, length(Y) ); D[y.match] <- (1:n.y)
  zero <- which(D==0)[-1]; D[zero] <- D[zero-1]
  
  z <- D*hit - Y*mis
  
  score <- if (absolute) max(z)-min(z) else z[which.max(abs(z))]
  names(score) <- "D"
  
  if (do.plot) {
    x.axis <- Y;
    y.axis <- z;
    if(Y[1]>0) {
      x.axis <- c(0,x.axis);
      y.axis <- c(0,y.axis);
    }
    if ( max(Y)<n.x ) {
      x.axis <- c(x.axis,n.x)
      y.axis <- c(y.axis,0)
    }
    plot( x.axis, y.axis, type="l",
          xlab=paste("up-regulated for class ", cls.lev[2], " (KS>0) vs ",
                     "up-regulated for class ", cls.lev[1], " (KS<0)", sep="" ),
          ylab="gene hits",...)
    abline(h=0)
    abline(v=n.x/2,lty=3)
    axis(1,at=y,labels=plot.labels,tcl=0.25,las=2)
    i.max <- which.max(abs(y.axis))
    points( x.axis[i.max], y.axis[i.max], pch=20, col="red")
    text(x.axis[i.max]+n.x/20,y.axis[i.max],round(y.axis[i.max],2))
  }
  if ( !do.pval ) {
    return(score)
  }
  ## ELSE compute p-value as in function ks.test but return signed statistic
  ##
  tmp <- suppressWarnings(ks.test(1:n.x,y,alternative=alternative))
  tmp$statistic <- score # use the signed statistic
  return( if (bare) c(tmp$statistic, tmp$p.value) else tmp )
}  

ks.test( exprs(CPM1)[1,], "pnorm", mean=mean(exprs(CPM1)[1,]), sd=sd(exprs(CPM1)[1,]) )
kt<-list()
##named interval14329
KStester<-c()
for(i in 1:14329){
  kt[[i]] <- ks.test(exprs(CPM1)[i,], "pnorm", mean=mean(exprs(CPM1)[i,]), sd=sd(exprs(CPM1)[i,]) )
  KStester$statistic.D[i]=kt[[i]]$statistic
  KStester$pvalue[i]=kt[[i]]$p.value
}
KStester$q.value <- p.adjust(KStester$pvalue,method="BH")
KStable<-data.frame(KStester$statistic.D,KStester$pvalue,KStester$q.value)
row.names(KStester)<-row.names(exprs(CPM1))
colnames(KStester)<-c("statistic.D","p.value","q.value")
print(head(KStable))
saveRDS(KStester, file=file.path("final.exercise2.RDS"))


##3 set up for rows/columns
source( file.path(OMPATH, "misc.R") )
install.packages("pheatmap")
install.packages("heatmap.plus")
require(pheatmap)
require(heatmap.plus)
source( file.path(OMPATH, "variationFilter.R") )
CPM2 <- variationFilter(CPM1,ngenes=2000, do.plot=FALSE)
hc.col <- hclust(dist(t(exprs(CPM2))),method="ward.D")
hc.row <- hclust(dist(t(exprs(CPM2))),method="ward.D")
bwrPalette <- colGradient(c("blue","white","red"),length=13)

annot <- pData(CPM2)[,c("grade","stage")]
annotCol <- list(
  grade = c("white","green","darkgreen"),
  stage = c("white","green","darkgreen")
)
names(annotCol$grade) <- levels(annot$grade)
names(annotCol$stage) <- levels(annot$stage)

pheatmap(exprs(CPM2),
         color=bwrPalette,
         annotation_col = annot,
         annotation_colors = annotCol,
         cluster_rows=hc.row, # the result of the hclust call above
         cluster_cols=hc.col, # ditto
         show_rownames = FALSE,
         show_colnames = FALSE,
         scale = "row")

#3 Fischer tests
C3 <- cutree(hc.col,3)
annot1 <- annot
annotCol1 <- annotCol
annot1$cluster <- factor(C3)
annotCol1$cluster <- c("yellow","orange","purple")
names(annotCol1$cluster) <- levels(annot1$cluster)

pheatmap(exprs(CPM2),
         color=bwrPalette,
         annotation_col = annot1,
         annotation_colors = annotCol1,
         cluster_rows=hc.row,
         cluster_cols=hc.col,
         show_rownames = FALSE,
         show_colnames = FALSE,
         scale = "row")
cluster.grade <- list(cluster1='one of {"AE","g1","g3"}',
                      cluster2='one of {"AE","g1","g3"}',
                      cluster3='one of {"AE","g1","g3"}')
cluster.stage <- list(cluster1='one of {"AE","stage.lo","stage.hi"}',
                      cluster2='one of {"AE","stage.lo","stage.hi"}',
                      saveRDS(cluster.grade,file=file.path("exercise3.grade.RDS")))
                      saveRDS(cluster.stage,file=file.path("exercise3.stage.RDS"))
                      
## 4
install.packages("cba")
require(cba)
hc.colp <- hcopt(dist(t(exprs(CPM2))),method="ward.D") 
hc.rowp <- hcopt(dist(t(exprs(CPM2))),method="ward.D")
hclust.pairs <- cbind(hc.col$order[-ncol(CPM2)],hc.col$order[-1])
hclust.dist <- apply(hclust.pairs,1,function(X) dist(t(exprs(CPM2)[,X])))
hcopt.pairs <- cbind(hc.colp$order[-ncol(CPM2)],hc.colp$order[-1])
hcopt.dist <- apply(hcopt.pairs,1,function(X) dist(t(exprs(CPM2)[,X])))
                      
DIST <- data.frame(hclust=sort(hclust.dist),
hcopt=sort(hcopt.dist))
saveRDS(dist, file=file.path("exercise4.RDS"))
print(head(DIST))
plot(DIST$hclust,DIST$hcopt,xlab = 'sorted hclust distances',ylab = 'sorted hcopt distances')
abline(a=0,b=1,col="red")
                      

##5
require(limma)
require(Biobase)
install.packages("CBMRtools")
require(CBMRtools)
if ( !library(caret,logical.return=TRUE) ) {
  install.packages(c("caret","pROC"),repos="http://cran.r-project.org")
  require(caret)
}
require(limma)
featureSelect <- function( DAT, CLS, nfeat, balanced=TRUE )
{
  ## BEGIN input checks
  if ( class(DAT)!="ExpressionSet" ) stop( "'ExpressionSet' object expcted: ", class(DAT) )
  if ( length(CLS)!=ncol(DAT) ) stop( "CLS and DAT have incompatible sizes" )
  if ( length(unique(CLS))!=2 ) stop( "CLS must be a binary feature" )
  if ( nfeat<1 | nfeat>nrow(DAT) ) stop( "nfeat must be in [1,nrow(DAT)]" )
  ## END checks
  
  design= model.matrix(~as.factor(CLS))
  fitTrn <- lmFit(DAT,design)
  fitTrn <- eBayes(fitTrn)
  TT <- topTable(fitTrn,coef=2,number=Inf)
  
  DAT1 <- {
    if ( balanced ) # pick half markers in each direction
      DAT[c(match(rownames(TT)[order(TT$t,decreasing=TRUE)[1:ceiling(nfeat/2)]],featureNames(DAT)),
            match(rownames(TT)[order(TT$t,decreasing=FALSE)[1:ceiling(nfeat/2)]],featureNames(DAT))),]
    else            # pick top markers irrespective of direction
      DAT[match(rownames(TT)[order(abs(TT$t),decreasing=TRUE)[1:nfeat]],featureNames(DAT)),]
  }
  list(dat=DAT1,tbl=TT[match(featureNames(DAT1),rownames(TT)),])
}

grouplo <- exprs(CPM)[,CPM$stage=='stage.lo']
dim(group1)
grouphi <- exprs(CPM)[,CPM$stage=='stage.hi']
dim(group2)

DM1 <- g1vsg3[,!is.na(pData(g1vsg3)[,"grade"])] 
table(droplevels(pData(DM1)[,"grade"]))
group1<-exprs(DM1)[,DM1$grade=="g1"]
group2<-exprs(DM1)[,DM1$grade=="g3"]

DM1000 <- variationFilter(DM1,ngenes=1000,score="mad",do.plot=FALSE)
DM1000$GRADE <- factor(DM1000$grade)              # ensuring phenoype is a factor
DM1000$STAGE <- factor(DM1000$stage) 

discovery <- data.frame(t(Biobase::exprs(DM1000)))
discoveryLab <- factor(discovery$grade, levels = c("g1", "g3"))

validation <- data.frame(t(Biobase::exprs(DM1000)))
validationLab <- factor(validation$, levels = c("g1", "g3"))

fitControl <- trainControl(method="cv",
                           number=10,
                           classProbs=T,
                           summaryFunction=twoClassSummary)

print(fitControl)
set.seed(1234) 


RF <- train(x=discovery,
            y=discoveryLab,
            method="rf",
            trControl=fitControl,
            tuneGrid=expand.grid(mtry=c(20, 50, 100, 200)),
            metric='ROC')
## ii and iii
RF$bestTune
RF$finalModel

install.packages("rmarkdown")
require(rmarkdown)
## Error: <text>:278:36: unexpected ','
## 277: validation <- data.frame(t(Biobase::exprs(DM1000)))
## 278: validationLab <- factor(validation$,
##                                         ^