##1
require(Biobase)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colMeans,
## colnames, colSums, do.call, duplicated, eval, evalq, Filter,
## Find, get, grep, grepl, intersect, is.unsorted, lapply,
## lengths, Map, mapply, match, mget, order, paste, pmax,
## pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
## rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
## tapply, union, unique, unsplit, which, which.max, which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
Sys.setenv(OMPATH="Users/Sara/Dropbox/Courses/BS831/Code")
OMPATH="C:/Users/Sara/Dropbox/Courses/BS831/Code"
CPM <- readRDS( file.path(OMPATH,"HNSC_htseq_normalized_AEvsG1vsG3.RDS") )
##1
table(CPM$grade)
table(CPM$stage)
stage <- factor(c("AE","stage.lo","stage.lo","stage.lo","stage.hi")[CPM$stage],
levels=c("AE","stage.lo","stage.hi"))
CPM$stage <- stage
table(CPM$grade,CPM$stage)
read.gmt <- function( gmt.file ) {
gmt <- scan(gmt.file,"character",sep="\n")
gmt <- lapply(gmt,function(Z) unlist(strsplit(Z,"\t"))[-2])
names(gmt) <- sapply(gmt,function(Z) Z[1])
gmt <- lapply(gmt,function(Z) Z[-1])
}
hall <- read.gmt( file.path(OMPATH,"h.all.v6.1.symbols.gmt") )
print(head(names(hall)))
g1vsg3 <- CPM[,CPM$grade %in% c("g1","g3")]
g1vsg3$grade <- droplevels(g1vsg3$grade)
exprs(g1vsg3) <- log2(exprs(g1vsg3)+1)
DM1 <- g1vsg3[,!is.na(pData(g1vsg3)[,"grade"])]
table(droplevels(pData(DM1)[,"grade"]))
group1<-exprs(DM1)[,DM1$grade=="g1"]
group2<-exprs(DM1)[,DM1$grade=="g3"]
ttest <- data.frame(t(sapply(1:nrow(group1),
function(i){
res <- t.test(x = group1[i, ], y = group2[i,], alternative ="two.sided")
res.list <- c(t.score=res$statistic,t.pvalue = res$p.value)
return(res.list)
})))
rownames(ttest)<- make.names(fData(DM1)$hgnc_symbol, unique=TRUE)
head(ttest)
ttestOrd <- order(ttest[,'t.score.t'],decreasing=FALSE)
ttestRes1<-(ttest[ttestOrd,])
head(ttestRes1[ttestOrd,])
KShall<- data.frame(t(sapply(1:50, function(i){
geneIDX <- match(toupper(hall[[i]][]),toupper(rownames(ttestRes1))) # convert all to capital case before matching
rank<-data.frame(rbind(hall[[i]][],geneIDX))
rank1<- rank[, colSums(is.na(rank)) < 1]
dataset<-dim(fData(DM1))[1]
geneset<-as.numeric(as.vector(unname(unlist(rank1[2,]))))
ks<-ksGenescore(dataset, geneset, do.plot=FALSE)
ks.list <- c(ks.score=ks$statistic,p.value = ks$p.value)
return(ks.list)
})))
rownames(KShall)<-names(hall)
KShall$q.value <- p.adjust(KShall$p.value,method="BH")
rownames(KShall) <- gsub("HALLMARK_","",rownames(KShall))
print(head(KShall))
saveRDS( KShall, file=file.path("final.exercise1.RDS"))
##2
ksGenescore <- function
(
n.x, # length of ranked list
y, # positions of geneset items in ranked list (basically, ranks)
do.pval=TRUE, # compute asymptotic p-value
alternative=c("two.sided","greater","less"),
do.plot=F, # draw the ES plot
bare=FALSE, # return score & p-value only (a 2-tuple)
cls.lev=c(0,1), # class labels to display
absolute=FALSE, # takes max - min score rather than the maximum deviation from null
plot.labels=FALSE, # hits' labels
... # additional plot arguments
)
{
# efficient version of ks.score (should give same results as ks.test, when weight=NULL)
#
alternative <- match.arg(alternative)
#DNAME <- paste( "1:", n.x, " and ", deparse(substitute(y)), sep="" )
#METHOD <- "Two-sample Kolmogorov-Smirnov test"
n.y <- length(y)
if ( n.y < 1 ) stop("Not enough y data")
if ( any(y>n.x) ) stop( "y must be <= n.x: ", max(y) )
if ( any(y<1) ) stop( "y must be positive: ", min(y) )
x.axis <- y.axis <- NULL
# KS score
#
y <- sort(y)
n <- n.x * n.y/(n.x + n.y)
hit <- 1/n.y
mis <- 1/n.x
## to compute score, only the y positions and their immediate preceding
## ..positions are needed
##
Y <- sort(c(y-1,y)); Y <- Y[diff(Y)!=0]; y.match <- match(y,Y)
D <- rep( 0, length(Y) ); D[y.match] <- (1:n.y)
zero <- which(D==0)[-1]; D[zero] <- D[zero-1]
z <- D*hit - Y*mis
score <- if (absolute) max(z)-min(z) else z[which.max(abs(z))]
names(score) <- "D"
if (do.plot) {
x.axis <- Y;
y.axis <- z;
if(Y[1]>0) {
x.axis <- c(0,x.axis);
y.axis <- c(0,y.axis);
}
if ( max(Y)<n.x ) {
x.axis <- c(x.axis,n.x)
y.axis <- c(y.axis,0)
}
plot( x.axis, y.axis, type="l",
xlab=paste("up-regulated for class ", cls.lev[2], " (KS>0) vs ",
"up-regulated for class ", cls.lev[1], " (KS<0)", sep="" ),
ylab="gene hits",...)
abline(h=0)
abline(v=n.x/2,lty=3)
axis(1,at=y,labels=plot.labels,tcl=0.25,las=2)
i.max <- which.max(abs(y.axis))
points( x.axis[i.max], y.axis[i.max], pch=20, col="red")
text(x.axis[i.max]+n.x/20,y.axis[i.max],round(y.axis[i.max],2))
}
if ( !do.pval ) {
return(score)
}
## ELSE compute p-value as in function ks.test but return signed statistic
##
tmp <- suppressWarnings(ks.test(1:n.x,y,alternative=alternative))
tmp$statistic <- score # use the signed statistic
return( if (bare) c(tmp$statistic, tmp$p.value) else tmp )
}
ks.test( exprs(CPM1)[1,], "pnorm", mean=mean(exprs(CPM1)[1,]), sd=sd(exprs(CPM1)[1,]) )
kt<-list()
##named interval14329
KStester<-c()
for(i in 1:14329){
kt[[i]] <- ks.test(exprs(CPM1)[i,], "pnorm", mean=mean(exprs(CPM1)[i,]), sd=sd(exprs(CPM1)[i,]) )
KStester$statistic.D[i]=kt[[i]]$statistic
KStester$pvalue[i]=kt[[i]]$p.value
}
KStester$q.value <- p.adjust(KStester$pvalue,method="BH")
KStable<-data.frame(KStester$statistic.D,KStester$pvalue,KStester$q.value)
row.names(KStester)<-row.names(exprs(CPM1))
colnames(KStester)<-c("statistic.D","p.value","q.value")
print(head(KStable))
saveRDS(KStester, file=file.path("final.exercise2.RDS"))
##3 set up for rows/columns
source( file.path(OMPATH, "misc.R") )
install.packages("pheatmap")
install.packages("heatmap.plus")
require(pheatmap)
require(heatmap.plus)
source( file.path(OMPATH, "variationFilter.R") )
CPM2 <- variationFilter(CPM1,ngenes=2000, do.plot=FALSE)
hc.col <- hclust(dist(t(exprs(CPM2))),method="ward.D")
hc.row <- hclust(dist(t(exprs(CPM2))),method="ward.D")
bwrPalette <- colGradient(c("blue","white","red"),length=13)
annot <- pData(CPM2)[,c("grade","stage")]
annotCol <- list(
grade = c("white","green","darkgreen"),
stage = c("white","green","darkgreen")
)
names(annotCol$grade) <- levels(annot$grade)
names(annotCol$stage) <- levels(annot$stage)
pheatmap(exprs(CPM2),
color=bwrPalette,
annotation_col = annot,
annotation_colors = annotCol,
cluster_rows=hc.row, # the result of the hclust call above
cluster_cols=hc.col, # ditto
show_rownames = FALSE,
show_colnames = FALSE,
scale = "row")
#3 Fischer tests
C3 <- cutree(hc.col,3)
annot1 <- annot
annotCol1 <- annotCol
annot1$cluster <- factor(C3)
annotCol1$cluster <- c("yellow","orange","purple")
names(annotCol1$cluster) <- levels(annot1$cluster)
pheatmap(exprs(CPM2),
color=bwrPalette,
annotation_col = annot1,
annotation_colors = annotCol1,
cluster_rows=hc.row,
cluster_cols=hc.col,
show_rownames = FALSE,
show_colnames = FALSE,
scale = "row")
cluster.grade <- list(cluster1='one of {"AE","g1","g3"}',
cluster2='one of {"AE","g1","g3"}',
cluster3='one of {"AE","g1","g3"}')
cluster.stage <- list(cluster1='one of {"AE","stage.lo","stage.hi"}',
cluster2='one of {"AE","stage.lo","stage.hi"}',
saveRDS(cluster.grade,file=file.path("exercise3.grade.RDS")))
saveRDS(cluster.stage,file=file.path("exercise3.stage.RDS"))
## 4
install.packages("cba")
require(cba)
hc.colp <- hcopt(dist(t(exprs(CPM2))),method="ward.D")
hc.rowp <- hcopt(dist(t(exprs(CPM2))),method="ward.D")
hclust.pairs <- cbind(hc.col$order[-ncol(CPM2)],hc.col$order[-1])
hclust.dist <- apply(hclust.pairs,1,function(X) dist(t(exprs(CPM2)[,X])))
hcopt.pairs <- cbind(hc.colp$order[-ncol(CPM2)],hc.colp$order[-1])
hcopt.dist <- apply(hcopt.pairs,1,function(X) dist(t(exprs(CPM2)[,X])))
DIST <- data.frame(hclust=sort(hclust.dist),
hcopt=sort(hcopt.dist))
saveRDS(dist, file=file.path("exercise4.RDS"))
print(head(DIST))
plot(DIST$hclust,DIST$hcopt,xlab = 'sorted hclust distances',ylab = 'sorted hcopt distances')
abline(a=0,b=1,col="red")
##5
require(limma)
require(Biobase)
install.packages("CBMRtools")
require(CBMRtools)
if ( !library(caret,logical.return=TRUE) ) {
install.packages(c("caret","pROC"),repos="http://cran.r-project.org")
require(caret)
}
require(limma)
featureSelect <- function( DAT, CLS, nfeat, balanced=TRUE )
{
## BEGIN input checks
if ( class(DAT)!="ExpressionSet" ) stop( "'ExpressionSet' object expcted: ", class(DAT) )
if ( length(CLS)!=ncol(DAT) ) stop( "CLS and DAT have incompatible sizes" )
if ( length(unique(CLS))!=2 ) stop( "CLS must be a binary feature" )
if ( nfeat<1 | nfeat>nrow(DAT) ) stop( "nfeat must be in [1,nrow(DAT)]" )
## END checks
design= model.matrix(~as.factor(CLS))
fitTrn <- lmFit(DAT,design)
fitTrn <- eBayes(fitTrn)
TT <- topTable(fitTrn,coef=2,number=Inf)
DAT1 <- {
if ( balanced ) # pick half markers in each direction
DAT[c(match(rownames(TT)[order(TT$t,decreasing=TRUE)[1:ceiling(nfeat/2)]],featureNames(DAT)),
match(rownames(TT)[order(TT$t,decreasing=FALSE)[1:ceiling(nfeat/2)]],featureNames(DAT))),]
else # pick top markers irrespective of direction
DAT[match(rownames(TT)[order(abs(TT$t),decreasing=TRUE)[1:nfeat]],featureNames(DAT)),]
}
list(dat=DAT1,tbl=TT[match(featureNames(DAT1),rownames(TT)),])
}
grouplo <- exprs(CPM)[,CPM$stage=='stage.lo']
dim(group1)
grouphi <- exprs(CPM)[,CPM$stage=='stage.hi']
dim(group2)
DM1 <- g1vsg3[,!is.na(pData(g1vsg3)[,"grade"])]
table(droplevels(pData(DM1)[,"grade"]))
group1<-exprs(DM1)[,DM1$grade=="g1"]
group2<-exprs(DM1)[,DM1$grade=="g3"]
DM1000 <- variationFilter(DM1,ngenes=1000,score="mad",do.plot=FALSE)
DM1000$GRADE <- factor(DM1000$grade) # ensuring phenoype is a factor
DM1000$STAGE <- factor(DM1000$stage)
discovery <- data.frame(t(Biobase::exprs(DM1000)))
discoveryLab <- factor(discovery$grade, levels = c("g1", "g3"))
validation <- data.frame(t(Biobase::exprs(DM1000)))
validationLab <- factor(validation$, levels = c("g1", "g3"))
fitControl <- trainControl(method="cv",
number=10,
classProbs=T,
summaryFunction=twoClassSummary)
print(fitControl)
set.seed(1234)
RF <- train(x=discovery,
y=discoveryLab,
method="rf",
trControl=fitControl,
tuneGrid=expand.grid(mtry=c(20, 50, 100, 200)),
metric='ROC')
## ii and iii
RF$bestTune
RF$finalModel
install.packages("rmarkdown")
require(rmarkdown)
## Error: <text>:278:36: unexpected ','
## 277: validation <- data.frame(t(Biobase::exprs(DM1000)))
## 278: validationLab <- factor(validation$,
## ^