Yiming Zhang
Load the packages
library(MASS)
library(reshape)
## Loading required package: plyr
##
## Attaching package: 'reshape'
##
## The following objects are masked from 'package:plyr':
##
## rename, round_any
library(car)
library(limma)
library(e1071)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following object is masked from 'package:reshape':
##
## expand
##
## Loaded glmnet 1.9-5
library(ROCR)
## Loading required package: gplots
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
library(CMA)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following object is masked from 'package:Matrix':
##
## as.vector
##
## The following object is masked from 'package:limma':
##
## plotMA
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, duplicated, eval, evalq, Filter, Find, get,
## intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unlist
##
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
##
## Attaching package: 'CMA'
##
## The following object is masked from 'package:ROCR':
##
## prediction
##
## The following object is masked from 'package:e1071':
##
## tune
##
## The following object is masked from 'package:plyr':
##
## join
library(GEOquery)
## Setting options('download.file.method.GEOquery'='auto')
library(lattice)
library(class)
##
## Attaching package: 'class'
##
## The following object is masked from 'package:reshape':
##
## condense
load("class_LNstatus.Rdata")
Look at the data
table(pData(train.es)$LnStatus)
##
## neg pos
## 48 48
table(pData(test.es)$LnStatus)
##
## neg pos
## 9 11
tapply(pData(train.es)$LnRatio, pData(train.es)$LnStatus, summary)
## $neg
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
##
## $pos
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.040 0.070 0.110 0.194 0.228 0.960
tapply(pData(test.es)$LnRatio, pData(test.es)$LnStatus, summary)
## $neg
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
##
## $pos
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.050 0.180 0.500 0.457 0.610 0.940
Look at expression of 3 randomly picked genes in both training and test sets
set.seed(123)
(getMe <- sample(1:nrow(train.es), size = 3))
## [1] 6970 19105 9912
For training data
trDat <- trainDat[getMe, ]
str(trDat)
## num [1:3, 1:96] 11.33 8.96 8.16 10.61 9.26 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:3] "208697_s_at" "228754_at" "213117_at"
## ..$ : chr [1:96] "GSM570518" "GSM570519" "GSM570520" "GSM570521" ...
trDat <- data.frame(LnStatus = pData(train.es)$LnStatus, Set = rep("train",
nrow(pData(train.es))), t(trDat))
str(trDat)
## 'data.frame': 96 obs. of 5 variables:
## $ LnStatus : Factor w/ 2 levels "neg","pos": 1 1 2 1 1 2 1 1 2 1 ...
## $ Set : Factor w/ 1 level "train": 1 1 1 1 1 1 1 1 1 1 ...
## $ X208697_s_at: num 11.3 10.6 11.8 11.9 11.3 ...
## $ X228754_at : num 8.96 9.26 8.64 9.18 8.27 ...
## $ X213117_at : num 8.16 7.95 6.91 7.11 7.63 ...
plotDat.train <- melt(trDat, id = c("LnStatus", "Set"), variable_name = "gene")
colnames(plotDat.train)[colnames(plotDat.train) == "value"] = "gExp"
For test data
tDat <- testDat[getMe, ]
str(tDat)
## num [1:3, 1:20] 11.93 8.24 7.62 11.21 8.09 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:3] "208697_s_at" "228754_at" "213117_at"
## ..$ : chr [1:20] "GSM570498" "GSM570499" "GSM570500" "GSM570501" ...
tDat <- data.frame(LnStatus = pData(test.es)$LnStatus, Set = rep("test", nrow(pData(test.es))),
t(tDat))
str(tDat)
## 'data.frame': 20 obs. of 5 variables:
## $ LnStatus : Factor w/ 2 levels "neg","pos": 1 1 1 1 1 1 1 1 1 2 ...
## $ Set : Factor w/ 1 level "test": 1 1 1 1 1 1 1 1 1 1 ...
## $ X208697_s_at: num 11.9 11.2 11.3 12.1 11.2 ...
## $ X228754_at : num 8.24 8.09 8.08 9.5 7.92 ...
## $ X213117_at : num 7.62 8.12 7.67 6.98 7.75 ...
plotDat.test <- melt(tDat, id = c("LnStatus", "Set"), variable_name = "gene")
colnames(plotDat.test)[colnames(plotDat.test) == "value"] = "gExp"
plotDat <- rbind(plotDat.train, plotDat.test)
Plot 3 randomly picked genes in both training and test sets
stripplot(gExp ~ LnStatus | gene + Set, plotDat, grid = TRUE, group = LnStatus,
auto.key = TRUE, jitter.data = TRUE)
Use 6 folds
nfold <- 6
tabTrain <- table(train.es$LnStatus)
indlist <- sapply(names(tabTrain), function(z) which(train.es$LnStatus == z),
simplify = FALSE)
set.seed(1234)
# Each row contains 8 pos and 8 negative samples.
fold.pos <- matrix(sample(indlist[["pos"]]), nrow = nfold)
fold.neg <- matrix(sample(indlist[["neg"]]), nrow = nfold)
splits <- GenerateLearningsets(y = train.es$LnStatus, method = "CV", fold = 6,
strat = TRUE)
# Define here the constants that you will not evaluate. For example, I will
# use the top-50 limma genes
ngenes <- 50
nmethod <- 7 #number of methods you plan to compare.
# Define here an output objects to store results
pr.err <- matrix(-1, nfold, nmethod, dimnames = list(paste0("Fold", 1:nfold),
c("1NN", "5NN", "10NN", "15NN", "LDA", "Logit", "SVM")))
for (i in 1:nfold) {
# Test Fold for the i-th step
testdat.fold <- trainDat[, c(fold.pos[i, ], fold.neg[i, ])]
# I will create a factor of classes for the test set of the i_th fold
testclass.fold <- train.es$LnStatus[c(fold.pos[i, ], fold.neg[i, ])]
# The rest of the samples are the training set for the i-th step
traindat.fold <- trainDat[, -c(fold.pos[i, ], fold.neg[i, ])]
trainclass.fold <- train.es$LnStatus[-c(fold.pos[i, ], fold.neg[i, ])]
# Step 1: feature selection (do you remember limma?).
# Note that a different set of genes will be selected for each fold! you can
# then compare how consistent these sets were.
limma.dat <- as.data.frame(traindat.fold)
desMat <- model.matrix(~trainclass.fold, limma.dat) #design matrix
trainFit <- lmFit(limma.dat, desMat)
eBtrainFit <- eBayes(trainFit)
# top-50 limma genes
top.fold <- topTable(eBtrainFit, coef = which(colnames(coef(trainFit)) !=
"(Intercept)"), n = ngenes, sort.by = "P")
# Retain the top-50 limma genes from the train and test sets
traindat.fold <- traindat.fold[rownames(top.fold), ]
testdat.fold <- testdat.fold[rownames(top.fold), ]
# STEP 2: select a classifier Set a counter for the method tested
l <- 0
# kNN classifiers
for (kk in c(1, 5, 10, 15)) {
# every time you get inside this loop, the l counter gets redefined (i.e.,
# 1, 2, etc for method 1, method 2, etc)
l <- l + 1
# knn needs samples in rows
yhat.knn <- knn(train = t(traindat.fold), test = t(testdat.fold), cl = trainclass.fold,
k = kk)
# Store the prediction error for each kk within this fold
pr.err[i, l] <- mean(testclass.fold != yhat.knn)
} #end of kNN loop
# LDA method. Note that you can change the prior parameter to reflect a
# different proportion of case and control samples. The default is to use
# the class proportions from the training set.
m.lda <- lda(x = t(traindat.fold), group = trainclass.fold, prior = c(0.5,
0.5))
yhat.lda <- predict(m.lda, newdata = t(testdat.fold))$class
pr.err[i, "LDA"] <- mean(testclass.fold != yhat.lda)
# Logit
glm.dat <- data.frame(t(traindat.fold), group = trainclass.fold)
m.log <- glm(group ~ ., data = glm.dat, family = binomial)
pr.log <- predict(m.log, newdata = data.frame(t(testdat.fold)), type = "response")
pr.cl <- rep(0, length(testclass.fold))
pr.cl[pr.log > 1/2] <- "pos"
pr.cl[pr.log <= 1/2] <- "neg"
pr.cl <- factor(pr.cl)
pr.err[i, "Logit"] <- mean(pr.cl != testclass.fold)
# SVM
m.svm <- svm(x = t(traindat.fold), y = trainclass.fold, cost = 1, type = "C-classification",
kernel = "linear")
pr.svm <- predict(m.svm, newdata = t(testdat.fold))
pr.err[i, "SVM"] <- mean(pr.svm != testclass.fold)
} #end of CV loop
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
cv.err <- colMeans(pr.err)
# mean - 1 sd (sd of the 6 error rates)
ls <- cv.err - apply(pr.err, 2, sd)
# mean + 1 sd (sd of the 6 error rates)
us <- cv.err + apply(pr.err, 2, sd)
# plot the results
plot(1:nmethod, cv.err, ylim = c(0, 1), xlim = c(1, (nmethod + 0.5)), type = "n",
axes = FALSE, xlab = "Classifier", ylab = "Error rate", main = "6-fold CV Error")
for (j in 1:ncol(pr.err)) points(jitter(rep(j, 6), factor = 2), jitter(pr.err[,
j]), cex = 0.8, pch = "X", col = "gray")
for (i in 1:nmethod) lines(c(i, i), c(ls[i], us[i]), lwd = 2, col = "gray")
points(1:nmethod, ls, pch = 19, col = "red")
points(1:nmethod, us, pch = 19, col = "green")
points(1:nmethod, cv.err, pch = 19, cex = 1.5, col = "black")
axis(2, ylab = "Error rate")
axis(1, 1:nmethod, colnames(pr.err))
box()
yhat.knn <- knn(train = t(trainDat), test = t(testDat), cl = train.es$LnStatus,
k = 10)
# Store the prediction error for each kk within this fold
pr.errTest <- mean(test.es$LnStatus != yhat.knn)
pr.errTest
## [1] 0.45
featureScores <- GeneSelection(X = t(trainDat), y = train.es$LnStatus, learningsets = splits,
method = "limma")
## GeneSelection: iteration 1
## GeneSelection: iteration 2
## GeneSelection: iteration 3
## GeneSelection: iteration 4
## GeneSelection: iteration 5
## GeneSelection: iteration 6
# Compare list of selected genes using:
toplist(featureScores)
## top 10 genes for iteration 1
##
## index importance
## 1 9265 25.38
## 2 6936 24.85
## 3 21592 24.56
## 4 1702 23.99
## 5 21919 23.99
## 6 19932 21.29
## 7 6938 20.60
## 8 22524 20.18
## 9 17847 18.78
## 10 6937 18.75
# We can aggregate the results across the 6 splits.
seliter <- numeric()
for (i in 1:nfold) seliter <- c(seliter, toplist(featureScores, iter = i, top = 10,
show = FALSE)$index)
sort(table(seliter), dec = T) # summarize
## seliter
## 1702 9265 6936 21919 808 6938 18958 19932 20571 21592 23206 23567
## 6 5 3 3 2 2 2 2 2 2 2 2
## 18 377 767 2690 3386 6937 7182 7183 8447 10254 10292 13581
## 1 1 1 1 1 1 1 1 1 1 1 1
## 13620 13802 15997 16094 16165 17847 18668 19152 19265 19402 19526 19577
## 1 1 1 1 1 1 1 1 1 1 1 1
## 21533 22524 22943
## 1 1 1
# Choose the 20 probes which are chosen most commonly in the 6 splits
bestprobes <- as.numeric(names(sort(table(seliter), dec = T)))[1:20]
# examine the annotations. I just selected a few columns from the fData of
# the eSet.
fData(dat)[bestprobes, c("Gene Symbol", "Gene Title", "ENTREZ_GENE_ID", "Representative Public ID")]
## Gene Symbol
## 1569472_s_at TTC3
## 212384_at ATP6V1G2 /// BAT1
## 208661_s_at TTC3
## 237746_at SFRS11
## 1556088_at RPAIN
## 208663_s_at TTC3
## 228510_at ATAT1
## 230609_at CLINT1
## 232740_at MCM3AP-AS
## 236196_at
## 242562_at DNAJC24
## 243751_at
## 1552283_s_at ZDHHC11
## 1554182_at TRIM73 /// TRIM74
## 1555920_at CBX3
## 201440_at DDX23
## 202182_at KAT2A
## 208662_s_at TTC3
## 208920_at SRI
## 208921_s_at SRI
## Gene Title
## 1569472_s_at tetratricopeptide repeat domain 3
## 212384_at ATPase, H+ transporting, lysosomal 13kDa, V1 subunit G2 /// HLA-B associated transcript 1
## 208661_s_at tetratricopeptide repeat domain 3
## 237746_at Splicing factor, arginine/serine-rich 11
## 1556088_at RPA interacting protein
## 208663_s_at tetratricopeptide repeat domain 3
## 228510_at alpha tubulin acetyltransferase 1
## 230609_at clathrin interactor 1
## 232740_at MCM3AP antisense RNA (non-protein coding)
## 236196_at
## 242562_at DnaJ (Hsp40) homolog, subfamily C, member 24
## 243751_at
## 1552283_s_at zinc finger, DHHC-type containing 11
## 1554182_at tripartite motif-containing 73 /// tripartite motif-containing 74
## 1555920_at Chromobox homolog 3 (HP1 gamma homolog, Drosophila)
## 201440_at DEAD (Asp-Glu-Ala-Asp) box polypeptide 23
## 202182_at K(lysine) acetyltransferase 2A
## 208662_s_at tetratricopeptide repeat domain 3
## 208920_at sorcin
## 208921_s_at sorcin
## ENTREZ_GENE_ID Representative Public ID
## 1569472_s_at 7267 BC026260
## 212384_at 534 /// 7919 AI282485
## 208661_s_at 7267 AW510696
## 237746_at 9295 AI168187
## 1556088_at 84268 AK098491
## 208663_s_at 7267 AI652848
## 228510_at 79969 AL566825
## 230609_at 9685 BF510429
## 232740_at 114044 BC002458
## 236196_at BF939032
## 242562_at 120526 AW772288
## 243751_at AA709148
## 1552283_s_at 79844 NM_024786
## 1554182_at 375593 /// 378108 BC033871
## 1555920_at 11335 BU683892
## 201440_at 9416 NM_004818
## 202182_at 2648 NM_021078
## 208662_s_at 7267 AI885338
## 208920_at 6717 AV752215
## 208921_s_at 6717 L12387
m <- matrix(which(dat$Set == "patient type: training set"), 1)
full.learningset <- new("learningsets", learnmatrix = m, method = "my own",
ntrain = 96, iter = 1)
fullFeatureScores <- GeneSelection(X = t(exprs(dat)), learningsets = full.learningset,
y = dat$LnStatus, method = "t.test")
## GeneSelection: iteration 1
testclassif <- classification(X = t(exprs(dat)), y = dat$LnStatus, learningsets = full.learningset,
genesel = fullFeatureScores, nbgene = 100, classifier = pknnCMA, k = 5)
## iteration 1
# Evaluation:
tres <- testclassif[[1]]
ftable(tres)
## number of missclassifications: 11
## missclassification rate: 0.55
## sensitivity: 0.545
## specificity: 0.333
## predicted
## true 0 1
## 0 3 6
## 1 5 6
roc(tres)