Abrar — Apr 2, 2014, 1:22 PM
library(MASS)
library(reshape)
Loading required package: plyr
Attaching package: 'reshape'
The following objects are masked from 'package:plyr':
rename, round_any
library(car)
library(limma)
library(e1071)
Loading required package: class
Attaching package: 'class'
The following object is masked from 'package:reshape':
condense
library(glmnet)
Loading required package: Matrix
Attaching package: 'Matrix'
The following object is masked from 'package:reshape':
expand
Loaded glmnet 1.9-5
library(ROCR)
Loading required package: gplots
KernSmooth 2.23 loaded
Copyright M. P. Wand 1997-2009
Attaching package: 'gplots'
The following object is masked from 'package:stats':
lowess
library(CMA)
Loading required package: Biobase
Loading required package: BiocGenerics
Loading required package: parallel
Attaching package: 'BiocGenerics'
The following objects are masked from 'package:parallel':
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
clusterExport, clusterMap, parApply, parCapply, parLapply,
parLapplyLB, parRapply, parSapply, parSapplyLB
The following object is masked from 'package:Matrix':
as.vector
The following object is masked from 'package:limma':
plotMA
The following object is masked from 'package:stats':
xtabs
The following objects are masked from 'package:base':
anyDuplicated, append, as.data.frame, as.vector, cbind,
colnames, duplicated, eval, evalq, Filter, Find, get,
intersect, is.unsorted, lapply, Map, mapply, match, mget,
order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
table, tapply, union, unique, unlist
Welcome to Bioconductor
Vignettes contain introductory material; view with
'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages 'citation("pkgname")'.
Attaching package: 'CMA'
The following object is masked from 'package:ROCR':
prediction
The following object is masked from 'package:e1071':
tune
The following object is masked from 'package:plyr':
join
library(GEOquery)
Setting options('download.file.method.GEOquery'='auto')
library(lattice)
##################################################################
# Data Preparation
##################################################################
if (file.exists("class_LNstatus.Rdata")) {
# if previously downloaded
load("class_LNstatus.Rdata")
} else {
datgeo <- getGEO("GSE23177", GSEMatrix = TRUE)
dat <- datgeo[[1]] #Note that dat is an ExpressionSets
str(pData(dat), max.level = 0)
# extract only those variables of interest
pData(dat) <- subset(pData(dat), select = c("characteristics_ch1.2", "characteristics_ch1.3", "characteristics_ch1"))
names(pData(dat)) <- c("LnStatus", "LnRatio", "Set")
# Note: LNRatio will not be used in this Seminar. However, you can use it to
# try some of the regularization techniques learned in class
# split the ExpressionSet into training and test sets.
train.es <- dat[, dat$Set == "patient type: training set"]
test.es <- dat[, dat$Set != "patient type: training set"]
# Re-label factor
pData(train.es)$LnStatus <- recode(pData(train.es)$LnStatus, "levels(pData(train.es)$LnStatus)[1]='neg'; else='pos'", levels = c("neg", "pos"))
pData(test.es)$LnStatus <- recode(pData(test.es)$LnStatus, "levels(pData(test.es)$LnStatus)[1]='neg'; else='pos'", levels = c("neg", "pos"))
# create data matrices with expression values (probesets in rows). Some of
# the functions we will use do not take ExpressionSets as objects
trainDat <- exprs(train.es)
testDat <- exprs(test.es)
# Redefine the quantitative variable LnRatio to make it a numeric variable.
ntrain <- dim(pData(train.es))[1]
ntest <- dim(pData(test.es))[1]
pData(train.es)$LnRatio <- as.numeric(unlist(strsplit(as.vector(unlist(pData(train.es)$LnRatio)), ":", fixed = TRUE))[(1:ntrain) * 2])
pData(test.es)$LnRatio <- as.numeric(unlist(strsplit(as.vector(unlist(pData(test.es)$LnRatio)), ":", fixed = TRUE))[(1:ntest) * 2])
# save the data to avoid future re-downloading
save(dat, trainDat, testDat, train.es, test.es, file = "class_LNstatus.Rdata")
}
# undestand your data for classification
table(pData(train.es)$LnStatus)
neg pos
48 48
table(pData(test.es)$LnStatus)
neg pos
9 11
# understand the continuous response
tapply(pData(train.es)$LnRatio, pData(train.es)$LnStatus, summary)
$neg
Min. 1st Qu. Median Mean 3rd Qu. Max.
0 0 0 0 0 0
$pos
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.040 0.070 0.110 0.194 0.228 0.960
tapply(pData(test.es)$LnRatio, pData(test.es)$LnStatus, summary)
$neg
Min. 1st Qu. Median Mean 3rd Qu. Max.
0 0 0 0 0 0
$pos
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.050 0.180 0.500 0.457 0.610 0.940
# look at the expression of 3 randomly picked genes in both training and
# test sets
set.seed(1234)
(getMe <- sample(1:nrow(train.es), size = 3))
[1] 2756 15082 14766
# training data
trDat <- trainDat[getMe, ]
str(trDat)
num [1:3, 1:96] 7.19 9.17 8.38 7.13 9.38 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:3] "201513_at" "223139_s_at" "222698_s_at"
..$ : chr [1:96] "GSM570518" "GSM570519" "GSM570520" "GSM570521" ...
trDat <- data.frame(LnStatus = pData(train.es)$LnStatus, Set = rep("train", nrow(pData(train.es))), t(trDat))
str(trDat)
'data.frame': 96 obs. of 5 variables:
$ LnStatus : Factor w/ 2 levels "neg","pos": 1 1 2 1 1 2 1 1 2 1 ...
$ Set : Factor w/ 1 level "train": 1 1 1 1 1 1 1 1 1 1 ...
$ X201513_at : num 7.19 7.13 7.39 6.86 6.96 ...
$ X223139_s_at: num 9.17 9.38 9.03 9.55 9.5 ...
$ X222698_s_at: num 8.38 8.24 7.23 7.87 8.45 ...
plotDat.train <- melt(trDat, id = c("LnStatus", "Set"), variable_name = "gene")
colnames(plotDat.train)[colnames(plotDat.train) == "value"] = "gExp"
# test data
tDat <- testDat[getMe, ]
str(tDat)
num [1:3, 1:20] 6.05 9.15 7.55 6.87 8.95 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:3] "201513_at" "223139_s_at" "222698_s_at"
..$ : chr [1:20] "GSM570498" "GSM570499" "GSM570500" "GSM570501" ...
tDat <- data.frame(LnStatus = pData(test.es)$LnStatus, Set = rep("test", nrow(pData(test.es))), t(tDat))
str(tDat)
'data.frame': 20 obs. of 5 variables:
$ LnStatus : Factor w/ 2 levels "neg","pos": 1 1 1 1 1 1 1 1 1 2 ...
$ Set : Factor w/ 1 level "test": 1 1 1 1 1 1 1 1 1 1 ...
$ X201513_at : num 6.05 6.87 6.71 8 6.54 ...
$ X223139_s_at: num 9.15 8.95 9.09 9.81 9.2 ...
$ X222698_s_at: num 7.55 8.34 8.32 7.33 8.14 ...
plotDat.test <- melt(tDat, id = c("LnStatus", "Set"), variable_name = "gene")
colnames(plotDat.test)[colnames(plotDat.test) == "value"] = "gExp"
plotDat <- rbind(plotDat.train, plotDat.test)
stripplot(gExp ~ LnStatus | gene + Set, plotDat, grid = TRUE, group = LnStatus, auto.key = TRUE, jitter.data = TRUE)
##################################################################
# Classification
##################################################################
# Cross validation splits
nfold <- 6
tabTrain <- table(train.es$LnStatus)
indlist <- sapply(names(tabTrain), function(z) which(train.es$LnStatus == z), simplify = FALSE)
set.seed(1234)
# Each row contains 8 pos and 8 negative samples.
fold.pos <- matrix(sample(indlist[["pos"]]), nrow = nfold)
fold.neg <- matrix(sample(indlist[["neg"]]), nrow = nfold)
##################################################################
# Exercise 1
##################################################################
for (i in 1:100){
splits <- GenerateLearningsets(y = train.es$LnStatus, method = "CV", fold = 6, strat = TRUE)
}
##################################################################
# Loop for feature selection and modeling + Exercise 1
##################################################################
# Define here the constants that you will not evaluate. For example, I will
# use the top-50 limma genes
ngenes <- 50
nmethod <- 7 #number of methods you plan to compare.
# Define here an output objects to store results
pr.err <- matrix(-1, nfold, nmethod, dimnames = list(paste0("Fold", 1:nfold), c("1NN", "5NN", "10NN", "15NN", "LDA", "Logit", "SVM")))
for (i in 1:nfold) {
# Test Fold for the i-th step
testdat.fold <- trainDat[, c(fold.pos[i, ], fold.neg[i, ])]
# I will create a factor of classes for the test set of the i_th fold
testclass.fold <- train.es$LnStatus[c(fold.pos[i, ], fold.neg[i, ])]
# The rest of the samples are the training set for the i-th step
traindat.fold <- trainDat[, -c(fold.pos[i, ], fold.neg[i, ])]
trainclass.fold <- train.es$LnStatus[-c(fold.pos[i, ], fold.neg[i, ])]
# Step 1: feature selection (do you remember limma?).
# Note that a different set of genes will be selected for each fold! you can
# then compare how consistent these sets were.
limma.dat <- as.data.frame(traindat.fold)
desMat <- model.matrix(~trainclass.fold, limma.dat) #design matrix
trainFit <- lmFit(limma.dat, desMat)
eBtrainFit <- eBayes(trainFit)
# top-50 limma genes
top.fold <- topTable(eBtrainFit, coef = which(colnames(coef(trainFit)) != "(Intercept)"), sort.by = "P", p.value=1e-4)
# Retain the top-50 limma genes from the train and test sets
traindat.fold <- traindat.fold[rownames(top.fold), ]
testdat.fold <- testdat.fold[rownames(top.fold), ]
# STEP 2: select a classifier Set a counter for the method tested
l <- 0
# kNN classifiers
for (kk in c(1, 5, 10, 15)) {
# every time you get inside this loop, the l counter gets redefined (i.e.,
# 1, 2, etc for method 1, method 2, etc)
l <- l + 1
# knn needs samples in rows
yhat.knn <- knn(train = t(traindat.fold), test = t(testdat.fold), cl = trainclass.fold,
k = kk)
# Store the prediction error for each kk within this fold
pr.err[i, l] <- mean(testclass.fold != yhat.knn)
} #end of kNN loop
# LDA method. Note that you can change the prior parameter to reflect a
# different proportion of case and control samples. The default is to use
# the class proportions from the training set.
m.lda <- lda(x = t(traindat.fold), group = trainclass.fold, prior = c(0.5,
0.5))
yhat.lda <- predict(m.lda, newdata = t(testdat.fold))$class
pr.err[i, "LDA"] <- mean(testclass.fold != yhat.lda)
# Logit
glm.dat <- data.frame(t(traindat.fold), group = trainclass.fold)
m.log <- glm(group ~ ., data = glm.dat, family = binomial)
pr.log <- predict(m.log, newdata = data.frame(t(testdat.fold)), type = "response")
pr.cl <- rep(0, length(testclass.fold))
pr.cl[pr.log > 1/2] <- "pos"
pr.cl[pr.log <= 1/2] <- "neg"
pr.cl <- factor(pr.cl)
pr.err[i, "Logit"] <- mean(pr.cl != testclass.fold)
# SVM
m.svm <- svm(x = t(traindat.fold), y = trainclass.fold, cost = 1, type = "C-classification",
kernel = "linear")
pr.svm <- predict(m.svm, newdata = t(testdat.fold))
pr.err[i, "SVM"] <- mean(pr.svm != testclass.fold)
} #end of CV loop
Error: a dimension is zero
##################################################################
# Error Rates
##################################################################
cv.err <- colMeans(pr.err)
# mean - 1 sd (sd of the 6 error rates)
ls <- cv.err - apply(pr.err, 2, sd)
# mean + 1 sd (sd of the 6 error rates)
us <- cv.err + apply(pr.err, 2, sd)
# plot the results
plot(1:nmethod, cv.err, ylim = c(0, 1), xlim = c(1, (nmethod + 0.5)), type = "n",
axes = FALSE, xlab = "Classifier", ylab = "Error rate", main = "6-fold CV Error")
for (j in 1:ncol(pr.err)) points(jitter(rep(j, 6), factor = 2), jitter(pr.err[, j]), cex = 0.8, pch = "X", col = "gray")
for (i in 1:nmethod) lines(c(i, i), c(ls[i], us[i]), lwd = 2, col = "gray")
points(1:nmethod, ls, pch = 19, col = "red")
points(1:nmethod, us, pch = 19, col = "green")
points(1:nmethod, cv.err, pch = 19, cex = 1.5, col = "black")
axis(2, ylab = "Error rate")
axis(1, 1:nmethod, colnames(pr.err))
box()
##################################################################
# Testing the selected model
##################################################################
yhat.knn <- knn(train = t(trainDat), test = t(testDat), cl = train.es$LnStatus, k = 10)
# Store the prediction error for each kk within this fold
pr.errTest <- mean(test.es$LnStatus != yhat.knn)
pr.errTest
[1] 0.55
##################################################################
# CMA
##################################################################
featureScores <- GeneSelection(X = t(trainDat), y = train.es$LnStatus, learningsets = splits, method = "limma")
GeneSelection: iteration 1
GeneSelection: iteration 2
GeneSelection: iteration 3
GeneSelection: iteration 4
GeneSelection: iteration 5
GeneSelection: iteration 6
# Compare list of selected genes using:
toplist(featureScores)
top 10 genes for iteration 1
index importance
1 1702 19.88
2 9265 19.28
3 10254 18.84
4 18958 16.55
5 14544 16.53
6 6936 16.49
7 23551 15.05
8 6001 14.92
9 20571 14.67
10 23567 14.58
# We can aggregate the results across the 6 splits.
seliter <- numeric()
for (i in 1:nfold) seliter <- c(seliter, toplist(featureScores, iter = i, top = 10, show = FALSE)$index)
sort(table(seliter), dec = T) # summarize
seliter
9265 1702 6936 10254 23567 14544 19932 21592 21919 23206 23930 18
6 5 4 3 3 2 2 2 2 2 2 1
767 808 944 1174 6001 6938 7183 10292 11052 14084 14378 15493
1 1 1 1 1 1 1 1 1 1 1 1
16094 17847 18958 19068 19238 19410 19562 19821 20571 21369 22328 23069
1 1 1 1 1 1 1 1 1 1 1 1
23504 23551
1 1
# Choose the 20 probes which are chosen most commonly in the 6 splits
bestprobes <- as.numeric(names(sort(table(seliter), dec = T)))[1:20]
# examine the annotations. I just selected a few columns from the fData of
# the eSet.
fData(dat)[bestprobes, c("Gene Symbol", "Gene Title", "ENTREZ_GENE_ID", "Representative Public ID")]
Gene Symbol
212384_at ATP6V1G2 /// BAT1
1569472_s_at TTC3
208661_s_at TTC3
213593_s_at TRA2A
243751_at
222439_s_at THRAP3
230609_at CLINT1
236196_at
237746_at SFRS11
242562_at DNAJC24
37117_at ARHGAP8 /// PRR5-ARHGAP8
1552283_s_at ZDHHC11
1555920_at CBX3
1556088_at RPAIN
1557193_at
1558747_at SMCHD1
205655_at MDM4
208663_s_at TTC3
208921_s_at SRI
213649_at SRSF7
Gene Title
212384_at ATPase, H+ transporting, lysosomal 13kDa, V1 subunit G2 /// HLA-B associated transcript 1
1569472_s_at tetratricopeptide repeat domain 3
208661_s_at tetratricopeptide repeat domain 3
213593_s_at transformer 2 alpha homolog (Drosophila)
243751_at
222439_s_at thyroid hormone receptor associated protein 3
230609_at clathrin interactor 1
236196_at
237746_at Splicing factor, arginine/serine-rich 11
242562_at DnaJ (Hsp40) homolog, subfamily C, member 24
37117_at Rho GTPase activating protein 8 /// PRR5-ARHGAP8 readthrough
1552283_s_at zinc finger, DHHC-type containing 11
1555920_at Chromobox homolog 3 (HP1 gamma homolog, Drosophila)
1556088_at RPA interacting protein
1557193_at
1558747_at structural maintenance of chromosomes flexible hinge domain containing 1
205655_at Mdm4 p53 binding protein homolog (mouse)
208663_s_at tetratricopeptide repeat domain 3
208921_s_at sorcin
213649_at serine/arginine-rich splicing factor 7
ENTREZ_GENE_ID Representative Public ID
212384_at 534 /// 7919 AI282485
1569472_s_at 7267 BC026260
208661_s_at 7267 AW510696
213593_s_at 29896 AW978896
243751_at AA709148
222439_s_at 9967 BE967048
230609_at 9685 BF510429
236196_at BF939032
237746_at 9295 AI168187
242562_at 120526 AW772288
37117_at 23779 /// 553158 Z83838
1552283_s_at 79844 NM_024786
1555920_at 11335 BU683892
1556088_at 84268 AK098491
1557193_at AI085450
1558747_at 23347 AA336502
205655_at 4194 NM_002393
208663_s_at 7267 AI652848
208921_s_at 6717 L12387
213649_at 6432 AA524053
m <- matrix(which(dat$Set == "patient type: training set"), 1)
full.learningset <- new("learningsets", learnmatrix = m, method = "my own", ntrain = 96, iter = 1)
fullFeatureScores <- GeneSelection(X = t(exprs(dat)), learningsets = full.learningset, y = dat$LnStatus, method = "t.test")
GeneSelection: iteration 1
testclassif <- classification(X = t(exprs(dat)), y = dat$LnStatus, learningsets = full.learningset, genesel = fullFeatureScores, nbgene = 100, classifier = pknnCMA, k = 5)
iteration 1
# Evaluation:
tres <- testclassif[[1]]
ftable(tres)
number of missclassifications: 11
missclassification rate: 0.55
sensitivity: 0.545
specificity: 0.333
predicted
true 0 1
0 3 6
1 5 6
roc(tres)