Seminar 10: Classification

Yiming Zhang

Load the packages

library(MASS)
library(reshape)

## Loading required package: plyr
## 
## Attaching package: 'reshape'
## 
## The following objects are masked from 'package:plyr':
## 
##     rename, round_any

library(car)
library(limma)
library(e1071)
library(glmnet)

## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following object is masked from 'package:reshape':
## 
##     expand
## 
## Loaded glmnet 1.9-5

library(ROCR)

## Loading required package: gplots
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess

library(CMA)

## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:Matrix':
## 
##     as.vector
## 
## The following object is masked from 'package:limma':
## 
##     plotMA
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist
## 
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## 
## Attaching package: 'CMA'
## 
## The following object is masked from 'package:ROCR':
## 
##     prediction
## 
## The following object is masked from 'package:e1071':
## 
##     tune
## 
## The following object is masked from 'package:plyr':
## 
##     join

library(GEOquery)

## Setting options('download.file.method.GEOquery'='auto')

library(lattice)
library(class)

## 
## Attaching package: 'class'
## 
## The following object is masked from 'package:reshape':
## 
##     condense

Data preparation

load("class_LNstatus.Rdata")

Look at the data

table(pData(train.es)$LnStatus)

## 
## neg pos 
##  48  48

table(pData(test.es)$LnStatus)

## 
## neg pos 
##   9  11

tapply(pData(train.es)$LnRatio, pData(train.es)$LnStatus, summary)

## $neg
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0 
## 
## $pos
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.040   0.070   0.110   0.194   0.228   0.960

tapply(pData(test.es)$LnRatio, pData(test.es)$LnStatus, summary)

## $neg
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0 
## 
## $pos
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.050   0.180   0.500   0.457   0.610   0.940

Look at expression of 3 randomly picked genes in both training and test sets

set.seed(123)
(getMe <- sample(1:nrow(train.es), size = 3))

## [1]  6970 19105  9912

For training data

trDat <- trainDat[getMe, ]
str(trDat)

##  num [1:3, 1:96] 11.33 8.96 8.16 10.61 9.26 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:3] "208697_s_at" "228754_at" "213117_at"
##   ..$ : chr [1:96] "GSM570518" "GSM570519" "GSM570520" "GSM570521" ...

trDat <- data.frame(LnStatus = pData(train.es)$LnStatus, Set = rep("train", 
    nrow(pData(train.es))), t(trDat))
str(trDat)

## 'data.frame':    96 obs. of  5 variables:
##  $ LnStatus    : Factor w/ 2 levels "neg","pos": 1 1 2 1 1 2 1 1 2 1 ...
##  $ Set         : Factor w/ 1 level "train": 1 1 1 1 1 1 1 1 1 1 ...
##  $ X208697_s_at: num  11.3 10.6 11.8 11.9 11.3 ...
##  $ X228754_at  : num  8.96 9.26 8.64 9.18 8.27 ...
##  $ X213117_at  : num  8.16 7.95 6.91 7.11 7.63 ...

plotDat.train <- melt(trDat, id = c("LnStatus", "Set"), variable_name = "gene")
colnames(plotDat.train)[colnames(plotDat.train) == "value"] = "gExp"

For test data

tDat <- testDat[getMe, ]
str(tDat)

##  num [1:3, 1:20] 11.93 8.24 7.62 11.21 8.09 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:3] "208697_s_at" "228754_at" "213117_at"
##   ..$ : chr [1:20] "GSM570498" "GSM570499" "GSM570500" "GSM570501" ...

tDat <- data.frame(LnStatus = pData(test.es)$LnStatus, Set = rep("test", nrow(pData(test.es))), 
    t(tDat))
str(tDat)

## 'data.frame':    20 obs. of  5 variables:
##  $ LnStatus    : Factor w/ 2 levels "neg","pos": 1 1 1 1 1 1 1 1 1 2 ...
##  $ Set         : Factor w/ 1 level "test": 1 1 1 1 1 1 1 1 1 1 ...
##  $ X208697_s_at: num  11.9 11.2 11.3 12.1 11.2 ...
##  $ X228754_at  : num  8.24 8.09 8.08 9.5 7.92 ...
##  $ X213117_at  : num  7.62 8.12 7.67 6.98 7.75 ...

plotDat.test <- melt(tDat, id = c("LnStatus", "Set"), variable_name = "gene")
colnames(plotDat.test)[colnames(plotDat.test) == "value"] = "gExp"

plotDat <- rbind(plotDat.train, plotDat.test)

Plot 3 randomly picked genes in both training and test sets

stripplot(gExp ~ LnStatus | gene + Set, plotDat, grid = TRUE, group = LnStatus, 
    auto.key = TRUE, jitter.data = TRUE)

plot of chunk unnamed-chunk-11

Classification

Feature and Model Selection

Cross validation splits

Use 6 folds

nfold <- 6

tabTrain <- table(train.es$LnStatus)

indlist <- sapply(names(tabTrain), function(z) which(train.es$LnStatus == z), 
    simplify = FALSE)

set.seed(1234)

# Each row contains 8 pos and 8 negative samples.

fold.pos <- matrix(sample(indlist[["pos"]]), nrow = nfold)
fold.neg <- matrix(sample(indlist[["neg"]]), nrow = nfold)

splits <- GenerateLearningsets(y = train.es$LnStatus, method = "CV", fold = 6, 
    strat = TRUE)

Loop for feature selection and modeling

# Define here the constants that you will not evaluate. For example, I will
# use the top-50 limma genes

ngenes <- 50
nmethod <- 7  #number of methods you plan to compare. 

# Define here an output objects to store results
pr.err <- matrix(-1, nfold, nmethod, dimnames = list(paste0("Fold", 1:nfold), 
    c("1NN", "5NN", "10NN", "15NN", "LDA", "Logit", "SVM")))

for (i in 1:nfold) {

    # Test Fold for the i-th step
    testdat.fold <- trainDat[, c(fold.pos[i, ], fold.neg[i, ])]
    # I will create a factor of classes for the test set of the i_th fold
    testclass.fold <- train.es$LnStatus[c(fold.pos[i, ], fold.neg[i, ])]


    # The rest of the samples are the training set for the i-th step
    traindat.fold <- trainDat[, -c(fold.pos[i, ], fold.neg[i, ])]
    trainclass.fold <- train.es$LnStatus[-c(fold.pos[i, ], fold.neg[i, ])]

    # Step 1: feature selection (do you remember limma?).

    # Note that a different set of genes will be selected for each fold! you can
    # then compare how consistent these sets were.

    limma.dat <- as.data.frame(traindat.fold)
    desMat <- model.matrix(~trainclass.fold, limma.dat)  #design matrix
    trainFit <- lmFit(limma.dat, desMat)
    eBtrainFit <- eBayes(trainFit)

    # top-50 limma genes
    top.fold <- topTable(eBtrainFit, coef = which(colnames(coef(trainFit)) != 
        "(Intercept)"), n = ngenes, sort.by = "P")

    # Retain the top-50 limma genes from the train and test sets
    traindat.fold <- traindat.fold[rownames(top.fold), ]
    testdat.fold <- testdat.fold[rownames(top.fold), ]


    # STEP 2: select a classifier Set a counter for the method tested
    l <- 0

    # kNN classifiers
    for (kk in c(1, 5, 10, 15)) {
        # every time you get inside this loop, the l counter gets redefined (i.e.,
        # 1, 2, etc for method 1, method 2, etc)
        l <- l + 1

        # knn needs samples in rows
        yhat.knn <- knn(train = t(traindat.fold), test = t(testdat.fold), cl = trainclass.fold, 
            k = kk)
        # Store the prediction error for each kk within this fold
        pr.err[i, l] <- mean(testclass.fold != yhat.knn)
    }  #end of kNN loop

    # LDA method. Note that you can change the prior parameter to reflect a
    # different proportion of case and control samples. The default is to use
    # the class proportions from the training set.

    m.lda <- lda(x = t(traindat.fold), group = trainclass.fold, prior = c(0.5, 
        0.5))
    yhat.lda <- predict(m.lda, newdata = t(testdat.fold))$class
    pr.err[i, "LDA"] <- mean(testclass.fold != yhat.lda)

    # Logit
    glm.dat <- data.frame(t(traindat.fold), group = trainclass.fold)
    m.log <- glm(group ~ ., data = glm.dat, family = binomial)

    pr.log <- predict(m.log, newdata = data.frame(t(testdat.fold)), type = "response")
    pr.cl <- rep(0, length(testclass.fold))
    pr.cl[pr.log > 1/2] <- "pos"
    pr.cl[pr.log <= 1/2] <- "neg"

    pr.cl <- factor(pr.cl)
    pr.err[i, "Logit"] <- mean(pr.cl != testclass.fold)

    # SVM
    m.svm <- svm(x = t(traindat.fold), y = trainclass.fold, cost = 1, type = "C-classification", 
        kernel = "linear")
    pr.svm <- predict(m.svm, newdata = t(testdat.fold))

    pr.err[i, "SVM"] <- mean(pr.svm != testclass.fold)
}  #end of CV loop

## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

Error Rate

cv.err <- colMeans(pr.err)

# mean - 1 sd (sd of the 6 error rates)
ls <- cv.err - apply(pr.err, 2, sd)

# mean + 1 sd (sd of the 6 error rates)
us <- cv.err + apply(pr.err, 2, sd)

# plot the results
plot(1:nmethod, cv.err, ylim = c(0, 1), xlim = c(1, (nmethod + 0.5)), type = "n", 
    axes = FALSE, xlab = "Classifier", ylab = "Error rate", main = "6-fold CV Error")

for (j in 1:ncol(pr.err)) points(jitter(rep(j, 6), factor = 2), jitter(pr.err[, 
    j]), cex = 0.8, pch = "X", col = "gray")

for (i in 1:nmethod) lines(c(i, i), c(ls[i], us[i]), lwd = 2, col = "gray")
points(1:nmethod, ls, pch = 19, col = "red")
points(1:nmethod, us, pch = 19, col = "green")
points(1:nmethod, cv.err, pch = 19, cex = 1.5, col = "black")
axis(2, ylab = "Error rate")
axis(1, 1:nmethod, colnames(pr.err))

box()

plot of chunk unnamed-chunk-15

Testing the selected model

yhat.knn <- knn(train = t(trainDat), test = t(testDat), cl = train.es$LnStatus, 
    k = 10)
# Store the prediction error for each kk within this fold
pr.errTest <- mean(test.es$LnStatus != yhat.knn)
pr.errTest

## [1] 0.45

CMA

featureScores <- GeneSelection(X = t(trainDat), y = train.es$LnStatus, learningsets = splits, 
    method = "limma")

## GeneSelection: iteration 1 
## GeneSelection: iteration 2 
## GeneSelection: iteration 3 
## GeneSelection: iteration 4 
## GeneSelection: iteration 5 
## GeneSelection: iteration 6


# Compare list of selected genes using:
toplist(featureScores)

## top  10  genes for iteration  1 
##  
##    index importance
## 1   9265      25.38
## 2   6936      24.85
## 3  21592      24.56
## 4   1702      23.99
## 5  21919      23.99
## 6  19932      21.29
## 7   6938      20.60
## 8  22524      20.18
## 9  17847      18.78
## 10  6937      18.75


# We can aggregate the results across the 6 splits.

seliter <- numeric()
for (i in 1:nfold) seliter <- c(seliter, toplist(featureScores, iter = i, top = 10, 
    show = FALSE)$index)
sort(table(seliter), dec = T)  # summarize

## seliter
##  1702  9265  6936 21919   808  6938 18958 19932 20571 21592 23206 23567 
##     6     5     3     3     2     2     2     2     2     2     2     2 
##    18   377   767  2690  3386  6937  7182  7183  8447 10254 10292 13581 
##     1     1     1     1     1     1     1     1     1     1     1     1 
## 13620 13802 15997 16094 16165 17847 18668 19152 19265 19402 19526 19577 
##     1     1     1     1     1     1     1     1     1     1     1     1 
## 21533 22524 22943 
##     1     1     1


# Choose the 20 probes which are chosen most commonly in the 6 splits
bestprobes <- as.numeric(names(sort(table(seliter), dec = T)))[1:20]

# examine the annotations. I just selected a few columns from the fData of
# the eSet.
fData(dat)[bestprobes, c("Gene Symbol", "Gene Title", "ENTREZ_GENE_ID", "Representative Public ID")]

##                    Gene Symbol
## 1569472_s_at              TTC3
## 212384_at    ATP6V1G2 /// BAT1
## 208661_s_at               TTC3
## 237746_at               SFRS11
## 1556088_at               RPAIN
## 208663_s_at               TTC3
## 228510_at                ATAT1
## 230609_at               CLINT1
## 232740_at            MCM3AP-AS
## 236196_at                     
## 242562_at              DNAJC24
## 243751_at                     
## 1552283_s_at           ZDHHC11
## 1554182_at   TRIM73 /// TRIM74
## 1555920_at                CBX3
## 201440_at                DDX23
## 202182_at                KAT2A
## 208662_s_at               TTC3
## 208920_at                  SRI
## 208921_s_at                SRI
##                                                                                             Gene Title
## 1569472_s_at                                                         tetratricopeptide repeat domain 3
## 212384_at    ATPase, H+ transporting, lysosomal 13kDa, V1 subunit G2 /// HLA-B associated transcript 1
## 208661_s_at                                                          tetratricopeptide repeat domain 3
## 237746_at                                                     Splicing factor, arginine/serine-rich 11
## 1556088_at                                                                     RPA interacting protein
## 208663_s_at                                                          tetratricopeptide repeat domain 3
## 228510_at                                                            alpha tubulin acetyltransferase 1
## 230609_at                                                                        clathrin interactor 1
## 232740_at                                                    MCM3AP antisense RNA (non-protein coding)
## 236196_at                                                                                             
## 242562_at                                                 DnaJ (Hsp40) homolog, subfamily C, member 24
## 243751_at                                                                                             
## 1552283_s_at                                                      zinc finger, DHHC-type containing 11
## 1554182_at                           tripartite motif-containing 73 /// tripartite motif-containing 74
## 1555920_at                                         Chromobox homolog 3 (HP1 gamma homolog, Drosophila)
## 201440_at                                                    DEAD (Asp-Glu-Ala-Asp) box polypeptide 23
## 202182_at                                                               K(lysine) acetyltransferase 2A
## 208662_s_at                                                          tetratricopeptide repeat domain 3
## 208920_at                                                                                       sorcin
## 208921_s_at                                                                                     sorcin
##                 ENTREZ_GENE_ID Representative Public ID
## 1569472_s_at              7267                 BC026260
## 212384_at         534 /// 7919                 AI282485
## 208661_s_at               7267                 AW510696
## 237746_at                 9295                 AI168187
## 1556088_at               84268                 AK098491
## 208663_s_at               7267                 AI652848
## 228510_at                79969                 AL566825
## 230609_at                 9685                 BF510429
## 232740_at               114044                 BC002458
## 236196_at                                      BF939032
## 242562_at               120526                 AW772288
## 243751_at                                      AA709148
## 1552283_s_at             79844                NM_024786
## 1554182_at   375593 /// 378108                 BC033871
## 1555920_at               11335                 BU683892
## 201440_at                 9416                NM_004818
## 202182_at                 2648                NM_021078
## 208662_s_at               7267                 AI885338
## 208920_at                 6717                 AV752215
## 208921_s_at               6717                   L12387

m <- matrix(which(dat$Set == "patient type: training set"), 1)

full.learningset <- new("learningsets", learnmatrix = m, method = "my own", 
    ntrain = 96, iter = 1)

fullFeatureScores <- GeneSelection(X = t(exprs(dat)), learningsets = full.learningset, 
    y = dat$LnStatus, method = "t.test")

## GeneSelection: iteration 1


testclassif <- classification(X = t(exprs(dat)), y = dat$LnStatus, learningsets = full.learningset, 
    genesel = fullFeatureScores, nbgene = 100, classifier = pknnCMA, k = 5)

## iteration 1


# Evaluation:
tres <- testclassif[[1]]
ftable(tres)

## number of missclassifications:  11 
## missclassification rate:  0.55 
## sensitivity: 0.545 
## specificity: 0.333 
##     predicted
## true 0 1
##    0 3 6
##    1 5 6

roc(tres)

plot of chunk unnamed-chunk-18