Seminar 10: Supervised Learning, Classification, Cross-validation & Variable Selection

We will be using a dataset from Smeets et al. 2010 who measured Affymetrix expression profiles from primary breast tumors. Their data set contains 24236 genes on 116 samples. The status of the lymph node is known for each sample, with 59 LN positive and 57 LN negative. Samples were divided into two parts: 96 samples (48 LN positive and 48 LN negative) were used as a “training” set and 20 samples (11 LN positive and 9 LN negative) were used as a “test” set. There is also a quantitative measure, “LnRatio”, the fraction of affected lymph nodes, presumably reflecting “how bad” the LnStatus is.

if (file.exists("class_LNstatus.Rdata")) {
    # if previously downloaded
    load("class_LNstatus.Rdata")
} else {
    # if downloading for the first time takes a several mins!; returns a list
    datgeo <- getGEO("GSE23177", GSEMatrix = TRUE)
    dat <- datgeo[[1]]  #Note that dat is an ExpressionSets

    str(pData(dat), max.level = 0)

    # extract only those variables of interest
    pData(dat) <- subset(pData(dat), select = c("characteristics_ch1.2", "characteristics_ch1.3", 
        "characteristics_ch1"))
    names(pData(dat)) <- c("LnStatus", "LnRatio", "Set")

    # Note: LNRatio will not be used in this Seminar. However, you can use it to
    # try some of the regularization techniques learned in class

    # split the ExpressionSet into training and test sets.
    train.es <- dat[, dat$Set == "patient type: training set"]
    test.es <- dat[, dat$Set != "patient type: training set"]

    # Re-label factor
    pData(train.es)$LnStatus <- recode(pData(train.es)$LnStatus, "levels(pData(train.es)$LnStatus)[1]='neg'; else='pos'", 
        levels = c("neg", "pos"))

    pData(test.es)$LnStatus <- recode(pData(test.es)$LnStatus, "levels(pData(test.es)$LnStatus)[1]='neg'; else='pos'", 
        levels = c("neg", "pos"))

    # create data matrices with expression values (probesets in rows). Some of
    # the functions we will use do not take ExpressionSets as objects
    trainDat <- exprs(train.es)
    testDat <- exprs(test.es)

    # Redefine the quantitative variable LnRatio to make it a numeric variable.
    ntrain <- dim(pData(train.es))[1]
    ntest <- dim(pData(test.es))[1]

    pData(train.es)$LnRatio <- as.numeric(unlist(strsplit(as.vector(unlist(pData(train.es)$LnRatio)), 
        ":", fixed = TRUE))[(1:ntrain) * 2])
    pData(test.es)$LnRatio <- as.numeric(unlist(strsplit(as.vector(unlist(pData(test.es)$LnRatio)), 
        ":", fixed = TRUE))[(1:ntest) * 2])

    # save the data to avoid future re-downloading
    save(dat, trainDat, testDat, train.es, test.es, file = "class_LNstatus.Rdata")
}

We will explore the dataset first.

# understand your data for classification
table(pData(train.es)$LnStatus)
## 
## neg pos 
##  48  48
table(pData(test.es)$LnStatus)
## 
## neg pos 
##   9  11
tapply(pData(train.es)$LnRatio, pData(train.es)$LnStatus, summary)
## $neg
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0 
## 
## $pos
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.040   0.070   0.110   0.194   0.228   0.960
tapply(pData(test.es)$LnRatio, pData(test.es)$LnStatus, summary)
## $neg
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0 
## 
## $pos
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.050   0.180   0.500   0.457   0.610   0.940

We will look at the expression of 3 randomly selected genes in both the training and test sets

# look at the expression of 3 randomly picked genes in both training and
# test sets
set.seed(1234)
(getMe <- sample(1:nrow(train.es), size = 3))  ## [1]   2756 15082 14766
## [1]  2756 15082 14766
# training data
trDat <- trainDat[getMe, ]
str(trDat)
##  num [1:3, 1:96] 7.19 9.17 8.38 7.13 9.38 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:3] "201513_at" "223139_s_at" "222698_s_at"
##   ..$ : chr [1:96] "GSM570518" "GSM570519" "GSM570520" "GSM570521" ...
trDat <- data.frame(LnStatus = pData(train.es)$LnStatus, Set = rep("train", 
    nrow(pData(train.es))), t(trDat))
str(trDat)
## 'data.frame':    96 obs. of  5 variables:
##  $ LnStatus    : Factor w/ 2 levels "neg","pos": 1 1 2 1 1 2 1 1 2 1 ...
##  $ Set         : Factor w/ 1 level "train": 1 1 1 1 1 1 1 1 1 1 ...
##  $ X201513_at  : num  7.19 7.13 7.39 6.86 6.96 ...
##  $ X223139_s_at: num  9.17 9.38 9.03 9.55 9.5 ...
##  $ X222698_s_at: num  8.38 8.24 7.23 7.87 8.45 ...
plotDat.train <- melt(trDat, id = c("LnStatus", "Set"), variable_name = "gene")
colnames(plotDat.train)[colnames(plotDat.train) == "value"] = "gExp"
# test data
tDat <- testDat[getMe, ]
str(tDat)
##  num [1:3, 1:20] 6.05 9.15 7.55 6.87 8.95 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:3] "201513_at" "223139_s_at" "222698_s_at"
##   ..$ : chr [1:20] "GSM570498" "GSM570499" "GSM570500" "GSM570501" ...
tDat <- data.frame(LnStatus = pData(test.es)$LnStatus, Set = rep("test", nrow(pData(test.es))), 
    t(tDat))
str(tDat)
## 'data.frame':    20 obs. of  5 variables:
##  $ LnStatus    : Factor w/ 2 levels "neg","pos": 1 1 1 1 1 1 1 1 1 2 ...
##  $ Set         : Factor w/ 1 level "test": 1 1 1 1 1 1 1 1 1 1 ...
##  $ X201513_at  : num  6.05 6.87 6.71 8 6.54 ...
##  $ X223139_s_at: num  9.15 8.95 9.09 9.81 9.2 ...
##  $ X222698_s_at: num  7.55 8.34 8.32 7.33 8.14 ...
plotDat.test <- melt(tDat, id = c("LnStatus", "Set"), variable_name = "gene")
colnames(plotDat.test)[colnames(plotDat.test) == "value"] = "gExp"

plotDat <- rbind(plotDat.train, plotDat.test)

# plot 3 randomly picked genes in both training and test sets
stripplot(gExp ~ LnStatus | gene + Set, plotDat, grid = TRUE, group = LnStatus, 
    auto.key = TRUE, jitter.data = TRUE)

plot of chunk three_random_genes

Classification

The prediction of a discrete response is usually refer to as classification. We will use the dataset from Smeets et al. to find the best-trained classifier and use it to predict the LnStatus of the 20 samples in the test set, i.e., classify those as “lymph node positive” or “negative”.

Feature and Model Selection

We will first identify the best set of features that we will use to train the model using cross-validation. Thus, we will divide the training set into 6 folds (the authors used 10 folds). We also want the proportion of positive and negative examples in each split to be approximately the same as for the full data set (i.e., stratified 6-fold CV with 8 positive and 8 negative samples within each fold). For each round of cross-validation, we use one fold as the test data and the rest of the data as training to select features and train different classifiers.

Cross validation splits

nfold <- 6

tabTrain <- table(train.es$LnStatus)

indlist <- sapply(names(tabTrain), function(z) which(train.es$LnStatus == z), 
    simplify = FALSE)

set.seed(1234)

# Each row contains 8 pos and 8 negative samples.

fold.pos <- matrix(sample(indlist[["pos"]]), nrow = nfold)
fold.neg <- matrix(sample(indlist[["neg"]]), nrow = nfold)
splits <- GenerateLearningsets(y = train.es$LnStatus, method = "CV", fold = 6, 
    strat = TRUE)

##Exercise 1: We will perform 100 runs of cross-validation before selecting a model to test. For this example, we will be using only the top-50 genes identified from limma

for (i in 1:100) {
    splits <- GenerateLearningsets(y = train.es$LnStatus, method = "CV", fold = 6, 
        strat = TRUE)
}
# Define here the constants that you will not evaluate. For example, I will
# use the top-50 limma genes

ngenes <- 50
nmethod <- 7  #number of methods you plan to compare. 

# Define here an output objects to store results
pr.err <- matrix(-1, nfold, nmethod, dimnames = list(paste0("Fold", 1:nfold), 
    c("1NN", "5NN", "10NN", "15NN", "LDA", "Logit", "SVM")))

for (i in 1:nfold) {

    # Test Fold for the i-th step
    testdat.fold <- trainDat[, c(fold.pos[i, ], fold.neg[i, ])]
    # I will create a factor of classes for the test set of the i_th fold
    testclass.fold <- train.es$LnStatus[c(fold.pos[i, ], fold.neg[i, ])]


    # The rest of the samples are the training set for the i-th step
    traindat.fold <- trainDat[, -c(fold.pos[i, ], fold.neg[i, ])]
    trainclass.fold <- train.es$LnStatus[-c(fold.pos[i, ], fold.neg[i, ])]

    # Step 1: feature selection (do you remember limma?).

    # Note that a different set of genes will be selected for each fold! you can
    # then compare how consistent these sets were.

    limma.dat <- as.data.frame(traindat.fold)
    desMat <- model.matrix(~trainclass.fold, limma.dat)  #design matrix
    trainFit <- lmFit(limma.dat, desMat)
    eBtrainFit <- eBayes(trainFit)

    # top-50 limma genes
    top.fold <- topTable(eBtrainFit, coef = which(colnames(coef(trainFit)) != 
        "(Intercept)"), n = ngenes, sort.by = "P")

    # Retain the top-50 limma genes from the train and test sets
    traindat.fold <- traindat.fold[rownames(top.fold), ]
    testdat.fold <- testdat.fold[rownames(top.fold), ]


    # STEP 2: select a classifier Set a counter for the method tested
    l <- 0

    # kNN classifiers
    for (kk in c(1, 5, 10, 15)) {
        # every time you get inside this loop, the l counter gets redefined (i.e.,
        # 1, 2, etc for method 1, method 2, etc)
        l <- l + 1

        # knn needs samples in rows
        yhat.knn <- knn(train = t(traindat.fold), test = t(testdat.fold), cl = trainclass.fold, 
            k = kk)
        # Store the prediction error for each kk within this fold
        pr.err[i, l] <- mean(testclass.fold != yhat.knn)
    }  #end of kNN loop

    # LDA method. Note that you can change the prior parameter to reflect a
    # different proportion of case and control samples. The default is to use
    # the class proportions from the training set.

    m.lda <- lda(x = t(traindat.fold), group = trainclass.fold, prior = c(0.5, 
        0.5))
    yhat.lda <- predict(m.lda, newdata = t(testdat.fold))$class
    pr.err[i, "LDA"] <- mean(testclass.fold != yhat.lda)

    # Logit
    glm.dat <- data.frame(t(traindat.fold), group = trainclass.fold)
    m.log <- glm(group ~ ., data = glm.dat, family = binomial)

    pr.log <- predict(m.log, newdata = data.frame(t(testdat.fold)), type = "response")
    pr.cl <- rep(0, length(testclass.fold))
    pr.cl[pr.log > 1/2] <- "pos"
    pr.cl[pr.log <= 1/2] <- "neg"

    pr.cl <- factor(pr.cl)
    pr.err[i, "Logit"] <- mean(pr.cl != testclass.fold)

    # SVM
    m.svm <- svm(x = t(traindat.fold), y = trainclass.fold, cost = 1, type = "C-classification", 
        kernel = "linear")
    pr.svm <- predict(m.svm, newdata = t(testdat.fold))

    pr.err[i, "SVM"] <- mean(pr.svm != testclass.fold)
}  #end of CV loop
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## JB: I get 'There were 12 warnings'

Error rates

Now you can get the average prediction error for all methods.

cv.err <- colMeans(pr.err)

# mean - 1 sd (sd of the 6 error rates)
ls <- cv.err - apply(pr.err, 2, sd)

# mean + 1 sd (sd of the 6 error rates)
us <- cv.err + apply(pr.err, 2, sd)

# plot the results
plot(1:nmethod, cv.err, ylim = c(0, 1), xlim = c(1, (nmethod + 0.5)), type = "n", 
    axes = FALSE, xlab = "Classifier", ylab = "Error rate", main = "6-fold CV Error")

for (j in 1:ncol(pr.err)) points(jitter(rep(j, 6), factor = 2), jitter(pr.err[, 
    j]), cex = 0.8, pch = "X", col = "gray")

for (i in 1:nmethod) lines(c(i, i), c(ls[i], us[i]), lwd = 2, col = "gray")
points(1:nmethod, ls, pch = 19, col = "red")
points(1:nmethod, us, pch = 19, col = "green")
points(1:nmethod, cv.err, pch = 19, cex = 1.5, col = "black")
axis(2, ylab = "Error rate")
axis(1, 1:nmethod, colnames(pr.err))

box()

plot of chunk error_rates

Testing the selected model

yhat.knn <- knn(train = t(trainDat), test = t(testDat), cl = train.es$LnStatus, 
    k = 10)
# Store the prediction error for each kk within this fold
pr.errTest <- mean(test.es$LnStatus != yhat.knn)
pr.errTest
## [1] 0.65

CMA

featureScores <- GeneSelection(X = t(trainDat), y = train.es$LnStatus, learningsets = splits, 
    method = "limma")
## GeneSelection: iteration 1 
## GeneSelection: iteration 2 
## GeneSelection: iteration 3 
## GeneSelection: iteration 4 
## GeneSelection: iteration 5 
## GeneSelection: iteration 6

# Compare list of selected genes using:
toplist(featureScores)
## top  10  genes for iteration  1 
##  
##    index importance
## 1  10254      24.36
## 2   9265      24.22
## 3  18668      21.86
## 4    767      21.68
## 5  23567      20.29
## 6  13802      20.28
## 7   1702      19.79
## 8  18860      19.63
## 9  13581      19.53
## 10 23206      19.33

# We can aggregate the results across the 6 splits.

seliter <- numeric()
for (i in 1:nfold) seliter <- c(seliter, toplist(featureScores, iter = i, top = 10, 
    show = FALSE)$index)
sort(table(seliter), dec = T)  # summarize
## seliter
##  9265  1702   808  6936 10254 21919  6938 14544 16094 19068 19932 23206 
##     6     5     3     3     3     3     2     2     2     2     2     2 
## 23567   767  1377  1491  6001  6937  7183 11052 11093 13581 13802 14249 
##     2     1     1     1     1     1     1     1     1     1     1     1 
## 14378 15997 18668 18860 19152 19697 19958 20690 21064 21592 22524 23208 
##     1     1     1     1     1     1     1     1     1     1     1     1

# Choose the 20 probes which are chosen most commonly in the 6 splits
bestprobes <- as.numeric(names(sort(table(seliter), dec = T)))[1:20]

# examine the annotations. I just selected a few columns from the fData of
# the eSet.
fData(dat)[bestprobes, c("Gene Symbol", "Gene Title", "ENTREZ_GENE_ID", "Representative Public ID")]
##                    Gene Symbol
## 212384_at    ATP6V1G2 /// BAT1
## 1569472_s_at              TTC3
## 1556088_at               RPAIN
## 208661_s_at               TTC3
## 213593_s_at              TRA2A
## 237746_at               SFRS11
## 208663_s_at               TTC3
## 222439_s_at             THRAP3
## 224872_at                DIP2B
## 228697_at                HINT3
## 230609_at               CLINT1
## 242562_at              DNAJC24
## 243751_at                     
## 1555920_at                CBX3
## 1560850_at                    
## 1563881_at                    
## 205655_at                 MDM4
## 208662_s_at               TTC3
## 208921_s_at                SRI
## 215220_s_at                TPR
##                                                                                             Gene Title
## 212384_at    ATPase, H+ transporting, lysosomal 13kDa, V1 subunit G2 /// HLA-B associated transcript 1
## 1569472_s_at                                                         tetratricopeptide repeat domain 3
## 1556088_at                                                                     RPA interacting protein
## 208661_s_at                                                          tetratricopeptide repeat domain 3
## 213593_s_at                                                   transformer 2 alpha homolog (Drosophila)
## 237746_at                                                     Splicing factor, arginine/serine-rich 11
## 208663_s_at                                                          tetratricopeptide repeat domain 3
## 222439_s_at                                              thyroid hormone receptor associated protein 3
## 224872_at                                      DIP2 disco-interacting protein 2 homolog B (Drosophila)
## 228697_at                                                 histidine triad nucleotide binding protein 3
## 230609_at                                                                        clathrin interactor 1
## 242562_at                                                 DnaJ (Hsp40) homolog, subfamily C, member 24
## 243751_at                                                                                             
## 1555920_at                                         Chromobox homolog 3 (HP1 gamma homolog, Drosophila)
## 1560850_at                                                                                            
## 1563881_at                                                                                            
## 205655_at                                                     Mdm4 p53 binding protein homolog (mouse)
## 208662_s_at                                                          tetratricopeptide repeat domain 3
## 208921_s_at                                                                                     sorcin
## 215220_s_at                                   translocated promoter region (to activated MET oncogene)
##              ENTREZ_GENE_ID Representative Public ID
## 212384_at      534 /// 7919                 AI282485
## 1569472_s_at           7267                 BC026260
## 1556088_at            84268                 AK098491
## 208661_s_at            7267                 AW510696
## 213593_s_at           29896                 AW978896
## 237746_at              9295                 AI168187
## 208663_s_at            7267                 AI652848
## 222439_s_at            9967                 BE967048
## 224872_at             57609                 AB040896
## 228697_at            135114                 AW731710
## 230609_at              9685                 BF510429
## 242562_at            120526                 AW772288
## 243751_at                                   AA709148
## 1555920_at            11335                 BU683892
## 1560850_at                                  BC016831
## 1563881_at                                  AL831897
## 205655_at              4194                NM_002393
## 208662_s_at            7267                 AI885338
## 208921_s_at            6717                   L12387
## 215220_s_at            7175                 AK023111
m <- matrix(which(dat$Set == "patient type: training set"), 1)

full.learningset <- new("learningsets", learnmatrix = m, method = "my own", 
    ntrain = 96, iter = 1)

fullFeatureScores <- GeneSelection(X = t(exprs(dat)), learningsets = full.learningset, 
    y = dat$LnStatus, method = "t.test")
## GeneSelection: iteration 1

testclassif <- classification(X = t(exprs(dat)), y = dat$LnStatus, learningsets = full.learningset, 
    genesel = fullFeatureScores, nbgene = 100, classifier = pknnCMA, k = 5)
## iteration 1

# Evaluation:
tres <- testclassif[[1]]
ftable(tres)
## number of missclassifications:  11 
## missclassification rate:  0.55 
## sensitivity: 0.545 
## specificity: 0.333 
##     predicted
## true 0 1
##    0 3 6
##    1 5 6
roc(tres)

plot of chunk unnamed-chunk-2