Seminar10.R

Abrar — Apr 2, 2014, 1:22 PM

library(MASS)
library(reshape)

Loading required package: plyr

Attaching package: 'reshape'

The following objects are masked from 'package:plyr':

    rename, round_any

library(car)
library(limma)
library(e1071)

Loading required package: class

Attaching package: 'class'

The following object is masked from 'package:reshape':

    condense

library(glmnet)

Loading required package: Matrix

Attaching package: 'Matrix'

The following object is masked from 'package:reshape':

    expand

Loaded glmnet 1.9-5

library(ROCR)

Loading required package: gplots
KernSmooth 2.23 loaded
Copyright M. P. Wand 1997-2009

Attaching package: 'gplots'

The following object is masked from 'package:stats':

    lowess

library(CMA)

Loading required package: Biobase
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: 'BiocGenerics'

The following objects are masked from 'package:parallel':

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following object is masked from 'package:Matrix':

    as.vector

The following object is masked from 'package:limma':

    plotMA

The following object is masked from 'package:stats':

    xtabs

The following objects are masked from 'package:base':

    anyDuplicated, append, as.data.frame, as.vector, cbind,
    colnames, duplicated, eval, evalq, Filter, Find, get,
    intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unlist

Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.


Attaching package: 'CMA'

The following object is masked from 'package:ROCR':

    prediction

The following object is masked from 'package:e1071':

    tune

The following object is masked from 'package:plyr':

    join

library(GEOquery)

Setting options('download.file.method.GEOquery'='auto')

library(lattice)

##################################################################
# Data Preparation
##################################################################

if (file.exists("class_LNstatus.Rdata")) {
  # if previously downloaded
  load("class_LNstatus.Rdata")
} else {
  datgeo <- getGEO("GSE23177", GSEMatrix = TRUE)
  dat <- datgeo[[1]]  #Note that dat is an ExpressionSets

  str(pData(dat), max.level = 0)
  # extract only those variables of interest
  pData(dat) <- subset(pData(dat), select = c("characteristics_ch1.2", "characteristics_ch1.3", "characteristics_ch1"))
  names(pData(dat)) <- c("LnStatus", "LnRatio", "Set")

  # Note: LNRatio will not be used in this Seminar. However, you can use it to
  # try some of the regularization techniques learned in class

  # split the ExpressionSet into training and test sets.
  train.es <- dat[, dat$Set == "patient type: training set"]
  test.es <- dat[, dat$Set != "patient type: training set"]

  # Re-label factor
  pData(train.es)$LnStatus <- recode(pData(train.es)$LnStatus, "levels(pData(train.es)$LnStatus)[1]='neg'; else='pos'", levels = c("neg", "pos"))
  pData(test.es)$LnStatus <- recode(pData(test.es)$LnStatus, "levels(pData(test.es)$LnStatus)[1]='neg'; else='pos'", levels = c("neg", "pos"))

  # create data matrices with expression values (probesets in rows). Some of
  # the functions we will use do not take ExpressionSets as objects
  trainDat <- exprs(train.es)
  testDat <- exprs(test.es)

  # Redefine the quantitative variable LnRatio to make it a numeric variable.
  ntrain <- dim(pData(train.es))[1]
  ntest <- dim(pData(test.es))[1]

  pData(train.es)$LnRatio <- as.numeric(unlist(strsplit(as.vector(unlist(pData(train.es)$LnRatio)), ":", fixed = TRUE))[(1:ntrain) * 2])
  pData(test.es)$LnRatio <- as.numeric(unlist(strsplit(as.vector(unlist(pData(test.es)$LnRatio)), ":", fixed = TRUE))[(1:ntest) * 2])

  # save the data to avoid future re-downloading
  save(dat, trainDat, testDat, train.es, test.es, file = "class_LNstatus.Rdata")

}

# undestand your data for classification
table(pData(train.es)$LnStatus)


neg pos 
 48  48

table(pData(test.es)$LnStatus)


neg pos 
  9  11


# understand the continuous response
tapply(pData(train.es)$LnRatio, pData(train.es)$LnStatus, summary)

$neg
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0       0       0       0       0       0 

$pos
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.040   0.070   0.110   0.194   0.228   0.960

tapply(pData(test.es)$LnRatio, pData(test.es)$LnStatus, summary)

$neg
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0       0       0       0       0       0 

$pos
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.050   0.180   0.500   0.457   0.610   0.940



# look at the expression of 3 randomly picked genes in both training and
# test sets
set.seed(1234)
(getMe <- sample(1:nrow(train.es), size = 3))

[1]  2756 15082 14766


# training data
trDat <- trainDat[getMe, ]
str(trDat)

 num [1:3, 1:96] 7.19 9.17 8.38 7.13 9.38 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:3] "201513_at" "223139_s_at" "222698_s_at"
  ..$ : chr [1:96] "GSM570518" "GSM570519" "GSM570520" "GSM570521" ...


trDat <- data.frame(LnStatus = pData(train.es)$LnStatus, Set = rep("train", nrow(pData(train.es))), t(trDat))
str(trDat)

'data.frame':   96 obs. of  5 variables:
 $ LnStatus    : Factor w/ 2 levels "neg","pos": 1 1 2 1 1 2 1 1 2 1 ...
 $ Set         : Factor w/ 1 level "train": 1 1 1 1 1 1 1 1 1 1 ...
 $ X201513_at  : num  7.19 7.13 7.39 6.86 6.96 ...
 $ X223139_s_at: num  9.17 9.38 9.03 9.55 9.5 ...
 $ X222698_s_at: num  8.38 8.24 7.23 7.87 8.45 ...


plotDat.train <- melt(trDat, id = c("LnStatus", "Set"), variable_name = "gene")
colnames(plotDat.train)[colnames(plotDat.train) == "value"] = "gExp"

# test data
tDat <- testDat[getMe, ]
str(tDat)

 num [1:3, 1:20] 6.05 9.15 7.55 6.87 8.95 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:3] "201513_at" "223139_s_at" "222698_s_at"
  ..$ : chr [1:20] "GSM570498" "GSM570499" "GSM570500" "GSM570501" ...


tDat <- data.frame(LnStatus = pData(test.es)$LnStatus, Set = rep("test", nrow(pData(test.es))), t(tDat))
str(tDat)

'data.frame':   20 obs. of  5 variables:
 $ LnStatus    : Factor w/ 2 levels "neg","pos": 1 1 1 1 1 1 1 1 1 2 ...
 $ Set         : Factor w/ 1 level "test": 1 1 1 1 1 1 1 1 1 1 ...
 $ X201513_at  : num  6.05 6.87 6.71 8 6.54 ...
 $ X223139_s_at: num  9.15 8.95 9.09 9.81 9.2 ...
 $ X222698_s_at: num  7.55 8.34 8.32 7.33 8.14 ...


plotDat.test <- melt(tDat, id = c("LnStatus", "Set"), variable_name = "gene")
colnames(plotDat.test)[colnames(plotDat.test) == "value"] = "gExp"

plotDat <- rbind(plotDat.train, plotDat.test)

stripplot(gExp ~ LnStatus | gene + Set, plotDat, grid = TRUE, group = LnStatus, auto.key = TRUE, jitter.data = TRUE)

plot of chunk unnamed-chunk-1


##################################################################
# Classification
##################################################################

# Cross validation splits


  nfold <- 6

  tabTrain <- table(train.es$LnStatus)

  indlist <- sapply(names(tabTrain), function(z) which(train.es$LnStatus == z), simplify = FALSE)

  set.seed(1234)

  # Each row contains 8 pos and 8 negative samples.

  fold.pos <- matrix(sample(indlist[["pos"]]), nrow = nfold)
  fold.neg <- matrix(sample(indlist[["neg"]]), nrow = nfold)


##################################################################
# Exercise 1
##################################################################

for (i in 1:100){  
  splits <- GenerateLearningsets(y = train.es$LnStatus, method = "CV", fold = 6, strat = TRUE)
}

##################################################################
# Loop for feature selection and modeling + Exercise 1
##################################################################

# Define here the constants that you will not evaluate. For example, I will
# use the top-50 limma genes

ngenes <- 50
nmethod <- 7  #number of methods you plan to compare. 

# Define here an output objects to store results
pr.err <- matrix(-1, nfold, nmethod, dimnames = list(paste0("Fold", 1:nfold), c("1NN", "5NN", "10NN", "15NN", "LDA", "Logit", "SVM")))


for (i in 1:nfold) {

  # Test Fold for the i-th step
  testdat.fold <- trainDat[, c(fold.pos[i, ], fold.neg[i, ])]
  # I will create a factor of classes for the test set of the i_th fold
  testclass.fold <- train.es$LnStatus[c(fold.pos[i, ], fold.neg[i, ])]


  # The rest of the samples are the training set for the i-th step
  traindat.fold <- trainDat[, -c(fold.pos[i, ], fold.neg[i, ])]
  trainclass.fold <- train.es$LnStatus[-c(fold.pos[i, ], fold.neg[i, ])]

  # Step 1: feature selection (do you remember limma?).

  # Note that a different set of genes will be selected for each fold! you can
  # then compare how consistent these sets were.

  limma.dat <- as.data.frame(traindat.fold)
  desMat <- model.matrix(~trainclass.fold, limma.dat)  #design matrix
  trainFit <- lmFit(limma.dat, desMat)
  eBtrainFit <- eBayes(trainFit)

  # top-50 limma genes
  top.fold <- topTable(eBtrainFit, coef = which(colnames(coef(trainFit)) != "(Intercept)"), sort.by = "P", p.value=1e-4)

  # Retain the top-50 limma genes from the train and test sets
  traindat.fold <- traindat.fold[rownames(top.fold), ]
  testdat.fold <- testdat.fold[rownames(top.fold), ]


  # STEP 2: select a classifier Set a counter for the method tested
  l <- 0

  # kNN classifiers
  for (kk in c(1, 5, 10, 15)) {
    # every time you get inside this loop, the l counter gets redefined (i.e.,
    # 1, 2, etc for method 1, method 2, etc)
    l <- l + 1

    # knn needs samples in rows
    yhat.knn <- knn(train = t(traindat.fold), test = t(testdat.fold), cl = trainclass.fold, 
                    k = kk)
    # Store the prediction error for each kk within this fold
    pr.err[i, l] <- mean(testclass.fold != yhat.knn)
  }  #end of kNN loop

  # LDA method. Note that you can change the prior parameter to reflect a
  # different proportion of case and control samples. The default is to use
  # the class proportions from the training set.

  m.lda <- lda(x = t(traindat.fold), group = trainclass.fold, prior = c(0.5, 
                                                                        0.5))
  yhat.lda <- predict(m.lda, newdata = t(testdat.fold))$class
  pr.err[i, "LDA"] <- mean(testclass.fold != yhat.lda)

  # Logit
  glm.dat <- data.frame(t(traindat.fold), group = trainclass.fold)
  m.log <- glm(group ~ ., data = glm.dat, family = binomial)

  pr.log <- predict(m.log, newdata = data.frame(t(testdat.fold)), type = "response")
  pr.cl <- rep(0, length(testclass.fold))
  pr.cl[pr.log > 1/2] <- "pos"
  pr.cl[pr.log <= 1/2] <- "neg"

  pr.cl <- factor(pr.cl)
  pr.err[i, "Logit"] <- mean(pr.cl != testclass.fold)

  # SVM
  m.svm <- svm(x = t(traindat.fold), y = trainclass.fold, cost = 1, type = "C-classification", 
               kernel = "linear")
  pr.svm <- predict(m.svm, newdata = t(testdat.fold))

  pr.err[i, "SVM"] <- mean(pr.svm != testclass.fold)
}  #end of CV loop

Error: a dimension is zero


##################################################################
# Error Rates
##################################################################
cv.err <- colMeans(pr.err)

# mean - 1 sd (sd of the 6 error rates)
ls <- cv.err - apply(pr.err, 2, sd)

# mean + 1 sd (sd of the 6 error rates)
us <- cv.err + apply(pr.err, 2, sd)

# plot the results
plot(1:nmethod, cv.err, ylim = c(0, 1), xlim = c(1, (nmethod + 0.5)), type = "n", 
     axes = FALSE, xlab = "Classifier", ylab = "Error rate", main = "6-fold CV Error")

for (j in 1:ncol(pr.err)) points(jitter(rep(j, 6), factor = 2), jitter(pr.err[, j]), cex = 0.8, pch = "X", col = "gray")

for (i in 1:nmethod) lines(c(i, i), c(ls[i], us[i]), lwd = 2, col = "gray")
points(1:nmethod, ls, pch = 19, col = "red")
points(1:nmethod, us, pch = 19, col = "green")
points(1:nmethod, cv.err, pch = 19, cex = 1.5, col = "black")
axis(2, ylab = "Error rate")
axis(1, 1:nmethod, colnames(pr.err))

box()

plot of chunk unnamed-chunk-1



##################################################################
# Testing the selected model
##################################################################

yhat.knn <- knn(train = t(trainDat), test = t(testDat), cl = train.es$LnStatus,  k = 10)
# Store the prediction error for each kk within this fold
pr.errTest <- mean(test.es$LnStatus != yhat.knn)
pr.errTest

[1] 0.55


##################################################################
# CMA
##################################################################

featureScores <- GeneSelection(X = t(trainDat), y = train.es$LnStatus, learningsets = splits, method = "limma")

GeneSelection: iteration 1 
GeneSelection: iteration 2 
GeneSelection: iteration 3 
GeneSelection: iteration 4 
GeneSelection: iteration 5 
GeneSelection: iteration 6


# Compare list of selected genes using:
toplist(featureScores)

top  10  genes for iteration  1 

   index importance
1   1702      19.88
2   9265      19.28
3  10254      18.84
4  18958      16.55
5  14544      16.53
6   6936      16.49
7  23551      15.05
8   6001      14.92
9  20571      14.67
10 23567      14.58


# We can aggregate the results across the 6 splits.

seliter <- numeric()
for (i in 1:nfold) seliter <- c(seliter, toplist(featureScores, iter = i, top = 10, show = FALSE)$index)
sort(table(seliter), dec = T)  # summarize

seliter
 9265  1702  6936 10254 23567 14544 19932 21592 21919 23206 23930    18 
    6     5     4     3     3     2     2     2     2     2     2     1 
  767   808   944  1174  6001  6938  7183 10292 11052 14084 14378 15493 
    1     1     1     1     1     1     1     1     1     1     1     1 
16094 17847 18958 19068 19238 19410 19562 19821 20571 21369 22328 23069 
    1     1     1     1     1     1     1     1     1     1     1     1 
23504 23551 
    1     1


# Choose the 20 probes which are chosen most commonly in the 6 splits
bestprobes <- as.numeric(names(sort(table(seliter), dec = T)))[1:20]

# examine the annotations. I just selected a few columns from the fData of
# the eSet.
fData(dat)[bestprobes, c("Gene Symbol", "Gene Title", "ENTREZ_GENE_ID", "Representative Public ID")]

                          Gene Symbol
212384_at           ATP6V1G2 /// BAT1
1569472_s_at                     TTC3
208661_s_at                      TTC3
213593_s_at                     TRA2A
243751_at                            
222439_s_at                    THRAP3
230609_at                      CLINT1
236196_at                            
237746_at                      SFRS11
242562_at                     DNAJC24
37117_at     ARHGAP8 /// PRR5-ARHGAP8
1552283_s_at                  ZDHHC11
1555920_at                       CBX3
1556088_at                      RPAIN
1557193_at                           
1558747_at                     SMCHD1
205655_at                        MDM4
208663_s_at                      TTC3
208921_s_at                       SRI
213649_at                       SRSF7
                                                                                            Gene Title
212384_at    ATPase, H+ transporting, lysosomal 13kDa, V1 subunit G2 /// HLA-B associated transcript 1
1569472_s_at                                                         tetratricopeptide repeat domain 3
208661_s_at                                                          tetratricopeptide repeat domain 3
213593_s_at                                                   transformer 2 alpha homolog (Drosophila)
243751_at                                                                                             
222439_s_at                                              thyroid hormone receptor associated protein 3
230609_at                                                                        clathrin interactor 1
236196_at                                                                                             
237746_at                                                     Splicing factor, arginine/serine-rich 11
242562_at                                                 DnaJ (Hsp40) homolog, subfamily C, member 24
37117_at                                  Rho GTPase activating protein 8 /// PRR5-ARHGAP8 readthrough
1552283_s_at                                                      zinc finger, DHHC-type containing 11
1555920_at                                         Chromobox homolog 3 (HP1 gamma homolog, Drosophila)
1556088_at                                                                     RPA interacting protein
1557193_at                                                                                            
1558747_at                    structural maintenance of chromosomes flexible hinge domain containing 1
205655_at                                                     Mdm4 p53 binding protein homolog (mouse)
208663_s_at                                                          tetratricopeptide repeat domain 3
208921_s_at                                                                                     sorcin
213649_at                                                       serine/arginine-rich splicing factor 7
               ENTREZ_GENE_ID Representative Public ID
212384_at        534 /// 7919                 AI282485
1569472_s_at             7267                 BC026260
208661_s_at              7267                 AW510696
213593_s_at             29896                 AW978896
243751_at                                     AA709148
222439_s_at              9967                 BE967048
230609_at                9685                 BF510429
236196_at                                     BF939032
237746_at                9295                 AI168187
242562_at              120526                 AW772288
37117_at     23779 /// 553158                   Z83838
1552283_s_at            79844                NM_024786
1555920_at              11335                 BU683892
1556088_at              84268                 AK098491
1557193_at                                    AI085450
1558747_at              23347                 AA336502
205655_at                4194                NM_002393
208663_s_at              7267                 AI652848
208921_s_at              6717                   L12387
213649_at                6432                 AA524053



m <- matrix(which(dat$Set == "patient type: training set"), 1)

full.learningset <- new("learningsets", learnmatrix = m, method = "my own", ntrain = 96, iter = 1)

fullFeatureScores <- GeneSelection(X = t(exprs(dat)), learningsets = full.learningset, y = dat$LnStatus, method = "t.test")

GeneSelection: iteration 1


testclassif <- classification(X = t(exprs(dat)), y = dat$LnStatus, learningsets = full.learningset, genesel = fullFeatureScores, nbgene = 100, classifier = pknnCMA, k = 5)

iteration 1


# Evaluation:
tres <- testclassif[[1]]
ftable(tres)

number of missclassifications:  11 
missclassification rate:  0.55 
sensitivity: 0.545 
specificity: 0.333 
    predicted
true 0 1
   0 3 6
   1 5 6

roc(tres)

plot of chunk unnamed-chunk-1