Step 1: DOWNLOADING PACKAGES TO COMPUTER This step only needs to be completed once.

Loading library

library(Biobase)
## Loading required package: BiocGenerics
## Loading required package: generics
## 
## Attaching package: 'generics'
## The following objects are masked from 'package:base':
## 
##     as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
##     setequal, union
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
##     unsplit, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
library(ALL)

data("ALL", package = "ALL")

class(ALL)
## [1] "ExpressionSet"
## attr(,"package")
## [1] "Biobase"

STEP 2: DATA PRE-PROCESSING

Obtaining the names/ descriptions of covariates. Info on the distribution of the samples across the two main variants: BT variables determines the type of acute lymphoblastic leukemia mol.bio variables describes the cytogenetic abnormality found on each sample.

pD <- phenoData(ALL) 
varMetadata(pD)
##                                                                   labelDescription
## cod                                                                     Patient ID
## diagnosis                                                        Date of diagnosis
## sex                                                          Gender of the patient
## age                                                    Age of the patient at entry
## BT                                      does the patient have B-cell or T-cell ALL
## remission           Complete remission(CR), refractory(REF) or NA. Derived from CR
## CR                                                          Original remisson data
## date.cr                                        Date complete remission if achieved
## t(4;11)             did the patient have t(4;11) translocation. Derived from citog
## t(9;22)             did the patient have t(9;22) translocation. Derived from citog
## cyto.normal                        Was cytogenetic test normal? Derived from citog
## citog             original citogenetics data, deletions or t(4;11), t(9;22) status
## mol.biol                                                         molecular biology
## fusion protein                        which of p190, p210 or p190/210 for bcr/able
## mdr                                                           multi-drug resistant
## kinet                                            ploidy: either diploid or hyperd.
## ccr                                Continuous complete remission? Derived from f.u
## relapse                                                  Relapse? Derived from f.u
## transplant      did the patient receive a bone marrow transplant? Derived from f.u
## f.u                                                       follow up data available
## date last seen                                          date patient was last seen
table(ALL$BT)
## 
##  B B1 B2 B3 B4  T T1 T2 T3 T4 
##  5 19 36 23 12  5  1 15 10  2
table(ALL$mol.biol)
## 
## ALL1/AF4  BCR/ABL E2A/PBX1      NEG   NUP-98  p15/p16 
##       10       37        5       74        1        1
table(ALL$BT, ALL$mol.biol)
##     
##      ALL1/AF4 BCR/ABL E2A/PBX1 NEG NUP-98 p15/p16
##   B         0       2        1   2      0       0
##   B1       10       1        0   8      0       0
##   B2        0      19        0  16      0       1
##   B3        0       8        1  14      0       0
##   B4        0       7        3   2      0       0
##   T         0       0        0   5      0       0
##   T1        0       0        0   1      0       0
##   T2        0       0        0  15      0       0
##   T3        0       0        0   9      1       0
##   T4        0       0        0   2      0       0

Showing first 10 genes and the names of first 5 samples.

featureNames(ALL)[1:10]
##  [1] "1000_at"   "1001_at"   "1002_f_at" "1003_s_at" "1004_at"   "1005_at"  
##  [7] "1006_at"   "1007_s_at" "1008_f_at" "1009_at"
sampleNames(ALL)[1:5]
## [1] "01005" "01010" "03002" "04006" "04007"

Set of cases that will be considered. Samples with specific values of the BT and mol.bio variables.

tgt.cases <- which(ALL$BT %in% levels(ALL$BT)[1:5] &
                     ALL$mol.biol %in% levels(ALL$mol.biol)[1:4])

ALLb <- ALL[, tgt.cases]
ALLb
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 12625 features, 94 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: 01005 01010 ... LAL5 (94 total)
##   varLabels: cod diagnosis ... date last seen (21 total)
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
##   pubMedIds: 14684422 16243790 
## Annotation: hgu95av2

Update available levels of factors into new ‘ALLb’ object:

ALLb$BT <- factor(ALLb$BT)
ALLb$mol.biol <- factor(ALLb$mol.biol)

Saving object locally on computer to not repeat preprocessing steps

save(ALLb, file = "myALL.Rdata")

STEP 3: EXPLORING DATASET:

Access gene expression levels matrix:

es <- exprs(ALLb)
dim(es)
## [1] 12625    94
summary(as.vector(es))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.985   4.122   5.469   5.624   6.829  14.045

Using package ‘genefilter’ to graph expression levels:

library(genefilter)
library(ggplot2)

exprVs <- data.frame(exprVal=as.vector(es))

ds <- data.frame(Stat=c("1stQ","Median","3rdQ","Shorth"),
                 Value=c(quantile(exprVs$exprVal,
                                  probs=c(0.25, 0.5, 0.75)),
                         shorth(exprVs$exprVal)),
                 Color=c("red","green","red","yellow"))

ggplot(exprVs,aes(x=exprVal)) + geom_histogram(fill="lightgrey") +
  geom_vline(data=ds,aes(xintercept=Value,color=Color)) +
  geom_text(data=ds,aes(x=Value-0.2,y=0,label=Stat,colour=Color),
            angle=90,hjust="left") +
  xlab("Expression Levels") + guides(colour="none", fill="none")
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

… WRITE SOMETHING HERE…

sapply(levels(ALLb$mol.biol),
       function(x) summary(as.vector(es[, which(ALLb$mol.biol == x)])))
##          ALL1/AF4   BCR/ABL  E2A/PBX1       NEG
## Min.     2.266403  2.194858  2.267802  1.984919
## 1st Qu.  4.140797  4.124414  4.152339  4.111435
## Median   5.454388  5.467834  5.496825  5.469887
## Mean     5.620976  5.627016  5.630445  5.621616
## 3rd Qu.  6.805440  6.832862  6.818587  6.832056
## Max.    14.031235 14.044896 13.812578 13.949564

STEP 4: GENE SELECTION

4.1 simple filters based on distribution properties

rowIQRs <- function(em)
  rowQ(em,ceiling(0.75*ncol(em))) - rowQ(em,floor(0.25*ncol(em)))

library(ggplot2)

dg <- data.frame(rowMed=rowMedians(es), rowIQR=rowIQRs(es))

ggplot(dg,aes(x=rowMed, y=rowIQR)) + geom_point() +
  xlab("Median expression level") + ylab("IQR expression level") +
  ggtitle("Main Characteristics of Genes Expression Levels")

Many genes of low variability (IQR near 0) Low variability means not useful for discriminating amoung different types of mutations

library(genefilter)

resFilter <- nsFilter(ALLb,
                      var.func=IQR,
                      var.cutoff=IQR(as.vector(es))/5, #removes genes with a variability under 1/5 of global IQR
                      feature.exclude="^AFFX")
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## 
## 
resFilter
## $eset
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 4025 features, 94 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: 01005 01010 ... LAL5 (94 total)
##   varLabels: cod diagnosis ... date last seen (21 total)
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
##   pubMedIds: 14684422 16243790 
## Annotation: hgu95av2 
## 
## $filter.log
## $filter.log$numDupsRemoved
## [1] 2888
## 
## $filter.log$numLowVar
## [1] 4751
## 
## $filter.log$numRemoved.ENTREZID
## [1] 942
## 
## $filter.log$feature.exclude
## [1] 19

Update the ‘ALLb’ and ‘es’ objects with the filtered objects.

ALLb <- resFilter$eset
es <- exprs(ALLb)
dim(es)
## [1] 4025   94

We can see in the output how much the dataset has decreased.

ANOVA FILTERS: comparing means across four groups, one of each of the B-cell ALL we are considering Genes considered useful are given the value TRUE

f <- Anova(ALLb$mol.biol, p = 0.01) # ANOVA filtering
ff <- filterfun(f) # generates a filtering function that can be applied to an expression matrix
selGenes <- genefilter(exprs(ALLb), ff) # creates a vector with as many elements as there are genes in the matrix
sum(selGenes)
## [1] 751

751 genes are considered useful from ANOVA filtering

Now, update the data structure to include only the selected genes

ALLb <- ALLb[selGenes, ]
ALLb
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 751 features, 94 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: 01005 01010 ... LAL5 (94 total)
##   varLabels: cod diagnosis ... date last seen (21 total)
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
##   pubMedIds: 14684422 16243790 
## Annotation: hgu95av2

…

es <- exprs(ALLb)
dim(es)
## [1] 751  94

Our new data: Significantly less dense than the other graph.

dg <- data.frame(rowMed=rowMedians(es), rowIQR=rowIQRs(es))

ggplot(dg,aes(x=rowMed, y=rowIQR)) + geom_point() +
  xlab("Median expression level") + ylab("IQR expression level") +
  ggtitle("Distribution Properties of the Selected Genes")

…

4.3: FILTERING USING RANDOM FORESTS

Must continue filtering because there are too many features than observations. Random Forests used to obtain a ranking of genes’ usefulness.

library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following object is masked from 'package:BiocGenerics':
## 
##     combine
dt <- data.frame(t(es), Mut = ALLb$mol.biol)
dt$Mut <- droplevels(dt$Mut)

set.seed(1234)

rf <- randomForest(Mut ~ ., dt, importance = TRUE) # obtain relevance of each variable

imp <- importance(rf)

rf.genes <- rownames(imp)[order(imp[,"MeanDecreaseAccuracy"], # select column with variables scores measured by estimated mean decrease 
                                decreasing = TRUE)[1:30]] # order in decreasing order of the highest 30 scores, obtaining names

To see the median expression level for the 30 selected genes

sapply(rf.genes, function(g) tapply(dt[, g], dt$Mut, median))
##          X1635_at X37015_at X1674_at X40202_at X40504_at X41470_at X41071_at
## ALL1/AF4 7.302814  3.752649 3.745752  8.550639  3.218079  9.616743  7.698420
## BCR/ABL  8.693082  4.857105 5.833510  9.767293  4.924310  5.205797  6.017967
## E2A/PBX1 7.562676  6.579530 3.808258  7.414635  3.455316  3.931191  6.058185
## NEG      7.324691  3.765741 4.244791  7.655605  3.541651  4.157748  6.573731
##          X37105_at X34699_at X37225_at X36873_at X37981_at X36638_at X1467_at
## ALL1/AF4  6.845719  4.253504  5.220668  7.040593  6.170311  9.811828 3.708985
## BCR/ABL   6.493001  6.315966  3.460902  3.490262  6.882755  8.486446 4.239306
## E2A/PBX1  6.740213  6.102031  7.445655  3.634471  8.080002  6.259730 3.411696
## NEG       6.298859  6.092511  3.387552  3.824670  7.423079  5.856580 3.515020
##          X38323_at X34210_at X40795_at X41274_at X39424_at X33412_at X38994_at
## ALL1/AF4  4.195967  5.641130  3.867134  5.501107  6.896571 10.757286  8.939007
## BCR/ABL   4.866452  9.204237  4.544239  5.261901  8.054597  6.880112  8.760193
## E2A/PBX1  4.317550  8.198781  4.151637  4.888560  7.689701  5.636466  4.741548
## NEG       4.286104  8.791774  3.909532  4.860075  8.044947  5.881145  8.046647
##          X1249_at X36275_at X41237_at X37014_at X1307_at X37951_at X1616_at
## ALL1/AF4 3.582763  3.618819  10.94079  5.596978 3.368915  3.418433 4.059859
## BCR/ABL  4.477659  6.259073  12.11895  5.751943 4.945270  3.881780 3.643486
## E2A/PBX1 3.257649  3.635956  11.35610  5.837242 4.678577  3.461861 6.033233
## NEG      3.791764  3.749953  11.93624  7.029444 4.863930  3.419113 3.747773
##          X32116_at X32434_at
## ALL1/AF4  7.115400  3.317480
## BCR/ABL   7.966959  5.339625
## E2A/PBX1  7.359097  3.668714
## NEG       7.636213  3.226766

…

library(tidyr) # get graph
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
library(dplyr) # summarization package
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, setequal, union
## The following object is masked from 'package:generics':
## 
##     explain
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
d <- gather(dt[,c(rf.genes,"Mut")],Gene,ExprValue,1:length(rf.genes))

dat <- group_by(d,Mut,Gene) %>%
  summarise(med=median(ExprValue), iqr=IQR(ExprValue), .groups = "drop")

ggplot(dat, aes(x=med,y=iqr,color=Mut)) +
  geom_point(size=6) + facet_wrap(~ Gene) +
  labs(x="MEDIAN expression level",y="IQR expression level",color="Mutation")

…

4.4: FILTERING USING CLUSTERING

…

library(Hmisc)
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following object is masked from 'package:AnnotationDbi':
## 
##     contents
## The following object is masked from 'package:Biobase':
## 
##     contents
## The following objects are masked from 'package:base':
## 
##     format.pval, units
vc <- varclus(t(es)) # function that uses a hierarchical clustering algorithm to cluster variables of a data set.
clus30 <- cutree(vc$hclust, 30) # obtain a clustering formed by 30 groups of variables
table(clus30)
## clus30
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
## 21 16 37 18 44 19 28 38 28 30 35 54 20 47 18 45 17 14 25 18 22 31 28 15  7 15 
## 27 28 29 30 
## 11 18 19 13

We do this under the assumption tat members of the same cluster will be similar to each other and therefore redundant

Generate one set of variables through random sampling from the selected number of clusters (30)

getVarsSet <- function(cluster,nvars=30,seed=NULL,verb=FALSE) { # generates a new set of 30 variables every time function is called
  if (!is.null(seed)) set.seed(seed)
  
  cls <- cutree(cluster,nvars)
  tots <- table(cls)
  vars <- c()
  vars <- sapply(1:nvars,function(clID)
  {
    if (!length(tots[clID])) stop('Empty cluster! (',clID,')')
    x <- sample(1:tots[clID],1)
    names(cls[cls==clID])[x]
  })
  if (verb) structure(vars,clusMemb=cls,clusTots=tots)
  else vars
}

getVarsSet(vc$hclust)
##  [1] "37579_at"   "33371_s_at" "34362_at"   "36008_at"   "39224_at"  
##  [6] "1207_at"    "36403_s_at" "41123_s_at" "2036_s_at"  "37842_at"  
## [11] "36188_at"   "106_at"     "41853_at"   "41715_at"   "41126_at"  
## [16] "763_at"     "39049_at"   "1389_at"    "37015_at"   "41872_at"  
## [21] "32607_at"   "2062_at"    "37304_at"   "33266_at"   "32612_at"  
## [26] "905_at"     "37027_at"   "38081_at"   "39649_at"   "33777_at"
getVarsSet(vc$hclust)
##  [1] "41139_at"   "32585_at"   "31508_at"   "39410_at"   "36089_at"  
##  [6] "33796_at"   "32169_at"   "36577_at"   "32725_at"   "38352_at"  
## [11] "41606_at"   "40480_s_at" "38651_at"   "37039_at"   "37251_s_at"
## [16] "40419_at"   "38014_at"   "317_at"     "33339_g_at" "33305_at"  
## [21] "36203_at"   "37680_at"   "41146_at"   "33429_at"   "37485_at"  
## [26] "40369_f_at" "36142_at"   "38078_at"   "40425_at"   "40454_at"

STEP 5: PREDICTION MODELING

5.1 Using the iris data set to demonstrate bootstrapping

….

Uses libraries performanceEstimation and DMwR2. These no longer exist in the CRAN Archive as of July 2025, so we use a new package called ‘remotes’ to download the legacy version from github.

install.packages("remotes")
remotes::install_github("ltorgo/performanceEstimation")
## Skipping install of 'performanceEstimation' from a github remote, the SHA1 (6262e570) has not changed since last install.
##   Use `force = TRUE` to force installation
remotes::install_github("ltorgo/DMwR2")
## Skipping install of 'DMwR2' from a github remote, the SHA1 (c19cb087) has not changed since last install.
##   Use `force = TRUE` to force installation
library(performanceEstimation)
library(DMwR2)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
data(iris)
exp <- performanceEstimation(
  PredTask(Species ~ ., iris),
  Workflow(learner="rpartXse", predictor.pars=list(type="class")),
  EstimationTask(metrics="acc",method=Bootstrap(nReps=100)))
## 
## 
## ##### PERFORMANCE ESTIMATION USING  BOOTSTRAP  #####
## 
## ** PREDICTIVE TASK :: iris.Species
## 
## ++ MODEL/WORKFLOW :: rpartXse 
## Task for estimating  acc  using
## 100  repetitions of  e0  Bootstrap experiment
##   Run with seed =  1234 
## Iteration :  1  2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99  100
summary(exp)
## 
## == Summary of a  Bootstrap Performance Estimation Experiment ==
## 
## Task for estimating  acc  using
## 100  repetitions of  e0  Bootstrap experiment
##   Run with seed =  1234 
## 
## * Predictive Tasks ::  iris.Species
## * Workflows  ::  rpartXse 
## 
## -> Task:  iris.Species
##   *Workflow: rpartXse 
##                acc
## avg     0.94183656
## std     0.02784701
## med     0.94392034
## iqr     0.03637634
## min     0.86666667
## max     1.00000000
## invalid 0.00000000

5.2: Modeling techniques:

We use 3 different data sets that differ in predictors used Genes selected by ANOVA, Random Forests, and Support Vector Machines (SVM)

Using the package ‘class’ to demonstrate KNN on iris data set:

library(class)
data(iris)

idx <- sample(1:nrow(iris), as.integer(0.7 * nrow(iris))) # 70% of rows to be used for training
tr <- iris[idx, ] # training set
ts <- iris[-idx, ] # test set (everything but training rows)
preds <- knn(tr[, -5], # features used to calculate distance
             ts[, -5], 
             tr[, 5], # the answers of the training data
             k = 3) # 3 nearest neighbors to predict
table(preds, ts[, 5]) # confusion matrix 
##             
## preds        setosa versicolor virginica
##   setosa         14          0         0
##   versicolor      0         15         1
##   virginica       0          2        13

This is a nonstandard interface. We create a function that enables the use of this method in a more standard formula-type interface:

‘stand’ parameter is used to standardize the data before KNN.

kNN <- function(form, train, test, stand = TRUE, stand.stats = NULL, ...) {
  require(class, quietly = TRUE)
  tgtCol <- which(colnames(train) == as.character(form[[2]])) # identifies the index of the target column
  if (stand) {
    if (is.null(stand.stats))
      tmp <- scale(train[, -tgtCol], center = TRUE, scale = TRUE) # calculates mean and stdev. of the training data and scales it if no stats are provided
    else tmp <- scale(train[, -tgtCol], center = stand.stats[[1]], # if stats are provided, uses those values to train data instead
                      scale = stand.stats[[2]])
    train[, -tgtCol] <- tmp # replaces original training features with new scaled versions
    ms <- attr(tmp, "scaled:center") # extracts the means used to scale data
    ss <- attr(tmp, "scaled:scale") # extracts the stdev. used to scale data
    test[, -tgtCol] <- scale(test[, -tgtCol], center = ms, scale = ss) # scales test set
    }
  knn(train[, -tgtCol], test[, -tgtCol], train[, tgtCol], ...) # runs KNN algorithm on scaled data and returns predictions
  }

preds.stand <- kNN(Species ~ ., tr, ts, k = 3) # predicting species and sets k=3
table(preds.stand,ts[, 5]) # generates confusion matrix
##             
## preds.stand  setosa versicolor virginica
##   setosa         14          0         0
##   versicolor      0         15         2
##   virginica       0          2        12
preds.notStand <- kNN(Species ~ ., tr, ts, stand = FALSE, k = 3)
table(preds.notStand, ts[, 5])
##               
## preds.notStand setosa versicolor virginica
##     setosa         14          0         0
##     versicolor      0         15         1
##     virginica       0          2        13

5.3: Comparing Models

Now using our data set, we are comparing 3 models 1. Using the genes from the ANOVA filtering 2. Applyinh the random forest filtering on top of ANOVA result 3. Applying filtering based on ensembles of clustered variables after the ANOVA filtering. Each will be combined with different classification tools

Creating an ensemble of models:

varsEnsemble <- function(tgt,train,test, # name of target variable. training set, test set
                         fs.meth, # a list containing the sets of variable names which is used to generate the predictors of each ensemble
                         baseLearner,blPars, # functions that implements the learner to be used on each member of the ensemble and the learning parameters
                         predictor,predPars, # function used to obtain the predictions of the ensemble for the given test. 
                         verb=FALSE)
  {
  require(Hmisc,quietly=TRUE)
  v <- varclus(as.matrix(train[,-which(colnames(train)==tgt)]))
  varsSets <- lapply(1:fs.meth[[3]],function(x)
    getVarsSet(v$hclust,nvars=fs.meth[[2]]))
  
  preds <- matrix(NA,ncol=length(varsSets),nrow=NROW(test))
  for(v in seq(along=varsSets)) {
    if (baseLearner=='knn')
      preds[,v] <- do.call("kNN",
                           c(list(as.formula(paste(tgt,
                                                   paste(varsSets[[v]],
                                                         collapse='+'),
                                                   sep='~')),
                                  train[,c(tgt,varsSets[[v]])],
                                  test[,c(tgt,varsSets[[v]])]),
                             blPars)
                           )
    else {
      m <- do.call(baseLearner,
                   c(list(as.formula(paste(tgt,
                                           paste(varsSets[[v]],
                                                 collapse='+'),
                                           sep='~')),
                          train[,c(tgt,varsSets[[v]])]),
                     blPars)
                   )
      preds[,v] <- do.call(predictor,
                           c(list(m,test[,c(tgt,varsSets[[v]])]),
                             predPars))
    }
    }
  ps <- apply(preds,1,function(x)
    levels(factor(x))[which.max(table(factor(x)))])
  factor(ps,
         levels=1:nlevels(train[,tgt]),
         labels=levels(train[,tgt]))
  }

Workflow function that takes an algorithm as a parameter ti ryb and evaluate our model pipeline

ALLb.wf <- function(form, train, test,
                    learner, learner.pars=NULL,
                    predictor="predict",predictor.pars=NULL,
                    featSel.meth = "s2",
                    available.fsMethods=list(s1=list("all"), # all features after ANOVA
                                             s2=list('rf',30), # random forest to select top 30 features by mean decrease in accuracy
                                             s3=list('varclus',30,50)), # variable clustering ensemble approach based on 50 models each building 30 predictors
                    .model=FALSE,
                    ...)
  {
  ## The characteristics of the selected feature selection method
  fs.meth <- available.fsMethods[[featSel.meth]]
  
  ## The target variable
  tgt <- as.character(form[[2]])
  tgtCol <- which(colnames(train)==tgt)
  
  ## Anova filtering
  f <- Anova(train[,tgt],p=0.01)
  ff <- filterfun(f)
  genes <- genefilter(t(train[,-tgtCol]),ff)
  genes <- names(genes)[genes]
  train <- train[,c(tgt,genes)]
  test <- test[,c(tgt,genes)]
  tgtCol <- 1
  
  ## Specific filtering
  if (fs.meth[[1]]=='varclus') {
    pred <- varsEnsemble(tgt,train,test,fs.meth,
                         learner,learner.pars,
                         predictor,predictor.pars,
                         list(...))
    } else {
      if (fs.meth[[1]]=='rf') {
        require(randomForest,quietly=TRUE)
        rf <- randomForest(form,train,importance=TRUE)
        imp <- importance(rf)
        rf.genes <- rownames(imp)[order(imp[,"MeanDecreaseAccuracy"],
                                        decreasing = TRUE)[1:fs.meth[[2]]]]
        train <- train[,c(tgt,rf.genes)]
        test <- test[,c(tgt,rf.genes)]
        }
      if (learner == 'knn')
        pred <- kNN(form,train,test,
                    stand.stats=list(rowMedians(t(as.matrix(train[,-tgtCol]))),
                                     rowIQRs(t(as.matrix(train[,-tgtCol])))),
                    ...)
      else {
        model <- do.call(learner,c(list(form,train),learner.pars))
        pred <- do.call(predictor,c(list(model,test),predictor.pars))
      }
      }
  return(list(trues=responseValues(form,test), preds=pred,
              model=if (.model && learner!="knn") model else NULL))
  }

Creating a list that will hold all variants that are considered in estimation experiments:

varslist <- list()
varslist$randomForest <- list(learner.pars=list(ntree=c(500,750,1000),
                                            mtry=c(5,15)),
                          preditor.pars=list(type="response"))
varslist$svm <- list(learner.pars=list(cost=c(1,100),
                                   gamma=c(0.01,0.001,0.0001)))
varslist$knn <- list(learner.pars=list(k=c(3,5,7),
                                   stand=c(TRUE,FALSE)))
varslist$featureSel <- list(featSel.meth=c("s1", "s2", "s3"))
library(performanceEstimation)
library(class)
library(randomForest)
library(e1071)
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
## 
##     impute
## The following object is masked from 'package:ggplot2':
## 
##     element
## The following object is masked from 'package:generics':
## 
##     interpolate
library(genefilter)

load('myALL.Rdata')
es <- exprs(ALLb)

ALLb <- nsFilter(ALLb,
                 var.func=IQR, var.cutoff=IQR(as.vector(es))/5,
                 feature.exclude="^AFFX")
ALLb <- ALLb$eset

dt <- data.frame(t(exprs(ALLb)), Mut=ALLb$mol.bio)
save(dt, file="myDataDT.Rdata")

nReps = 2 was used for testing and debugging code, but we actually ran the code in the console (not R cell in markdown file for safety) and used nReps = 100. This ran over night, 10 hours total (it may haven taken less than the allotted time but it probably took at least 4 hours based off how long nReps = 2 took). I cannot update the code inside the markdown cell because it would have to rerun for it to save, but all cells below use the nReps = 100.

load("myDataDT.Rdata")
assign("dt", dt, envir=.GlobalEnv)
set.seed(1234)

library(parallelMap)
parallelStartSocket(cpus=2)
parallelExport("dt", "ALLb.wf", "varsEnsemble", "kNN", "getVarsSet")  
parallelLibrary("randomForest", "e1071", "class", "genefilter", 
                "matrixStats", "performanceEstimation",
                "Biobase", "BiocGenerics")

TODO <- c('knn','svm','randomForest')

for(td in TODO) {
  assign(td,
         performanceEstimation(
           PredTask(Mut ~ ., dt, 'ALL', copy=TRUE),
           do.call('workflowVariants',
                   c(list('ALLb.wf', learner=td, varsRootName=td),
                     varslist[[td]],
                     varslist$featureSel
                     )
                   ),
           EstimationTask(metrics="acc", method=Bootstrap(nReps=2)),
           cluster=TRUE
           )
         )
  save(list=td, file=paste(td, 'Rdata', sep='.'))
}

parallelStop()

Load results of the experiments

load("knn.Rdata")
load("svm.Rdata")
load("randomForest.Rdata")

Top 5 of each variant:

rankWorkflows(svm, maxs = TRUE)
## $ALL
## $ALL$acc
##   Workflow  Estimate
## 1  svm.v12 0.8161308
## 2   svm.v8 0.8138529
## 3  svm.v10 0.8132432
## 4   svm.v6 0.8015241
## 5   svm.v7 0.8008629
rankWorkflows(knn, maxs = TRUE)
## $ALL
## $ALL$acc
##   Workflow  Estimate
## 1   knn.v7 0.8078413
## 2   knn.v8 0.8078413
## 3   knn.v9 0.8078413
## 4  knn.v10 0.8078413
## 5  knn.v11 0.8078413
rankWorkflows(randomForest, maxs = TRUE)
## $ALL
## $ALL$acc
##           Workflow  Estimate
## 1  randomForest.v9 0.7866133
## 2  randomForest.v8 0.7844237
## 3  randomForest.v7 0.7813595
## 4 randomForest.v10 0.7787207
## 5 randomForest.v12 0.7780597

Top 10 scores of all trials:

 all.trials <- mergeEstimationRes(svm, knn, randomForest, by ="workflows")

 rankWorkflows(all.trials, top=10, maxs = TRUE)
## $ALL
## $ALL$acc
##    Workflow  Estimate
## 1   svm.v12 0.8161308
## 2    svm.v8 0.8138529
## 3   svm.v10 0.8132432
## 4    knn.v7 0.8078413
## 5    knn.v8 0.8078413
## 6    knn.v9 0.8078413
## 7   knn.v10 0.8078413
## 8   knn.v11 0.8078413
## 9   knn.v12 0.8078413
## 10   svm.v6 0.8015241

Characteristics of the top solution:

getWorkflow("svm.v12", all.trials)
## Workflow Object:
##  Workflow ID       ::  svm.v12 
##  Workflow Function ::  ALLb.wf
##       Parameter values:
##       learner.pars  -> cost=100 gamma=1e-04 
##       learner  -> svm 
##       featSel.meth  -> s2

SVM.v12 uses 30 genes filtered by an ANOVA and random forest (s2 strategy), and uses a SVM parameter setting of a cost of 100 and gamma of 0.0001

top10WFnames <- rankWorkflows(all.trials, top=10,
                              maxs = TRUE)[["ALL"]][["acc"]][,"Workflow"]

sapply(top10WFnames, function(WFid) getWorkflow(WFid,all.trials)@pars$featSel.meth)
## svm.v12  svm.v8 svm.v10  knn.v7  knn.v8  knn.v9 knn.v10 knn.v11 knn.v12  svm.v6 
##    "s2"    "s2"    "s2"    "s2"    "s2"    "s2"    "s2"    "s2"    "s2"    "s1"
plot(subset(all.trials,workflows=top10WFnames))
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the performanceEstimation package.
##   Please report the issue at
##   <https://github.com/ltorgo/performanceEstimation/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ps <- pairedComparisons(all.trials, baseline="svm.v12")
## Warning in pairedComparisons(all.trials, baseline = "svm.v12"): With less 2
## tasks the Friedman, Nemenyi and Bonferroni-Dunn tests are not calculated.
ps$acc$WilcoxonSignedRank.test
## , , ALL
## 
##                   MedScore DiffMedScores      p.value
## svm.v12          0.8235294            NA           NA
## svm.v1           0.4058277   0.417701709 3.944995e-18
## svm.v2           0.4058277   0.417701709 3.944995e-18
## svm.v3           0.7245690   0.098960446 3.896526e-13
## svm.v4           0.7419355   0.081593928 9.015880e-11
## svm.v5           0.6764706   0.147058824 1.441511e-16
## svm.v6           0.8000000   0.023529412 6.729801e-03
## svm.v7           0.8060036   0.017525828 1.934278e-06
## svm.v8           0.8153409   0.008188503 6.314458e-01
## svm.v9           0.6984848   0.125044563 4.374660e-17
## svm.v10          0.8153409   0.008188503 6.900373e-01
## svm.v11          0.4027027   0.420826709 5.770927e-18
## svm.v13          0.7285012   0.095028183 9.906675e-15
## svm.v14          0.7917957   0.031733746 1.653512e-04
## svm.v15          0.4428105   0.380718954 5.774248e-18
## svm.v16          0.7894737   0.034055728 6.500782e-04
## svm.v17          0.4027027   0.420826709 5.770594e-18
## svm.v18          0.7533784   0.070151033 2.785619e-11
## knn.v1           0.7647059   0.058823529 4.850979e-09
## knn.v2           0.7647059   0.058823529 4.850979e-09
## knn.v3           0.7647059   0.058823529 4.850979e-09
## knn.v4           0.7647059   0.058823529 4.850979e-09
## knn.v5           0.7647059   0.058823529 4.850979e-09
## knn.v6           0.7647059   0.058823529 4.850979e-09
## knn.v7           0.8116554   0.011874006 2.010675e-01
## knn.v8           0.8116554   0.011874006 2.010675e-01
## knn.v9           0.8116554   0.011874006 2.010675e-01
## knn.v10          0.8116554   0.011874006 2.010675e-01
## knn.v11          0.8116554   0.011874006 2.010675e-01
## knn.v12          0.8116554   0.011874006 2.010675e-01
## knn.v13          0.7393888   0.084140617 1.775823e-11
## knn.v14          0.7386148   0.084914611 5.960646e-13
## knn.v15          0.7101019   0.113427544 2.103474e-14
## knn.v16          0.7795139   0.044015523 1.078240e-05
## knn.v17          0.7728111   0.050718352 1.258618e-07
## knn.v18          0.7647059   0.058823529 1.059726e-09
## randomForest.v1  0.7058824   0.117647059 1.968237e-15
## randomForest.v2  0.7204861   0.103043301 1.298364e-15
## randomForest.v3  0.7108014   0.112728018 1.876365e-15
## randomForest.v4  0.7386148   0.084914611 1.311747e-13
## randomForest.v5  0.7290210   0.094508433 3.849026e-14
## randomForest.v6  0.7302495   0.093279932 4.530103e-14
## randomForest.v7  0.7878788   0.035650624 2.207773e-08
## randomForest.v8  0.7970588   0.026470588 5.994822e-07
## randomForest.v9  0.7970588   0.026470588 1.002769e-06
## randomForest.v10 0.7845644   0.038965018 5.342399e-08
## randomForest.v11 0.7878788   0.035650624 2.484293e-08
## randomForest.v12 0.7825169   0.041012520 2.158400e-08
## randomForest.v13 0.7236111   0.099918301 9.058075e-15
## randomForest.v14 0.7272727   0.096256684 3.873705e-15
## randomForest.v15 0.7272727   0.096256684 2.724379e-15
## randomForest.v16 0.7419355   0.081593928 3.620423e-14
## randomForest.v17 0.7352941   0.088235294 2.023434e-14
## randomForest.v18 0.7352941   0.088235294 4.096730e-14

The confusion matrix ALLb.wf is our user defined workflow function that shows that the actual prediction of the models are.

iteration <- 1
itInfo <- getIterationsInfo(all.trials,workflow="svm.v12",it=iteration)
table(itInfo$trues, itInfo$preds)
##           
##            ALL1/AF4 BCR/ABL E2A/PBX1 NEG
##   ALL1/AF4        3       0        0   0
##   BCR/ABL         0       8        0   1
##   E2A/PBX1        0       0        0   1
##   NEG             0       2        0  13

ALL1/AF4 mutation predictions were correct. Actual mutation BCR/ABL and E2A/PBX1 mutation were predicted as negative once each. Predicted BCR/ABL as a mutation when it was actually negative twice.