Step 1: DOWNLOADING PACKAGES TO COMPUTER This step only needs to be completed once.
Loading library
library(Biobase)
## Loading required package: BiocGenerics
## Loading required package: generics
##
## Attaching package: 'generics'
## The following objects are masked from 'package:base':
##
## as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
## setequal, union
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
## unsplit, which.max, which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
library(ALL)
data("ALL", package = "ALL")
class(ALL)
## [1] "ExpressionSet"
## attr(,"package")
## [1] "Biobase"
STEP 2: DATA PRE-PROCESSING
Obtaining the names/ descriptions of covariates. Info on the distribution of the samples across the two main variants: BT variables determines the type of acute lymphoblastic leukemia mol.bio variables describes the cytogenetic abnormality found on each sample.
pD <- phenoData(ALL)
varMetadata(pD)
## labelDescription
## cod Patient ID
## diagnosis Date of diagnosis
## sex Gender of the patient
## age Age of the patient at entry
## BT does the patient have B-cell or T-cell ALL
## remission Complete remission(CR), refractory(REF) or NA. Derived from CR
## CR Original remisson data
## date.cr Date complete remission if achieved
## t(4;11) did the patient have t(4;11) translocation. Derived from citog
## t(9;22) did the patient have t(9;22) translocation. Derived from citog
## cyto.normal Was cytogenetic test normal? Derived from citog
## citog original citogenetics data, deletions or t(4;11), t(9;22) status
## mol.biol molecular biology
## fusion protein which of p190, p210 or p190/210 for bcr/able
## mdr multi-drug resistant
## kinet ploidy: either diploid or hyperd.
## ccr Continuous complete remission? Derived from f.u
## relapse Relapse? Derived from f.u
## transplant did the patient receive a bone marrow transplant? Derived from f.u
## f.u follow up data available
## date last seen date patient was last seen
table(ALL$BT)
##
## B B1 B2 B3 B4 T T1 T2 T3 T4
## 5 19 36 23 12 5 1 15 10 2
table(ALL$mol.biol)
##
## ALL1/AF4 BCR/ABL E2A/PBX1 NEG NUP-98 p15/p16
## 10 37 5 74 1 1
table(ALL$BT, ALL$mol.biol)
##
## ALL1/AF4 BCR/ABL E2A/PBX1 NEG NUP-98 p15/p16
## B 0 2 1 2 0 0
## B1 10 1 0 8 0 0
## B2 0 19 0 16 0 1
## B3 0 8 1 14 0 0
## B4 0 7 3 2 0 0
## T 0 0 0 5 0 0
## T1 0 0 0 1 0 0
## T2 0 0 0 15 0 0
## T3 0 0 0 9 1 0
## T4 0 0 0 2 0 0
Showing first 10 genes and the names of first 5 samples.
featureNames(ALL)[1:10]
## [1] "1000_at" "1001_at" "1002_f_at" "1003_s_at" "1004_at" "1005_at"
## [7] "1006_at" "1007_s_at" "1008_f_at" "1009_at"
sampleNames(ALL)[1:5]
## [1] "01005" "01010" "03002" "04006" "04007"
Set of cases that will be considered. Samples with specific values of the BT and mol.bio variables.
tgt.cases <- which(ALL$BT %in% levels(ALL$BT)[1:5] &
ALL$mol.biol %in% levels(ALL$mol.biol)[1:4])
ALLb <- ALL[, tgt.cases]
ALLb
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 12625 features, 94 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: 01005 01010 ... LAL5 (94 total)
## varLabels: cod diagnosis ... date last seen (21 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 14684422 16243790
## Annotation: hgu95av2
Update available levels of factors into new ‘ALLb’ object:
ALLb$BT <- factor(ALLb$BT)
ALLb$mol.biol <- factor(ALLb$mol.biol)
Saving object locally on computer to not repeat preprocessing steps
save(ALLb, file = "myALL.Rdata")
STEP 3: EXPLORING DATASET:
Access gene expression levels matrix:
es <- exprs(ALLb)
dim(es)
## [1] 12625 94
summary(as.vector(es))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.985 4.122 5.469 5.624 6.829 14.045
Using package ‘genefilter’ to graph expression levels:
library(genefilter)
library(ggplot2)
exprVs <- data.frame(exprVal=as.vector(es))
ds <- data.frame(Stat=c("1stQ","Median","3rdQ","Shorth"),
Value=c(quantile(exprVs$exprVal,
probs=c(0.25, 0.5, 0.75)),
shorth(exprVs$exprVal)),
Color=c("red","green","red","yellow"))
ggplot(exprVs,aes(x=exprVal)) + geom_histogram(fill="lightgrey") +
geom_vline(data=ds,aes(xintercept=Value,color=Color)) +
geom_text(data=ds,aes(x=Value-0.2,y=0,label=Stat,colour=Color),
angle=90,hjust="left") +
xlab("Expression Levels") + guides(colour="none", fill="none")
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
… WRITE SOMETHING HERE…
sapply(levels(ALLb$mol.biol),
function(x) summary(as.vector(es[, which(ALLb$mol.biol == x)])))
## ALL1/AF4 BCR/ABL E2A/PBX1 NEG
## Min. 2.266403 2.194858 2.267802 1.984919
## 1st Qu. 4.140797 4.124414 4.152339 4.111435
## Median 5.454388 5.467834 5.496825 5.469887
## Mean 5.620976 5.627016 5.630445 5.621616
## 3rd Qu. 6.805440 6.832862 6.818587 6.832056
## Max. 14.031235 14.044896 13.812578 13.949564
STEP 4: GENE SELECTION
4.1 simple filters based on distribution properties
rowIQRs <- function(em)
rowQ(em,ceiling(0.75*ncol(em))) - rowQ(em,floor(0.25*ncol(em)))
library(ggplot2)
dg <- data.frame(rowMed=rowMedians(es), rowIQR=rowIQRs(es))
ggplot(dg,aes(x=rowMed, y=rowIQR)) + geom_point() +
xlab("Median expression level") + ylab("IQR expression level") +
ggtitle("Main Characteristics of Genes Expression Levels")
Many genes of low variability (IQR near 0) Low variability means not useful for discriminating amoung different types of mutations
library(genefilter)
resFilter <- nsFilter(ALLb,
var.func=IQR,
var.cutoff=IQR(as.vector(es))/5, #removes genes with a variability under 1/5 of global IQR
feature.exclude="^AFFX")
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
##
##
resFilter
## $eset
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 4025 features, 94 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: 01005 01010 ... LAL5 (94 total)
## varLabels: cod diagnosis ... date last seen (21 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 14684422 16243790
## Annotation: hgu95av2
##
## $filter.log
## $filter.log$numDupsRemoved
## [1] 2888
##
## $filter.log$numLowVar
## [1] 4751
##
## $filter.log$numRemoved.ENTREZID
## [1] 942
##
## $filter.log$feature.exclude
## [1] 19
Update the ‘ALLb’ and ‘es’ objects with the filtered objects.
ALLb <- resFilter$eset
es <- exprs(ALLb)
dim(es)
## [1] 4025 94
We can see in the output how much the dataset has decreased.
ANOVA FILTERS: comparing means across four groups, one of each of the B-cell ALL we are considering Genes considered useful are given the value TRUE
f <- Anova(ALLb$mol.biol, p = 0.01) # ANOVA filtering
ff <- filterfun(f) # generates a filtering function that can be applied to an expression matrix
selGenes <- genefilter(exprs(ALLb), ff) # creates a vector with as many elements as there are genes in the matrix
sum(selGenes)
## [1] 751
751 genes are considered useful from ANOVA filtering
Now, update the data structure to include only the selected genes
ALLb <- ALLb[selGenes, ]
ALLb
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 751 features, 94 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: 01005 01010 ... LAL5 (94 total)
## varLabels: cod diagnosis ... date last seen (21 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 14684422 16243790
## Annotation: hgu95av2
…
es <- exprs(ALLb)
dim(es)
## [1] 751 94
Our new data: Significantly less dense than the other graph.
dg <- data.frame(rowMed=rowMedians(es), rowIQR=rowIQRs(es))
ggplot(dg,aes(x=rowMed, y=rowIQR)) + geom_point() +
xlab("Median expression level") + ylab("IQR expression level") +
ggtitle("Distribution Properties of the Selected Genes")
…
4.3: FILTERING USING RANDOM FORESTS
Must continue filtering because there are too many features than observations. Random Forests used to obtain a ranking of genes’ usefulness.
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:Biobase':
##
## combine
## The following object is masked from 'package:BiocGenerics':
##
## combine
dt <- data.frame(t(es), Mut = ALLb$mol.biol)
dt$Mut <- droplevels(dt$Mut)
set.seed(1234)
rf <- randomForest(Mut ~ ., dt, importance = TRUE) # obtain relevance of each variable
imp <- importance(rf)
rf.genes <- rownames(imp)[order(imp[,"MeanDecreaseAccuracy"], # select column with variables scores measured by estimated mean decrease
decreasing = TRUE)[1:30]] # order in decreasing order of the highest 30 scores, obtaining names
To see the median expression level for the 30 selected genes
sapply(rf.genes, function(g) tapply(dt[, g], dt$Mut, median))
## X1635_at X37015_at X1674_at X40202_at X40504_at X41470_at X41071_at
## ALL1/AF4 7.302814 3.752649 3.745752 8.550639 3.218079 9.616743 7.698420
## BCR/ABL 8.693082 4.857105 5.833510 9.767293 4.924310 5.205797 6.017967
## E2A/PBX1 7.562676 6.579530 3.808258 7.414635 3.455316 3.931191 6.058185
## NEG 7.324691 3.765741 4.244791 7.655605 3.541651 4.157748 6.573731
## X37105_at X34699_at X37225_at X36873_at X37981_at X36638_at X1467_at
## ALL1/AF4 6.845719 4.253504 5.220668 7.040593 6.170311 9.811828 3.708985
## BCR/ABL 6.493001 6.315966 3.460902 3.490262 6.882755 8.486446 4.239306
## E2A/PBX1 6.740213 6.102031 7.445655 3.634471 8.080002 6.259730 3.411696
## NEG 6.298859 6.092511 3.387552 3.824670 7.423079 5.856580 3.515020
## X38323_at X34210_at X40795_at X41274_at X39424_at X33412_at X38994_at
## ALL1/AF4 4.195967 5.641130 3.867134 5.501107 6.896571 10.757286 8.939007
## BCR/ABL 4.866452 9.204237 4.544239 5.261901 8.054597 6.880112 8.760193
## E2A/PBX1 4.317550 8.198781 4.151637 4.888560 7.689701 5.636466 4.741548
## NEG 4.286104 8.791774 3.909532 4.860075 8.044947 5.881145 8.046647
## X1249_at X36275_at X41237_at X37014_at X1307_at X37951_at X1616_at
## ALL1/AF4 3.582763 3.618819 10.94079 5.596978 3.368915 3.418433 4.059859
## BCR/ABL 4.477659 6.259073 12.11895 5.751943 4.945270 3.881780 3.643486
## E2A/PBX1 3.257649 3.635956 11.35610 5.837242 4.678577 3.461861 6.033233
## NEG 3.791764 3.749953 11.93624 7.029444 4.863930 3.419113 3.747773
## X32116_at X32434_at
## ALL1/AF4 7.115400 3.317480
## BCR/ABL 7.966959 5.339625
## E2A/PBX1 7.359097 3.668714
## NEG 7.636213 3.226766
…
library(tidyr) # get graph
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
##
## expand
library(dplyr) # summarization package
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, setequal, union
## The following object is masked from 'package:generics':
##
## explain
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
d <- gather(dt[,c(rf.genes,"Mut")],Gene,ExprValue,1:length(rf.genes))
dat <- group_by(d,Mut,Gene) %>%
summarise(med=median(ExprValue), iqr=IQR(ExprValue), .groups = "drop")
ggplot(dat, aes(x=med,y=iqr,color=Mut)) +
geom_point(size=6) + facet_wrap(~ Gene) +
labs(x="MEDIAN expression level",y="IQR expression level",color="Mutation")
…
4.4: FILTERING USING CLUSTERING
…
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following object is masked from 'package:AnnotationDbi':
##
## contents
## The following object is masked from 'package:Biobase':
##
## contents
## The following objects are masked from 'package:base':
##
## format.pval, units
vc <- varclus(t(es)) # function that uses a hierarchical clustering algorithm to cluster variables of a data set.
clus30 <- cutree(vc$hclust, 30) # obtain a clustering formed by 30 groups of variables
table(clus30)
## clus30
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 21 16 37 18 44 19 28 38 28 30 35 54 20 47 18 45 17 14 25 18 22 31 28 15 7 15
## 27 28 29 30
## 11 18 19 13
We do this under the assumption tat members of the same cluster will be similar to each other and therefore redundant
Generate one set of variables through random sampling from the selected number of clusters (30)
getVarsSet <- function(cluster,nvars=30,seed=NULL,verb=FALSE) { # generates a new set of 30 variables every time function is called
if (!is.null(seed)) set.seed(seed)
cls <- cutree(cluster,nvars)
tots <- table(cls)
vars <- c()
vars <- sapply(1:nvars,function(clID)
{
if (!length(tots[clID])) stop('Empty cluster! (',clID,')')
x <- sample(1:tots[clID],1)
names(cls[cls==clID])[x]
})
if (verb) structure(vars,clusMemb=cls,clusTots=tots)
else vars
}
getVarsSet(vc$hclust)
## [1] "37579_at" "33371_s_at" "34362_at" "36008_at" "39224_at"
## [6] "1207_at" "36403_s_at" "41123_s_at" "2036_s_at" "37842_at"
## [11] "36188_at" "106_at" "41853_at" "41715_at" "41126_at"
## [16] "763_at" "39049_at" "1389_at" "37015_at" "41872_at"
## [21] "32607_at" "2062_at" "37304_at" "33266_at" "32612_at"
## [26] "905_at" "37027_at" "38081_at" "39649_at" "33777_at"
getVarsSet(vc$hclust)
## [1] "41139_at" "32585_at" "31508_at" "39410_at" "36089_at"
## [6] "33796_at" "32169_at" "36577_at" "32725_at" "38352_at"
## [11] "41606_at" "40480_s_at" "38651_at" "37039_at" "37251_s_at"
## [16] "40419_at" "38014_at" "317_at" "33339_g_at" "33305_at"
## [21] "36203_at" "37680_at" "41146_at" "33429_at" "37485_at"
## [26] "40369_f_at" "36142_at" "38078_at" "40425_at" "40454_at"
STEP 5: PREDICTION MODELING
5.1 Using the iris data set to demonstrate bootstrapping
….
Uses libraries performanceEstimation and DMwR2. These no longer exist in the CRAN Archive as of July 2025, so we use a new package called ‘remotes’ to download the legacy version from github.
install.packages("remotes")
remotes::install_github("ltorgo/performanceEstimation")
## Skipping install of 'performanceEstimation' from a github remote, the SHA1 (6262e570) has not changed since last install.
## Use `force = TRUE` to force installation
remotes::install_github("ltorgo/DMwR2")
## Skipping install of 'DMwR2' from a github remote, the SHA1 (c19cb087) has not changed since last install.
## Use `force = TRUE` to force installation
library(performanceEstimation)
library(DMwR2)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
data(iris)
exp <- performanceEstimation(
PredTask(Species ~ ., iris),
Workflow(learner="rpartXse", predictor.pars=list(type="class")),
EstimationTask(metrics="acc",method=Bootstrap(nReps=100)))
##
##
## ##### PERFORMANCE ESTIMATION USING BOOTSTRAP #####
##
## ** PREDICTIVE TASK :: iris.Species
##
## ++ MODEL/WORKFLOW :: rpartXse
## Task for estimating acc using
## 100 repetitions of e0 Bootstrap experiment
## Run with seed = 1234
## Iteration : 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
summary(exp)
##
## == Summary of a Bootstrap Performance Estimation Experiment ==
##
## Task for estimating acc using
## 100 repetitions of e0 Bootstrap experiment
## Run with seed = 1234
##
## * Predictive Tasks :: iris.Species
## * Workflows :: rpartXse
##
## -> Task: iris.Species
## *Workflow: rpartXse
## acc
## avg 0.94183656
## std 0.02784701
## med 0.94392034
## iqr 0.03637634
## min 0.86666667
## max 1.00000000
## invalid 0.00000000
5.2: Modeling techniques:
We use 3 different data sets that differ in predictors used Genes selected by ANOVA, Random Forests, and Support Vector Machines (SVM)
Using the package ‘class’ to demonstrate KNN on iris data set:
library(class)
data(iris)
idx <- sample(1:nrow(iris), as.integer(0.7 * nrow(iris))) # 70% of rows to be used for training
tr <- iris[idx, ] # training set
ts <- iris[-idx, ] # test set (everything but training rows)
preds <- knn(tr[, -5], # features used to calculate distance
ts[, -5],
tr[, 5], # the answers of the training data
k = 3) # 3 nearest neighbors to predict
table(preds, ts[, 5]) # confusion matrix
##
## preds setosa versicolor virginica
## setosa 14 0 0
## versicolor 0 15 1
## virginica 0 2 13
This is a nonstandard interface. We create a function that enables the use of this method in a more standard formula-type interface:
‘stand’ parameter is used to standardize the data before KNN.
kNN <- function(form, train, test, stand = TRUE, stand.stats = NULL, ...) {
require(class, quietly = TRUE)
tgtCol <- which(colnames(train) == as.character(form[[2]])) # identifies the index of the target column
if (stand) {
if (is.null(stand.stats))
tmp <- scale(train[, -tgtCol], center = TRUE, scale = TRUE) # calculates mean and stdev. of the training data and scales it if no stats are provided
else tmp <- scale(train[, -tgtCol], center = stand.stats[[1]], # if stats are provided, uses those values to train data instead
scale = stand.stats[[2]])
train[, -tgtCol] <- tmp # replaces original training features with new scaled versions
ms <- attr(tmp, "scaled:center") # extracts the means used to scale data
ss <- attr(tmp, "scaled:scale") # extracts the stdev. used to scale data
test[, -tgtCol] <- scale(test[, -tgtCol], center = ms, scale = ss) # scales test set
}
knn(train[, -tgtCol], test[, -tgtCol], train[, tgtCol], ...) # runs KNN algorithm on scaled data and returns predictions
}
preds.stand <- kNN(Species ~ ., tr, ts, k = 3) # predicting species and sets k=3
table(preds.stand,ts[, 5]) # generates confusion matrix
##
## preds.stand setosa versicolor virginica
## setosa 14 0 0
## versicolor 0 15 2
## virginica 0 2 12
preds.notStand <- kNN(Species ~ ., tr, ts, stand = FALSE, k = 3)
table(preds.notStand, ts[, 5])
##
## preds.notStand setosa versicolor virginica
## setosa 14 0 0
## versicolor 0 15 1
## virginica 0 2 13
5.3: Comparing Models
Now using our data set, we are comparing 3 models 1. Using the genes from the ANOVA filtering 2. Applyinh the random forest filtering on top of ANOVA result 3. Applying filtering based on ensembles of clustered variables after the ANOVA filtering. Each will be combined with different classification tools
Creating an ensemble of models:
varsEnsemble <- function(tgt,train,test, # name of target variable. training set, test set
fs.meth, # a list containing the sets of variable names which is used to generate the predictors of each ensemble
baseLearner,blPars, # functions that implements the learner to be used on each member of the ensemble and the learning parameters
predictor,predPars, # function used to obtain the predictions of the ensemble for the given test.
verb=FALSE)
{
require(Hmisc,quietly=TRUE)
v <- varclus(as.matrix(train[,-which(colnames(train)==tgt)]))
varsSets <- lapply(1:fs.meth[[3]],function(x)
getVarsSet(v$hclust,nvars=fs.meth[[2]]))
preds <- matrix(NA,ncol=length(varsSets),nrow=NROW(test))
for(v in seq(along=varsSets)) {
if (baseLearner=='knn')
preds[,v] <- do.call("kNN",
c(list(as.formula(paste(tgt,
paste(varsSets[[v]],
collapse='+'),
sep='~')),
train[,c(tgt,varsSets[[v]])],
test[,c(tgt,varsSets[[v]])]),
blPars)
)
else {
m <- do.call(baseLearner,
c(list(as.formula(paste(tgt,
paste(varsSets[[v]],
collapse='+'),
sep='~')),
train[,c(tgt,varsSets[[v]])]),
blPars)
)
preds[,v] <- do.call(predictor,
c(list(m,test[,c(tgt,varsSets[[v]])]),
predPars))
}
}
ps <- apply(preds,1,function(x)
levels(factor(x))[which.max(table(factor(x)))])
factor(ps,
levels=1:nlevels(train[,tgt]),
labels=levels(train[,tgt]))
}
Workflow function that takes an algorithm as a parameter ti ryb and evaluate our model pipeline
ALLb.wf <- function(form, train, test,
learner, learner.pars=NULL,
predictor="predict",predictor.pars=NULL,
featSel.meth = "s2",
available.fsMethods=list(s1=list("all"), # all features after ANOVA
s2=list('rf',30), # random forest to select top 30 features by mean decrease in accuracy
s3=list('varclus',30,50)), # variable clustering ensemble approach based on 50 models each building 30 predictors
.model=FALSE,
...)
{
## The characteristics of the selected feature selection method
fs.meth <- available.fsMethods[[featSel.meth]]
## The target variable
tgt <- as.character(form[[2]])
tgtCol <- which(colnames(train)==tgt)
## Anova filtering
f <- Anova(train[,tgt],p=0.01)
ff <- filterfun(f)
genes <- genefilter(t(train[,-tgtCol]),ff)
genes <- names(genes)[genes]
train <- train[,c(tgt,genes)]
test <- test[,c(tgt,genes)]
tgtCol <- 1
## Specific filtering
if (fs.meth[[1]]=='varclus') {
pred <- varsEnsemble(tgt,train,test,fs.meth,
learner,learner.pars,
predictor,predictor.pars,
list(...))
} else {
if (fs.meth[[1]]=='rf') {
require(randomForest,quietly=TRUE)
rf <- randomForest(form,train,importance=TRUE)
imp <- importance(rf)
rf.genes <- rownames(imp)[order(imp[,"MeanDecreaseAccuracy"],
decreasing = TRUE)[1:fs.meth[[2]]]]
train <- train[,c(tgt,rf.genes)]
test <- test[,c(tgt,rf.genes)]
}
if (learner == 'knn')
pred <- kNN(form,train,test,
stand.stats=list(rowMedians(t(as.matrix(train[,-tgtCol]))),
rowIQRs(t(as.matrix(train[,-tgtCol])))),
...)
else {
model <- do.call(learner,c(list(form,train),learner.pars))
pred <- do.call(predictor,c(list(model,test),predictor.pars))
}
}
return(list(trues=responseValues(form,test), preds=pred,
model=if (.model && learner!="knn") model else NULL))
}
Creating a list that will hold all variants that are considered in estimation experiments:
varslist <- list()
varslist$randomForest <- list(learner.pars=list(ntree=c(500,750,1000),
mtry=c(5,15)),
preditor.pars=list(type="response"))
varslist$svm <- list(learner.pars=list(cost=c(1,100),
gamma=c(0.01,0.001,0.0001)))
varslist$knn <- list(learner.pars=list(k=c(3,5,7),
stand=c(TRUE,FALSE)))
varslist$featureSel <- list(featSel.meth=c("s1", "s2", "s3"))
library(performanceEstimation)
library(class)
library(randomForest)
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
##
## impute
## The following object is masked from 'package:ggplot2':
##
## element
## The following object is masked from 'package:generics':
##
## interpolate
library(genefilter)
load('myALL.Rdata')
es <- exprs(ALLb)
ALLb <- nsFilter(ALLb,
var.func=IQR, var.cutoff=IQR(as.vector(es))/5,
feature.exclude="^AFFX")
ALLb <- ALLb$eset
dt <- data.frame(t(exprs(ALLb)), Mut=ALLb$mol.bio)
save(dt, file="myDataDT.Rdata")
nReps = 2 was used for testing and debugging code, but we actually ran the code in the console (not R cell in markdown file for safety) and used nReps = 100. This ran over night, 10 hours total (it may haven taken less than the allotted time but it probably took at least 4 hours based off how long nReps = 2 took). I cannot update the code inside the markdown cell because it would have to rerun for it to save, but all cells below use the nReps = 100.
load("myDataDT.Rdata")
assign("dt", dt, envir=.GlobalEnv)
set.seed(1234)
library(parallelMap)
parallelStartSocket(cpus=2)
parallelExport("dt", "ALLb.wf", "varsEnsemble", "kNN", "getVarsSet")
parallelLibrary("randomForest", "e1071", "class", "genefilter",
"matrixStats", "performanceEstimation",
"Biobase", "BiocGenerics")
TODO <- c('knn','svm','randomForest')
for(td in TODO) {
assign(td,
performanceEstimation(
PredTask(Mut ~ ., dt, 'ALL', copy=TRUE),
do.call('workflowVariants',
c(list('ALLb.wf', learner=td, varsRootName=td),
varslist[[td]],
varslist$featureSel
)
),
EstimationTask(metrics="acc", method=Bootstrap(nReps=2)),
cluster=TRUE
)
)
save(list=td, file=paste(td, 'Rdata', sep='.'))
}
parallelStop()
Load results of the experiments
load("knn.Rdata")
load("svm.Rdata")
load("randomForest.Rdata")
Top 5 of each variant:
rankWorkflows(svm, maxs = TRUE)
## $ALL
## $ALL$acc
## Workflow Estimate
## 1 svm.v12 0.8161308
## 2 svm.v8 0.8138529
## 3 svm.v10 0.8132432
## 4 svm.v6 0.8015241
## 5 svm.v7 0.8008629
rankWorkflows(knn, maxs = TRUE)
## $ALL
## $ALL$acc
## Workflow Estimate
## 1 knn.v7 0.8078413
## 2 knn.v8 0.8078413
## 3 knn.v9 0.8078413
## 4 knn.v10 0.8078413
## 5 knn.v11 0.8078413
rankWorkflows(randomForest, maxs = TRUE)
## $ALL
## $ALL$acc
## Workflow Estimate
## 1 randomForest.v9 0.7866133
## 2 randomForest.v8 0.7844237
## 3 randomForest.v7 0.7813595
## 4 randomForest.v10 0.7787207
## 5 randomForest.v12 0.7780597
Top 10 scores of all trials:
all.trials <- mergeEstimationRes(svm, knn, randomForest, by ="workflows")
rankWorkflows(all.trials, top=10, maxs = TRUE)
## $ALL
## $ALL$acc
## Workflow Estimate
## 1 svm.v12 0.8161308
## 2 svm.v8 0.8138529
## 3 svm.v10 0.8132432
## 4 knn.v7 0.8078413
## 5 knn.v8 0.8078413
## 6 knn.v9 0.8078413
## 7 knn.v10 0.8078413
## 8 knn.v11 0.8078413
## 9 knn.v12 0.8078413
## 10 svm.v6 0.8015241
Characteristics of the top solution:
getWorkflow("svm.v12", all.trials)
## Workflow Object:
## Workflow ID :: svm.v12
## Workflow Function :: ALLb.wf
## Parameter values:
## learner.pars -> cost=100 gamma=1e-04
## learner -> svm
## featSel.meth -> s2
SVM.v12 uses 30 genes filtered by an ANOVA and random forest (s2 strategy), and uses a SVM parameter setting of a cost of 100 and gamma of 0.0001
top10WFnames <- rankWorkflows(all.trials, top=10,
maxs = TRUE)[["ALL"]][["acc"]][,"Workflow"]
sapply(top10WFnames, function(WFid) getWorkflow(WFid,all.trials)@pars$featSel.meth)
## svm.v12 svm.v8 svm.v10 knn.v7 knn.v8 knn.v9 knn.v10 knn.v11 knn.v12 svm.v6
## "s2" "s2" "s2" "s2" "s2" "s2" "s2" "s2" "s2" "s1"
plot(subset(all.trials,workflows=top10WFnames))
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the performanceEstimation package.
## Please report the issue at
## <https://github.com/ltorgo/performanceEstimation/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ps <- pairedComparisons(all.trials, baseline="svm.v12")
## Warning in pairedComparisons(all.trials, baseline = "svm.v12"): With less 2
## tasks the Friedman, Nemenyi and Bonferroni-Dunn tests are not calculated.
ps$acc$WilcoxonSignedRank.test
## , , ALL
##
## MedScore DiffMedScores p.value
## svm.v12 0.8235294 NA NA
## svm.v1 0.4058277 0.417701709 3.944995e-18
## svm.v2 0.4058277 0.417701709 3.944995e-18
## svm.v3 0.7245690 0.098960446 3.896526e-13
## svm.v4 0.7419355 0.081593928 9.015880e-11
## svm.v5 0.6764706 0.147058824 1.441511e-16
## svm.v6 0.8000000 0.023529412 6.729801e-03
## svm.v7 0.8060036 0.017525828 1.934278e-06
## svm.v8 0.8153409 0.008188503 6.314458e-01
## svm.v9 0.6984848 0.125044563 4.374660e-17
## svm.v10 0.8153409 0.008188503 6.900373e-01
## svm.v11 0.4027027 0.420826709 5.770927e-18
## svm.v13 0.7285012 0.095028183 9.906675e-15
## svm.v14 0.7917957 0.031733746 1.653512e-04
## svm.v15 0.4428105 0.380718954 5.774248e-18
## svm.v16 0.7894737 0.034055728 6.500782e-04
## svm.v17 0.4027027 0.420826709 5.770594e-18
## svm.v18 0.7533784 0.070151033 2.785619e-11
## knn.v1 0.7647059 0.058823529 4.850979e-09
## knn.v2 0.7647059 0.058823529 4.850979e-09
## knn.v3 0.7647059 0.058823529 4.850979e-09
## knn.v4 0.7647059 0.058823529 4.850979e-09
## knn.v5 0.7647059 0.058823529 4.850979e-09
## knn.v6 0.7647059 0.058823529 4.850979e-09
## knn.v7 0.8116554 0.011874006 2.010675e-01
## knn.v8 0.8116554 0.011874006 2.010675e-01
## knn.v9 0.8116554 0.011874006 2.010675e-01
## knn.v10 0.8116554 0.011874006 2.010675e-01
## knn.v11 0.8116554 0.011874006 2.010675e-01
## knn.v12 0.8116554 0.011874006 2.010675e-01
## knn.v13 0.7393888 0.084140617 1.775823e-11
## knn.v14 0.7386148 0.084914611 5.960646e-13
## knn.v15 0.7101019 0.113427544 2.103474e-14
## knn.v16 0.7795139 0.044015523 1.078240e-05
## knn.v17 0.7728111 0.050718352 1.258618e-07
## knn.v18 0.7647059 0.058823529 1.059726e-09
## randomForest.v1 0.7058824 0.117647059 1.968237e-15
## randomForest.v2 0.7204861 0.103043301 1.298364e-15
## randomForest.v3 0.7108014 0.112728018 1.876365e-15
## randomForest.v4 0.7386148 0.084914611 1.311747e-13
## randomForest.v5 0.7290210 0.094508433 3.849026e-14
## randomForest.v6 0.7302495 0.093279932 4.530103e-14
## randomForest.v7 0.7878788 0.035650624 2.207773e-08
## randomForest.v8 0.7970588 0.026470588 5.994822e-07
## randomForest.v9 0.7970588 0.026470588 1.002769e-06
## randomForest.v10 0.7845644 0.038965018 5.342399e-08
## randomForest.v11 0.7878788 0.035650624 2.484293e-08
## randomForest.v12 0.7825169 0.041012520 2.158400e-08
## randomForest.v13 0.7236111 0.099918301 9.058075e-15
## randomForest.v14 0.7272727 0.096256684 3.873705e-15
## randomForest.v15 0.7272727 0.096256684 2.724379e-15
## randomForest.v16 0.7419355 0.081593928 3.620423e-14
## randomForest.v17 0.7352941 0.088235294 2.023434e-14
## randomForest.v18 0.7352941 0.088235294 4.096730e-14
The confusion matrix ALLb.wf is our user defined workflow function that shows that the actual prediction of the models are.
iteration <- 1
itInfo <- getIterationsInfo(all.trials,workflow="svm.v12",it=iteration)
table(itInfo$trues, itInfo$preds)
##
## ALL1/AF4 BCR/ABL E2A/PBX1 NEG
## ALL1/AF4 3 0 0 0
## BCR/ABL 0 8 0 1
## E2A/PBX1 0 0 0 1
## NEG 0 2 0 13
ALL1/AF4 mutation predictions were correct. Actual mutation BCR/ABL and E2A/PBX1 mutation were predicted as negative once each. Predicted BCR/ABL as a mutation when it was actually negative twice.