We use ALL data in ALL package.The dataset we will use comes from a study on acute lymphoblastic leukemia (Chiarettiet al., 2004; Li, 2009). The data consists of microarray samples from 128 individuals with this type of disease.Our modeling goal is to be able to predict the type of mutation of an individual given its microarray assay
library(Biobase)
library(ALL)
data(ALL)
ALL
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 12625 features, 128 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: 01005 01010 ... LAL4 (128 total)
## varLabels: cod diagnosis ... date last seen (21 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 14684422 16243790
## Annotation: hgu95av2
We start by obtaining some information on the co-variates associated to each sample.
# name and description of covariates
pD <- phenoData(ALL)
varMetadata(ALL)
## labelDescription
## cod Patient ID
## diagnosis Date of diagnosis
## sex Gender of the patient
## age Age of the patient at entry
## BT does the patient have B-cell or T-cell ALL
## remission Complete remission(CR), refractory(REF) or NA. Derived from CR
## CR Original remisson data
## date.cr Date complete remission if achieved
## t(4;11) did the patient have t(4;11) translocation. Derived from citog
## t(9;22) did the patient have t(9;22) translocation. Derived from citog
## cyto.normal Was cytogenetic test normal? Derived from citog
## citog original citogenetics data, deletions or t(4;11), t(9;22) status
## mol.biol molecular biology
## fusion protein which of p190, p210 or p190/210 for bcr/able
## mdr multi-drug resistant
## kinet ploidy: either diploid or hyperd.
## ccr Continuous complete remission? Derived from f.u
## relapse Relapse? Derived from f.u
## transplant did the patient receive a bone marrow transplant? Derived from f.u
## f.u follow up data available
## date last seen date patient was last seen
# information for main covariate distribution
#BT is determine type of acute lymphoblastic
#mol-biol describe cytogenetic abnormality on each sample
table(ALL$BT)
##
## B B1 B2 B3 B4 T T1 T2 T3 T4
## 5 19 36 23 12 5 1 15 10 2
table(ALL$mol.biol)
##
## ALL1/AF4 BCR/ABL E2A/PBX1 NEG NUP-98 p15/p16
## 10 37 5 74 1 1
table(ALL$BT, ALL$mol.biol)
##
## ALL1/AF4 BCR/ABL E2A/PBX1 NEG NUP-98 p15/p16
## B 0 2 1 2 0 0
## B1 10 1 0 8 0 0
## B2 0 19 0 16 0 1
## B3 0 8 1 14 0 0
## B4 0 7 3 2 0 0
## T 0 0 0 5 0 0
## T1 0 0 0 1 0 0
## T2 0 0 0 15 0 0
## T3 0 0 0 9 1 0
## T4 0 0 0 2 0 0
We can also obtain some information on the genes and samples.
featureNames(ALL)[1:10]
## [1] "1000_at" "1001_at" "1002_f_at" "1003_s_at" "1004_at" "1005_at"
## [7] "1006_at" "1007_s_at" "1008_f_at" "1009_at"
sampleNames(ALL)[1:10]
## [1] "01005" "01010" "03002" "04006" "04007" "04008" "04010" "04016" "06002"
## [10] "08001"
we will focus our analysis of this data on the B-cell ALL cases and in particular on the samples with a subset of the mutations, which will be our target class
tgt.cases <- which(ALL$BT %in% levels(ALL$BT)[1:5] & ALL$mol.biol %in% levels(ALL$mol.biol)[1:4])
ALLb <- ALL[,tgt.cases]
we should update the available levels of these two factors on our new ALLb object.
ALLb$BT <- factor(ALLb$BT)
ALLb$mol.biol <- factor(ALLb$mol.biol)
es <- exprs(ALLb)
dim(es)
## [1] 12625 94
In terms of dimensionality, the main challenge of this problem is the fact that there are far too many variables (12,625) for the number of available cases (94). With these dimensions, most modeling techniques will have a hard time obtaining any meaningful result.
summary(as.vector(es))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.985 4.122 5.469 5.624 6.829 14.045
most expression values are between 4 and 7. A better overview of the distribution of the expression levels can be obtained graphically.
library(genefilter)
library(ggplot2)
exprVs <- data.frame(exprVal=as.vector(es))
ds <- data.frame(Stat=c("1stQ","Median","3rdQ","Shorth"),
Value=c(quantile(exprVs$exprVal,
probs=c(0.25, 0.5, 0.75)),
shorth(exprVs$exprVal)),
Color=c("red","green","red","yellow"))
ggplot(exprVs,aes(x=exprVal)) + geom_histogram(fill="lightgrey") +
geom_vline(data=ds,aes(xintercept=Value,color=Color)) +
geom_text(data=ds,aes(x=Value-0.2,y=0,label=Stat,colour=Color),
angle=90,hjust="left") +
xlab("Expression Levels") + guides(colour="none", fill="none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The distributions of the gene expression levels of the samples with a particular mutation are not different from each other.
sapply(levels(ALLb$mol.bio),
function(x) summary(as.vector(es[, which(ALLb$mol.bio == x)])))
## ALL1/AF4 BCR/ABL E2A/PBX1 NEG
## Min. 2.266403 2.194858 2.267802 1.984919
## 1st Qu. 4.140797 4.124414 4.152339 4.111435
## Median 5.454388 5.467834 5.496825 5.469887
## Mean 5.620976 5.627016 5.630445 5.621616
## 3rd Qu. 6.805440 6.832862 6.818587 6.832056
## Max. 14.031235 14.044896 13.812578 13.949564
The first gene filtering methods are based on information concerning the distribution of the expression levels. This type of experimental data usually includes several genes that are not expressed at all or show very small variability.
library(hgu95av2.db)
rowIQRs <- function(em) rowQ(em,ceiling(0.75*ncol(em))) - rowQ(em,floor(0.25*ncol(em)))
dg <- data.frame(rowMed=rowMedians(es), rowIQR=rowIQRs(es))
ggplot(dg,aes(x=rowMed, y=rowIQR)) + geom_point() +
xlab("Median expression level") + ylab("IQR expression level") +
ggtitle("Main Characteristics of Genes Expression Levels")
We will use a heuristic threshold based on the value of the IQR to eliminate some of the genes with very low variability. Namely, we will remove any genes with a variability that is smaller than 1/5 of the global IQR.
resFilter <- nsFilter(ALLb,var.func=IQR,var.cutoff=IQR(as.vector(es))/5,
feature.exclude="^AFFX")
resFilter
## $eset
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 3937 features, 94 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: 01005 01010 ... LAL5 (94 total)
## varLabels: cod diagnosis ... date last seen (21 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 14684422 16243790
## Annotation: hgu95av2
##
## $filter.log
## $filter.log$numDupsRemoved
## [1] 2856
##
## $filter.log$numLowVar
## [1] 4647
##
## $filter.log$numRemoved.ENTREZID
## [1] 1166
##
## $filter.log$feature.exclude
## [1] 19
As you see, we are left with only 3,943 genes from the initial 12,625. This is a rather significant reduction. Nevertheless, we are still far from a dataset that is manageable by most classification models, given that we only have 94 observations.
ALLb <- resFilter$eset
es <- exprs(ALLb)
dim(es)
## [1] 3937 94
If a gene has a distribution of expression values that is similar across all possible values of the target variable, then it will not be very useful to discriminate among these values. We will compare the mean expression level of genes across the subsets of samples belonging to a certain B-cell ALL mutation.
f <- Anova(ALLb$mol.bio, p = 0.01)
ff <- filterfun(f)
selGenes <- genefilter(exprs(ALLb), ff)
sum(selGenes)
## [1] 745
Next we update our data structures to include only these selected genes.
ALLb <- ALLb[selGenes, ]
ALLb
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 745 features, 94 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: 01005 01010 ... LAL5 (94 total)
## varLabels: cod diagnosis ... date last seen (21 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 14684422 16243790
## Annotation: hgu95av2
es <- exprs(ALLb)
dim(es)
## [1] 745 94
dg <- data.frame(rowMed=rowMedians(es), rowIQR=rowIQRs(es))
ggplot(dg,aes(x=rowMed, y=rowIQR)) + geom_point() +
xlab("Median expression level") + ylab("IQR expression level") +
ggtitle("Distribution Properties of the Selected Genes")
##### Filtering using randomforest Random forests can be used to obtain a ranking of the genes as follows
library(randomForest)
dt <- data.frame(t(es), Mut = ALLb$mol.bio)
dt$Mut <- droplevels(dt$Mut)
set.seed(1234)
rf <- randomForest(Mut ~ ., dt, importance = TRUE)
imp <- importance(rf)
rf.genes <- rownames(imp)[order(imp[,"MeanDecreaseAccuracy"],
decreasing = TRUE)[1:30]]
We construct a training set by adding the mutation information to the transpose of the expression matrix. We then use random forest to obtain estimates of the importance of the variables. We select the column with the variable scores measured as the estimated mean decrease in classification accuracy when each variable is removed in turn. We order the values of this score in decreasing order and select the highest 30 of these scores, obtaining the names of the corresponding genes. We may be curious about the expression level distribution of theses 30 genes across the different mutations. We can obtain the median level for these top 30 genes as follows.
sapply(rf.genes, function(g) tapply(dt[, g], dt$Mut, median))
## X1635_at X37015_at X40504_at X1674_at X1467_at X40202_at X1307_at
## ALL1/AF4 7.302814 3.752649 3.218079 3.745752 3.708985 8.550639 3.368915
## BCR/ABL 8.693082 4.857105 4.924310 5.833510 4.239306 9.767293 4.945270
## E2A/PBX1 7.562676 6.579530 3.455316 3.808258 3.411696 7.414635 4.678577
## NEG 7.324691 3.765741 3.541651 4.244791 3.515020 7.655605 4.863930
## X37363_at X41470_at X40795_at X34210_at X41191_at X34699_at X34850_at
## ALL1/AF4 3.910130 9.616743 3.867134 5.641130 6.314058 4.253504 5.426653
## BCR/ABL 5.583184 5.205797 4.544239 9.204237 4.459709 6.315966 6.898979
## E2A/PBX1 3.800260 3.931191 4.151637 8.198781 4.325834 6.102031 5.928574
## NEG 3.926599 4.157748 3.909532 8.791774 4.369366 6.092511 6.327281
## X39837_s_at X39424_at X36873_at X40480_s_at X32434_at X33284_at
## ALL1/AF4 6.633188 6.896571 7.040593 6.414368 3.317480 6.046114
## BCR/ABL 7.374046 8.054597 3.490262 8.208263 5.339625 6.975736
## E2A/PBX1 6.708400 7.689701 3.634471 6.722296 3.668714 5.896895
## NEG 6.878846 8.044947 3.824670 7.362318 3.226766 6.078459
## X37981_at X1134_at X41071_at X35162_s_at X41193_at X35164_at X675_at
## ALL1/AF4 6.170311 7.846189 7.698420 4.398885 5.707761 5.577268 7.824595
## BCR/ABL 6.882755 8.475578 6.017967 4.924553 8.107205 6.493672 10.036177
## E2A/PBX1 8.080002 8.697500 6.058185 4.380962 6.810393 7.406714 8.538763
## NEG 7.423079 8.167493 6.573731 4.236335 7.621437 7.492440 9.118158
## X37225_at X40454_at X37351_at
## ALL1/AF4 5.220668 4.007171 5.919309
## BCR/ABL 3.460902 3.910912 6.959287
## E2A/PBX1 7.445655 7.390283 6.125676
## NEG 3.387552 3.807652 6.255333
We can obtain more detail by graphically inspecting the median and inter-quartile ranges of the expression levels of these genes for the 94 samples.
library(tidyr)
library(dplyr)
d <- gather(dt[,c(rf.genes,"Mut")],Gene,ExprValue,1:length(rf.genes))
dat <- group_by(d,Mut,Gene) %>%
summarise(med=median(ExprValue), iqr=IQR(ExprValue))
ggplot(dat, aes(x=med,y=iqr,color=Mut)) +
geom_point(size=6) + facet_wrap(~ Gene) +
labs(x="MEDIAN expression level",y="IQR expression level",color="Mutation")
##### Filtering Using Feature Clustering Ensembles we use a clustering algorithm to obtain 30 groups of variables that are supposed to be similar. These 30 variable clusters will then be used to obtain an ensemble classification model where m models will be obtained with 30 variables, each one randomly chosen from one of the 30 clusters.We will assume that there is some degree of redundancy on our set of features generated by the ANOVA filter. We will try to model this redundancy by clustering the variables.
library(Hmisc)
vc <- varclus(t(es))
clus30 <- cutree(vc$hclust, 30)
table(clus30)
## clus30
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 20 34 30 22 34 29 19 18 45 52 19 22 21 17 54 15 20 17 18 21 60 19 18 14 24 23
## 27 28 29 30
## 17 15 13 15
The following function facilitates the process by generating one set of variables via randomly sampling from the selected number of clusters.
getVarsSet <- function(cluster,nvars=30,seed=NULL,verb=FALSE) {
if (!is.null(seed)) set.seed(seed)
cls <- cutree(cluster,nvars)
tots <- table(cls)
vars <- c()
vars <- sapply(1:nvars,function(clID)
{
if (!length(tots[clID])) stop('Empty cluster! (',clID,')')
x <- sample(1:tots[clID],1)
names(cls[cls==clID])[x]
})
if (verb) structure(vars,clusMemb=cls,clusTots=tots)
else vars
}
getVarsSet(vc$hclust)
## [1] "38037_at" "38063_at" "35828_at" "187_at" "41715_at"
## [6] "35940_at" "33134_at" "37524_at" "37001_at" "38381_at"
## [11] "1640_at" "38717_at" "38833_at" "35136_at" "40419_at"
## [16] "39351_at" "656_at" "33339_g_at" "36313_at" "38218_at"
## [21] "40167_s_at" "32321_at" "1782_s_at" "32475_at" "33232_at"
## [26] "35643_at" "36131_at" "35369_at" "32725_at" "37781_at"
we will use three different datasets that differ in the predictors that are used. One will have all genes selected by an ANOVA test, while the other two will select 30 of these genes. All datasets will contain 94 cases of B-cell ALL. With the exception of the target variable, all information is numeric. To handle this problem we have selected three different modeling techniques: SVM, Random Forest and knn. We create a small function that enable the use of knn in a more standard formula.
kNN <- function(form, train, test, stand = TRUE, stand.stats = NULL, ...) {
require(class, quietly = TRUE)
tgtCol <- which(colnames(train) == as.character(form[[2]]))
if (stand) {
if (is.null(stand.stats))
tmp <- scale(train[, -tgtCol], center = TRUE, scale = TRUE)
else tmp <- scale(train[, -tgtCol], center = stand.stats[[1]],
scale = stand.stats[[2]])
train[, -tgtCol] <- tmp
ms <- attr(tmp, "scaled:center")
ss <- attr(tmp, "scaled:scale")
test[, -tgtCol] <- scale(test[, -tgtCol], center = ms, scale = ss)
}
knn(train[, -tgtCol], test[, -tgtCol], train[, tgtCol], ...)
}
The following function implements the approach involving creating an ensemble of models, each applied to a different set of predictor variables. This function will be called from within the function that implements our workflow (the solution to the prediction task).
varsEnsemble <- function(tgt,train,test,
fs.meth,
baseLearner,blPars,
predictor,predPars,
verb=FALSE)
{
require(Hmisc,quietly=TRUE)
v <- varclus(as.matrix(train[,-which(colnames(train)==tgt)]))
varsSets <- lapply(1:fs.meth[[3]],function(x)
getVarsSet(v$hclust,nvars=fs.meth[[2]]))
preds <- matrix(NA,ncol=length(varsSets),nrow=NROW(test))
for(v in seq(along=varsSets)) {
if (baseLearner=='knn')
preds[,v] <- do.call("kNN",
c(list(as.formula(paste(tgt,
paste(varsSets[[v]],
collapse='+'),
sep='~')),
train[,c(tgt,varsSets[[v]])],
test[,c(tgt,varsSets[[v]])]),
blPars)
)
else {
m <- do.call(baseLearner,
c(list(as.formula(paste(tgt,
paste(varsSets[[v]],
collapse='+'),
sep='~')),
train[,c(tgt,varsSets[[v]])]),
blPars)
)
preds[,v] <- do.call(predictor,
c(list(m,test[,c(tgt,varsSets[[v]])]),
predPars))
}
}
ps <- apply(preds,1,function(x)
levels(factor(x))[which.max(table(factor(x)))])
factor(ps,
levels=1:nlevels(train[,tgt]),
labels=levels(train[,tgt]))
}
Given the similarity of the tasks to be carried out by each of the classification algorithms, we have created a single user-defined workflow function that will receive as one of the parameters the learner that is to be used.
ALLb.wf <- function(form, train, test,
learner, learner.pars=NULL,
predictor="predict",predictor.pars=NULL,
featSel.meth = "s2",
available.fsMethods=list(s1=list("all"),s2=list('rf',30),
s3=list('varclus',30,50)),
.model=FALSE,
...)
{
## The characteristics of the selected feature selection method
fs.meth <- available.fsMethods[[featSel.meth]]
## The target variable
tgt <- as.character(form[[2]])
tgtCol <- which(colnames(train)==tgt)
## Anova filtering
f <- Anova(train[,tgt],p=0.01)
ff <- filterfun(f)
genes <- genefilter(t(train[,-tgtCol]),ff)
genes <- names(genes)[genes]
train <- train[,c(tgt,genes)]
test <- test[,c(tgt,genes)]
tgtCol <- 1
## Specific filtering
if (fs.meth[[1]]=='varclus') {
pred <- varsEnsemble(tgt,train,test,fs.meth,
learner,learner.pars,
predictor,predictor.pars,
list(...))
} else {
if (fs.meth[[1]]=='rf') {
require(randomForest,quietly=TRUE)
rf <- randomForest(form,train,importance=TRUE)
imp <- importance(rf)
rf.genes <- rownames(imp)[order(imp[,"MeanDecreaseAccuracy"],
decreasing = TRUE)[1:fs.meth[[2]]]]
train <- train[,c(tgt,rf.genes)]
test <- test[,c(tgt,rf.genes)]
}
if (learner == 'knn')
pred <- kNN(form,train,test,
stand.stats=list(rowMedians(t(as.matrix(train[,-tgtCol]))),
rowIQRs(t(as.matrix(train[,-tgtCol])))),
...)
else {
model <- do.call(learner,c(list(form,train),learner.pars))
pred <- do.call(predictor,c(list(model,test),predictor.pars))
}
}
return(list(trues=responseValues(form,test), preds=pred,
model=if (.model && learner!="knn") model else NULL))
}
The following list holds all variants that are to be considered in our estimation experiments:
vars <- list()
vars$randomForest <- list(learner.pars=list(ntree=c(500,750,1000),
mtry=c(5,15)),
preditor.pars=list(type="response"))
vars$svm <- list(learner.pars=list(cost=c(1,100),
gamma=c(0.01,0.001,0.0001)))
vars$knn <- list(learner.pars=list(k=c(3,5,7),
stand=c(TRUE,FALSE)))
vars$featureSel <- list(featSel.meth=c("s1", "s2", "s3"))
The code to run the full experiments is the following
library(performanceEstimation)
library(class)
library(randomForest)
library(e1071)
library(genefilter)
es <- exprs(ALLb)
## simple filtering
ALLb <- nsFilter(ALLb,
var.func=IQR,var.cutoff=IQR(as.vector(es))/5,
feature.exclude="^AFFX")
ALLb <- ALLb$eset
dt <- data.frame(t(exprs(ALLb)),Mut=ALLb$mol.bio)
set.seed(1234)
## The learners to evaluate
TODO <- c('knn','svm','randomForest')
for(td in TODO) {
#browser()
assign(td,
performanceEstimation(
PredTask(Mut ~ .,dt,'ALL'),
do.call('workflowVariants',
c(list('ALLb.wf',learner=td,varsRootName=td),
vars[[td]],
vars$featureSel
)
),
EstimationTask(metrics="acc",method=Bootstrap(nReps=100)),
cluster=TRUE
)
)
save(list=td,file=paste(td,'Rdata',sep='.'))
}
## load results of the exps
load("knn.Rdata")
load("svm.Rdata")
load("randomForest.Rdata")
In order to have an overall perspective of all the workflows tried, we can join the three objects:
all.trials <- mergeEstimationRes(svm, knn, randomForest, by ="workflows")
rankWorkflows(all.trials, top=10, maxs = TRUE)
## $ALL
## $ALL$acc
## Workflow Estimate
## 1 svm.v8 0.8126319
## 2 svm.v12 0.8112365
## 3 knn.v7 0.8084350
## 4 knn.v8 0.8084350
## 5 knn.v9 0.8084350
## 6 knn.v10 0.8084350
## 7 knn.v11 0.8084350
## 8 knn.v12 0.8084350
## 9 svm.v10 0.8064391
## 10 svm.v6 0.8046412
no random forest variant appears in the top 10 solutions. The top score is obtained by a variant of the SVM method. Let us check its characteristics
getWorkflow("svm.v8", all.trials)
## Workflow Object:
## Workflow ID :: svm.v8
## Workflow Function :: ALLb.wf
## Parameter values:
## learner.pars -> cost=100 gamma=0.01
## learner -> svm
## featSel.meth -> s2
top10WFnames <- rankWorkflows(all.trials, top=10,
maxs = TRUE)[["ALL"]][["acc"]][,"Workflow"]
plot(subset(all.trials,workflows=top10WFnames))
here are no statistically significant differences among these 10 workflows
ps <- pairedComparisons(subset(all.trials,workflows=top10WFnames),baseline="svm.v8")
## Warning in pairedComparisons(subset(all.trials, workflows = top10WFnames), :
## With less 2 tasks the Friedman, Nemenyi and Bonferroni-Dunn tests are not
## calculated.
ps$acc$WilcoxonSignedRank.test
## , , ALL
##
## MedScore DiffMedScores p.value
## svm.v8 0.8235294 NA NA
## svm.v12 0.8235294 0.000000000 0.7000463
## knn.v7 0.8169856 0.006543766 0.7941389
## knn.v8 0.8169856 0.006543766 0.7941389
## knn.v9 0.8169856 0.006543766 0.7941389
## knn.v10 0.8169856 0.006543766 0.7941389
## knn.v11 0.8169856 0.006543766 0.7941389
## knn.v12 0.8169856 0.006543766 0.7941389
## svm.v10 0.8086312 0.014898200 0.2240278
## svm.v6 0.8055556 0.017973856 0.4055620