install.packages("BiocManager")
Error in install.packages : Updating loaded packages
BiocManager::install("Biobase")
'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
CRAN: https://cran.rstudio.com/
Bioconductor version 3.16 (BiocManager 1.30.20), R 4.2.2 (2022-10-31)
Warning: package(s) not installed when version(s) same as or greater than current; use
`force = TRUE` to re-install: 'Biobase'
BiocManager::install("ALL")
'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
CRAN: https://cran.rstudio.com/
Bioconductor version 3.16 (BiocManager 1.30.20), R 4.2.2 (2022-10-31)
Installing package(s) 'ALL'
Warning: downloaded length 0 != reported length 6873Warning: cannot open URL 'https://bioconductor.org/packages/3.16/data/annotation/bin/macosx/big-sur-arm64/contrib/4.2/PACKAGES.rds': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 6873Warning: cannot open URL 'https://bioconductor.org/packages/3.16/data/annotation/bin/macosx/big-sur-arm64/contrib/4.2/PACKAGES.gz': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 6873Warning: cannot open URL 'https://bioconductor.org/packages/3.16/data/experiment/bin/macosx/big-sur-arm64/contrib/4.2/PACKAGES.rds': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 6873Warning: cannot open URL 'https://bioconductor.org/packages/3.16/data/experiment/bin/macosx/big-sur-arm64/contrib/4.2/PACKAGES.gz': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 6873Warning: cannot open URL 'https://bioconductor.org/packages/3.16/workflows/bin/macosx/big-sur-arm64/contrib/4.2/PACKAGES.rds': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 6873Warning: cannot open URL 'https://bioconductor.org/packages/3.16/workflows/bin/macosx/big-sur-arm64/contrib/4.2/PACKAGES.gz': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 6873Warning: cannot open URL 'https://bioconductor.org/packages/3.16/books/bin/macosx/big-sur-arm64/contrib/4.2/PACKAGES.rds': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 6873Warning: cannot open URL 'https://bioconductor.org/packages/3.16/books/bin/macosx/big-sur-arm64/contrib/4.2/PACKAGES.gz': HTTP status was '404 Not Found'installing the source package ‘ALL’
trying URL 'https://bioconductor.org/packages/3.16/data/experiment/src/contrib/ALL_1.40.0.tar.gz'
Content type 'application/x-gzip' length 11383132 bytes (10.9 MB)
==================================================
downloaded 10.9 MB
* installing *source* package ‘ALL’ ...
** using staged installation
** R
** data
** inst
** byte-compile and prepare package for lazy loading
** help
*** installing help indices
** building package indices
** installing vignettes
** testing if installed package can be loaded from temporary location
** testing if installed package can be loaded from final location
** testing if installed package keeps a record of temporary installation path
* DONE (ALL)
The downloaded source packages are in
‘/private/var/folders/v3/vq_nw_j931j4mvyt1xqc2dr80000gn/T/Rtmp3RlWFu/downloaded_packages’
library(Biobase)
library(ALL)
data(ALL)
ALL
ExpressionSet (storageMode: lockedEnvironment)
assayData: 12625 features, 128 samples
element names: exprs
protocolData: none
phenoData
sampleNames: 01005 01010 ... LAL4 (128 total)
varLabels: cod diagnosis ... date last seen (21 total)
varMetadata: labelDescription
featureData: none
experimentData: use 'experimentData(object)'
pubMedIds: 14684422 16243790
Annotation: hgu95av2
pD <- phenoData(ALL)
varMetadata(pD)
table(ALL$BT)
B B1 B2 B3 B4 T T1 T2 T3 T4
5 19 36 23 12 5 1 15 10 2
table(ALL$mol.biol)
ALL1/AF4 BCR/ABL E2A/PBX1 NEG NUP-98 p15/p16
10 37 5 74 1 1
table(ALL$BT, ALL$mol.bio)
ALL1/AF4 BCR/ABL E2A/PBX1 NEG NUP-98 p15/p16
B 0 2 1 2 0 0
B1 10 1 0 8 0 0
B2 0 19 0 16 0 1
B3 0 8 1 14 0 0
B4 0 7 3 2 0 0
T 0 0 0 5 0 0
T1 0 0 0 1 0 0
T2 0 0 0 15 0 0
T3 0 0 0 9 1 0
T4 0 0 0 2 0 0
featureNames(ALL)[1:10]
[1] "1000_at" "1001_at" "1002_f_at" "1003_s_at" "1004_at"
[6] "1005_at" "1006_at" "1007_s_at" "1008_f_at" "1009_at"
sampleNames(ALL)[1:5]
[1] "01005" "01010" "03002" "04006" "04007"
tgt.cases <- which(ALL$BT %in% levels(ALL$BT)[1:5] &
ALL$mol.bio %in% levels(ALL$mol.bio)[1:4])
ALLb <- ALL[,tgt.cases]
ALLb
ExpressionSet (storageMode: lockedEnvironment)
assayData: 12625 features, 94 samples
element names: exprs
protocolData: none
phenoData
sampleNames: 01005 01010 ... LAL5 (94 total)
varLabels: cod diagnosis ... date last seen (21 total)
varMetadata: labelDescription
featureData: none
experimentData: use 'experimentData(object)'
pubMedIds: 14684422 16243790
Annotation: hgu95av2
ALLb$BT <- factor(ALLb$BT)
ALLb$mol.bio <- factor(ALLb$mol.bio)
save(ALLb, file = "myALL.Rdata")
es <- exprs(ALLb)
dim(es)
[1] 12625 94
summary(as.vector(es))
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.985 4.122 5.469 5.624 6.829 14.045
BiocManager::install("genefilter")
'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
CRAN: https://cran.rstudio.com/
Bioconductor version 3.16 (BiocManager 1.30.20), R 4.2.2 (2022-10-31)
Warning: package(s) not installed when version(s) same as or greater than
current; use `force = TRUE` to re-install: 'genefilter'
#source("http://bioconductor.org/biocLite.R")
library(genefilter)
library(genefilter)
library(ggplot2)
exprVs <- data.frame(exprVal=as.vector(es))
ds <- data.frame(Stat=c("1stQ","Median","3rdQ","Shorth"),
Value=c(quantile(exprVs$exprVal,
probs=c(0.25, 0.5, 0.75)),
shorth(exprVs$exprVal)),
Color=c("red","green","red","yellow"))
ggplot(exprVs,aes(x=exprVal)) + geom_histogram(fill="lightgrey") +
geom_vline(data=ds,aes(xintercept=Value,color=Color)) +
geom_text(data=ds,aes(x=Value-0.2,y=0,label=Stat,colour=Color),
angle=90,hjust="left") +
xlab("Expression Levels") + guides(colour="none", fill="none")

library(genefilter)
library(ggplot2)
exprVs <- data.frame(exprVal=as.vector(es))
ds <- data.frame(Stat=c("1stQ","Median","3rdQ","Shorth"),
Value=c(quantile(exprVs$exprVal, probs=c(0.25, 0.5, 0.75)),
shorth(exprVs$exprVal)),
Color=c("red","green","red","yellow"))
ggplot(exprVs,aes(x=exprVal)) + geom_histogram(fill="lightgrey") +
geom_vline(data=ds,aes(xintercept=Value,color=Color)) +
geom_text(data=ds,aes(x=Value-0.2,y=0,label=Stat,colour=Color),angle=90,hjust="left") +
xlab("Expression Levels") + guides(colour="none", fill="none")

sapply(levels(ALLb$mol.bio),
function(x) summary(as.vector(es[, which(ALLb$mol.bio == x)])))
ALL1/AF4 BCR/ABL E2A/PBX1 NEG
Min. 2.266403 2.194858 2.267802 1.984919
1st Qu. 4.140797 4.124414 4.152339 4.111435
Median 5.454388 5.467834 5.496825 5.469887
Mean 5.620976 5.627016 5.630445 5.621616
3rd Qu. 6.805440 6.832862 6.818587 6.832056
Max. 14.031235 14.044896 13.812578 13.949564
library(hgu95av2.db)
Loading required package: AnnotationDbi
Loading required package: stats4
Loading required package: IRanges
Loading required package: S4Vectors
Attaching package: ‘S4Vectors’
The following object is masked from ‘package:xts’:
first
The following objects are masked from ‘package:dplyr’:
first, rename
The following objects are masked from ‘package:base’:
expand.grid, I, unname
Attaching package: ‘IRanges’
The following object is masked from ‘package:sp’:
%over%
The following objects are masked from ‘package:dplyr’:
collapse, desc, slice
Attaching package: ‘AnnotationDbi’
The following object is masked from ‘package:dplyr’:
select
Loading required package: org.Hs.eg.db
rowIQRs <- function(em)
rowQ(em,ceiling(0.75*ncol(em))) - rowQ(em,floor(0.25*ncol(em)))
library(ggplot2)
dg <- data.frame(rowMed=rowMedians(es), rowIQR=rowIQRs(es))
ggplot(dg,aes(x=rowMed, y=rowIQR)) + geom_point() +
xlab("Median expression level") + ylab("IQR expression level") +
ggtitle("Main Characteristics of Genes Expression Levels")

rowIQRs <- function(em) rowQ(em,ceiling(0.75*ncol(em))) - rowQ(em,floor(0.25*ncol(em)))
library(ggplot2)
dg <- data.frame(rowMed=rowMedians(es), rowIQR=rowIQRs(es))
ggplot(dg,aes(x=rowMed, y=rowIQR)) + geom_point() +
xlab("Median expression level") + ylab("IQR expression level") +
ggtitle("Main Characteristics of Genes Expression Levels")

library(genefilter)
resFilter <- nsFilter(ALLb,
var.func=IQR,
var.cutoff=IQR(as.vector(es))/5,
feature.exclude="^AFFX")
resFilter
$eset
ExpressionSet (storageMode: lockedEnvironment)
assayData: 4025 features, 94 samples
element names: exprs
protocolData: none
phenoData
sampleNames: 01005 01010 ... LAL5 (94 total)
varLabels: cod diagnosis ... mol.bio (22 total)
varMetadata: labelDescription
featureData: none
experimentData: use 'experimentData(object)'
pubMedIds: 14684422 16243790
Annotation: hgu95av2
$filter.log
$filter.log$numDupsRemoved
[1] 2888
$filter.log$numLowVar
[1] 4751
$filter.log$numRemoved.ENTREZID
[1] 942
$filter.log$feature.exclude
[1] 19
ALLb <- resFilter$eset
es <- exprs(ALLb)
dim(es)
[1] 4025 94
f <- Anova(ALLb$mol.bio, p = 0.01)
ff <- filterfun(f)
selGenes <- genefilter(exprs(ALLb), ff)
sum(selGenes)
[1] 751
ALLb <- ALLb[selGenes, ]
ALLb
ExpressionSet (storageMode: lockedEnvironment)
assayData: 751 features, 94 samples
element names: exprs
protocolData: none
phenoData
sampleNames: 01005 01010 ... LAL5 (94 total)
varLabels: cod diagnosis ... mol.bio (22 total)
varMetadata: labelDescription
featureData: none
experimentData: use 'experimentData(object)'
pubMedIds: 14684422 16243790
Annotation: hgu95av2
es <- exprs(ALLb)
dim(es)
[1] 751 94
dg <- data.frame(rowMed=rowMedians(es), rowIQR=rowIQRs(es))
ggplot(dg,aes(x=rowMed, y=rowIQR)) + geom_point() +
xlab("Median expression level") + ylab("IQR expression level") +
ggtitle("Distribution Properties of the Selected Genes")

library(randomForest)
dt <- data.frame(t(es), Mut = ALLb$mol.bio)
dt$Mut <- droplevels(dt$Mut)
set.seed(1234)
rf <- randomForest(Mut ~ ., dt, importance = TRUE)
imp <- importance(rf)
rf.genes <- rownames(imp)[order(imp[,"MeanDecreaseAccuracy"],
decreasing = TRUE)[1:30]]
library(tidyr)
Attaching package: ‘tidyr’
The following object is masked from ‘package:S4Vectors’:
expand
library(dplyr)
d <- gather(dt[,c(rf.genes,"Mut")],Gene,ExprValue,1:length(rf.genes))
dat <- group_by(d,Mut,Gene) %>%
summarise(med=median(ExprValue), iqr=IQR(ExprValue))
`summarise()` has grouped output by 'Mut'. You can override using the `.groups` argument.
ggplot(dat, aes(x=med,y=iqr,color=Mut)) +
geom_point(size=6) + facet_wrap(~ Gene) +
labs(x="MEDIAN expression level",y="IQR expression level",color="Mutation")

install.packages('Hmisc')
also installing the dependency ‘htmlTable’
trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.2/htmlTable_2.4.1.tgz'
Content type 'application/x-gzip' length 419727 bytes (409 KB)
==================================================
downloaded 409 KB
trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.2/Hmisc_5.0-1.tgz'
Content type 'application/x-gzip' length 3434580 bytes (3.3 MB)
==================================================
downloaded 3.3 MB
The downloaded binary packages are in
/var/folders/v3/vq_nw_j931j4mvyt1xqc2dr80000gn/T//Rtmp3RlWFu/downloaded_packages
library(Hmisc)
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Attaching package: ‘Hmisc’
The following object is masked from ‘package:AnnotationDbi’:
contents
The following object is masked from ‘package:Biobase’:
contents
The following objects are masked from ‘package:TeachingDemos’:
cnvrt.coords, subplot
The following object is masked from ‘package:e1071’:
impute
The following object is masked from ‘package:quantmod’:
Lag
The following objects are masked from ‘package:dplyr’:
src, summarize
The following objects are masked from ‘package:base’:
format.pval, units
vc <- varclus(t(es))
clus30 <- cutree(vc$hclust, 30)
table(clus30)
clus30
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
21 16 37 18 44 19 28 38 28 30 35 54 20 47 18 45 17 14 25 18 22 31 28
24 25 26 27 28 29 30
15 7 15 11 18 19 13
getVarsSet <- function(cluster,nvars=30,seed=NULL,verb=FALSE) {
if (!is.null(seed)) set.seed(seed)
cls <- cutree(cluster,nvars)
tots <- table(cls)
vars <- c()
vars <- sapply(1:nvars,function(clID)
{
if (!length(tots[clID])) stop('Empty cluster! (',clID,')')
x <- sample(1:tots[clID],1)
names(cls[cls==clID])[x]
})
if (verb) structure(vars,clusMemb=cls,clusTots=tots)
else vars
}
getVarsSet(vc$hclust)
[1] "37579_at" "33371_s_at" "34362_at" "36008_at" "39224_at"
[6] "1207_at" "36403_s_at" "41123_s_at" "2036_s_at" "37842_at"
[11] "36188_at" "106_at" "41853_at" "41715_at" "41126_at"
[16] "763_at" "39049_at" "1389_at" "37015_at" "41872_at"
[21] "32607_at" "2062_at" "37304_at" "33266_at" "32612_at"
[26] "905_at" "37027_at" "38081_at" "39649_at" "33777_at"
getVarsSet(vc$hclust)
[1] "41139_at" "32585_at" "31508_at" "39410_at" "36089_at"
[6] "33796_at" "32169_at" "36577_at" "32725_at" "38352_at"
[11] "41606_at" "40480_s_at" "38651_at" "37039_at" "37251_s_at"
[16] "40419_at" "38014_at" "317_at" "33339_g_at" "33305_at"
[21] "36203_at" "37680_at" "41146_at" "33429_at" "37485_at"
[26] "40369_f_at" "36142_at" "38078_at" "40425_at" "40454_at"
library(performanceEstimation)
library(DMwR2)
data(iris)
exp <- performanceEstimation(
PredTask(Species ~ ., iris),
Workflow(learner="rpartXse", predictor.pars=list(type="class")),
EstimationTask(metrics="acc",method=Bootstrap(nReps=100)))
##### PERFORMANCE ESTIMATION USING BOOTSTRAP #####
** PREDICTIVE TASK :: iris.Species
++ MODEL/WORKFLOW :: rpartXse
Task for estimating acc using
100 repetitions of e0 Bootstrap experiment
Run with seed = 1234
Iteration : 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
summary(exp)
== Summary of a Bootstrap Performance Estimation Experiment ==
Task for estimating acc using
100 repetitions of e0 Bootstrap experiment
Run with seed = 1234
* Predictive Tasks :: iris.Species
* Workflows :: rpartXse
-> Task: iris.Species
*Workflow: rpartXse
acc
avg 0.94183656
std 0.02784701
med 0.94392034
iqr 0.03637634
min 0.86666667
max 1.00000000
invalid 0.00000000
#install.packages('class')
library(class)
data(iris)
idx <- sample(1:nrow(iris), as.integer(0.7 * nrow(iris)))
tr <- iris[idx, ]
ts <- iris[-idx, ]
preds <- knn(tr[, -5], ts[, -5], tr[, 5], k = 3)
table(preds, ts[, 5])
preds setosa versicolor virginica
setosa 15 0 0
versicolor 0 10 3
virginica 0 1 16
kNN <- function(form, train, test, stand = TRUE, stand.stats = NULL, ...) {
require(class, quietly = TRUE)
tgtCol <- which(colnames(train) == as.character(form[[2]]))
if (stand) {
if (is.null(stand.stats))
tmp <- scale(train[, -tgtCol], center = TRUE, scale = TRUE)
else tmp <- scale(train[, -tgtCol], center = stand.stats[[1]],
scale = stand.stats[[2]])
train[, -tgtCol] <- tmp
ms <- attr(tmp, "scaled:center")
ss <- attr(tmp, "scaled:scale")
test[, -tgtCol] <- scale(test[, -tgtCol], center = ms, scale = ss)
}
knn(train[, -tgtCol], test[, -tgtCol], train[, tgtCol], ...)
}
preds.stand <- kNN(Species ~ ., tr, ts, k = 3)
table(preds.stand,ts[, 5])
preds.stand setosa versicolor virginica
setosa 15 0 0
versicolor 0 10 5
virginica 0 1 14
preds.notStand <- kNN(Species ~ ., tr, ts, stand = FALSE, k = 3)
table(preds.notStand, ts[, 5])
preds.notStand setosa versicolor virginica
setosa 15 0 0
versicolor 0 10 3
virginica 0 1 16
varsEnsemble <- function(tgt,train,test,
fs.meth,
baseLearner,blPars,
predictor,predPars,
verb=FALSE)
{
require(Hmisc,quietly=TRUE)
v <- varclus(as.matrix(train[,-which(colnames(train)==tgt)]))
varsSets <- lapply(1:fs.meth[[3]],function(x)
getVarsSet(v$hclust,nvars=fs.meth[[2]]))
preds <- matrix(NA,ncol=length(varsSets),nrow=NROW(test))
for(v in seq(along=varsSets)) {
if (baseLearner=='knn')
preds[,v] <- do.call("kNN",
c(list(as.formula(paste(tgt,
paste(varsSets[[v]],
collapse='+'),
sep='~')),
train[,c(tgt,varsSets[[v]])],
test[,c(tgt,varsSets[[v]])]),
blPars)
)
else {
m <- do.call(baseLearner,
c(list(as.formula(paste(tgt,
paste(varsSets[[v]],
collapse='+'),
sep='~')),
train[,c(tgt,varsSets[[v]])]),
blPars)
)
preds[,v] <- do.call(predictor,
c(list(m,test[,c(tgt,varsSets[[v]])]),
predPars))
}
}
ps <- apply(preds,1,function(x)
levels(factor(x))[which.max(table(factor(x)))])
factor(ps,
levels=1:nlevels(train[,tgt]),
labels=levels(train[,tgt]))
}
ALLb.wf <- function(form, train, test,
learner, learner.pars=NULL,
predictor="predict",predictor.pars=NULL,
featSel.meth = "s2",
available.fsMethods=list(s1=list("all"),s2=list('rf',30),
s3=list('varclus',30,50)),
.model=FALSE,
...)
{
## The characteristics of the selected feature selection method
fs.meth <- available.fsMethods[[featSel.meth]]
## The target variable
tgt <- as.character(form[[2]])
tgtCol <- which(colnames(train)==tgt)
## Anova filtering
f <- Anova(train[,tgt],p=0.01)
ff <- filterfun(f)
genes <- genefilter(t(train[,-tgtCol]),ff)
genes <- names(genes)[genes]
train <- train[,c(tgt,genes)]
test <- test[,c(tgt,genes)]
tgtCol <- 1
## Specific filtering
if (fs.meth[[1]]=='varclus') {
pred <- varsEnsemble(tgt,train,test,fs.meth,
learner,learner.pars,
predictor,predictor.pars,
list(...))
} else {
if (fs.meth[[1]]=='rf') {
require(randomForest,quietly=TRUE)
rf <- randomForest(form,train,importance=TRUE)
imp <- importance(rf)
rf.genes <- rownames(imp)[order(imp[,"MeanDecreaseAccuracy"],
decreasing = TRUE)[1:fs.meth[[2]]]]
train <- train[,c(tgt,rf.genes)]
test <- test[,c(tgt,rf.genes)]
}
if (learner == 'knn')
pred <- kNN(form,train,test,
stand.stats=list(rowMedians(t(as.matrix(train[,-tgtCol]))),
rowIQRs(t(as.matrix(train[,-tgtCol])))),
...)
else {
model <- do.call(learner,c(list(form,train),learner.pars))
pred <- do.call(predictor,c(list(model,test),predictor.pars))
}
}
return(list(trues=responseValues(form,test), preds=pred,
model=if (.model && learner!="knn") model else NULL))
}
vars <- list()
vars$randomForest <- list(learner.pars=list(ntree=c(500,750,1000),
mtry=c(5,15)),
preditor.pars=list(type="response"))
vars$svm <- list(learner.pars=list(cost=c(1,100),
gamma=c(0.01,0.001,0.0001)))
vars$knn <- list(learner.pars=list(k=c(3,5,7),
stand=c(TRUE,FALSE)))
vars$featureSel <- list(featSel.meth=c("s1", "s2", "s3"))
library(performanceEstimation)
library(class)
library(randomForest)
randomForest 4.7-1.1
Type rfNews() to see new features/changes/bug fixes.
library(e1071)
library(genefilter)
load('myALL.Rdata') # loading the previously saved object with the data
Warning: cannot open compressed file 'myALL.Rdata', probable reason 'No such file or directory'Error in readChar(con, 5L, useBytes = TRUE) : cannot open the connection
setwd("~/Downloads")
Warning: The working directory was changed to /Users/christianpetersen/Downloads inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
load("myALL.Rdata")
load("knn.Rdata")
load("svm.Rdata")
load("randomForest.Rdata")
rankWorkflows(svm, maxs = TRUE)
$ALL
$ALL$acc
NANA
rankWorkflows(all.trials, top=10, maxs = TRUE)
$ALL
$ALL$acc
NANA
getWorkflow("svm.v8", all.trials)
Workflow Object:
Workflow ID :: svm.v8
Workflow Function :: ALLb.wf
Parameter values:
learner.pars -> cost=100 gamma=0.01
learner -> svm
featSel.meth -> s2
top10WFnames <- rankWorkflows(all.trials, top=10,
maxs = TRUE)[["ALL"]][["acc"]][,"Workflow"]
sapply(top10WFnames, function(WFid) getWorkflow(WFid,all.trials)@pars$featSel.meth)
svm.v8 svm.v12 knn.v7 knn.v8 knn.v9 knn.v10 knn.v11 knn.v12
"s2" "s2" "s2" "s2" "s2" "s2" "s2" "s2"
svm.v10 svm.v6
"s2" "s1"
plot(subset(all.trials,workflows=top10WFnames))

ps <- pairedComparisons(subset(all.trials,workflows=top10WFnames),baseline="svm.v8")
Warning: With less 2 tasks the Friedman, Nemenyi and Bonferroni-Dunn tests are not calculated.
ps$acc$WilcoxonSignedRank.test
, , ALL
MedScore DiffMedScores p.value
svm.v8 0.8235294 NA NA
svm.v12 0.8235294 0.000000000 0.7000463
knn.v7 0.8169856 0.006543766 0.7941389
knn.v8 0.8169856 0.006543766 0.7941389
knn.v9 0.8169856 0.006543766 0.7941389
knn.v10 0.8169856 0.006543766 0.7941389
knn.v11 0.8169856 0.006543766 0.7941389
knn.v12 0.8169856 0.006543766 0.7941389
svm.v10 0.8086312 0.014898200 0.2240278
svm.v6 0.8055556 0.017973856 0.4055620
iteration <- 1 # any number between 1 and 100 in this case
itInfo <- getIterationsInfo(all.trials,workflow="svm.v8",it=iteration)
table(itInfo$trues, itInfo$preds)
ALL1/AF4 BCR/ABL E2A/PBX1 NEG
ALL1/AF4 3 0 0 0
BCR/ABL 0 12 0 3
E2A/PBX1 0 0 0 1
NEG 0 1 1 14
---
title: "Case Study 4"
output: html_notebook
---


```{r}
install.packages("BiocManager")
library(BiocManager)

```

```{r}

```

```{r}
BiocManager::install("Biobase")
BiocManager::install("ALL")
library(Biobase)
library(ALL)
data(ALL)
```

```{r}
ALL
```
```{r}

pD <- phenoData(ALL)
varMetadata(pD)
table(ALL$BT)
table(ALL$mol.biol)
table(ALL$BT, ALL$mol.bio)
```


```{r}
featureNames(ALL)[1:10]
```


```{r}
sampleNames(ALL)[1:5]
```


```{r}
tgt.cases <- which(ALL$BT %in% levels(ALL$BT)[1:5] & 
                   ALL$mol.bio %in% levels(ALL$mol.bio)[1:4])
ALLb <- ALL[,tgt.cases]
ALLb
```


```{r}
ALLb$BT <- factor(ALLb$BT)
ALLb$mol.bio <- factor(ALLb$mol.bio)
```



```{r}

save(ALLb, file = "myALL.Rdata")
```



```{r}
es <- exprs(ALLb)
dim(es)
```
```{r}
summary(as.vector(es))
```


```{r}
#BiocManager::install("genefilter")

library(genefilter)
```

```{r}
library(genefilter)
library(ggplot2)
exprVs <- data.frame(exprVal=as.vector(es))
ds <- data.frame(Stat=c("1stQ","Median","3rdQ","Shorth"),
                 Value=c(quantile(exprVs$exprVal, 
                         probs=c(0.25, 0.5, 0.75)),
                         shorth(exprVs$exprVal)),
                 Color=c("red","green","red","yellow"))
ggplot(exprVs,aes(x=exprVal)) + geom_histogram(fill="lightgrey") + 
    geom_vline(data=ds,aes(xintercept=Value,color=Color)) + 
    geom_text(data=ds,aes(x=Value-0.2,y=0,label=Stat,colour=Color),
              angle=90,hjust="left") +
    xlab("Expression Levels") + guides(colour="none", fill="none") 
```

```{r}
library(genefilter)
library(ggplot2)
exprVs <- data.frame(exprVal=as.vector(es))
ds <- data.frame(Stat=c("1stQ","Median","3rdQ","Shorth"),
                 Value=c(quantile(exprVs$exprVal, probs=c(0.25, 0.5, 0.75)),
                     shorth(exprVs$exprVal)),
                 Color=c("red","green","red","yellow"))
ggplot(exprVs,aes(x=exprVal)) + geom_histogram(fill="lightgrey") + 
    geom_vline(data=ds,aes(xintercept=Value,color=Color)) + 
    geom_text(data=ds,aes(x=Value-0.2,y=0,label=Stat,colour=Color),angle=90,hjust="left") +
    xlab("Expression Levels") + guides(colour="none", fill="none")    
```


```{r}
sapply(levels(ALLb$mol.bio), 
       function(x) summary(as.vector(es[, which(ALLb$mol.bio == x)])))
```


```{r}
BiocManager::install("hgu95av2.db")

library(hgu95av2.db)
```


```{r}
rowIQRs <- function(em) 
    rowQ(em,ceiling(0.75*ncol(em))) - rowQ(em,floor(0.25*ncol(em)))
library(ggplot2)
dg <- data.frame(rowMed=rowMedians(es), rowIQR=rowIQRs(es))
ggplot(dg,aes(x=rowMed, y=rowIQR)) + geom_point() +
    xlab("Median expression level") + ylab("IQR expression level") +
    ggtitle("Main Characteristics of Genes Expression Levels")
```


```{r}
rowIQRs <- function(em) rowQ(em,ceiling(0.75*ncol(em))) - rowQ(em,floor(0.25*ncol(em)))
library(ggplot2)
dg <- data.frame(rowMed=rowMedians(es), rowIQR=rowIQRs(es))
ggplot(dg,aes(x=rowMed, y=rowIQR)) + geom_point() +
    xlab("Median expression level") + ylab("IQR expression level") +
    ggtitle("Main Characteristics of Genes Expression Levels")
```


```{r}
library(genefilter)
resFilter <- nsFilter(ALLb,
                 var.func=IQR,
                 var.cutoff=IQR(as.vector(es))/5, 
                 feature.exclude="^AFFX")
resFilter
```

```{r}
ALLb <- resFilter$eset
es <- exprs(ALLb)
dim(es)

```

```{r}
f <- Anova(ALLb$mol.bio, p = 0.01)
ff <- filterfun(f)
selGenes <- genefilter(exprs(ALLb), ff)
sum(selGenes)
```


```{r}
ALLb <- ALLb[selGenes, ]
ALLb
es <- exprs(ALLb)
dim(es)
```

```{r}
dg <- data.frame(rowMed=rowMedians(es), rowIQR=rowIQRs(es))
ggplot(dg,aes(x=rowMed, y=rowIQR)) + geom_point() +
    xlab("Median expression level") + ylab("IQR expression level") +
    ggtitle("Distribution Properties of the Selected Genes")
```


```{r}
library(randomForest)
dt <- data.frame(t(es), Mut = ALLb$mol.bio)
dt$Mut <- droplevels(dt$Mut)
set.seed(1234)
rf <- randomForest(Mut ~ ., dt, importance = TRUE)
imp <- importance(rf)
rf.genes <- rownames(imp)[order(imp[,"MeanDecreaseAccuracy"], 
                                decreasing = TRUE)[1:30]]
```

```{r}
library(tidyr)
library(dplyr)
d <- gather(dt[,c(rf.genes,"Mut")],Gene,ExprValue,1:length(rf.genes))
dat <- group_by(d,Mut,Gene) %>% 
    summarise(med=median(ExprValue), iqr=IQR(ExprValue))
ggplot(dat, aes(x=med,y=iqr,color=Mut)) +  
    geom_point(size=6) + facet_wrap(~ Gene) + 
    labs(x="MEDIAN expression level",y="IQR expression level",color="Mutation")
```


```{r}
#install.packages('Hmisc')
library(Hmisc)
vc <- varclus(t(es))
clus30 <- cutree(vc$hclust, 30)
table(clus30)


```


```{r}
getVarsSet <- function(cluster,nvars=30,seed=NULL,verb=FALSE) {
    if (!is.null(seed)) set.seed(seed)
    
    cls <- cutree(cluster,nvars)
    tots <- table(cls)
    vars <- c()
    vars <- sapply(1:nvars,function(clID)
    {
        if (!length(tots[clID])) stop('Empty cluster! (',clID,')')
        x <- sample(1:tots[clID],1)
        names(cls[cls==clID])[x]
    })
    if (verb)  structure(vars,clusMemb=cls,clusTots=tots)
    else       vars
}
getVarsSet(vc$hclust)
getVarsSet(vc$hclust)

```



```{r}
library(performanceEstimation)
library(DMwR2)
data(iris)
exp <- performanceEstimation(
    PredTask(Species ~ ., iris), 
    Workflow(learner="rpartXse", predictor.pars=list(type="class")),
    EstimationTask(metrics="acc",method=Bootstrap(nReps=100)))
```


```{r}
summary(exp)

```


```{r}
#install.packages('class')
library(class)
data(iris)
idx <- sample(1:nrow(iris), as.integer(0.7 * nrow(iris)))
tr <- iris[idx, ]
ts <- iris[-idx, ]
preds <- knn(tr[, -5], ts[, -5], tr[, 5], k = 3)
table(preds, ts[, 5])
```




```{r}
kNN <- function(form, train, test, stand = TRUE, stand.stats = NULL, ...) {
    require(class, quietly = TRUE)
    tgtCol <- which(colnames(train) == as.character(form[[2]]))
    if (stand) {
        if (is.null(stand.stats)) 
            tmp <- scale(train[, -tgtCol], center = TRUE, scale = TRUE)
        else tmp <- scale(train[, -tgtCol], center = stand.stats[[1]], 
                          scale = stand.stats[[2]])
        train[, -tgtCol] <- tmp
        ms <- attr(tmp, "scaled:center")
        ss <- attr(tmp, "scaled:scale")
        test[, -tgtCol] <- scale(test[, -tgtCol], center = ms, scale = ss)
    }
    knn(train[, -tgtCol], test[, -tgtCol], train[, tgtCol], ...)
}

```

```{r}
preds.stand <- kNN(Species ~ ., tr, ts, k = 3)
table(preds.stand,ts[, 5])
preds.notStand <- kNN(Species ~ ., tr, ts, stand = FALSE, k = 3)
table(preds.notStand, ts[, 5]) 

```

```{r}
varsEnsemble <- function(tgt,train,test,
                         fs.meth,
                         baseLearner,blPars,
                         predictor,predPars,
                         verb=FALSE)
{
    require(Hmisc,quietly=TRUE)
    v <- varclus(as.matrix(train[,-which(colnames(train)==tgt)]))
    varsSets <- lapply(1:fs.meth[[3]],function(x)
        getVarsSet(v$hclust,nvars=fs.meth[[2]]))
    
    preds <- matrix(NA,ncol=length(varsSets),nrow=NROW(test))
    for(v in seq(along=varsSets)) {
        if (baseLearner=='knn')
            preds[,v] <- do.call("kNN",
                                 c(list(as.formula(paste(tgt,
                                                 paste(varsSets[[v]],
                                                       collapse='+'),
                                                 sep='~')),
                                        train[,c(tgt,varsSets[[v]])],
                                        test[,c(tgt,varsSets[[v]])]),
                                   blPars)
                                 )
        else {
            m <- do.call(baseLearner,
                         c(list(as.formula(paste(tgt,
                                                 paste(varsSets[[v]],
                                                       collapse='+'),
                                                 sep='~')),
                                train[,c(tgt,varsSets[[v]])]),
                           blPars)
                         )
            preds[,v] <- do.call(predictor,
                                 c(list(m,test[,c(tgt,varsSets[[v]])]),
                                   predPars))
        }
    }

    ps <- apply(preds,1,function(x)
        levels(factor(x))[which.max(table(factor(x)))])
    factor(ps,
           levels=1:nlevels(train[,tgt]),
           labels=levels(train[,tgt]))
}
```

```{r}
ALLb.wf <- function(form, train, test,
                    learner, learner.pars=NULL,
                    predictor="predict",predictor.pars=NULL,
                    featSel.meth = "s2", 
                    available.fsMethods=list(s1=list("all"),s2=list('rf',30),
                                             s3=list('varclus',30,50)),
                    .model=FALSE,
                    ...)
{
    ## The characteristics of the selected feature selection method
    fs.meth <- available.fsMethods[[featSel.meth]] 
    
    ## The target variable
    tgt <- as.character(form[[2]])
    tgtCol <- which(colnames(train)==tgt)

    ## Anova filtering  
    f <- Anova(train[,tgt],p=0.01)
    ff <- filterfun(f)
    genes <- genefilter(t(train[,-tgtCol]),ff)
    genes <- names(genes)[genes]
    train <- train[,c(tgt,genes)]
    test <- test[,c(tgt,genes)]
    tgtCol <- 1

    ## Specific filtering 
    if (fs.meth[[1]]=='varclus') {
      pred <- varsEnsemble(tgt,train,test,fs.meth,
                            learner,learner.pars,
                            predictor,predictor.pars,
                            list(...))

    } else {
      if (fs.meth[[1]]=='rf') {
        require(randomForest,quietly=TRUE)
        rf <- randomForest(form,train,importance=TRUE)
        imp <- importance(rf)
        rf.genes <- rownames(imp)[order(imp[,"MeanDecreaseAccuracy"], 
                                        decreasing = TRUE)[1:fs.meth[[2]]]]
        train <- train[,c(tgt,rf.genes)]
        test <- test[,c(tgt,rf.genes)]
      }

      if (learner == 'knn') 
        pred <- kNN(form,train,test,
                    stand.stats=list(rowMedians(t(as.matrix(train[,-tgtCol]))),
                        rowIQRs(t(as.matrix(train[,-tgtCol])))),
                    ...)
      else {
        model <- do.call(learner,c(list(form,train),learner.pars))
        pred <- do.call(predictor,c(list(model,test),predictor.pars))
      }

    }

    return(list(trues=responseValues(form,test), preds=pred, 
                model=if (.model && learner!="knn") model else NULL))

}
```


```{r}
vars <- list()
vars$randomForest <- list(learner.pars=list(ntree=c(500,750,1000),
                                            mtry=c(5,15)),
                          preditor.pars=list(type="response"))
vars$svm <- list(learner.pars=list(cost=c(1,100),
                                   gamma=c(0.01,0.001,0.0001)))
vars$knn <- list(learner.pars=list(k=c(3,5,7),
                                   stand=c(TRUE,FALSE)))
vars$featureSel <- list(featSel.meth=c("s1", "s2", "s3"))

```

```{r}
library(performanceEstimation)
library(class)
library(randomForest)
library(e1071)
library(genefilter)
load('myALL.Rdata')  # loading the previously saved object with the data

es <- exprs(ALLb)

## simple filtering
ALLb <- nsFilter(ALLb,
                 var.func=IQR,var.cutoff=IQR(as.vector(es))/5, 
                 feature.exclude="^AFFX")
ALLb <- ALLb$eset

## the source dataset after the basic filtering
dt <- data.frame(t(exprs(ALLb)),Mut=ALLb$mol.bio)

set.seed(1234)
## The learners to evaluate
TODO <- c('knn','svm','randomForest')
for(td in TODO) {
    assign(td,
         performanceEstimation(
             PredTask(Mut ~ .,dt,'ALL'),
             do.call('workflowVariants',
                     c(list('ALLb.wf',learner=td,varsRootName=td),
                       vars[[td]],
                       vars$featureSel
                       )
                     ),
             EstimationTask(metrics="acc",method=Bootstrap(nReps=100)),
             cluster=TRUE
         )
         )
    save(list=td,file=paste(td,'Rdata',sep='.'))
}
```

```{r}
setwd("~/Downloads")

load("myALL.Rdata")
load("knn.Rdata")
load("svm.Rdata")
load("randomForest.Rdata")
```

```{r}
rankWorkflows(svm, maxs = TRUE)
```

```{r}
all.trials <- mergeEstimationRes(svm, knn, randomForest, by ="workflows")

```

```{r}
rankWorkflows(all.trials, top=10, maxs = TRUE)

```

```{r}
getWorkflow("svm.v8", all.trials)

```

```{r}
top10WFnames <- rankWorkflows(all.trials, top=10, 
                              maxs = TRUE)[["ALL"]][["acc"]][,"Workflow"]
sapply(top10WFnames, function(WFid) getWorkflow(WFid,all.trials)@pars$featSel.meth)
```


```{r}
plot(subset(all.trials,workflows=top10WFnames))

```

```{r}
ps <- pairedComparisons(subset(all.trials,workflows=top10WFnames),baseline="svm.v8")
ps$acc$WilcoxonSignedRank.test
```

```{r}
iteration <- 1  # any number between 1 and 100 in this case
itInfo <- getIterationsInfo(all.trials,workflow="svm.v8",it=iteration)
table(itInfo$trues, itInfo$preds)
```












































































