install.packages("BiocManager")
Error in install.packages : Updating loaded packages
BiocManager::install("Biobase")
'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
    CRAN: https://cran.rstudio.com/
Bioconductor version 3.16 (BiocManager 1.30.20), R 4.2.2 (2022-10-31)
Warning: package(s) not installed when version(s) same as or greater than current; use
  `force = TRUE` to re-install: 'Biobase'
BiocManager::install("ALL")
'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
    CRAN: https://cran.rstudio.com/
Bioconductor version 3.16 (BiocManager 1.30.20), R 4.2.2 (2022-10-31)
Installing package(s) 'ALL'
Warning: downloaded length 0 != reported length 6873Warning: cannot open URL 'https://bioconductor.org/packages/3.16/data/annotation/bin/macosx/big-sur-arm64/contrib/4.2/PACKAGES.rds': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 6873Warning: cannot open URL 'https://bioconductor.org/packages/3.16/data/annotation/bin/macosx/big-sur-arm64/contrib/4.2/PACKAGES.gz': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 6873Warning: cannot open URL 'https://bioconductor.org/packages/3.16/data/experiment/bin/macosx/big-sur-arm64/contrib/4.2/PACKAGES.rds': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 6873Warning: cannot open URL 'https://bioconductor.org/packages/3.16/data/experiment/bin/macosx/big-sur-arm64/contrib/4.2/PACKAGES.gz': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 6873Warning: cannot open URL 'https://bioconductor.org/packages/3.16/workflows/bin/macosx/big-sur-arm64/contrib/4.2/PACKAGES.rds': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 6873Warning: cannot open URL 'https://bioconductor.org/packages/3.16/workflows/bin/macosx/big-sur-arm64/contrib/4.2/PACKAGES.gz': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 6873Warning: cannot open URL 'https://bioconductor.org/packages/3.16/books/bin/macosx/big-sur-arm64/contrib/4.2/PACKAGES.rds': HTTP status was '404 Not Found'Warning: downloaded length 0 != reported length 6873Warning: cannot open URL 'https://bioconductor.org/packages/3.16/books/bin/macosx/big-sur-arm64/contrib/4.2/PACKAGES.gz': HTTP status was '404 Not Found'installing the source package ‘ALL’

trying URL 'https://bioconductor.org/packages/3.16/data/experiment/src/contrib/ALL_1.40.0.tar.gz'
Content type 'application/x-gzip' length 11383132 bytes (10.9 MB)
==================================================
downloaded 10.9 MB

* installing *source* package ‘ALL’ ...
** using staged installation
** R
** data
** inst
** byte-compile and prepare package for lazy loading
** help
*** installing help indices
** building package indices
** installing vignettes
** testing if installed package can be loaded from temporary location
** testing if installed package can be loaded from final location
** testing if installed package keeps a record of temporary installation path
* DONE (ALL)

The downloaded source packages are in
    ‘/private/var/folders/v3/vq_nw_j931j4mvyt1xqc2dr80000gn/T/Rtmp3RlWFu/downloaded_packages’
library(Biobase)
library(ALL)
data(ALL)
ALL
ExpressionSet (storageMode: lockedEnvironment)
assayData: 12625 features, 128 samples 
  element names: exprs 
protocolData: none
phenoData
  sampleNames: 01005 01010 ... LAL4 (128 total)
  varLabels: cod diagnosis ... date last seen (21 total)
  varMetadata: labelDescription
featureData: none
experimentData: use 'experimentData(object)'
  pubMedIds: 14684422 16243790 
Annotation: hgu95av2 

pD <- phenoData(ALL)
varMetadata(pD)
table(ALL$BT)

 B B1 B2 B3 B4  T T1 T2 T3 T4 
 5 19 36 23 12  5  1 15 10  2 
table(ALL$mol.biol)

ALL1/AF4  BCR/ABL E2A/PBX1      NEG   NUP-98  p15/p16 
      10       37        5       74        1        1 
table(ALL$BT, ALL$mol.bio)
    
     ALL1/AF4 BCR/ABL E2A/PBX1 NEG NUP-98 p15/p16
  B         0       2        1   2      0       0
  B1       10       1        0   8      0       0
  B2        0      19        0  16      0       1
  B3        0       8        1  14      0       0
  B4        0       7        3   2      0       0
  T         0       0        0   5      0       0
  T1        0       0        0   1      0       0
  T2        0       0        0  15      0       0
  T3        0       0        0   9      1       0
  T4        0       0        0   2      0       0
featureNames(ALL)[1:10]
 [1] "1000_at"   "1001_at"   "1002_f_at" "1003_s_at" "1004_at"  
 [6] "1005_at"   "1006_at"   "1007_s_at" "1008_f_at" "1009_at"  
sampleNames(ALL)[1:5]
[1] "01005" "01010" "03002" "04006" "04007"
tgt.cases <- which(ALL$BT %in% levels(ALL$BT)[1:5] & 
                   ALL$mol.bio %in% levels(ALL$mol.bio)[1:4])
ALLb <- ALL[,tgt.cases]
ALLb
ExpressionSet (storageMode: lockedEnvironment)
assayData: 12625 features, 94 samples 
  element names: exprs 
protocolData: none
phenoData
  sampleNames: 01005 01010 ... LAL5 (94 total)
  varLabels: cod diagnosis ... date last seen (21 total)
  varMetadata: labelDescription
featureData: none
experimentData: use 'experimentData(object)'
  pubMedIds: 14684422 16243790 
Annotation: hgu95av2 
ALLb$BT <- factor(ALLb$BT)
ALLb$mol.bio <- factor(ALLb$mol.bio)

save(ALLb, file = "myALL.Rdata")
es <- exprs(ALLb)
dim(es)
[1] 12625    94
summary(as.vector(es))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.985   4.122   5.469   5.624   6.829  14.045 
BiocManager::install("genefilter")
'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
    CRAN: https://cran.rstudio.com/
Bioconductor version 3.16 (BiocManager 1.30.20), R 4.2.2 (2022-10-31)
Warning: package(s) not installed when version(s) same as or greater than
  current; use `force = TRUE` to re-install: 'genefilter'
#source("http://bioconductor.org/biocLite.R")
library(genefilter)
library(genefilter)
library(ggplot2)
exprVs <- data.frame(exprVal=as.vector(es))
ds <- data.frame(Stat=c("1stQ","Median","3rdQ","Shorth"),
                 Value=c(quantile(exprVs$exprVal, 
                         probs=c(0.25, 0.5, 0.75)),
                         shorth(exprVs$exprVal)),
                 Color=c("red","green","red","yellow"))
ggplot(exprVs,aes(x=exprVal)) + geom_histogram(fill="lightgrey") + 
    geom_vline(data=ds,aes(xintercept=Value,color=Color)) + 
    geom_text(data=ds,aes(x=Value-0.2,y=0,label=Stat,colour=Color),
              angle=90,hjust="left") +
    xlab("Expression Levels") + guides(colour="none", fill="none") 

library(genefilter)
library(ggplot2)
exprVs <- data.frame(exprVal=as.vector(es))
ds <- data.frame(Stat=c("1stQ","Median","3rdQ","Shorth"),
                 Value=c(quantile(exprVs$exprVal, probs=c(0.25, 0.5, 0.75)),
                     shorth(exprVs$exprVal)),
                 Color=c("red","green","red","yellow"))
ggplot(exprVs,aes(x=exprVal)) + geom_histogram(fill="lightgrey") + 
    geom_vline(data=ds,aes(xintercept=Value,color=Color)) + 
    geom_text(data=ds,aes(x=Value-0.2,y=0,label=Stat,colour=Color),angle=90,hjust="left") +
    xlab("Expression Levels") + guides(colour="none", fill="none")    

sapply(levels(ALLb$mol.bio), 
       function(x) summary(as.vector(es[, which(ALLb$mol.bio == x)])))
         ALL1/AF4   BCR/ABL  E2A/PBX1       NEG
Min.     2.266403  2.194858  2.267802  1.984919
1st Qu.  4.140797  4.124414  4.152339  4.111435
Median   5.454388  5.467834  5.496825  5.469887
Mean     5.620976  5.627016  5.630445  5.621616
3rd Qu.  6.805440  6.832862  6.818587  6.832056
Max.    14.031235 14.044896 13.812578 13.949564
library(hgu95av2.db)
Loading required package: AnnotationDbi
Loading required package: stats4
Loading required package: IRanges
Loading required package: S4Vectors

Attaching package: ‘S4Vectors’

The following object is masked from ‘package:xts’:

    first

The following objects are masked from ‘package:dplyr’:

    first, rename

The following objects are masked from ‘package:base’:

    expand.grid, I, unname


Attaching package: ‘IRanges’

The following object is masked from ‘package:sp’:

    %over%

The following objects are masked from ‘package:dplyr’:

    collapse, desc, slice


Attaching package: ‘AnnotationDbi’

The following object is masked from ‘package:dplyr’:

    select

Loading required package: org.Hs.eg.db
rowIQRs <- function(em) 
    rowQ(em,ceiling(0.75*ncol(em))) - rowQ(em,floor(0.25*ncol(em)))
library(ggplot2)
dg <- data.frame(rowMed=rowMedians(es), rowIQR=rowIQRs(es))
ggplot(dg,aes(x=rowMed, y=rowIQR)) + geom_point() +
    xlab("Median expression level") + ylab("IQR expression level") +
    ggtitle("Main Characteristics of Genes Expression Levels")

rowIQRs <- function(em) rowQ(em,ceiling(0.75*ncol(em))) - rowQ(em,floor(0.25*ncol(em)))
library(ggplot2)
dg <- data.frame(rowMed=rowMedians(es), rowIQR=rowIQRs(es))
ggplot(dg,aes(x=rowMed, y=rowIQR)) + geom_point() +
    xlab("Median expression level") + ylab("IQR expression level") +
    ggtitle("Main Characteristics of Genes Expression Levels")

library(genefilter)
resFilter <- nsFilter(ALLb,
                 var.func=IQR,
                 var.cutoff=IQR(as.vector(es))/5, 
                 feature.exclude="^AFFX")
resFilter
$eset
ExpressionSet (storageMode: lockedEnvironment)
assayData: 4025 features, 94 samples 
  element names: exprs 
protocolData: none
phenoData
  sampleNames: 01005 01010 ... LAL5 (94 total)
  varLabels: cod diagnosis ... mol.bio (22 total)
  varMetadata: labelDescription
featureData: none
experimentData: use 'experimentData(object)'
  pubMedIds: 14684422 16243790 
Annotation: hgu95av2 

$filter.log
$filter.log$numDupsRemoved
[1] 2888

$filter.log$numLowVar
[1] 4751

$filter.log$numRemoved.ENTREZID
[1] 942

$filter.log$feature.exclude
[1] 19
ALLb <- resFilter$eset
es <- exprs(ALLb)
dim(es)
[1] 4025   94
f <- Anova(ALLb$mol.bio, p = 0.01)
ff <- filterfun(f)
selGenes <- genefilter(exprs(ALLb), ff)
sum(selGenes)
[1] 751
ALLb <- ALLb[selGenes, ]
ALLb
ExpressionSet (storageMode: lockedEnvironment)
assayData: 751 features, 94 samples 
  element names: exprs 
protocolData: none
phenoData
  sampleNames: 01005 01010 ... LAL5 (94 total)
  varLabels: cod diagnosis ... mol.bio (22 total)
  varMetadata: labelDescription
featureData: none
experimentData: use 'experimentData(object)'
  pubMedIds: 14684422 16243790 
Annotation: hgu95av2 
es <- exprs(ALLb)
dim(es)
[1] 751  94
dg <- data.frame(rowMed=rowMedians(es), rowIQR=rowIQRs(es))
ggplot(dg,aes(x=rowMed, y=rowIQR)) + geom_point() +
    xlab("Median expression level") + ylab("IQR expression level") +
    ggtitle("Distribution Properties of the Selected Genes")

library(randomForest)
dt <- data.frame(t(es), Mut = ALLb$mol.bio)
dt$Mut <- droplevels(dt$Mut)
set.seed(1234)
rf <- randomForest(Mut ~ ., dt, importance = TRUE)
imp <- importance(rf)
rf.genes <- rownames(imp)[order(imp[,"MeanDecreaseAccuracy"], 
                                decreasing = TRUE)[1:30]]
library(tidyr)

Attaching package: ‘tidyr’

The following object is masked from ‘package:S4Vectors’:

    expand
library(dplyr)
d <- gather(dt[,c(rf.genes,"Mut")],Gene,ExprValue,1:length(rf.genes))
dat <- group_by(d,Mut,Gene) %>% 
    summarise(med=median(ExprValue), iqr=IQR(ExprValue))
`summarise()` has grouped output by 'Mut'. You can override using the `.groups` argument.
ggplot(dat, aes(x=med,y=iqr,color=Mut)) +  
    geom_point(size=6) + facet_wrap(~ Gene) + 
    labs(x="MEDIAN expression level",y="IQR expression level",color="Mutation")

install.packages('Hmisc')
also installing the dependency ‘htmlTable’

trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.2/htmlTable_2.4.1.tgz'
Content type 'application/x-gzip' length 419727 bytes (409 KB)
==================================================
downloaded 409 KB

trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.2/Hmisc_5.0-1.tgz'
Content type 'application/x-gzip' length 3434580 bytes (3.3 MB)
==================================================
downloaded 3.3 MB

The downloaded binary packages are in
    /var/folders/v3/vq_nw_j931j4mvyt1xqc2dr80000gn/T//Rtmp3RlWFu/downloaded_packages
library(Hmisc)
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio

Attaching package: ‘Hmisc’

The following object is masked from ‘package:AnnotationDbi’:

    contents

The following object is masked from ‘package:Biobase’:

    contents

The following objects are masked from ‘package:TeachingDemos’:

    cnvrt.coords, subplot

The following object is masked from ‘package:e1071’:

    impute

The following object is masked from ‘package:quantmod’:

    Lag

The following objects are masked from ‘package:dplyr’:

    src, summarize

The following objects are masked from ‘package:base’:

    format.pval, units
vc <- varclus(t(es))
clus30 <- cutree(vc$hclust, 30)
table(clus30)
clus30
 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 
21 16 37 18 44 19 28 38 28 30 35 54 20 47 18 45 17 14 25 18 22 31 28 
24 25 26 27 28 29 30 
15  7 15 11 18 19 13 
getVarsSet <- function(cluster,nvars=30,seed=NULL,verb=FALSE) {
    if (!is.null(seed)) set.seed(seed)
    
    cls <- cutree(cluster,nvars)
    tots <- table(cls)
    vars <- c()
    vars <- sapply(1:nvars,function(clID)
    {
        if (!length(tots[clID])) stop('Empty cluster! (',clID,')')
        x <- sample(1:tots[clID],1)
        names(cls[cls==clID])[x]
    })
    if (verb)  structure(vars,clusMemb=cls,clusTots=tots)
    else       vars
}
getVarsSet(vc$hclust)
 [1] "37579_at"   "33371_s_at" "34362_at"   "36008_at"   "39224_at"  
 [6] "1207_at"    "36403_s_at" "41123_s_at" "2036_s_at"  "37842_at"  
[11] "36188_at"   "106_at"     "41853_at"   "41715_at"   "41126_at"  
[16] "763_at"     "39049_at"   "1389_at"    "37015_at"   "41872_at"  
[21] "32607_at"   "2062_at"    "37304_at"   "33266_at"   "32612_at"  
[26] "905_at"     "37027_at"   "38081_at"   "39649_at"   "33777_at"  
getVarsSet(vc$hclust)
 [1] "41139_at"   "32585_at"   "31508_at"   "39410_at"   "36089_at"  
 [6] "33796_at"   "32169_at"   "36577_at"   "32725_at"   "38352_at"  
[11] "41606_at"   "40480_s_at" "38651_at"   "37039_at"   "37251_s_at"
[16] "40419_at"   "38014_at"   "317_at"     "33339_g_at" "33305_at"  
[21] "36203_at"   "37680_at"   "41146_at"   "33429_at"   "37485_at"  
[26] "40369_f_at" "36142_at"   "38078_at"   "40425_at"   "40454_at"  
library(performanceEstimation)
library(DMwR2)
data(iris)
exp <- performanceEstimation(
    PredTask(Species ~ ., iris), 
    Workflow(learner="rpartXse", predictor.pars=list(type="class")),
    EstimationTask(metrics="acc",method=Bootstrap(nReps=100)))


##### PERFORMANCE ESTIMATION USING  BOOTSTRAP  #####

** PREDICTIVE TASK :: iris.Species

++ MODEL/WORKFLOW :: rpartXse 
Task for estimating  acc  using
100  repetitions of  e0  Bootstrap experiment
     Run with seed =  1234 
Iteration :  1  2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99  100
summary(exp)

== Summary of a  Bootstrap Performance Estimation Experiment ==

Task for estimating  acc  using
100  repetitions of  e0  Bootstrap experiment
     Run with seed =  1234 

* Predictive Tasks ::  iris.Species
* Workflows  ::  rpartXse 

-> Task:  iris.Species
  *Workflow: rpartXse 
               acc
avg     0.94183656
std     0.02784701
med     0.94392034
iqr     0.03637634
min     0.86666667
max     1.00000000
invalid 0.00000000
#install.packages('class')
library(class)
data(iris)
idx <- sample(1:nrow(iris), as.integer(0.7 * nrow(iris)))
tr <- iris[idx, ]
ts <- iris[-idx, ]
preds <- knn(tr[, -5], ts[, -5], tr[, 5], k = 3)
table(preds, ts[, 5])
            
preds        setosa versicolor virginica
  setosa         15          0         0
  versicolor      0         10         3
  virginica       0          1        16
kNN <- function(form, train, test, stand = TRUE, stand.stats = NULL, ...) {
    require(class, quietly = TRUE)
    tgtCol <- which(colnames(train) == as.character(form[[2]]))
    if (stand) {
        if (is.null(stand.stats)) 
            tmp <- scale(train[, -tgtCol], center = TRUE, scale = TRUE)
        else tmp <- scale(train[, -tgtCol], center = stand.stats[[1]], 
                          scale = stand.stats[[2]])
        train[, -tgtCol] <- tmp
        ms <- attr(tmp, "scaled:center")
        ss <- attr(tmp, "scaled:scale")
        test[, -tgtCol] <- scale(test[, -tgtCol], center = ms, scale = ss)
    }
    knn(train[, -tgtCol], test[, -tgtCol], train[, tgtCol], ...)
}
preds.stand <- kNN(Species ~ ., tr, ts, k = 3)
table(preds.stand,ts[, 5])
            
preds.stand  setosa versicolor virginica
  setosa         15          0         0
  versicolor      0         10         5
  virginica       0          1        14
preds.notStand <- kNN(Species ~ ., tr, ts, stand = FALSE, k = 3)
table(preds.notStand, ts[, 5]) 
              
preds.notStand setosa versicolor virginica
    setosa         15          0         0
    versicolor      0         10         3
    virginica       0          1        16
varsEnsemble <- function(tgt,train,test,
                         fs.meth,
                         baseLearner,blPars,
                         predictor,predPars,
                         verb=FALSE)
{
    require(Hmisc,quietly=TRUE)
    v <- varclus(as.matrix(train[,-which(colnames(train)==tgt)]))
    varsSets <- lapply(1:fs.meth[[3]],function(x)
        getVarsSet(v$hclust,nvars=fs.meth[[2]]))
    
    preds <- matrix(NA,ncol=length(varsSets),nrow=NROW(test))
    for(v in seq(along=varsSets)) {
        if (baseLearner=='knn')
            preds[,v] <- do.call("kNN",
                                 c(list(as.formula(paste(tgt,
                                                 paste(varsSets[[v]],
                                                       collapse='+'),
                                                 sep='~')),
                                        train[,c(tgt,varsSets[[v]])],
                                        test[,c(tgt,varsSets[[v]])]),
                                   blPars)
                                 )
        else {
            m <- do.call(baseLearner,
                         c(list(as.formula(paste(tgt,
                                                 paste(varsSets[[v]],
                                                       collapse='+'),
                                                 sep='~')),
                                train[,c(tgt,varsSets[[v]])]),
                           blPars)
                         )
            preds[,v] <- do.call(predictor,
                                 c(list(m,test[,c(tgt,varsSets[[v]])]),
                                   predPars))
        }
    }

    ps <- apply(preds,1,function(x)
        levels(factor(x))[which.max(table(factor(x)))])
    factor(ps,
           levels=1:nlevels(train[,tgt]),
           labels=levels(train[,tgt]))
}
ALLb.wf <- function(form, train, test,
                    learner, learner.pars=NULL,
                    predictor="predict",predictor.pars=NULL,
                    featSel.meth = "s2", 
                    available.fsMethods=list(s1=list("all"),s2=list('rf',30),
                                             s3=list('varclus',30,50)),
                    .model=FALSE,
                    ...)
{
    ## The characteristics of the selected feature selection method
    fs.meth <- available.fsMethods[[featSel.meth]] 
    
    ## The target variable
    tgt <- as.character(form[[2]])
    tgtCol <- which(colnames(train)==tgt)

    ## Anova filtering  
    f <- Anova(train[,tgt],p=0.01)
    ff <- filterfun(f)
    genes <- genefilter(t(train[,-tgtCol]),ff)
    genes <- names(genes)[genes]
    train <- train[,c(tgt,genes)]
    test <- test[,c(tgt,genes)]
    tgtCol <- 1

    ## Specific filtering 
    if (fs.meth[[1]]=='varclus') {
      pred <- varsEnsemble(tgt,train,test,fs.meth,
                            learner,learner.pars,
                            predictor,predictor.pars,
                            list(...))

    } else {
      if (fs.meth[[1]]=='rf') {
        require(randomForest,quietly=TRUE)
        rf <- randomForest(form,train,importance=TRUE)
        imp <- importance(rf)
        rf.genes <- rownames(imp)[order(imp[,"MeanDecreaseAccuracy"], 
                                        decreasing = TRUE)[1:fs.meth[[2]]]]
        train <- train[,c(tgt,rf.genes)]
        test <- test[,c(tgt,rf.genes)]
      }

      if (learner == 'knn') 
        pred <- kNN(form,train,test,
                    stand.stats=list(rowMedians(t(as.matrix(train[,-tgtCol]))),
                        rowIQRs(t(as.matrix(train[,-tgtCol])))),
                    ...)
      else {
        model <- do.call(learner,c(list(form,train),learner.pars))
        pred <- do.call(predictor,c(list(model,test),predictor.pars))
      }

    }

    return(list(trues=responseValues(form,test), preds=pred, 
                model=if (.model && learner!="knn") model else NULL))

}
vars <- list()
vars$randomForest <- list(learner.pars=list(ntree=c(500,750,1000),
                                            mtry=c(5,15)),
                          preditor.pars=list(type="response"))
vars$svm <- list(learner.pars=list(cost=c(1,100),
                                   gamma=c(0.01,0.001,0.0001)))
vars$knn <- list(learner.pars=list(k=c(3,5,7),
                                   stand=c(TRUE,FALSE)))
vars$featureSel <- list(featSel.meth=c("s1", "s2", "s3"))
library(performanceEstimation)
library(class)
library(randomForest)
randomForest 4.7-1.1
Type rfNews() to see new features/changes/bug fixes.
library(e1071)
library(genefilter)
load('myALL.Rdata')  # loading the previously saved object with the data
Warning: cannot open compressed file 'myALL.Rdata', probable reason 'No such file or directory'Error in readChar(con, 5L, useBytes = TRUE) : cannot open the connection
setwd("~/Downloads")
Warning: The working directory was changed to /Users/christianpetersen/Downloads inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
load("myALL.Rdata")
load("knn.Rdata")
load("svm.Rdata")
load("randomForest.Rdata")
rankWorkflows(svm, maxs = TRUE)
$ALL
$ALL$acc
NANA
rankWorkflows(all.trials, top=10, maxs = TRUE)
$ALL
$ALL$acc
NANA
getWorkflow("svm.v8", all.trials)
Workflow Object:
    Workflow ID       ::  svm.v8 
    Workflow Function ::  ALLb.wf
         Parameter values:
         learner.pars  -> cost=100 gamma=0.01 
         learner  -> svm 
         featSel.meth  -> s2 
top10WFnames <- rankWorkflows(all.trials, top=10, 
                              maxs = TRUE)[["ALL"]][["acc"]][,"Workflow"]
sapply(top10WFnames, function(WFid) getWorkflow(WFid,all.trials)@pars$featSel.meth)
 svm.v8 svm.v12  knn.v7  knn.v8  knn.v9 knn.v10 knn.v11 knn.v12 
   "s2"    "s2"    "s2"    "s2"    "s2"    "s2"    "s2"    "s2" 
svm.v10  svm.v6 
   "s2"    "s1" 
plot(subset(all.trials,workflows=top10WFnames))

ps <- pairedComparisons(subset(all.trials,workflows=top10WFnames),baseline="svm.v8")
Warning: With less 2 tasks the Friedman, Nemenyi and Bonferroni-Dunn tests are not calculated.
ps$acc$WilcoxonSignedRank.test
, , ALL

         MedScore DiffMedScores   p.value
svm.v8  0.8235294            NA        NA
svm.v12 0.8235294   0.000000000 0.7000463
knn.v7  0.8169856   0.006543766 0.7941389
knn.v8  0.8169856   0.006543766 0.7941389
knn.v9  0.8169856   0.006543766 0.7941389
knn.v10 0.8169856   0.006543766 0.7941389
knn.v11 0.8169856   0.006543766 0.7941389
knn.v12 0.8169856   0.006543766 0.7941389
svm.v10 0.8086312   0.014898200 0.2240278
svm.v6  0.8055556   0.017973856 0.4055620
iteration <- 1  # any number between 1 and 100 in this case
itInfo <- getIterationsInfo(all.trials,workflow="svm.v8",it=iteration)
table(itInfo$trues, itInfo$preds)
          
           ALL1/AF4 BCR/ABL E2A/PBX1 NEG
  ALL1/AF4        3       0        0   0
  BCR/ABL         0      12        0   3
  E2A/PBX1        0       0        0   1
  NEG             0       1        1  14
---
title: "Case Study 4"
output: html_notebook
---


```{r}
install.packages("BiocManager")
library(BiocManager)

```

```{r}

```

```{r}
BiocManager::install("Biobase")
BiocManager::install("ALL")
library(Biobase)
library(ALL)
data(ALL)
```

```{r}
ALL
```
```{r}

pD <- phenoData(ALL)
varMetadata(pD)
table(ALL$BT)
table(ALL$mol.biol)
table(ALL$BT, ALL$mol.bio)
```


```{r}
featureNames(ALL)[1:10]
```


```{r}
sampleNames(ALL)[1:5]
```


```{r}
tgt.cases <- which(ALL$BT %in% levels(ALL$BT)[1:5] & 
                   ALL$mol.bio %in% levels(ALL$mol.bio)[1:4])
ALLb <- ALL[,tgt.cases]
ALLb
```


```{r}
ALLb$BT <- factor(ALLb$BT)
ALLb$mol.bio <- factor(ALLb$mol.bio)
```



```{r}

save(ALLb, file = "myALL.Rdata")
```



```{r}
es <- exprs(ALLb)
dim(es)
```
```{r}
summary(as.vector(es))
```


```{r}
#BiocManager::install("genefilter")

library(genefilter)
```

```{r}
library(genefilter)
library(ggplot2)
exprVs <- data.frame(exprVal=as.vector(es))
ds <- data.frame(Stat=c("1stQ","Median","3rdQ","Shorth"),
                 Value=c(quantile(exprVs$exprVal, 
                         probs=c(0.25, 0.5, 0.75)),
                         shorth(exprVs$exprVal)),
                 Color=c("red","green","red","yellow"))
ggplot(exprVs,aes(x=exprVal)) + geom_histogram(fill="lightgrey") + 
    geom_vline(data=ds,aes(xintercept=Value,color=Color)) + 
    geom_text(data=ds,aes(x=Value-0.2,y=0,label=Stat,colour=Color),
              angle=90,hjust="left") +
    xlab("Expression Levels") + guides(colour="none", fill="none") 
```

```{r}
library(genefilter)
library(ggplot2)
exprVs <- data.frame(exprVal=as.vector(es))
ds <- data.frame(Stat=c("1stQ","Median","3rdQ","Shorth"),
                 Value=c(quantile(exprVs$exprVal, probs=c(0.25, 0.5, 0.75)),
                     shorth(exprVs$exprVal)),
                 Color=c("red","green","red","yellow"))
ggplot(exprVs,aes(x=exprVal)) + geom_histogram(fill="lightgrey") + 
    geom_vline(data=ds,aes(xintercept=Value,color=Color)) + 
    geom_text(data=ds,aes(x=Value-0.2,y=0,label=Stat,colour=Color),angle=90,hjust="left") +
    xlab("Expression Levels") + guides(colour="none", fill="none")    
```


```{r}
sapply(levels(ALLb$mol.bio), 
       function(x) summary(as.vector(es[, which(ALLb$mol.bio == x)])))
```


```{r}
BiocManager::install("hgu95av2.db")

library(hgu95av2.db)
```


```{r}
rowIQRs <- function(em) 
    rowQ(em,ceiling(0.75*ncol(em))) - rowQ(em,floor(0.25*ncol(em)))
library(ggplot2)
dg <- data.frame(rowMed=rowMedians(es), rowIQR=rowIQRs(es))
ggplot(dg,aes(x=rowMed, y=rowIQR)) + geom_point() +
    xlab("Median expression level") + ylab("IQR expression level") +
    ggtitle("Main Characteristics of Genes Expression Levels")
```


```{r}
rowIQRs <- function(em) rowQ(em,ceiling(0.75*ncol(em))) - rowQ(em,floor(0.25*ncol(em)))
library(ggplot2)
dg <- data.frame(rowMed=rowMedians(es), rowIQR=rowIQRs(es))
ggplot(dg,aes(x=rowMed, y=rowIQR)) + geom_point() +
    xlab("Median expression level") + ylab("IQR expression level") +
    ggtitle("Main Characteristics of Genes Expression Levels")
```


```{r}
library(genefilter)
resFilter <- nsFilter(ALLb,
                 var.func=IQR,
                 var.cutoff=IQR(as.vector(es))/5, 
                 feature.exclude="^AFFX")
resFilter
```

```{r}
ALLb <- resFilter$eset
es <- exprs(ALLb)
dim(es)

```

```{r}
f <- Anova(ALLb$mol.bio, p = 0.01)
ff <- filterfun(f)
selGenes <- genefilter(exprs(ALLb), ff)
sum(selGenes)
```


```{r}
ALLb <- ALLb[selGenes, ]
ALLb
es <- exprs(ALLb)
dim(es)
```

```{r}
dg <- data.frame(rowMed=rowMedians(es), rowIQR=rowIQRs(es))
ggplot(dg,aes(x=rowMed, y=rowIQR)) + geom_point() +
    xlab("Median expression level") + ylab("IQR expression level") +
    ggtitle("Distribution Properties of the Selected Genes")
```


```{r}
library(randomForest)
dt <- data.frame(t(es), Mut = ALLb$mol.bio)
dt$Mut <- droplevels(dt$Mut)
set.seed(1234)
rf <- randomForest(Mut ~ ., dt, importance = TRUE)
imp <- importance(rf)
rf.genes <- rownames(imp)[order(imp[,"MeanDecreaseAccuracy"], 
                                decreasing = TRUE)[1:30]]
```

```{r}
library(tidyr)
library(dplyr)
d <- gather(dt[,c(rf.genes,"Mut")],Gene,ExprValue,1:length(rf.genes))
dat <- group_by(d,Mut,Gene) %>% 
    summarise(med=median(ExprValue), iqr=IQR(ExprValue))
ggplot(dat, aes(x=med,y=iqr,color=Mut)) +  
    geom_point(size=6) + facet_wrap(~ Gene) + 
    labs(x="MEDIAN expression level",y="IQR expression level",color="Mutation")
```


```{r}
#install.packages('Hmisc')
library(Hmisc)
vc <- varclus(t(es))
clus30 <- cutree(vc$hclust, 30)
table(clus30)


```


```{r}
getVarsSet <- function(cluster,nvars=30,seed=NULL,verb=FALSE) {
    if (!is.null(seed)) set.seed(seed)
    
    cls <- cutree(cluster,nvars)
    tots <- table(cls)
    vars <- c()
    vars <- sapply(1:nvars,function(clID)
    {
        if (!length(tots[clID])) stop('Empty cluster! (',clID,')')
        x <- sample(1:tots[clID],1)
        names(cls[cls==clID])[x]
    })
    if (verb)  structure(vars,clusMemb=cls,clusTots=tots)
    else       vars
}
getVarsSet(vc$hclust)
getVarsSet(vc$hclust)

```



```{r}
library(performanceEstimation)
library(DMwR2)
data(iris)
exp <- performanceEstimation(
    PredTask(Species ~ ., iris), 
    Workflow(learner="rpartXse", predictor.pars=list(type="class")),
    EstimationTask(metrics="acc",method=Bootstrap(nReps=100)))
```


```{r}
summary(exp)

```


```{r}
#install.packages('class')
library(class)
data(iris)
idx <- sample(1:nrow(iris), as.integer(0.7 * nrow(iris)))
tr <- iris[idx, ]
ts <- iris[-idx, ]
preds <- knn(tr[, -5], ts[, -5], tr[, 5], k = 3)
table(preds, ts[, 5])
```




```{r}
kNN <- function(form, train, test, stand = TRUE, stand.stats = NULL, ...) {
    require(class, quietly = TRUE)
    tgtCol <- which(colnames(train) == as.character(form[[2]]))
    if (stand) {
        if (is.null(stand.stats)) 
            tmp <- scale(train[, -tgtCol], center = TRUE, scale = TRUE)
        else tmp <- scale(train[, -tgtCol], center = stand.stats[[1]], 
                          scale = stand.stats[[2]])
        train[, -tgtCol] <- tmp
        ms <- attr(tmp, "scaled:center")
        ss <- attr(tmp, "scaled:scale")
        test[, -tgtCol] <- scale(test[, -tgtCol], center = ms, scale = ss)
    }
    knn(train[, -tgtCol], test[, -tgtCol], train[, tgtCol], ...)
}

```

```{r}
preds.stand <- kNN(Species ~ ., tr, ts, k = 3)
table(preds.stand,ts[, 5])
preds.notStand <- kNN(Species ~ ., tr, ts, stand = FALSE, k = 3)
table(preds.notStand, ts[, 5]) 

```

```{r}
varsEnsemble <- function(tgt,train,test,
                         fs.meth,
                         baseLearner,blPars,
                         predictor,predPars,
                         verb=FALSE)
{
    require(Hmisc,quietly=TRUE)
    v <- varclus(as.matrix(train[,-which(colnames(train)==tgt)]))
    varsSets <- lapply(1:fs.meth[[3]],function(x)
        getVarsSet(v$hclust,nvars=fs.meth[[2]]))
    
    preds <- matrix(NA,ncol=length(varsSets),nrow=NROW(test))
    for(v in seq(along=varsSets)) {
        if (baseLearner=='knn')
            preds[,v] <- do.call("kNN",
                                 c(list(as.formula(paste(tgt,
                                                 paste(varsSets[[v]],
                                                       collapse='+'),
                                                 sep='~')),
                                        train[,c(tgt,varsSets[[v]])],
                                        test[,c(tgt,varsSets[[v]])]),
                                   blPars)
                                 )
        else {
            m <- do.call(baseLearner,
                         c(list(as.formula(paste(tgt,
                                                 paste(varsSets[[v]],
                                                       collapse='+'),
                                                 sep='~')),
                                train[,c(tgt,varsSets[[v]])]),
                           blPars)
                         )
            preds[,v] <- do.call(predictor,
                                 c(list(m,test[,c(tgt,varsSets[[v]])]),
                                   predPars))
        }
    }

    ps <- apply(preds,1,function(x)
        levels(factor(x))[which.max(table(factor(x)))])
    factor(ps,
           levels=1:nlevels(train[,tgt]),
           labels=levels(train[,tgt]))
}
```

```{r}
ALLb.wf <- function(form, train, test,
                    learner, learner.pars=NULL,
                    predictor="predict",predictor.pars=NULL,
                    featSel.meth = "s2", 
                    available.fsMethods=list(s1=list("all"),s2=list('rf',30),
                                             s3=list('varclus',30,50)),
                    .model=FALSE,
                    ...)
{
    ## The characteristics of the selected feature selection method
    fs.meth <- available.fsMethods[[featSel.meth]] 
    
    ## The target variable
    tgt <- as.character(form[[2]])
    tgtCol <- which(colnames(train)==tgt)

    ## Anova filtering  
    f <- Anova(train[,tgt],p=0.01)
    ff <- filterfun(f)
    genes <- genefilter(t(train[,-tgtCol]),ff)
    genes <- names(genes)[genes]
    train <- train[,c(tgt,genes)]
    test <- test[,c(tgt,genes)]
    tgtCol <- 1

    ## Specific filtering 
    if (fs.meth[[1]]=='varclus') {
      pred <- varsEnsemble(tgt,train,test,fs.meth,
                            learner,learner.pars,
                            predictor,predictor.pars,
                            list(...))

    } else {
      if (fs.meth[[1]]=='rf') {
        require(randomForest,quietly=TRUE)
        rf <- randomForest(form,train,importance=TRUE)
        imp <- importance(rf)
        rf.genes <- rownames(imp)[order(imp[,"MeanDecreaseAccuracy"], 
                                        decreasing = TRUE)[1:fs.meth[[2]]]]
        train <- train[,c(tgt,rf.genes)]
        test <- test[,c(tgt,rf.genes)]
      }

      if (learner == 'knn') 
        pred <- kNN(form,train,test,
                    stand.stats=list(rowMedians(t(as.matrix(train[,-tgtCol]))),
                        rowIQRs(t(as.matrix(train[,-tgtCol])))),
                    ...)
      else {
        model <- do.call(learner,c(list(form,train),learner.pars))
        pred <- do.call(predictor,c(list(model,test),predictor.pars))
      }

    }

    return(list(trues=responseValues(form,test), preds=pred, 
                model=if (.model && learner!="knn") model else NULL))

}
```


```{r}
vars <- list()
vars$randomForest <- list(learner.pars=list(ntree=c(500,750,1000),
                                            mtry=c(5,15)),
                          preditor.pars=list(type="response"))
vars$svm <- list(learner.pars=list(cost=c(1,100),
                                   gamma=c(0.01,0.001,0.0001)))
vars$knn <- list(learner.pars=list(k=c(3,5,7),
                                   stand=c(TRUE,FALSE)))
vars$featureSel <- list(featSel.meth=c("s1", "s2", "s3"))

```

```{r}
library(performanceEstimation)
library(class)
library(randomForest)
library(e1071)
library(genefilter)
load('myALL.Rdata')  # loading the previously saved object with the data

es <- exprs(ALLb)

## simple filtering
ALLb <- nsFilter(ALLb,
                 var.func=IQR,var.cutoff=IQR(as.vector(es))/5, 
                 feature.exclude="^AFFX")
ALLb <- ALLb$eset

## the source dataset after the basic filtering
dt <- data.frame(t(exprs(ALLb)),Mut=ALLb$mol.bio)

set.seed(1234)
## The learners to evaluate
TODO <- c('knn','svm','randomForest')
for(td in TODO) {
    assign(td,
         performanceEstimation(
             PredTask(Mut ~ .,dt,'ALL'),
             do.call('workflowVariants',
                     c(list('ALLb.wf',learner=td,varsRootName=td),
                       vars[[td]],
                       vars$featureSel
                       )
                     ),
             EstimationTask(metrics="acc",method=Bootstrap(nReps=100)),
             cluster=TRUE
         )
         )
    save(list=td,file=paste(td,'Rdata',sep='.'))
}
```

```{r}
setwd("~/Downloads")

load("myALL.Rdata")
load("knn.Rdata")
load("svm.Rdata")
load("randomForest.Rdata")
```

```{r}
rankWorkflows(svm, maxs = TRUE)
```

```{r}
all.trials <- mergeEstimationRes(svm, knn, randomForest, by ="workflows")

```

```{r}
rankWorkflows(all.trials, top=10, maxs = TRUE)

```

```{r}
getWorkflow("svm.v8", all.trials)

```

```{r}
top10WFnames <- rankWorkflows(all.trials, top=10, 
                              maxs = TRUE)[["ALL"]][["acc"]][,"Workflow"]
sapply(top10WFnames, function(WFid) getWorkflow(WFid,all.trials)@pars$featSel.meth)
```


```{r}
plot(subset(all.trials,workflows=top10WFnames))

```

```{r}
ps <- pairedComparisons(subset(all.trials,workflows=top10WFnames),baseline="svm.v8")
ps$acc$WilcoxonSignedRank.test
```

```{r}
iteration <- 1  # any number between 1 and 100 in this case
itInfo <- getIterationsInfo(all.trials,workflow="svm.v8",it=iteration)
table(itInfo$trues, itInfo$preds)
```












































































