{r ropts_chunk$set(cache=TRUE, echo=TRUE)}

Resources

GEO

GEO overview - http://www.ncbi.nlm.nih.gov/geo/info/overview.html - describes data organization GEO2R - http://www.ncbi.nlm.nih.gov/geo/info/geo2r.html - compare 2 or more samples in a GEO series Querying GEO data sets - http://www.ncbi.nlm.nih.gov/geo/info/qqtutorial.html - a tutorial describing how to query for specific data sets.

CBW - Canadian Bioinformatics Workshops

Bioinformatics for Cancer Genomics (http://bioinformatics.ca/) has a tutorial in R analyzing data from GEO (GSE17172) - only 3 experimental conditions with three technical replicates each.

Initial plan 1. Repeat work in chapter 6-7 of Bioconductor case studies 2. Find a more interesting data set using GEO interface and GEO query and R GEOquery package 3. Try machine learning approach to predict case versus control 4. Try to identify signature related to predictor 5. Something with pathways - example of using machine learning in a predictive modelling (classification/regression) setting

Setting up the work environment

source("http://bioconductor.org/biocLite.R")
## Bioconductor version 3.0 (BiocInstaller 1.16.2), ?biocLite for help
biocLite()
## BioC_mirror: http://bioconductor.org
## Using Bioconductor version 3.0 (BiocInstaller 1.16.2), R version 3.1.1.
biocLite("Biobase")
## BioC_mirror: http://bioconductor.org
## Using Bioconductor version 3.0 (BiocInstaller 1.16.2), R version 3.1.1.
## Installing package(s) 'Biobase'
library("Biobase")
## Loading required package: BiocGenerics
## Warning: package 'BiocGenerics' was built under R version 3.1.2
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist, unsplit
## 
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
biocLite("ALL")
## BioC_mirror: http://bioconductor.org
## Using Bioconductor version 3.0 (BiocInstaller 1.16.2), R version 3.1.1.
## Installing package(s) 'ALL'
library("ALL")
data("ALL")

biocLite("genefilter")
## BioC_mirror: http://bioconductor.org
## Using Bioconductor version 3.0 (BiocInstaller 1.16.2), R version 3.1.1.
## Installing package(s) 'genefilter'
library("genefilter")
## 
## Attaching package: 'genefilter'
## 
## The following object is masked from 'package:base':
## 
##     anyNA
biocLite("multtest")
## BioC_mirror: http://bioconductor.org
## Using Bioconductor version 3.0 (BiocInstaller 1.16.2), R version 3.1.1.
## Installing package(s) 'multtest'
library("multtest")

biocLite("hgu95av2.db")
## BioC_mirror: http://bioconductor.org
## Using Bioconductor version 3.0 (BiocInstaller 1.16.2), R version 3.1.1.
## Installing package(s) 'hgu95av2.db'
library("hgu95av2.db")
## Loading required package: AnnotationDbi
## Warning: package 'AnnotationDbi' was built under R version 3.1.3
## Loading required package: stats4
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 3.1.2
## Loading required package: S4Vectors
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 3.1.2
## 
## Attaching package: 'AnnotationDbi'
## 
## The following object is masked from 'package:GenomeInfoDb':
## 
##     species
## 
## Loading required package: org.Hs.eg.db
## Warning: package 'RSQLite' was built under R version 3.1.2
## Loading required package: DBI
biocLite("bioDist")
## BioC_mirror: http://bioconductor.org
## Using Bioconductor version 3.0 (BiocInstaller 1.16.2), R version 3.1.1.
## Installing package(s) 'bioDist'
library("bioDist")
## Loading required package: KernSmooth
## Warning: package 'KernSmooth' was built under R version 3.1.2
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
install.packages("RColorBrewer", repos="http://cran.ma.imperial.ac.uk/")
library("RColorBrewer")
## Warning: package 'RColorBrewer' was built under R version 3.1.2
install.packages("randomForest", repos="http://cran.ma.imperial.ac.uk/")
library("randomForest")
## Warning: package 'randomForest' was built under R version 3.1.2
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:Biobase':
## 
##     combine
## 
## The following object is masked from 'package:BiocGenerics':
## 
##     combine
biocLite("MLInterfaces")
## BioC_mirror: http://bioconductor.org
## Using Bioconductor version 3.0 (BiocInstaller 1.16.2), R version 3.1.1.
## Installing package(s) 'MLInterfaces'
library("MLInterfaces")
## Loading required package: annotate
## Loading required package: XML
## Warning: package 'XML' was built under R version 3.1.2
## 
## Attaching package: 'annotate'
## 
## The following object is masked from 'package:GenomeInfoDb':
## 
##     organism
## 
## Loading required package: cluster
## Warning: package 'cluster' was built under R version 3.1.2
install.packages("WGCNA", repos="http://cran.ma.imperial.ac.uk/")
#library("WGCNA")

Utility functions for programming

check <- function(x){
  #returns lots of meta-information related to a specified variable
  cat("\nhead:\n", sep=" ");
  print(head(x));
  
  cat("\ntail:\n", sep=" ");
  print(tail(x));
  
  cat("\nstr:\n","\n", sep=" ");
  print(str(x));
  
  cat("\nlength:", length(x), "\n", sep=" ");
  
  cat("\ndim:", dim(x), "\n", sep=" ");
  
  cat("\ntotal NA:", sum(is.na(x)), "\n", sep=" ");
  
  cat("\ntotal NULL:", sum(is.null(x)), "\n", sep=" ");
  
  cat("\ntotal duplicated:", sum(duplicated(x)), "\n", sep=" ");
  
  cat("\nsummary:\n", "\n", sep=" ");
  if (!is.list(x)){
      summary(x);
  }
  else {
    head(summary(x));
  }
}


checkMemory <- function(Units="MB"){
  #returns the memory consumption of variables in the current working environment
  #in the specified units 
  units2divisor = list(KB=1000, MB=1000000, GB=1000000000)
  divisor=as.integer(units2divisor[Units])

  theseVariables <- ls()
  theseSizes <- sapply(theseVariables, function(x) object.size(get(x)))
  theseSizes<-round(as.integer(unname(theseSizes))/divisor, 2)
  
  results <- data.frame(theseVariables, theseSizes, Units, stringsAsFactors=FALSE)
  results <- results[rev(order(theseSizes)),]
  results <- rbind(c("TOTAL", sum(theseSizes), Units), results)
  results
}

Investigating the ALL data set

#investigate ALL
?ALL
class(ALL)
## [1] "ExpressionSet"
## attr(,"package")
## [1] "Biobase"
dim(ALL)
## Features  Samples 
##    12625      128
str(ALL) #<--- has 7 slots
## Formal class 'ExpressionSet' [package "Biobase"] with 7 slots
##   ..@ experimentData   :Formal class 'MIAME' [package "Biobase"] with 13 slots
##   .. .. ..@ name             : chr "Chiaretti et al."
##   .. .. ..@ lab              : chr "Department of Medical Oncology, Dana-Farber Cancer Institute, Department of Medicine, Brigham and Women's Hospital, Harvard Med"| __truncated__
##   .. .. ..@ contact          : chr ""
##   .. .. ..@ title            : chr "Gene expression profile of adult T-cell acute lymphocytic leukemia identifies distinct subsets of patients with different respo"| __truncated__
##   .. .. ..@ abstract         : chr "Gene expression profiles were examined in 33 adult patients with T-cell acute lymphocytic leukemia (T-ALL). Nonspecific filteri"| __truncated__
##   .. .. ..@ url              : chr ""
##   .. .. ..@ pubMedIds        : chr [1:2] "14684422" "16243790"
##   .. .. ..@ samples          : list()
##   .. .. ..@ hybridizations   : list()
##   .. .. ..@ normControls     : list()
##   .. .. ..@ preprocessing    : list()
##   .. .. ..@ other            : list()
##   .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
##   .. .. .. .. ..@ .Data:List of 1
##   .. .. .. .. .. ..$ : int [1:3] 1 0 0
##   ..@ assayData        :<environment: 0x7f7f9cafba40> 
##   ..@ phenoData        :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
##   .. .. ..@ varMetadata      :'data.frame':  21 obs. of  1 variable:
##   .. .. .. ..$ labelDescription: chr [1:21] " Patient ID" " Date of diagnosis" " Gender of the patient" " Age of the patient at entry" ...
##   .. .. ..@ data             :'data.frame':  128 obs. of  21 variables:
##   .. .. .. ..$ cod           : chr [1:128] "1005" "1010" "3002" "4006" ...
##   .. .. .. ..$ diagnosis     : chr [1:128] "5/21/1997" "3/29/2000" "6/24/1998" "7/17/1997" ...
##   .. .. .. ..$ sex           : Factor w/ 2 levels "F","M": 2 2 1 2 2 2 1 2 2 2 ...
##   .. .. .. ..$ age           : int [1:128] 53 19 52 38 57 17 18 16 15 40 ...
##   .. .. .. ..$ BT            : Factor w/ 10 levels "B","B1","B2",..: 3 3 5 2 3 2 2 2 3 3 ...
##   .. .. .. ..$ remission     : Factor w/ 2 levels "CR","REF": 1 1 1 1 1 1 1 1 1 1 ...
##   .. .. .. ..$ CR            : chr [1:128] "CR" "CR" "CR" "CR" ...
##   .. .. .. ..$ date.cr       : chr [1:128] "8/6/1997" "6/27/2000" "8/17/1998" "9/8/1997" ...
##   .. .. .. ..$ t(4;11)       : logi [1:128] FALSE FALSE NA TRUE FALSE FALSE ...
##   .. .. .. ..$ t(9;22)       : logi [1:128] TRUE FALSE NA FALSE FALSE FALSE ...
##   .. .. .. ..$ cyto.normal   : logi [1:128] FALSE FALSE NA FALSE FALSE FALSE ...
##   .. .. .. ..$ citog         : chr [1:128] "t(9;22)" "simple alt." NA "t(4;11)" ...
##   .. .. .. ..$ mol.biol      : Factor w/ 6 levels "ALL1/AF4","BCR/ABL",..: 2 4 2 1 4 4 4 4 4 2 ...
##   .. .. .. ..$ fusion protein: Factor w/ 3 levels "p190","p190/p210",..: 3 NA 1 NA NA NA NA NA NA 1 ...
##   .. .. .. ..$ mdr           : Factor w/ 2 levels "NEG","POS": 1 2 1 1 1 1 2 1 1 1 ...
##   .. .. .. ..$ kinet         : Factor w/ 2 levels "dyploid","hyperd.": 1 1 1 1 1 2 2 1 1 NA ...
##   .. .. .. ..$ ccr           : logi [1:128] FALSE FALSE FALSE FALSE FALSE FALSE ...
##   .. .. .. ..$ relapse       : logi [1:128] FALSE TRUE TRUE TRUE TRUE TRUE ...
##   .. .. .. ..$ transplant    : logi [1:128] TRUE FALSE FALSE FALSE FALSE FALSE ...
##   .. .. .. ..$ f.u           : chr [1:128] "BMT / DEATH IN CR" "REL" "REL" "REL" ...
##   .. .. .. ..$ date last seen: chr [1:128] NA "8/28/2000" "10/15/1999" "1/23/1998" ...
##   .. .. ..@ dimLabels        : chr [1:2] "sampleNames" "sampleColumns"
##   .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
##   .. .. .. .. ..@ .Data:List of 1
##   .. .. .. .. .. ..$ : int [1:3] 1 1 0
##   ..@ featureData      :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
##   .. .. ..@ varMetadata      :'data.frame':  0 obs. of  1 variable:
##   .. .. .. ..$ labelDescription: logi(0) 
##   .. .. ..@ data             :'data.frame':  12625 obs. of  0 variables
##   .. .. ..@ dimLabels        : chr [1:2] "featureNames" "featureColumns"
##   .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
##   .. .. .. .. ..@ .Data:List of 1
##   .. .. .. .. .. ..$ : int [1:3] 1 1 0
##   ..@ annotation       : chr "hgu95av2"
##   ..@ protocolData     :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
##   .. .. ..@ varMetadata      :'data.frame':  0 obs. of  1 variable:
##   .. .. .. ..$ labelDescription: chr(0) 
##   .. .. ..@ data             :'data.frame':  128 obs. of  0 variables
##   .. .. ..@ dimLabels        : chr [1:2] "sampleNames" "sampleColumns"
##   .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
##   .. .. .. .. ..@ .Data:List of 1
##   .. .. .. .. .. ..$ : int [1:3] 1 1 0
##   ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
##   .. .. ..@ .Data:List of 4
##   .. .. .. ..$ : int [1:3] 2 10 0
##   .. .. .. ..$ : int [1:3] 2 5 5
##   .. .. .. ..$ : int [1:3] 1 3 0
##   .. .. .. ..$ : int [1:3] 1 0 0
?ExpressionSet #<--- the contents of the 7 slots are explained here 
####
#READ THIS and see class specific functions - there are methods to retrieve data 
#from each slot and also methods to transform the entire ExpressionSet into a 
#data.frame

# information about assay and sample data
featureNames(ALL)[1:10]
##  [1] "1000_at"   "1001_at"   "1002_f_at" "1003_s_at" "1004_at"  
##  [6] "1005_at"   "1006_at"   "1007_s_at" "1008_f_at" "1009_at"
sampleNames(ALL)[1:5]
## [1] "01005" "01010" "03002" "04006" "04007"
experimentData(ALL)
## Experiment data
##   Experimenter name: Chiaretti et al. 
##   Laboratory: Department of Medical Oncology, Dana-Farber Cancer Institute, Department of Medicine, Brigham and Women's Hospital, Harvard Medical School, Boston, MA 02115, USA. 
##   Contact information:  
##   Title: Gene expression profile of adult T-cell acute lymphocytic leukemia identifies distinct subsets of patients with different response to therapy and survival. 
##   URL:  
##   PMIDs: 14684422 16243790 
## 
##   Abstract: A 187 word abstract is available. Use 'abstract' method.
# ## subset first 10 gene probe expression results for samples 2, 4, and 10
# the [ ] notation retrieves subsets of the the expression data ALONG WITH all of the meta data
# in the other slots.  so the notation looks like you are dealing with a data frame but that's not true
# what is returned is a data structure of class called ExpressionSet
expressionSet <- ALL[1:10,c(2,4,10)] #
expressionSet
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 10 features, 3 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: 01010 04006 08001
##   varLabels: cod diagnosis ... date last seen (21 total)
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
##   pubMedIds: 14684422 16243790 
## Annotation: hgu95av2
str(expressionSet) #an object of the ExpressionSet class  
## Formal class 'ExpressionSet' [package "Biobase"] with 7 slots
##   ..@ experimentData   :Formal class 'MIAME' [package "Biobase"] with 13 slots
##   .. .. ..@ name             : chr "Chiaretti et al."
##   .. .. ..@ lab              : chr "Department of Medical Oncology, Dana-Farber Cancer Institute, Department of Medicine, Brigham and Women's Hospital, Harvard Med"| __truncated__
##   .. .. ..@ contact          : chr ""
##   .. .. ..@ title            : chr "Gene expression profile of adult T-cell acute lymphocytic leukemia identifies distinct subsets of patients with different respo"| __truncated__
##   .. .. ..@ abstract         : chr "Gene expression profiles were examined in 33 adult patients with T-cell acute lymphocytic leukemia (T-ALL). Nonspecific filteri"| __truncated__
##   .. .. ..@ url              : chr ""
##   .. .. ..@ pubMedIds        : chr [1:2] "14684422" "16243790"
##   .. .. ..@ samples          : list()
##   .. .. ..@ hybridizations   : list()
##   .. .. ..@ normControls     : list()
##   .. .. ..@ preprocessing    : list()
##   .. .. ..@ other            : list()
##   .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
##   .. .. .. .. ..@ .Data:List of 1
##   .. .. .. .. .. ..$ : int [1:3] 1 0 0
##   ..@ assayData        :<environment: 0x7f7f9188f180> 
##   ..@ phenoData        :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
##   .. .. ..@ varMetadata      :'data.frame':  21 obs. of  1 variable:
##   .. .. .. ..$ labelDescription: chr [1:21] " Patient ID" " Date of diagnosis" " Gender of the patient" " Age of the patient at entry" ...
##   .. .. ..@ data             :'data.frame':  3 obs. of  21 variables:
##   .. .. .. ..$ cod           : chr [1:3] "1010" "4006" "8001"
##   .. .. .. ..$ diagnosis     : chr [1:3] "3/29/2000" "7/17/1997" "1/15/1997"
##   .. .. .. ..$ sex           : Factor w/ 2 levels "F","M": 2 2 2
##   .. .. .. ..$ age           : int [1:3] 19 38 40
##   .. .. .. ..$ BT            : Factor w/ 10 levels "B","B1","B2",..: 3 2 3
##   .. .. .. ..$ remission     : Factor w/ 2 levels "CR","REF": 1 1 1
##   .. .. .. ..$ CR            : chr [1:3] "CR" "CR" "CR"
##   .. .. .. ..$ date.cr       : chr [1:3] "6/27/2000" "9/8/1997" "3/26/1997"
##   .. .. .. ..$ t(4;11)       : logi [1:3] FALSE TRUE FALSE
##   .. .. .. ..$ t(9;22)       : logi [1:3] FALSE FALSE FALSE
##   .. .. .. ..$ cyto.normal   : logi [1:3] FALSE FALSE FALSE
##   .. .. .. ..$ citog         : chr [1:3] "simple alt." "t(4;11)" "del(p15)"
##   .. .. .. ..$ mol.biol      : Factor w/ 6 levels "ALL1/AF4","BCR/ABL",..: 4 1 2
##   .. .. .. ..$ fusion protein: Factor w/ 3 levels "p190","p190/p210",..: NA NA 1
##   .. .. .. ..$ mdr           : Factor w/ 2 levels "NEG","POS": 2 1 1
##   .. .. .. ..$ kinet         : Factor w/ 2 levels "dyploid","hyperd.": 1 1 NA
##   .. .. .. ..$ ccr           : logi [1:3] FALSE FALSE FALSE
##   .. .. .. ..$ relapse       : logi [1:3] TRUE TRUE TRUE
##   .. .. .. ..$ transplant    : logi [1:3] FALSE FALSE FALSE
##   .. .. .. ..$ f.u           : chr [1:3] "REL" "REL" "REL"
##   .. .. .. ..$ date last seen: chr [1:3] "8/28/2000" "1/23/1998" "7/11/1997"
##   .. .. ..@ dimLabels        : chr [1:2] "sampleNames" "sampleColumns"
##   .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
##   .. .. .. .. ..@ .Data:List of 1
##   .. .. .. .. .. ..$ : int [1:3] 1 1 0
##   ..@ featureData      :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
##   .. .. ..@ varMetadata      :'data.frame':  0 obs. of  1 variable:
##   .. .. .. ..$ labelDescription: chr(0) 
##   .. .. ..@ data             :'data.frame':  10 obs. of  0 variables
##   .. .. ..@ dimLabels        : chr [1:2] "featureNames" "featureColumns"
##   .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
##   .. .. .. .. ..@ .Data:List of 1
##   .. .. .. .. .. ..$ : int [1:3] 1 1 0
##   ..@ annotation       : chr "hgu95av2"
##   ..@ protocolData     :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
##   .. .. ..@ varMetadata      :'data.frame':  0 obs. of  1 variable:
##   .. .. .. ..$ labelDescription: chr(0) 
##   .. .. ..@ data             :'data.frame':  3 obs. of  0 variables
##   .. .. ..@ dimLabels        : chr [1:2] "sampleNames" "sampleColumns"
##   .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
##   .. .. .. .. ..@ .Data:List of 1
##   .. .. .. .. .. ..$ : int [1:3] 1 1 0
##   ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
##   .. .. ..@ .Data:List of 4
##   .. .. .. ..$ : int [1:3] 2 10 0
##   .. .. .. ..$ : int [1:3] 2 5 5
##   .. .. .. ..$ : int [1:3] 1 3 0
##   .. .. .. ..$ : int [1:3] 1 0 0
# named features and their expression levels
subset <- ALL[c("1000_at","1001_at"),] #names of features retrieved using featureNames()
exprs(subset)
##            01005    01010    03002    04006    04007    04008    04010
## 1000_at 7.597323 7.479445 7.567593 7.384684 7.905312 7.065914 7.474537
## 1001_at 5.046194 4.932537 4.799294 4.922627 4.844565 5.147762 5.122518
##            04016    06002    08001    08011    08012    08018    08024
## 1000_at 7.536119 7.183331 7.735545 7.591498 7.824284 7.231814 7.879988
## 1001_at 5.016132 5.288943 4.633217 4.583148 4.685951 5.059300 4.830464
##            09008    09017    11005    12006    12007    12012    12019
## 1000_at 7.891793 7.756734 7.640012 7.759599 7.678636 7.464285 7.652719
## 1001_at 5.999496 4.987595 4.967288 4.770481 5.456332 4.785863 5.175609
##            12026    14016    15001    15004    15005    16004    16009
## 1000_at 7.501591 7.570417 7.331509 7.366208 7.455451 7.328875 7.297313
## 1001_at 5.188992 5.258312 4.627955 4.733495 5.125098 5.332775 5.215707
##            19005    20002    22009    22010    22011    22013    24001
## 1000_at 7.563561 7.541133 8.016818 7.862181 7.702580 7.412003 7.916169
## 1001_at 4.858392 4.964424 5.216252 5.135825 4.802946 5.222676 4.790170
##            24005    24008    24010    24011    24017    24018    24019
## 1000_at 7.595848 7.296349 7.506236 7.144425 7.513972 7.815971 7.406135
## 1001_at 4.804743 5.002518 4.218220 5.228892 5.264158 4.899316 4.791335
##            24022    25003    25006    26001    26003    26005    26008
## 1000_at 7.300980 7.845054 7.651229 7.376930 7.663977 7.250353 7.663612
## 1001_at 5.177703 5.250315 4.896195 5.123546 5.078104 4.945670 5.124591
##            27003    27004    28001    28003    28005    28006    28007
## 1000_at 7.329996 7.360754 7.035203 7.705260 7.551734 7.538601 7.501531
## 1001_at 5.438098 4.757900 5.005279 5.009705 4.944978 4.511194 4.888814
##            28019    28021    28023    28024    28028    28031    28032
## 1000_at 7.116676 7.107979 7.427808 6.549926 7.514761 7.377215 6.973861
## 1001_at 5.275964 4.865566 5.057619 5.185277 4.788468 4.778381 4.970430
##            28035    28036    28037    28042    28043    28044    28047
## 1000_at 7.227516 7.407561 7.158049 7.235291 7.589310 7.988476 7.362458
## 1001_at 6.408157 5.042222 5.431469 4.686293 4.851805 4.894379 4.843868
##            30001    31007    31011    33005    36001    36002    37013
## 1000_at 7.508667 7.147843 7.651676 7.486432 7.759074 7.473427 7.627685
## 1001_at 5.587029 4.943857 4.741654 4.642628 4.962544 4.953122 5.358236
##            43001    43004    43007    43012    48001    49006    57001
## 1000_at 7.577529 7.600206 7.776844 7.585928 7.450666 7.004613 7.195206
## 1001_at 5.054157 4.879037 4.949908 5.057530 4.960382 4.836905 4.744006
##            62001    62002    62003    63001    64001    64002    65005
## 1000_at 7.407351 7.756195 7.913324 7.270997 7.694588 7.583071 7.609538
## 1001_at 4.930312 5.238937 5.074681 4.513671 4.928159 4.804083 4.715693
##            68001    68003    84004     LAL5    01003    01007    02020
## 1000_at 7.324502 7.545120 7.679603 7.604093 7.240252 7.676749 7.934247
## 1001_at 5.379102 4.650231 4.795495 4.988922 5.224752 5.129002 5.667907
##            04018    09002    10005    11002    12008    15006    16002
## 1000_at 7.874448 7.404271 7.775253 7.771891 7.355677 7.388882 7.589734
## 1001_at 5.005420 5.127949 4.423445 4.476761 5.461252 5.330129 4.836986
##            16007    17003    18001    19002    19008    19014    19017
## 1000_at 7.675929 7.662426 7.584008 7.840099 7.164922 7.843162 7.695714
## 1001_at 4.959669 5.743215 4.674920 5.208166 4.554529 5.718569 4.498515
##            20005    24006    26009    28008    28009    31015    37001
## 1000_at 7.520867 7.836577 7.470524 7.520806 7.646947 7.727560 7.849455
## 1001_at 5.135697 5.129836 5.213340 4.690815 4.902946 4.866731 4.959450
##            43006    43015    44001    49004    56007    64005    65003
## 1000_at 7.960842 8.188617 7.399999 7.813474 7.816922 7.913249 7.800199
## 1001_at 4.537677 5.154500 5.071885 4.874525 4.788699 5.403640 5.443827
##            83001     LAL4
## 1000_at 8.030047 7.702217
## 1001_at 5.178633 5.029670
# subset using properties in phenoData
str(expressionSet)
## Formal class 'ExpressionSet' [package "Biobase"] with 7 slots
##   ..@ experimentData   :Formal class 'MIAME' [package "Biobase"] with 13 slots
##   .. .. ..@ name             : chr "Chiaretti et al."
##   .. .. ..@ lab              : chr "Department of Medical Oncology, Dana-Farber Cancer Institute, Department of Medicine, Brigham and Women's Hospital, Harvard Med"| __truncated__
##   .. .. ..@ contact          : chr ""
##   .. .. ..@ title            : chr "Gene expression profile of adult T-cell acute lymphocytic leukemia identifies distinct subsets of patients with different respo"| __truncated__
##   .. .. ..@ abstract         : chr "Gene expression profiles were examined in 33 adult patients with T-cell acute lymphocytic leukemia (T-ALL). Nonspecific filteri"| __truncated__
##   .. .. ..@ url              : chr ""
##   .. .. ..@ pubMedIds        : chr [1:2] "14684422" "16243790"
##   .. .. ..@ samples          : list()
##   .. .. ..@ hybridizations   : list()
##   .. .. ..@ normControls     : list()
##   .. .. ..@ preprocessing    : list()
##   .. .. ..@ other            : list()
##   .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
##   .. .. .. .. ..@ .Data:List of 1
##   .. .. .. .. .. ..$ : int [1:3] 1 0 0
##   ..@ assayData        :<environment: 0x7f7f9188f180> 
##   ..@ phenoData        :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
##   .. .. ..@ varMetadata      :'data.frame':  21 obs. of  1 variable:
##   .. .. .. ..$ labelDescription: chr [1:21] " Patient ID" " Date of diagnosis" " Gender of the patient" " Age of the patient at entry" ...
##   .. .. ..@ data             :'data.frame':  3 obs. of  21 variables:
##   .. .. .. ..$ cod           : chr [1:3] "1010" "4006" "8001"
##   .. .. .. ..$ diagnosis     : chr [1:3] "3/29/2000" "7/17/1997" "1/15/1997"
##   .. .. .. ..$ sex           : Factor w/ 2 levels "F","M": 2 2 2
##   .. .. .. ..$ age           : int [1:3] 19 38 40
##   .. .. .. ..$ BT            : Factor w/ 10 levels "B","B1","B2",..: 3 2 3
##   .. .. .. ..$ remission     : Factor w/ 2 levels "CR","REF": 1 1 1
##   .. .. .. ..$ CR            : chr [1:3] "CR" "CR" "CR"
##   .. .. .. ..$ date.cr       : chr [1:3] "6/27/2000" "9/8/1997" "3/26/1997"
##   .. .. .. ..$ t(4;11)       : logi [1:3] FALSE TRUE FALSE
##   .. .. .. ..$ t(9;22)       : logi [1:3] FALSE FALSE FALSE
##   .. .. .. ..$ cyto.normal   : logi [1:3] FALSE FALSE FALSE
##   .. .. .. ..$ citog         : chr [1:3] "simple alt." "t(4;11)" "del(p15)"
##   .. .. .. ..$ mol.biol      : Factor w/ 6 levels "ALL1/AF4","BCR/ABL",..: 4 1 2
##   .. .. .. ..$ fusion protein: Factor w/ 3 levels "p190","p190/p210",..: NA NA 1
##   .. .. .. ..$ mdr           : Factor w/ 2 levels "NEG","POS": 2 1 1
##   .. .. .. ..$ kinet         : Factor w/ 2 levels "dyploid","hyperd.": 1 1 NA
##   .. .. .. ..$ ccr           : logi [1:3] FALSE FALSE FALSE
##   .. .. .. ..$ relapse       : logi [1:3] TRUE TRUE TRUE
##   .. .. .. ..$ transplant    : logi [1:3] FALSE FALSE FALSE
##   .. .. .. ..$ f.u           : chr [1:3] "REL" "REL" "REL"
##   .. .. .. ..$ date last seen: chr [1:3] "8/28/2000" "1/23/1998" "7/11/1997"
##   .. .. ..@ dimLabels        : chr [1:2] "sampleNames" "sampleColumns"
##   .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
##   .. .. .. .. ..@ .Data:List of 1
##   .. .. .. .. .. ..$ : int [1:3] 1 1 0
##   ..@ featureData      :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
##   .. .. ..@ varMetadata      :'data.frame':  0 obs. of  1 variable:
##   .. .. .. ..$ labelDescription: chr(0) 
##   .. .. ..@ data             :'data.frame':  10 obs. of  0 variables
##   .. .. ..@ dimLabels        : chr [1:2] "featureNames" "featureColumns"
##   .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
##   .. .. .. .. ..@ .Data:List of 1
##   .. .. .. .. .. ..$ : int [1:3] 1 1 0
##   ..@ annotation       : chr "hgu95av2"
##   ..@ protocolData     :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
##   .. .. ..@ varMetadata      :'data.frame':  0 obs. of  1 variable:
##   .. .. .. ..$ labelDescription: chr(0) 
##   .. .. ..@ data             :'data.frame':  3 obs. of  0 variables
##   .. .. ..@ dimLabels        : chr [1:2] "sampleNames" "sampleColumns"
##   .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
##   .. .. .. .. ..@ .Data:List of 1
##   .. .. .. .. .. ..$ : int [1:3] 1 1 0
##   ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
##   .. .. ..@ .Data:List of 4
##   .. .. .. ..$ : int [1:3] 2 10 0
##   .. .. .. ..$ : int [1:3] 2 5 5
##   .. .. .. ..$ : int [1:3] 1 3 0
##   .. .. .. ..$ : int [1:3] 1 0 0
expressionSet$sex #<--one of the phenoData columns
## [1] M M M
## Levels: F M
maleExpressionSet <- expressionSet$sex == "M"

#class specific methods to access data in an object of the ExpressionSet class
assayData(ALL)
## <environment: 0x7f7f9cafba40>
sampleNames(ALL)
##   [1] "01005" "01010" "03002" "04006" "04007" "04008" "04010" "04016"
##   [9] "06002" "08001" "08011" "08012" "08018" "08024" "09008" "09017"
##  [17] "11005" "12006" "12007" "12012" "12019" "12026" "14016" "15001"
##  [25] "15004" "15005" "16004" "16009" "19005" "20002" "22009" "22010"
##  [33] "22011" "22013" "24001" "24005" "24008" "24010" "24011" "24017"
##  [41] "24018" "24019" "24022" "25003" "25006" "26001" "26003" "26005"
##  [49] "26008" "27003" "27004" "28001" "28003" "28005" "28006" "28007"
##  [57] "28019" "28021" "28023" "28024" "28028" "28031" "28032" "28035"
##  [65] "28036" "28037" "28042" "28043" "28044" "28047" "30001" "31007"
##  [73] "31011" "33005" "36001" "36002" "37013" "43001" "43004" "43007"
##  [81] "43012" "48001" "49006" "57001" "62001" "62002" "62003" "63001"
##  [89] "64001" "64002" "65005" "68001" "68003" "84004" "LAL5"  "01003"
##  [97] "01007" "02020" "04018" "09002" "10005" "11002" "12008" "15006"
## [105] "16002" "16007" "17003" "18001" "19002" "19008" "19014" "19017"
## [113] "20005" "24006" "26009" "28008" "28009" "31015" "37001" "43006"
## [121] "43015" "44001" "49004" "56007" "64005" "65003" "83001" "LAL4"
featureNames(ALL)[1:10]
##  [1] "1000_at"   "1001_at"   "1002_f_at" "1003_s_at" "1004_at"  
##  [6] "1005_at"   "1006_at"   "1007_s_at" "1008_f_at" "1009_at"
dims(ALL)
##          exprs
## Features 12625
## Samples    128
phenoData(ALL)
## An object of class 'AnnotatedDataFrame'
##   sampleNames: 01005 01010 ... LAL4 (128 total)
##   varLabels: cod diagnosis ... date last seen (21 total)
##   varMetadata: labelDescription
varLabels(ALL)
##  [1] "cod"            "diagnosis"      "sex"            "age"           
##  [5] "BT"             "remission"      "CR"             "date.cr"       
##  [9] "t(4;11)"        "t(9;22)"        "cyto.normal"    "citog"         
## [13] "mol.biol"       "fusion protein" "mdr"            "kinet"         
## [17] "ccr"            "relapse"        "transplant"     "f.u"           
## [21] "date last seen"
varMetadata(ALL)
##                                                                   labelDescription
## cod                                                                     Patient ID
## diagnosis                                                        Date of diagnosis
## sex                                                          Gender of the patient
## age                                                    Age of the patient at entry
## BT                                      does the patient have B-cell or T-cell ALL
## remission           Complete remission(CR), refractory(REF) or NA. Derived from CR
## CR                                                          Original remisson data
## date.cr                                        Date complete remission if achieved
## t(4;11)             did the patient have t(4;11) translocation. Derived from citog
## t(9;22)             did the patient have t(9;22) translocation. Derived from citog
## cyto.normal                        Was cytogenetic test normal? Derived from citog
## citog             original citogenetics data, deletions or t(4;11), t(9;22) status
## mol.biol                                                         molecular biology
## fusion protein                        which of p190, p210 or p190/210 for bcr/able
## mdr                                                           multi-drug resistant
## kinet                                            ploidy: either diploid or hyperd.
## ccr                                Continuous complete remission? Derived from f.u
## relapse                                                  Relapse? Derived from f.u
## transplant      did the patient receive a bone marrow transplant? Derived from f.u
## f.u                                                       follow up data available
## date last seen                                          date patient was last seen
pData(ALL)[1:5,] #<--show actual phenotypic data
##        cod diagnosis sex age BT remission CR   date.cr t(4;11) t(9;22)
## 01005 1005 5/21/1997   M  53 B2        CR CR  8/6/1997   FALSE    TRUE
## 01010 1010 3/29/2000   M  19 B2        CR CR 6/27/2000   FALSE   FALSE
## 03002 3002 6/24/1998   F  52 B4        CR CR 8/17/1998      NA      NA
## 04006 4006 7/17/1997   M  38 B1        CR CR  9/8/1997    TRUE   FALSE
## 04007 4007 7/22/1997   M  57 B2        CR CR 9/17/1997   FALSE   FALSE
##       cyto.normal       citog mol.biol fusion protein mdr   kinet   ccr
## 01005       FALSE     t(9;22)  BCR/ABL           p210 NEG dyploid FALSE
## 01010       FALSE simple alt.      NEG           <NA> POS dyploid FALSE
## 03002          NA        <NA>  BCR/ABL           p190 NEG dyploid FALSE
## 04006       FALSE     t(4;11) ALL1/AF4           <NA> NEG dyploid FALSE
## 04007       FALSE     del(6q)      NEG           <NA> NEG dyploid FALSE
##       relapse transplant               f.u date last seen
## 01005   FALSE       TRUE BMT / DEATH IN CR           <NA>
## 01010    TRUE      FALSE               REL      8/28/2000
## 03002    TRUE      FALSE               REL     10/15/1999
## 04006    TRUE      FALSE               REL      1/23/1998
## 04007    TRUE      FALSE               REL      11/4/1997
experimentData(ALL)
## Experiment data
##   Experimenter name: Chiaretti et al. 
##   Laboratory: Department of Medical Oncology, Dana-Farber Cancer Institute, Department of Medicine, Brigham and Women's Hospital, Harvard Medical School, Boston, MA 02115, USA. 
##   Contact information:  
##   Title: Gene expression profile of adult T-cell acute lymphocytic leukemia identifies distinct subsets of patients with different response to therapy and survival. 
##   URL:  
##   PMIDs: 14684422 16243790 
## 
##   Abstract: A 187 word abstract is available. Use 'abstract' method.
pubMedIds(ALL)
## [1] "14684422" "16243790"
abstract(ALL)
## [1] "Gene expression profiles were examined in 33 adult patients with T-cell acute lymphocytic leukemia (T-ALL). Nonspecific filtering criteria identified 313 genes differentially expressed in the leukemic cells. Hierarchical clustering of samples identified 2 groups that reflected the degree of T-cell differentiation but was not associated with clinical outcome. Comparison between refractory patients and those who responded to induction chemotherapy identified a single gene, interleukin 8 (IL-8), that was highly expressed in refractory T-ALL cells and a set of 30 genes that was highly expressed in leukemic cells from patients who achieved complete remission. We next identified 19 genes that were differentially expressed in T-ALL cells from patients who either had a relapse or remained in continuous complete remission. A model based on the expression of 3 of these genes was predictive of duration of remission. The 3-gene model was validated on a further set of T-ALL samples from 18 additional patients treated on the same clinical protocol. This study demonstrates that gene expression profiling can identify a limited number of genes that are predictive of response to induction therapy and remission duration in adult patients with T-ALL."
annotation(ALL)
## [1] "hgu95av2"
protocolData(ALL)
## An object of class 'AnnotatedDataFrame': none
#$ notation dereferences columns in the data slot of the phenoData slot of the ExpressionSet 
#str(ALL)
ALL$BT
##   [1] B2 B2 B4 B1 B2 B1 B1 B1 B2 B2 B3 B3 B3 B2 B3 B  B2 B3 B2 B3 B2 B2 B2
##  [24] B1 B1 B2 B1 B2 B1 B2 B  B  B2 B2 B2 B1 B2 B2 B2 B2 B2 B4 B4 B2 B2 B2
##  [47] B4 B2 B1 B2 B2 B3 B4 B3 B3 B3 B4 B3 B3 B1 B1 B1 B1 B3 B3 B3 B3 B3 B3
##  [70] B3 B3 B1 B3 B1 B4 B2 B2 B1 B3 B4 B4 B2 B2 B3 B4 B4 B4 B1 B2 B2 B2 B1
##  [93] B2 B  B  T  T3 T2 T2 T3 T2 T  T4 T2 T3 T3 T  T2 T3 T2 T2 T2 T1 T4 T 
## [116] T2 T3 T2 T2 T2 T2 T3 T3 T3 T2 T3 T2 T 
## Levels: B B1 B2 B3 B4 T T1 T2 T3 T4
#@ notation can be used to dereference a single slot at a time of the ExpressionSet
ALL@experimentData
## Experiment data
##   Experimenter name: Chiaretti et al. 
##   Laboratory: Department of Medical Oncology, Dana-Farber Cancer Institute, Department of Medicine, Brigham and Women's Hospital, Harvard Medical School, Boston, MA 02115, USA. 
##   Contact information:  
##   Title: Gene expression profile of adult T-cell acute lymphocytic leukemia identifies distinct subsets of patients with different response to therapy and survival. 
##   URL:  
##   PMIDs: 14684422 16243790 
## 
##   Abstract: A 187 word abstract is available. Use 'abstract' method.
#1 slot - experimentDaa
ALL@experimentData #<--- a description of the experiment
## Experiment data
##   Experimenter name: Chiaretti et al. 
##   Laboratory: Department of Medical Oncology, Dana-Farber Cancer Institute, Department of Medicine, Brigham and Women's Hospital, Harvard Medical School, Boston, MA 02115, USA. 
##   Contact information:  
##   Title: Gene expression profile of adult T-cell acute lymphocytic leukemia identifies distinct subsets of patients with different response to therapy and survival. 
##   URL:  
##   PMIDs: 14684422 16243790 
## 
##   Abstract: A 187 word abstract is available. Use 'abstract' method.
abstract(ALL)
## [1] "Gene expression profiles were examined in 33 adult patients with T-cell acute lymphocytic leukemia (T-ALL). Nonspecific filtering criteria identified 313 genes differentially expressed in the leukemic cells. Hierarchical clustering of samples identified 2 groups that reflected the degree of T-cell differentiation but was not associated with clinical outcome. Comparison between refractory patients and those who responded to induction chemotherapy identified a single gene, interleukin 8 (IL-8), that was highly expressed in refractory T-ALL cells and a set of 30 genes that was highly expressed in leukemic cells from patients who achieved complete remission. We next identified 19 genes that were differentially expressed in T-ALL cells from patients who either had a relapse or remained in continuous complete remission. A model based on the expression of 3 of these genes was predictive of duration of remission. The 3-gene model was validated on a further set of T-ALL samples from 18 additional patients treated on the same clinical protocol. This study demonstrates that gene expression profiling can identify a limited number of genes that are predictive of response to induction therapy and remission duration in adult patients with T-ALL."
#2 slot - assayData
ALL@assayData #<--- an environment
## <environment: 0x7f7f9cafba40>
#this is where the actual expression data lives
#use exprs() method to access
theData<-exprs(ALL)
str(theData)
##  num [1:12625, 1:128] 7.6 5.05 3.9 5.9 5.93 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:12625] "1000_at" "1001_at" "1002_f_at" "1003_s_at" ...
##   ..$ : chr [1:128] "01005" "01010" "03002" "04006" ...
#3 slot - phenoData - all the metadata about each patient
str(ALL@phenoData)
## Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
##   ..@ varMetadata      :'data.frame':    21 obs. of  1 variable:
##   .. ..$ labelDescription: chr [1:21] " Patient ID" " Date of diagnosis" " Gender of the patient" " Age of the patient at entry" ...
##   ..@ data             :'data.frame':    128 obs. of  21 variables:
##   .. ..$ cod           : chr [1:128] "1005" "1010" "3002" "4006" ...
##   .. ..$ diagnosis     : chr [1:128] "5/21/1997" "3/29/2000" "6/24/1998" "7/17/1997" ...
##   .. ..$ sex           : Factor w/ 2 levels "F","M": 2 2 1 2 2 2 1 2 2 2 ...
##   .. ..$ age           : int [1:128] 53 19 52 38 57 17 18 16 15 40 ...
##   .. ..$ BT            : Factor w/ 10 levels "B","B1","B2",..: 3 3 5 2 3 2 2 2 3 3 ...
##   .. ..$ remission     : Factor w/ 2 levels "CR","REF": 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..$ CR            : chr [1:128] "CR" "CR" "CR" "CR" ...
##   .. ..$ date.cr       : chr [1:128] "8/6/1997" "6/27/2000" "8/17/1998" "9/8/1997" ...
##   .. ..$ t(4;11)       : logi [1:128] FALSE FALSE NA TRUE FALSE FALSE ...
##   .. ..$ t(9;22)       : logi [1:128] TRUE FALSE NA FALSE FALSE FALSE ...
##   .. ..$ cyto.normal   : logi [1:128] FALSE FALSE NA FALSE FALSE FALSE ...
##   .. ..$ citog         : chr [1:128] "t(9;22)" "simple alt." NA "t(4;11)" ...
##   .. ..$ mol.biol      : Factor w/ 6 levels "ALL1/AF4","BCR/ABL",..: 2 4 2 1 4 4 4 4 4 2 ...
##   .. ..$ fusion protein: Factor w/ 3 levels "p190","p190/p210",..: 3 NA 1 NA NA NA NA NA NA 1 ...
##   .. ..$ mdr           : Factor w/ 2 levels "NEG","POS": 1 2 1 1 1 1 2 1 1 1 ...
##   .. ..$ kinet         : Factor w/ 2 levels "dyploid","hyperd.": 1 1 1 1 1 2 2 1 1 NA ...
##   .. ..$ ccr           : logi [1:128] FALSE FALSE FALSE FALSE FALSE FALSE ...
##   .. ..$ relapse       : logi [1:128] FALSE TRUE TRUE TRUE TRUE TRUE ...
##   .. ..$ transplant    : logi [1:128] TRUE FALSE FALSE FALSE FALSE FALSE ...
##   .. ..$ f.u           : chr [1:128] "BMT / DEATH IN CR" "REL" "REL" "REL" ...
##   .. ..$ date last seen: chr [1:128] NA "8/28/2000" "10/15/1999" "1/23/1998" ...
##   ..@ dimLabels        : chr [1:2] "sampleNames" "sampleColumns"
##   ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
##   .. .. ..@ .Data:List of 1
##   .. .. .. ..$ : int [1:3] 1 1 0
#4 slot - featureData
str(ALL@featureData)
## Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
##   ..@ varMetadata      :'data.frame':    0 obs. of  1 variable:
##   .. ..$ labelDescription: logi(0) 
##   ..@ data             :'data.frame':    12625 obs. of  0 variables
##   ..@ dimLabels        : chr [1:2] "featureNames" "featureColumns"
##   ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
##   .. .. ..@ .Data:List of 1
##   .. .. .. ..$ : int [1:3] 1 1 0
#5 slot - annotation
ALL@annotation #<-- [1] "hgu95av2"
## [1] "hgu95av2"
#Affymetrix Human Genome U95 Set annotation data (chip hgu95av2)
#http://www.bioconductor.org/packages/release/data/annotation/html/hgu95av2.html

#6 slot - protocolData
  str(ALL@protocolData)
## Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
##   ..@ varMetadata      :'data.frame':    0 obs. of  1 variable:
##   .. ..$ labelDescription: chr(0) 
##   ..@ data             :'data.frame':    128 obs. of  0 variables
##   ..@ dimLabels        : chr [1:2] "sampleNames" "sampleColumns"
##   ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
##   .. .. ..@ .Data:List of 1
##   .. .. .. ..$ : int [1:3] 1 1 0
##
#convert the ALL data set into a data frame
#Coerce objects of ExpressionSet-class to data.frame by transposing the expression matrix and concatenating phenoData
#this format might be more amenable to standard machine learning packages
##
ALL.df <- as(ALL, "data.frame")
#str(ALL.df)
length(colnames(ALL.df)) #12646
## [1] 12646
head(colnames(ALL.df))
## [1] "X1000_at"   "X1001_at"   "X1002_f_at" "X1003_s_at" "X1004_at"  
## [6] "X1005_at"
#[1] "X1000_at"   "X1001_at"   "X1002_f_at" "X1003_s_at" "X1004_at"   "X1005_at"
tail(colnames(ALL.df))
## [1] "kinet"          "ccr"            "relapse"        "transplant"    
## [5] "f.u"            "date.last.seen"
#[1] "kinet"          "ccr"            "relapse"        "transplant"     "f.u"            "date.last.seen"

Chapter 6 of Bioconductor Case Studies: Easy Differntial Expression

Construction of the data set is first. ?ALL The ALL data set (Acute Lymphoblastic Leukemia Data) is microarray data from 128 individuals that has ben normalized using rma. Two features (covariates of the data) are used to select a sybset of the data for further analysis The BT feature describes the type and stge of the disease. We will select individuals with B-cell type leukemia The mol.biol feature describes the “molecular biology” of the disease for the individual. This feature really describes the presence or absence of some cytogenetic variation. In this case, we select the subset of individuals with BCR/ABL translocations or no cytogenetic abnormalities at all (NEG)

?ALL
head(ALL$BT)
## [1] B2 B2 B4 B1 B2 B1
## Levels: B B1 B2 B3 B4 T T1 T2 T3 T4
bcell = grep("^B", as.character(ALL$BT)) 
#bcell is a vector indices of individuals with B-cell type leukemia
head(bcell)
## [1] 1 2 3 4 5 6
moltyp = which(as.character(ALL$mol.biol) %in% c("NEG", "BCR/ABL")) 
#moltype is a vector of indices of individuals with BCR/ABL translocations or no cytogentic abnormalities (NEG)
head(moltyp)
## [1] 1 2 3 5 6 7
ALL_bcrneg = ALL[, intersect(bcell, moltyp)] 
#get a list of individuals meeting both criteria for our data subset 
# ALL_bcrneg is an ExpressionSet

#this next line simply refactors the mol.biol feature which will now have only two levels
ALL_bcrneg$mol.biol = factor(ALL_bcrneg$mol.biol)
str(ALL_bcrneg$mol.biol)
##  Factor w/ 2 levels "BCR/ABL","NEG": 1 2 1 2 2 2 2 2 1 1 ...

Feature selection

This next section removes data for probes whose variation is independent of the sample. First, we calculate the standard deviation of each probe across all samples in the set. The distribution of standard deviations is non-normal (see the histogram below). We want to filter out probes that have little variation (regardless of the individual’s phenotype). To do this, we select the probes that have standard deviations above the “shorth” The “shorth” is the shortest interval that coverss half the values in a vector x. The “shorth” function in the genefilter package, returns the mean of the x values that lie in the shorth. This is used as a robust estimator of location. By removing probes that are likely to be non-informaive, we reduce the number of results that will be eliminated by multiple hypothesis correction methods.

library("genefilter")
?genefilter #<- a package for filtering gene expression probe data
expressionData <- exprs(ALL_bcrneg)
str(expressionData) #<-matrix of 12625 probes by 79 samples (individuals) 
##  num [1:12625, 1:79] 7.6 5.05 3.9 5.9 5.93 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:12625] "1000_at" "1001_at" "1002_f_at" "1003_s_at" ...
##   ..$ : chr [1:79] "01005" "01010" "03002" "04007" ...
sds = rowSds(expressionData) #<-the standard deviation of each row (rowSds is from genefilter package)
sds[1:5]
##   1000_at   1001_at 1002_f_at 1003_s_at   1004_at 
## 0.2580000 0.3126742 0.1881744 0.2601203 0.2630441
sds_alternative <- apply(expressionData, 1, sd) #alternative way of coding the same thing
sds_alternative[1:5]
##   1000_at   1001_at 1002_f_at 1003_s_at   1004_at 
## 0.2580000 0.3126742 0.1881744 0.2601203 0.2630441
#plot the distribution of the standard deviations of the probes
hist(sds, breaks=50, col="blue", xlab="standard deviation")
sh = shorth(sds)
sh
## [1] 0.2423124
abline(v=sh, col="red", lwd=3, lty=2)

#make a new data set that eliminates all probes that are below sh
ALLsfilt = ALL_bcrneg[sds>=sh, ]
dim(exprs(ALLsfilt))
## [1] 8812   79
#and look at the new distribution of standard deviations
sds_new <- rowSds(exprs(ALLsfilt))
hist(sds_new, breaks=50, col="blue", xlab="standard deviation")
sh_new = shorth(sds_new)
sh_new
## [1] 0.3064567
abline(v=sh, col="red", lwd=3, lty=2)

# alternative methods of filtering - not done
# 1. discard all probes with consistantly low expression values
# 2. remove non-specific probes using the nsFilter function from the Category package - see application in Chapter 1 

#explain shorth - not done
x<-rnorm(n=1000,mean=0,sd=1)
shorth_x<-shorth(x)
shorth_x
## [1] 0.03602156

Testing for differential expression

The rowttests function from the genefilter package is used to test for differential expression between two different phenotypes in our data set. Recall that the mol.biol feature indicates whether individuals in the data set have a BCR/ABL translocation or not. The table function is used to count the number of samples (individuals) in the data set with these phenotypes. There are 37 with the translocation and 42 without. The function call rowttests(ALLsfilt, "mol.biol") examines each probe in our filtered data set for significant differences in expression between these two phenotypes.

table(ALLsfilt$mol.biol)
## 
## BCR/ABL     NEG 
##      37      42
tt = rowttests(ALLsfilt, "mol.biol")
str(tt)
## 'data.frame':    8812 obs. of  3 variables:
##  $ statistic: num  0.737 0.453 -0.215 -0.533 -1.72 ...
##  $ dm       : num  0.043 0.0321 -0.0127 -0.0318 -0.4273 ...
##  $ p.value  : num  0.4637 0.652 0.8302 0.5954 0.0894 ...
##  - attr(*, "df")= num 77
names(tt)
## [1] "statistic" "dm"        "p.value"
hist(tt$p.value, breaks=50, col="blue", main="Histogram of all p-values (filtered data set)", xlab="p-value")

#looking at just those p-values less than 0.05
hist(tt$p.value[tt$p.value < 0.05], breaks=50, col="blue", main="Histogram of all p-values (filtered data set)", xlab="p-value")

sum(tt$p.value < 0.05) #<--1155
## [1] 1155
length(tt$p.value) #<--8812
## [1] 8812
0.05 * length(tt$p.value) #<---440
## [1] 440.6
#we can go back and repeat the same analysis for the data that was filtered away from this set (to see if we have removed an undue number of probes that have significant differences of expression between phenotypes)
ALLsrest = ALL_bcrneg[sds<sh, ]
ttrest = rowttests(ALLsrest, "mol.biol")
hist(ttrest$p.value, breaks=50, col="blue", main="Histogram of all p-values (data removed by filter step)", xlab="p-value")

hist(ttrest$p.value[ttrest$p.value < 0.05], breaks=50, col="blue", main="Histogram of all p-values (data removed by filter step)", xlab="p-value")

sum(ttrest$p.value < 0.05) #<--so only 84 probe results were removed and its likely that many of these would not have survived multiple-hypothesis correction anyway
## [1] 84

Multiple-hypothesis testing correction

We have done 8812 t-tests. If we were testing at the 0.05 significance level, we would expect, by chance, 8812*0.05=440.6 results to be “significant” or, more correctly, we would expect to “reject the NULL hypothesis 440 times” even if we were looking at a normally distributed data set where we knew that there were no difference. We need to correct for the fact that we have performed multiple hypothesis tests to determine if the number of significant results we observed (1115 p-values were less than 0.05) is really more than what we would expect by chance.

The Benjamini and Hochberg procedure uses the false-discovery rate (FDR) to recalculate p-values taking into account multiple hypothesis testing.

p_adjust <- p.adjust(tt$p.value, method="BH")
num_significant <- sum(tt$p.value < 0.05)
num_significant #<-- 1155
## [1] 1155
num_significant_adj <- sum(p_adjust < 0.05)
num_significant_adj #<--201
## [1] 201
#alternatively, use the multtest function from the multtest bioconductor package
#this function works directly on the expression matrix and returns a sorted list
library("multtest")
str(tt)
## 'data.frame':    8812 obs. of  3 variables:
##  $ statistic: num  0.737 0.453 -0.215 -0.533 -1.72 ...
##  $ dm       : num  0.043 0.0321 -0.0127 -0.0318 -0.4273 ...
##  $ p.value  : num  0.4637 0.652 0.8302 0.5954 0.0894 ...
##  - attr(*, "df")= num 77
mt = mt.rawp2adjp(tt$p.value, proc="BH")
str(mt)
## List of 4
##  $ adjp   : num [1:8812, 1:2] 3.76e-14 4.79e-13 2.45e-10 1.28e-09 5.27e-09 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:2] "rawp" "BH"
##  $ index  : int [1:8812] 498 6778 497 529 7326 4781 7128 1639 4790 6866 ...
##  $ h0.ABH : NULL
##  $ h0.TSBH: NULL
mt$adjp[1:10,1]
##  [1] 3.762489e-14 4.791997e-13 2.445693e-10 1.280912e-09 5.265146e-09
##  [6] 2.740257e-08 2.785038e-08 1.536188e-07 2.600957e-07 4.741431e-07
sum(mt$adjp[,1] < 0.05) #<--1155
## [1] 1155
sum(mt$adjp[,2] < 0.05) #<--201
## [1] 201
#finally, retrieve a list of genes corresponding to the 10 most significant differentially expressed probe sets
g = featureNames(ALLsfilt)[mt$index[1:10]]
str(g)
##  chr [1:10] "1636_g_at" "39730_at" "1635_at" "1674_at" ...
library("hgu95av2.db")
links(hgu95av2SYMBOL[g])
##      probe_id  symbol
## 1     1635_at    ABL1
## 2   1636_g_at    ABL1
## 3     1674_at    YES1
## 4    32434_at  MARCKS
## 5    37015_at ALDH1A1
## 6    37027_at   AHNAK
## 7    39730_at    ABL1
## 8  39837_s_at  ZNF467
## 9    40202_at    KLF9
## 10   40504_at    PON2

Each of the differentially expressed probes can be visualized across the individuals in the data set.

mb = ALLsfilt$mol.biol #<--vector of values indicating molecular biology of each sample
y = exprs(ALLsfilt)[g[1],] #<--expression values for probe across all individuals
ord = order(mb) #<--to order the data points by molecular biology
plot(y[ord], pch=c(1,16)[mb[ord]], col=c("black", "red")[mb[ord]], main=g[1], ylab=expression(log[2]~intensity), xlab="samples")

Machine learning - Chapter 9 notes

This section uses the same ALL data subset as above: individuals all have B-cell type leukemias and either have or do not have a BCL/ABL translocation (mol.biol is either “BCL/ABL” or “NEG” respectively).

We first begin by determining the number of samples we have in each mol.biol class.

table(ALL_bcrneg$mol.biol)
## 
## BCR/ABL     NEG 
##      37      42
#BCR/ABL     NEG 
#     37      42 

Filtering data

Next, the nsFilter function from the genefilter package is used to remove non-informative probes. By default this function will also removes control probes on Affymetrix arrays (identifed by their AFFX prefix). We also exclude genes without Entrez Gene identifiers, and we select the top 25% of genes on the basis of variability across samples. nsFilter takes as input a data object of class ExpressionSet and returns the same class type.

?nsFilter
class(ALL_bcrneg)
## [1] "ExpressionSet"
## attr(,"package")
## [1] "Biobase"
ALLfilt_bcrneg = nsFilter(ALL_bcrneg, var.cutoff=0.75)$eset
class(ALLfilt_bcrneg)
## [1] "ExpressionSet"
## attr(,"package")
## [1] "Biobase"

Data standardization

The data are first centred and scaled using median and IQR. This method is more robust to non-normal distributions outliers and the typical method of centering and scaling that uses mean and standard deviation. See http://en.wikipedia.org/wiki/Robust_measures_of_scale.

# rowIQRs is a helper fuction that calculates inter-quartile ranges (IQR) for each of the probes (rows) in an expression data set.
# eSet is the expression data from the assayData slot of the ExpressionSet class.  These data are retrieved using the expr() function.
# rowQ (from the Biobase package) retrieves the requested quantile for each row of a matrix of an ExpressionSet
rowIQRs = function(eSet) {
  numSamp = ncol(eSet)
  lowQ = rowQ(eSet, floor(0.25 * numSamp))
  upQ = rowQ(eSet, ceiling(0.75 * numSamp))
  upQ - lowQ
}

# standardize is a helper fuction that centres and scales expression data across samples
# by subtracting the median for the probe value and then dividing by the IQR
standardize = function(x) (x - rowMedians(x)) / rowIQRs(x)

#this line standardizes our data set
exprs(ALLfilt_bcrneg) = standardize(exprs(ALLfilt_bcrneg))

#

Selecting a distance measure

Tools is R to calculate distance: stats package: dist cluster package: daisy bioDist package: Kullback-Leibler distance mutual information distance Euclidean distance Manhattan distance correlation distance (using Pearson, Spearman, or Kendall’s tau)

#not done

Visualizing euclidean distance between samples

Distance between samples in the expression matrix. Since this matrix is rows x columns = probes x individuals, we want to call the dist function on the transpose of this matrix (individuals x probes). The resulting distance matrix will have rows x columns = individuals x individuals with euclidean distances for each pair of individuals.

The distance matrix is visualized using the heatmap() function. The RColorBrewer package is used to construct a color pallette for the visualization.

#calculate euclidean distances between samples
eucD = dist(t(exprs(ALLfilt_bcrneg)))
str(eucD)
## Class 'dist'  atomic [1:3081] 52 37.3 38 42.2 54.3 ...
##   ..- attr(*, "Size")= int 79
##   ..- attr(*, "Labels")= chr [1:79] "01005" "01010" "03002" "04007" ...
##   ..- attr(*, "Diag")= logi FALSE
##   ..- attr(*, "Upper")= logi FALSE
##   ..- attr(*, "method")= chr "euclidean"
##   ..- attr(*, "call")= language dist(x = t(exprs(ALLfilt_bcrneg)))
eucM = as.matrix(eucD)
str(eucM)
##  num [1:79, 1:79] 0 52 37.3 38 42.2 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:79] "01005" "01010" "03002" "04007" ...
##   ..$ : chr [1:79] "01005" "01010" "03002" "04007" ...
dim(eucM)
## [1] 79 79
library("RColorBrewer")
hmcol = colorRampPalette(brewer.pal(10, "RdBu"))(256)
hmcol = rev(hmcol) #<---so that red and blue correspond to high and low values respectively 
# - dark blue indicates samples that are identical 
# - dark red indicate samples that are highly dissimilar
heatmap(eucM, sym=TRUE, col=hmcol, distfun=as.dist)

#repeat the above steps using the Spearman correlation distance metric
library("bioDist")
spD = spearman.dist(ALLfilt_bcrneg)
str(spD)
## Class 'dist'  atomic [1:3081] 0.705 0.872 0.743 0.846 0.686 ...
##   ..- attr(*, "Size")= int 79
##   ..- attr(*, "Labels")= chr [1:79] "01005" "01010" "03002" "04007" ...
##   ..- attr(*, "Diag")= logi FALSE
##   ..- attr(*, "Upper")= logi FALSE
##   ..- attr(*, "methods")= chr "spearman"
## spD@Size (79*79 - 79)/2=3081
attr(spD, "Size")
## [1] 79
#[1] 79
spM = as.matrix(spD)
heatmap(spM, sym=TRUE, col=hmcol, distfun=function(x) as.dist(x))

Using the MLInterfaces package for machine learning

MLInterfaces provides a consistant interface to a number of machine learning packages in R (much like carret but adapted specifically for Bioconductor containers)

library("MLInterfaces")

#explore MLInterfaces packages
library(help=MLInterfaces)
?MLearn
#openVignette() #<==== not done, go through these tutorials for MLearn

# #code from the MLearn help page
# library("MASS")
# data(crabs)
# dim(crabs)
# head(crabs)
# table(crabs$sex)
# 
# #selecting a training and testing set
# set.seed(1234)
# #create a training set (indices)
# kp = sample(1:200, size=120) #<--random sample of indices in the data set (used for the trainInd in calls below)
# 
# #a random forest example
# rf1 = MLearn(sp~CW+RW, data=crabs, randomForestI, kp, ntree=600 )
# rf1 #<-- the resulting model
# str(rf1) #<---it's data structure
# ?testScores(rf1) #<---get a description of the output data structure and its methods
# rf1@testOutcomes #<---the actual labels for the outcome variable for the test set
# rf1@testPredictions #<---the predicted labels for the outcome variable for the test set
# rf1@testScores #<---individual scores for each class for each sample in the test data
# confuMat(rf1) #<--the confusion matrix for the test set
# show(rf1)
# testScores(rf1)
# #many other methods to retrieve predictions, scores for test set and train set
# 
# #neural net
# nn1 = MLearn(sp~CW+RW, data=crabs, nnetI, kp, size=3, decay=.01,
#     trace=FALSE )
# nn1
# RObject(nn1)
# 
# #k-nearest neighbors
# knn1 = MLearn(sp~CW+RW, data=crabs, knnI(k=3,l=2), kp)
# knn1
# names(RObject(knn1))
# 
# 
# dlda1 = MLearn(sp~CW+RW, data=crabs, dldaI, kp )
# dlda1
# names(RObject(dlda1))
# 
# #linear discriminant analysis
# lda1 = MLearn(sp~CW+RW, data=crabs, ldaI, kp )
# lda1
# names(RObject(lda1))

Construct a training set and a test set

The training set is balanced between samples that have the BCR/ABL translocation phenotype (BCR/ABL) and those that don’t (NEG).

#create sample indices for the training and test sets
set.seed(1234)
Negs = which(ALLfilt_bcrneg$mol.biol == "NEG")
Bcr = which(ALLfilt_bcrneg$mol.biol == "BCR/ABL")
S1 = sample(Negs, 20, replace=FALSE)
S2 = sample(Bcr, 20, replace = FALSE)
TrainInd = c(S1, S2)
TestInd = setdiff(1:79, TrainInd)

Use KNN to predict the presence or absence of the BCL/ABL translocation

kans = MLearn( mol.biol ~ ., data=ALLfilt_bcrneg, knnI(k=1,l=0), TrainInd)
kans.cm <- confuMat(kans)
kans.cm
##          predicted
## given     BCR/ABL NEG
##   BCR/ABL      16   1
##   NEG           4  18
#          predicted
# given     BCR/ABL NEG
#   BCR/ABL      16   1
#   NEG           4  18
str(kans.cm)
##  int [1:2, 1:2] 16 4 1 18
##  - attr(*, "dimnames")=List of 2
##   ..$ given    : chr [1:2] "BCR/ABL" "NEG"
##   ..$ predicted: chr [1:2] "BCR/ABL" "NEG"

Use diagonal linear discriminant analysis to predict the presence or absence of the BCL/ABL translocation

dldans = MLearn( mol.biol ~ ., ALLfilt_bcrneg, dldaI, TrainInd)
dldans.cm <- confuMat(dldans)
dldans.cm
##          predicted
## given     BCR/ABL NEG
##   BCR/ABL      13   4
##   NEG           2  20
#          predicted
# given     BCR/ABL NEG
#   BCR/ABL      13   4
#   NEG           2  20

Use linear discriminant analysis to predict the presence or absence of the BCL/ABL translocation

ldaans = MLearn( mol.biol ~ ., ALLfilt_bcrneg, ldaI, TrainInd)
## Warning in lda.default(x, grouping, ...): variables are collinear
ldans.cm <- confuMat(ldaans)
ldans.cm
##          predicted
## given     BCR/ABL NEG
##   BCR/ABL      13   4
##   NEG           2  20
#          predicted
# given     BCR/ABL NEG
#   BCR/ABL      13   4
#   NEG           2  20

Feature selection using the t-test

Select gene probes that are likely to be indicative of the presence or absence of the BCL/ABL translocation. Use the t-test to do this. This selection is done on only the training set - doing feature selection on the entire data set would lead to an overestimated performance.

The learning methods used above are repeated using the new data set containing only the selected features.

#compute the t-tests on the training set
Traintt = rowttests(ALLfilt_bcrneg[, TrainInd], "mol.biol")
# then sort them from largest to smallest
ordTT = order(abs(Traintt$statistic), decreasing=TRUE)
#and then obtain the names of the 50 that have the largest observed test statistics.
fNtt = featureNames(ALLfilt_bcrneg)[ordTT[1:50]]

#select a subset of the data based on the selected features
BNf = ALLfilt_bcrneg[fNtt,] #BNf means Best N features ?

#redo knn
kans.bf = MLearn( mol.biol ~ ., data=BNf, knnI(k=1,l=0), TrainInd)
kans.bf.cm <- confuMat(kans.bf)
kans.bf.cm
##          predicted
## given     BCR/ABL NEG
##   BCR/ABL      14   3
##   NEG           1  21
#          predicted
# given     BCR/ABL NEG
#   BCR/ABL      14   3
#   NEG           1  21

#redo diagonal linear discriminant analysis
dldans.bf = MLearn( mol.biol ~ ., data=BNf, dldaI, TrainInd)
dldans.bf.cm <- confuMat(dldans.bf)
dldans.bf.cm
##          predicted
## given     BCR/ABL NEG
##   BCR/ABL      13   4
##   NEG           1  21
#          predicted
# given     BCR/ABL NEG
#   BCR/ABL      13   4
#   NEG           1  21

ldaans.bf = MLearn( mol.biol ~ ., data=BNf, ldaI, TrainInd)
## Warning in lda.default(x, grouping, ...): variables are collinear
ldans.bf.cm <- confuMat(ldaans.bf)
ldans.bf.cm
##          predicted
## given     BCR/ABL NEG
##   BCR/ABL      14   3
##   NEG           3  19
#          predicted
# given     BCR/ABL NEG
#   BCR/ABL      14   3
#   NEG           3  19

Cross-validation with MLInterface

Cross-validation can be employed in the MLInterfaces package by specifying xvalSpec parameters that are passed to the MLearn function. The xvalSpec allows you to specify a cross-validation type, a partition function, the number of partitions, and optionally a function that helps to select features in each subset.

There are only three cross-validations types supported: LOO=leave-one-out, LOG=leave-out-group and NOTEST=use the entire data set for training. There is no built-in supprot for boot-strapping.

Finally, the possibility for feature selection is built into the cross-validation object - see the fs.Fun parameter. Three basic methods for feature selection are provided based on variation between two classes detected using a T-test (fs.absT), fs.probT and fs.topVariance. It is possible to define your own feature-selection helper functions following these examples.

?xvalSpec

#limit the size of the data set to include only the first 1000 probes
#this is strictly for practical reasons due to the computational time required of cross-validation schemes
BNx = ALLfilt_bcrneg[1:1000,]
#str(BNx)

#perform leave-one-out (LOO) cross validation for knn
#note that in this case, a cross-validation capability is built into the knn.cvI method
?knn.cvI
knnXval1 = MLearn(mol.biol~., data=BNx, knn.cvI(k=1, l=0), trainInd=1:ncol(BNx))

#perform leave-one-out (LOO) cross validation for knn - does the same as the above but using the xvalSpec functionality of MLearn - the code is slower than the above

#knnXval1 = MLearn(mol.biol~., data=BNx, knnI(k=1, l=0), xvalSpec("LOO"))
#look at the results
knnXval1
## MLInterfaces classification output container
## The call was:
## MLearn(formula = mol.biol ~ ., data = BNx, .method = knn.cvI(k = 1, 
##     l = 0), trainInd = 1:ncol(BNx))
## Predicted outcome distribution for test set:
## 
## BCR/ABL     NEG 
##      47      32 
## Summary of scores on test set (use testScores() method for details):
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       1       1       1       1
knnXval1.cm <- confuMat(knnXval1)
knnXval1.cm
##          predicted
## given     BCR/ABL NEG
##   BCR/ABL      31   6
##   NEG          16  26
sum(knnXval1.cm)
## [1] 79
#compare this to the non-cv method used above
kans2 = MLearn( mol.biol ~ ., data=BNx, knnI(k=1,l=0), TrainInd)
kans2.cm <- confuMat(kans2)
kans2.cm
##          predicted
## given     BCR/ABL NEG
##   BCR/ABL      16   1
##   NEG           4  18
sum(kans2.cm)
## [1] 39
#the results of the two are not directly  comparable
#need helper functions to calculate precision, recall etc. - are these not in MLearn
?precision
#ah yes, here they are - but you have to cast the confusion matrix provided by confuMat to a table (strange)
?confuMat
cm <- as.table(knnXval1.cm)
precision(cm)
##   BCR/ABL       NEG 
## 0.8378378 0.6190476
recall(cm)
##   BCR/ABL       NEG 
## 0.6595745 0.8125000
acc(cm)
## [1] 0.721519
cm <- as.table(kans2.cm)
precision(cm)
##   BCR/ABL       NEG 
## 0.9411765 0.8181818
recall(cm)
##   BCR/ABL       NEG 
## 0.8000000 0.9473684
acc(cm)
## [1] 0.8717949
# so in this case, the cross-validation method gives a much more conservative assessment of performance - expected
# always use cross-validation


#feature selection can be combined with cross-validation by providing a helper function to the cross-validation specification   
?xvalSpec
?fs.absT
#fs.absT #to see code
lk3f1 = MLearn(mol.biol~., data=BNx, knnI(k=1), xvalSpec("LOO", fsFun=fs.absT(50)))
lk3f1.cm <- confuMat(lk3f1)
cm <- as.table(lk3f1.cm)
precision(cm)
##   BCR/ABL       NEG 
## 0.8918919 0.8333333
recall(cm)
##   BCR/ABL       NEG 
## 0.8250000 0.8974359
acc(cm)
## [1] 0.8607595
#ok, comparable accuracy as before, but probably not large enough sample mto see any differences

Trying out different values of k in k-nearest neighbours to assess best value

knnXval2 = MLearn(mol.biol~., data=BNx, knn.cvI(k=2, l=0), trainInd=1:ncol(BNx))
confuMat(knnXval2)
##          predicted
## given     BCR/ABL NEG
##   BCR/ABL      26  11
##   NEG          19  23
acc(as.table(confuMat(knnXval2)))
## [1] 0.6202532
knnXval3 = MLearn(mol.biol~., data=BNx, knn.cvI(k=3, l=0), trainInd=1:ncol(BNx))
confuMat(knnXval3)
##          predicted
## given     BCR/ABL NEG
##   BCR/ABL      33   4
##   NEG          19  23
acc(as.table(confuMat(knnXval3)))
## [1] 0.7088608
knnXval5 = MLearn(mol.biol~., data=BNx, knn.cvI(k=5, l=0), trainInd=1:ncol(BNx))
confuMat(knnXval5)
##          predicted
## given     BCR/ABL NEG
##   BCR/ABL      35   2
##   NEG          20  22
acc(as.table(confuMat(knnXval5)))
## [1] 0.721519
knnXval10 = MLearn(mol.biol~., data=BNx, knn.cvI(k=10, l=0), trainInd=1:ncol(BNx))
confuMat(knnXval10)
##          predicted
## given     BCR/ABL NEG
##   BCR/ABL      33   4
##   NEG          20  22
acc(as.table(confuMat(knnXval10)))
## [1] 0.6962025

Random forest with MLearn

library("randomForest")
set.seed(123)
#the guideline rule for mtry is the sqrt of the number of features
dim(ALLfilt_bcrneg)
## Features  Samples 
##     2160       79
sqrt(2160)
## [1] 46.4758
rf1 = MLearn( mol.biol~., data=ALLfilt_bcrneg, randomForestI, TrainInd, ntree=1000, mtry=55, importance=TRUE)
confuMat(rf1)
##          predicted
## given     BCR/ABL NEG
##   BCR/ABL      13   4
##   NEG           0  22
acc(as.table(confuMat(rf1)))
## [1] 0.8974359
#but much smaller values of mtry can be used - potentially with comparable results.
rf2 = MLearn( mol.biol~., data=ALLfilt_bcrneg, randomForestI, TrainInd, ntree=1000, mtry=10, importance=TRUE)
confuMat(rf2)
##          predicted
## given     BCR/ABL NEG
##   BCR/ABL      13   4
##   NEG           2  20
acc(as.table(confuMat(rf2)))
## [1] 0.8461538
#the preformance is almost the same in both cases
#the random forest performance is better than the KNN solution

Examining importance of features from a random forest model

The importance parameter of the random forest method allows us to examine which features had the largest impact on performance. Data about feture importance is retrieved using the getVarImp function of the MLInterfaces package

opar = par(no.readonly=TRUE, mar=c(7,5,4,2)) #opar=original parameters - keep these so they can be restored
par(las=2) #<-- style of the axis label - parallel to the axis
?getVarImp #<-- from the MLInterfaces package
impV1 = getVarImp(rf1)
plot(impV1, n=15, plat="hgu95av2", toktype="SYMBOL") #<-- implementation of plot allows direct mapping of gene symbols

par(opar) #<-- restore the original graphics parameters

par(las=2, mar=c(7,5,4,2))
impV2 = getVarImp(rf2)
plot(impV2, n=15, plat="hgu95av2", toktype="SYMBOL")

par(opar)

What next?

There are other Chapters (all of them) in Bioconductor Case Studies that are worth going through later. Most notably

Chapter 7. A more detailed treatment of differential expression analysis Chapter 10. Unsupervised machine learning (clustering) Chapter 13. GSEA Chapter 14. Hypergeometric tests for GSEA

But first I want to look at trying to collapse probes into pathways and then using this as a data set for ML. The aggregate function in R is a likely candidate and this would require first adding an extra column indicating the pathway/complex that the probe belongs to and on which to aggregate. Someone has almost certainly already done this so lets code with Google:

Google(“collapse affymetrix probes into pathways”)

and found this

Strategies for aggregating gene expression data: The collapseRows R function - http://www.biomedcentral.com/1471-2105/12/322

This is from 2011 and is distributed as the WGCNA package. Lets go take a look.

The WGCNA package for collapseRows

biocLite("preprocessCore")
## BioC_mirror: http://bioconductor.org
## Using Bioconductor version 3.0 (BiocInstaller 1.16.2), R version 3.1.1.
## Installing package(s) 'preprocessCore'
## 
## The downloaded binary packages are in
##  /var/folders/hx/07bqmsdn1fn95ld5btyqm_hc0000gn/T//Rtmp9RPDBY/downloaded_packages
#install.packages("WGCNA")
#Weighted Correlation Network Analysis
library("WGCNA")
## Warning: package 'WGCNA' was built under R version 3.1.2
## Loading required package: dynamicTreeCut
## Warning: package 'dynamicTreeCut' was built under R version 3.1.2
## Loading required package: fastcluster
## Warning: package 'fastcluster' was built under R version 3.1.2
## 
## Attaching package: 'fastcluster'
## 
## The following object is masked from 'package:stats':
## 
##     hclust
## ==========================================================================
## *
## *  Package WGCNA 1.42 loaded.
## *
## *    Important note: It appears that your system supports multi-threading,
## *    but it is not enabled within WGCNA in R. 
## *    To allow multi-threading within WGCNA with all available cores, use 
## *
## *          allowWGCNAThreads()
## *
## *    within R. Use disableWGCNAThreads() to disable threading if necessary.
## *    Alternatively, set the following environment variable on your system:
## *
## *          ALLOW_WGCNA_THREADS=<number_of_processors>
## *
## *    for example 
## *
## *          ALLOW_WGCNA_THREADS=4
## *
## *    To set the environment variable in linux bash shell, type 
## *
## *           export ALLOW_WGCNA_THREADS=4
## *
## *     before running R. Other operating systems or shells will
## *     have a similar command to achieve the same aim.
## *
## ==========================================================================
## 
## Attaching package: 'WGCNA'
## 
## The following object is masked from 'package:IRanges':
## 
##     cor
## 
## The following object is masked from 'package:stats':
## 
##     cor
help(package=WGCNA)
?collapseRows


#first we need to retrieve just the expression data, transpose and cast to a data frame - this took several minutes to figure out because of the $%^$&&* poor documentation
eData <- as.data.frame(t(exprs(ALLfilt_bcrneg)))

#finally, add the outcome variable separately
learnThis <- ALLfilt_bcrneg$mol.biol
newColNames <- c("learnThis", colnames(eData)) 
eData <- cbind(learnThis, eData) 
colnames(eData) <- newColNames

colnames(eData)[1:10]
##  [1] "learnThis"  "41654_at"   "35430_at"   "38924_s_at" "36023_at"  
##  [6] "266_s_at"   "37569_at"   "33047_at"   "39428_at"   "36515_at"
#str(eData)
length(learnThis)
## [1] 79
dim(eData)
## [1]   79 2161
#make sure this is acceptable as input to MLearn
###################
# hours of fun
# the next several hours is spent finding out that the probe names which begin with numbers
# create a problem for random forest and so they must be appended with letters
##################
# length(learnThis)
# dim(eData)
# rf1e = MLearn( learnThis~., data=eData, randomForestI, TrainInd, ntree=1000, mtry=55, importance=TRUE)
# #Error in eval(expr, envir, enclos) : object '41654_at' not found
# #why?????
# dim(eData)
# eData[,c(1:5, 2161)]
# str(colnames(eData))
# sum(is.na(eData)) #0
# colnames(eData) <- c(1:2160,"outcome")
# 
# first lets try a positive control - repeat something i know works
# library("MASS")
# data(crabs)
# set.seed(1234)
# kp = sample(1:200, size=120)
# rf1 = MLearn(sp~CW+RW, data=crabs, randomForestI, kp, ntree=600 )
# rf1
# 
# #try this
# attach(eData)
# # The following object is masked _by_ .GlobalEnv:
# # 
# #     outcome
# 
# #make sure this is acceptable as input to MLearn
# rf1e = MLearn( outcome~., data=eData, randomForestI, TrainInd, ntree=1000, mtry=55, importance=TRUE)
# # Error in randomForest.default(m, y, ...) : 
# #   length of response must be the same as predictors
# detach(eData)
# 
# 
# eData <- t(crabs)
# eData <- t(eData)
# eData <- as.data.frame(eData, stringsAsFactors=FALSE)
# learnThis <- eData$sp
# dim(crabs)[2]
# eData <- cbind(learnThis, eData[,2:dim(crabs)[2]])
# rf1e = MLearn( learnThis~., data=eData, randomForestI, TrainInd, ntree=1000, mtry=55, importance=TRUE)
# # Error in randomForest.default(m, y, ...) : 
# #   Need at least two classes to do classification.
# str(eData)
# 
# 
# #################
# 
# #first we need to retrieve just the expression data, transpose and cast to a data frame - this took several minutes to figure out because of the $%^$&&* poor documentation
# newColNames<-rownames(ALLfilt_bcrneg)
# newRowNames<-colnames(ALLfilt_bcrneg)
# str(exprs(ALLfilt_bcrneg)[1:10,1:10])
# eData<-as.data.frame(t(exprs(ALLfilt_bcrneg)))
# colnames(eData)<-newColNames
# rownames(eData)<-newRowNames
# str(eData[1:10,1:10])
# 
# 
# eData <- as.data.frame(t(exprs(ALLfilt_bcrneg)))
# 
# #finally, add the outcome variable separately
# learnThis <- ALLfilt_bcrneg$mol.biol
# newColNames <- c("learnThis", colnames(eData)) 
# eData <- cbind(learnThis, eData) 
# colnames(eData) <- newColNames
# 
# colnames(eData)
# str(eData)
# length(learnThis)
# dim(eData)
# 
# 
# #make sure this is acceptable as input to MLearn
# 
# length(learnThis)
# dim(eData)
# rf1e = MLearn( learnThis~., data=eData, randomForestI, TrainInd, ntree=1000, mtry=55, importance=TRUE)
# #....Error in eval(expr, envir, enclos) : object '41654_at' not found
# 
# #next try making up data and see if i can reproduce the error 
# outcome<-sample(c("black","white"), 100, replace=TRUE)
# train<-matrix(data=runif(10000, 0, 1), nrow=100)
# dim(train)
# train.df <- as.data.frame(train)
# ##using alphabetic colnames works
# colnames(train.df) <- apply(train.df,2,function(x){paste(sample(c(letters),5), sep = "", collapse="")})
# ##but introducing numbers or underscores will cause problems for random_forest and the error
# #Error in eval(expr, envir, enclos) : object '8o2fq' not found
# #will be returned for the first column name with a number or _ at the beginning
# colnames(train.df) <- apply(train.df,2,function(x){paste(sample(c(letters,1:10),5), sep = "", collapse="")})
# colnames(train.df) <- apply(train.df,2,function(x){paste(sample(c(letters,"_"),5), sep = "", collapse="")})
# colnames(train.df)
# dim(train.df)
# str(train.df)
# train.df<-cbind(outcome, train.df)
# str(train.df[1:10,1:10])
# #colnames(train.df)<-c("outcome", as.character(1:100))
# 
# 
# rownames(train.df)<-c(as.character(1:100))
# str(train.df)
# 
# y<-sample(1:100, 20)
# rftest<- MLearn(outcome~., data=train.df, randomForestI, y, ntree=1000, mtry=55, importance=TRUE)
# rftest.cm<-confuMat(rftest)
# acc(as.table(rftest.cm))
# #ok - so the problem seems to be solved





########

#first we need to retrieve just the expression data, transpose and cast to a data frame - this took several minutes to figure out because of the $%^$&&* poor documentation
eData <- as.data.frame(t(exprs(ALLfilt_bcrneg)))

###
###<==== must append something to beginning of probe names so they do not begin with a number
newColNames <- paste("AAA", colnames(eData), sep="", collapse=NULL) 
####
####


#finally, add the outcome variable separately
learnThis <- ALLfilt_bcrneg$mol.biol
newColNames <- c("learnThis", newColNames)
eData <- cbind(learnThis, eData) 
colnames(eData) <- newColNames

colnames(eData)[1:10]
##  [1] "learnThis"     "AAA41654_at"   "AAA35430_at"   "AAA38924_s_at"
##  [5] "AAA36023_at"   "AAA266_s_at"   "AAA37569_at"   "AAA33047_at"  
##  [9] "AAA39428_at"   "AAA36515_at"
str(eData)
## 'data.frame':    79 obs. of  2161 variables:
##  $ learnThis    : Factor w/ 2 levels "BCR/ABL","NEG": 1 2 1 2 2 2 2 2 1 1 ...
##  $ AAA41654_at  : num  0.302 -0.333 -0.269 -0.195 -0.968 ...
##  $ AAA35430_at  : num  0.168 -1.564 -0.488 -0.257 -0.675 ...
##  $ AAA38924_s_at: num  0.2552 -0.4167 0.4683 0.7595 0.0545 ...
##  $ AAA36023_at  : num  0.3659 -0.9778 0.0377 0.5133 -1.1639 ...
##  $ AAA266_s_at  : num  0.353 -0.3864 0.2118 0.3164 -0.0454 ...
##  $ AAA37569_at  : num  -0.0336 -0.2993 -0.0016 0.086 -0.4045 ...
##  $ AAA33047_at  : num  0.191 -0.53 0.378 0.131 -0.662 ...
##  $ AAA39428_at  : num  0.552 -0.852 0.215 0.166 0.382 ...
##  $ AAA36515_at  : num  -0.0319 -0.837 -0.0167 0.5539 -0.5017 ...
##  $ AAA32589_at  : num  0.2711 -0.7963 -1.1034 0.0659 -0.2122 ...
##  $ AAA38843_at  : num  1.3 -1.88 -1.2 1.13 0.7 ...
##  $ AAA39134_at  : num  -0.4043 0.2827 0.0683 0.0285 1.5156 ...
##  $ AAA41233_at  : num  0.596 -0.777 1.229 1.152 0.433 ...
##  $ AAA32130_at  : num  0.0782 0.1309 0.0692 0.4793 -0.1468 ...
##  $ AAA34878_at  : num  0.9765 -0.6601 0.5408 1.0579 -0.0318 ...
##  $ AAA34814_at  : num  0.175 -0.898 -0.823 0.747 -0.202 ...
##  $ AAA1933_g_at : num  -0.5035 -0.2458 0.9783 -0.0657 -0.9887 ...
##  $ AAA32163_f_at: num  -0.2164 0.4552 0.2242 -0.1617 0.0198 ...
##  $ AAA40427_at  : num  -0.934 0.608 -0.227 0.116 0.361 ...
##  $ AAA34372_at  : num  -0.5635 0.121 0.2703 -0.0281 -0.139 ...
##  $ AAA35831_at  : num  -0.0263 -0.422 0.0849 2.4236 -0.2785 ...
##  $ AAA34325_at  : num  -0.0765 1.1914 -0.0514 0.0866 1.4476 ...
##  $ AAA38392_at  : num  -0.386 -0.376 0.66 0.41 -0.65 ...
##  $ AAA35810_at  : num  0.2775 -0.2718 0.786 -0.0479 -1.7225 ...
##  $ AAA39043_at  : num  -0.5537 0.3854 -0.2388 -0.0692 -1.2403 ...
##  $ AAA35271_at  : num  -0.019 0.3004 1.4439 0.5096 -0.0302 ...
##  $ AAA35734_at  : num  0.1807 -0.735 0.9578 0.0378 -0.303 ...
##  $ AAA40990_at  : num  -0.407 -0.136 0.863 1.273 -0.489 ...
##  $ AAA40712_at  : num  -0.0461 0.6305 0.4847 0.9942 1.406 ...
##  $ AAA41202_s_at: num  0.2739 -0.7677 0.3301 -0.0441 -0.8925 ...
##  $ AAA39829_at  : num  -0.361 0.608 0.54 -0.339 0.212 ...
##  $ AAA33291_at  : num  -0.522 0.388 -0.36 0.406 0.162 ...
##  $ AAA41750_at  : num  0.474 -0.586 -1.148 0.664 -0.378 ...
##  $ AAA1468_at   : num  0.465 -0.365 -0.807 0.511 -0.453 ...
##  $ AAA41742_s_at: num  0.332 0.171 0.523 1.122 0.984 ...
##  $ AAA41724_at  : num  -0.5444 1.2145 0.1716 0.0582 0.0828 ...
##  $ AAA33849_at  : num  1 -0.109 1.447 0.876 1.422 ...
##  $ AAA38573_at  : num  0.557 -0.574 0.615 0.752 -0.227 ...
##  $ AAA40631_at  : num  0.0194 -0.5644 1.0923 0.9533 0.2914 ...
##  $ AAA36506_at  : num  0.17 -1.308 0.445 0.534 0.398 ...
##  $ AAA41133_at  : num  -0.4587 0.2464 0.0439 0.5585 0.02 ...
##  $ AAA32223_at  : num  0.18 -1.454 -0.116 0.554 -0.792 ...
##  $ AAA32193_at  : num  -0.0586 -0.3474 1.2351 -0.4959 -0.948 ...
##  $ AAA33710_at  : num  -0.266 -0.896 1.079 0.537 0.42 ...
##  $ AAA32803_at  : num  0.611 -1.2287 -0.0406 0.723 -0.2909 ...
##  $ AAA32804_at  : num  0.451 -0.841 1.064 0.76 0.121 ...
##  $ AAA37542_at  : num  0 -0.387 0.505 0.965 2.353 ...
##  $ AAA1942_s_at : num  -0.3899 0.0057 -0.6981 0.4313 -0.7233 ...
##  $ AAA34998_at  : num  -0.746 -0.91 -0.287 0.385 -0.847 ...
##  $ AAA35970_g_at: num  0.1518 -0.7169 -0.0208 0.7091 -1.063 ...
##  $ AAA31851_at  : num  0.9391 -1.2726 0.1419 1.2747 0.0421 ...
##  $ AAA40203_at  : num  0.337 0.13 0.93 0.143 0.77 ...
##  $ AAA1207_at   : num  0.0606 0.3911 0.0991 1.3855 -0.7271 ...
##  $ AAA38568_at  : num  -0.2925 -0.2522 0.3156 2.0414 -0.0944 ...
##  $ AAA149_at    : num  0.0278 -1.43 -0.6234 0.6257 0.3028 ...
##  $ AAA33247_at  : num  0.0183 -0.7447 0.5192 0 0.7644 ...
##  $ AAA40196_at  : num  -0.103 -0.53 0.212 0.367 -0.758 ...
##  $ AAA34961_at  : num  -0.0759 -0.1188 0.9586 1.6728 -0.8327 ...
##  $ AAA38359_at  : num  -0.479 -0.515 -0.392 1.201 -0.783 ...
##  $ AAA39792_at  : num  0.234 -1.014 -0.323 0.713 0 ...
##  $ AAA35140_at  : num  0.0474 -0.6076 0.3419 0.7936 0.1206 ...
##  $ AAA40063_at  : num  0.158 -0.803 0.252 0.993 -0.671 ...
##  $ AAA32173_at  : num  -0.655 0.353 -0.483 0.1 -0.964 ...
##  $ AAA41737_at  : num  0.482 -0.451 -0.209 1.103 0.745 ...
##  $ AAA38767_at  : num  0.368 -1.204 0.775 1.916 0.449 ...
##  $ AAA33700_at  : num  0.5541 -0.8779 0.981 0.0479 -0.3282 ...
##  $ AAA2031_s_at : num  0.509 0.204 0.959 0.447 -0.112 ...
##  $ AAA32961_at  : num  1.015 -0.613 0.143 0.869 0.324 ...
##  $ AAA35151_at  : num  -0.5431 1.0672 0.3853 0.2683 -0.0736 ...
##  $ AAA33847_s_at: num  0.734 -0.688 0.17 1.565 -0.159 ...
##  $ AAA34268_at  : num  -0.209 1.283 0.377 -0.626 -0.364 ...
##  $ AAA39221_at  : num  0 -0.5154 0.1736 0.1524 -0.0686 ...
##  $ AAA33351_at  : num  1.032 -0.539 0.246 0.464 -0.174 ...
##  $ AAA32832_at  : num  0.419 -1.511 -0.391 0.354 0.655 ...
##  $ AAA38070_at  : num  0 -0.737 0.638 -0.472 -1.482 ...
##  $ AAA33791_at  : num  0.154 -0.706 -0.187 1.206 -0.189 ...
##  $ AAA34544_at  : num  1.2526 -0.0239 0.1342 0.926 1.0894 ...
##  $ AAA35166_at  : num  -0.688 -0.653 0.236 0.239 0.443 ...
##  $ AAA1797_at   : num  0.931 0.677 -0.509 0.961 0.214 ...
##  $ AAA834_at    : num  -0.294 -0.248 -0.21 1.062 0.979 ...
##  $ AAA1599_at   : num  -0.0931 -0.321 -0.3019 0.6173 -0.1053 ...
##  $ AAA37955_at  : num  -0.4849 0.0339 -0.3365 -0.1529 -1.0689 ...
##  $ AAA37964_at  : num  0.1823 -1.5958 0.0742 0.7164 -1.3492 ...
##  $ AAA36825_at  : num  -0.0385 0.0405 0.4315 0.7139 -1.2051 ...
##  $ AAA41526_at  : num  -0.3742 0.6121 -0.198 -0.0442 0.8871 ...
##  $ AAA33113_at  : num  0.096 -2.372 1.054 1.54 -0.461 ...
##  $ AAA32272_at  : num  0.4965 -1.2341 0.0371 0.7363 -1.3649 ...
##  $ AAA429_f_at  : num  -0.232 0.198 0.924 -0.682 0.751 ...
##  $ AAA33678_i_at: num  -0.2169 0.0015 1.1055 -0.6857 0.5163 ...
##  $ AAA38241_at  : num  -0.3709 0.7107 -0.0985 0.0782 -0.6484 ...
##  $ AAA38518_at  : num  0.3503 -0.5535 -0.0664 0.6618 -0.9554 ...
##  $ AAA41187_at  : num  0.587 -0.212 0.896 0.768 0.257 ...
##  $ AAA33267_at  : num  0.0695 -0.8214 0.5734 0.0477 -1.5886 ...
##  $ AAA36933_at  : num  0.855 -0.165 0.725 0.726 0.805 ...
##  $ AAA40041_at  : num  0.491 -0.914 0.318 0.997 0.745 ...
##  $ AAA32607_at  : num  1.0713 -0.2841 0.0262 -0.0388 0.5067 ...
##  $ AAA41745_at  : num  -0.2493 0.1612 0.3265 0.9085 -0.0403 ...
##  $ AAA38353_at  : num  0.563 -1.391 -0.088 0.199 0.182 ...
##   [list output truncated]
length(learnThis)
## [1] 79
dim(eData)
## [1]   79 2161
#make sure this is acceptable as input to MLearn
length(learnThis)
## [1] 79
dim(eData)
## [1]   79 2161
rf1e = MLearn( learnThis~., data=eData, randomForestI, TrainInd, ntree=1000, mtry=55, importance=TRUE)
#....
rf1e.cm<-confuMat(rf1e)
acc(as.table(rf1e.cm))
## [1] 0.8974359
#so now lets go back and first try collapsing the rows in the expression data set
tmp <- exprs(ALLfilt_bcrneg)
#make a small test set
tmp2<-tmp[1:6,1:6]
tmp2
##                  01005      01010       03002       04007       04008
## 41654_at    0.30223298 -0.3331243 -0.26938075 -0.19468856 -0.96768116
## 35430_at    0.16821060 -1.5636539 -0.48766759 -0.25706581 -0.67450101
## 38924_s_at  0.25522567 -0.4166590  0.46829547  0.75954472  0.05449203
## 36023_at    0.36592774 -0.9777903  0.03769396  0.51333134 -1.16386312
## 266_s_at    0.35301246 -0.3863641  0.21180977  0.31643576 -0.04539891
## 37569_at   -0.03356995 -0.2993378 -0.00160298  0.08596511 -0.40454587
##                  04010
## 41654_at    0.88593546
## 35430_at   -1.02849426
## 38924_s_at -0.93322046
## 36023_at   -0.94238529
## 266_s_at    0.52930562
## 37569_at   -0.08933493
#prepare row names and group names
rowID<-rownames(tmp2)
rowID
## [1] "41654_at"   "35430_at"   "38924_s_at" "36023_at"   "266_s_at"  
## [6] "37569_at"
rowGroup<-c("g1", "g1", "g2", "g2", "g2", "g3")
rowGroup
## [1] "g1" "g1" "g2" "g2" "g2" "g3"
#collapse the rows and examine the results
collapseRows.results<-collapseRows(datET=tmp2, rowGroup=rowGroup, rowID=rowID)
tmp2Collapsed<-collapseRows.results$datETcollapsed
tmp2Collapsed #<====resulting object is a matrix with group names as new row names 
##          01005      01010       03002       04007       04008       04010
## g1  0.30223298 -0.3331243 -0.26938075 -0.19468856 -0.96768116  0.88593546
## g2  0.35301246 -0.3863641  0.21180977  0.31643576 -0.04539891  0.52930562
## g3 -0.03356995 -0.2993378 -0.00160298  0.08596511 -0.40454587 -0.08933493
str(tmp2Collapsed)
##  num [1:3, 1:6] 0.3022 0.353 -0.0336 -0.3331 -0.3864 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:3] "g1" "g2" "g3"
##   ..$ : chr [1:6] "01005" "01010" "03002" "04007" ...
#the function also returns meta-information about the groups
collapseRows.results$group2row
##    group selectedRowID
## g1 "g1"  "41654_at"   
## g2 "g2"  "266_s_at"   
## g3 "g3"  "37569_at"
collapseRows.results$selectedRow
##   41654_at   35430_at 38924_s_at   36023_at   266_s_at   37569_at 
##       TRUE      FALSE      FALSE      FALSE       TRUE       TRUE
#now there are multiple ways to collapse rows including user-defined methods
#when method="function", then methodFunction must be specified
#other values for mehtod could be:
# "MaxMean" (default) or 
# "MinMean" = choose the row with the highest or lowest mean value, respectively. 
# "maxRowVariance" = choose the row with the highest variance (across the columns of datET). 
# "absMaxMean" or "absMinMean" = choose the row with the highest or lowest mean absolute value. 
# "ME" = choose the eigenrow (first principal component of the rows in each group). Note that with this method option, connectivityBasedCollapsing is automatically set to FALSE. 
# "Average" = for each column, take the average value of the rows in each group
# "function" = use this method for a user-input function (see the description of the argument "methodFunction"). 
# Note: if method="ME", "Average" or "function", the output parameters "group2row" and "selectedRow" are not informative.
collapseRows.results<-collapseRows(datET=tmp2, rowGroup=rowGroup, rowID=rowID, method="function", methodFunction=colMeans)
## Comment: make sure methodFunction takes a matrix as input.
## ...Ignore previous comment.  Function completed properly!
tmp2Collapsed<-collapseRows.results$datETcollapsed
tmp2Collapsed #<====resulting object is a matrix with group names as new row names 
##          01005      01010       03002       04007      04008       04010
## g1  0.23522179 -0.9483891 -0.37852417 -0.22587719 -0.8210911 -0.07127940
## g2  0.32472195 -0.5936045  0.23926640  0.52977060 -0.3849233 -0.44876671
## g3 -0.03356995 -0.2993378 -0.00160298  0.08596511 -0.4045459 -0.08933493
str(tmp2Collapsed)
##  num [1:3, 1:6] 0.2352 0.3247 -0.0336 -0.9484 -0.5936 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:3] "g1" "g2" "g3"
##   ..$ : chr [1:6] "01005" "01010" "03002" "04007" ...
#since a function is used to generate a new value for each column, meta-information about the groups is meaningless
collapseRows.results$group2row
##      group selectedRowID
## [1,] "g1"  "function.g1"
## [2,] "g2"  "function.g2"
## [3,] "g3"  "function.g3"
collapseRows.results$selectedRow
##   41654_at   35430_at 38924_s_at   36023_at   266_s_at   37569_at 
##      FALSE      FALSE      FALSE      FALSE      FALSE      FALSE
#try selecting the absolute maximum value for each column
#first have to provide a function that will return the maximum absolute value for each column in a collapse group
myCollapse<-function(someMatrix)apply(someMatrix, 2, function(x){ return(x[which.max(abs(x))])})
collapseRows.results<-collapseRows(datET=tmp2, rowGroup=rowGroup, rowID=rowID, method="function", methodFunction=myCollapse)
## Comment: make sure methodFunction takes a matrix as input.
## ...Ignore previous comment.  Function completed properly!
tmp2Collapsed<-collapseRows.results$datETcollapsed
tmp2Collapsed #<====resulting object is a matrix with group names as new row names 
##          01005      01010       03002       04007      04008       04010
## g1  0.30223298 -1.5636539 -0.48766759 -0.25706581 -0.9676812 -1.02849426
## g2  0.36592774 -0.9777903  0.46829547  0.75954472 -1.1638631 -0.94238529
## g3 -0.03356995 -0.2993378 -0.00160298  0.08596511 -0.4045459 -0.08933493
str(tmp2Collapsed)
##  num [1:3, 1:6] 0.3022 0.3659 -0.0336 -1.5637 -0.9778 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:3] "g1" "g2" "g3"
##   ..$ : chr [1:6] "01005" "01010" "03002" "04007" ...
#since a function is used to generate a new value for each column, meta-information about the groups is meaningless
collapseRows.results$group2row
##      group selectedRowID
## [1,] "g1"  "function.g1"
## [2,] "g2"  "function.g2"
## [3,] "g3"  "function.g3"
collapseRows.results$selectedRow
##   41654_at   35430_at 38924_s_at   36023_at   266_s_at   37569_at 
##      FALSE      FALSE      FALSE      FALSE      FALSE      FALSE
####
# finally,put it all together
#
# collapse whole expression set by gene id and selecting max row for each
tmp <- exprs(ALLfilt_bcrneg)
rowID<-rownames(tmp)#prepare row names and group names
ps_eg = toTable(hgu95av2ENTREZID) #probe_id to Entrez Gene ids
ind<-match(rowID, ps_eg[,1]) #find matching entries of rowIDs in ps_eg table
geneid <- ps_eg[ind,2] #retrieve corresponding gene id for each rowID - all are unique so collapsing wont actually do anything
check(geneid)#chekc that all rowIDs were mapped to a gene id
## 
## head:
## [1] "100"       "10001"     "10006"     "100129361" "100133941" "10016"    
## 
## tail:
## [1] "9976" "998"  "9984" "9987" "9988" "9989"
## 
## str:
##  
##  chr [1:2160] "100" "10001" "10006" "100129361" "100133941" ...
## NULL
## 
## length: 2160 
## 
## dim: 
## 
## total NA: 0 
## 
## total NULL: 0 
## 
## total duplicated: 0 
## 
## summary:
## 
##    Length     Class      Mode 
##      2160 character character
rowGroup<-geneid
myCollapse<-function(someMatrix)apply(someMatrix, 2, function(x){ return(x[which.max(abs(x))])}) #define a collapse function
collapseRows.results<-collapseRows(datET=tmp, rowGroup=rowGroup, rowID=rowID, method="function", methodFunction=myCollapse)#collapse
## Comment: make sure methodFunction takes a matrix as input.
## ...Ignore previous comment.  Function completed properly!
tmpCollapsed<-collapseRows.results$datETcollapsed  
  
  
# prepare expression set for ml
eData <- as.data.frame(t(tmpCollapsed))#transpose and cast as a data frame
newColNames <- paste("geneid", colnames(eData), sep="", collapse=NULL) #fix column names
learnThis <- ALLfilt_bcrneg$mol.biol ##finally, add the outcome variable separately
newColNames <- c("learnThis", newColNames)
eData <- cbind(learnThis, eData) 
colnames(eData) <- newColNames

# do the random forest thang
length(learnThis)
## [1] 79
dim(eData)
## [1]   79 2161
rf1e = MLearn( learnThis~., data=eData, randomForestI, TrainInd, ntree=1000, mtry=55, importance=TRUE)
#....
rf1e.cm<-confuMat(rf1e)
rf1e.cm
##          predicted
## given     BCR/ABL NEG
##   BCR/ABL      13   4
##   NEG           0  22
acc(as.table(rf1e.cm))
## [1] 0.8974359

An aside on retrieving functional annotation data

How do I map probe ids to gene names/ids and ids to pathways and complexes

This section should eventually be moved to Chapter 8 notes.

###
#using the annotation environment - probe id to entrez gene id
###
?hgu95av
## No documentation for 'hgu95av' in specified packages and libraries:
## you could try '??hgu95av'
?hgu95av2ENTREZID
EG = as.character(hgu95av2ENTREZID[featureNames(ALL)])
dim(ALL) #12625 x 128
## Features  Samples 
##    12625      128
length(EG) #11543 therefore, not all probe ids map to a gene id
## [1] 11543
EG[1:5]
##   1000_at   1001_at 1002_f_at 1003_s_at   1004_at 
##    "5595"    "7075"    "1557"     "643"     "643"
sum(is.na(EG)) #0 
## [1] 0
#so EG is a dictionary of probe names 2 geneids that can be accessed like this 
thisProbe <- "40812_at"
EG[thisProbe]
## 40812_at 
##   "6526"
#we can also do this to get an ordinary table
ps_eg = toTable(hgu95av2ENTREZID)
head(ps_eg)
##    probe_id gene_id
## 1   1000_at    5595
## 2   1001_at    7075
## 3 1002_f_at    1557
## 4 1003_s_at     643
## 5   1004_at     643
## 6   1005_at    1843
#how many genes map to one probe, two probes, three probes...etc.
 table(table(EG))
## 
##    1    2    3    4    5    6    7    8 
## 6586 1443  458  104   22   16    5    5
#    1    2    3    4    5    6    7    8 
# 6586 1443  458  104   22   16    5    5
#The inner call to table counts for each EntrezGene ID the number of probe sets
#that are mapped to it. The outer one tabulates how often each count is seen, one, two, three, . . . times.
#so there are 6586 genes that map to one probe, there are 1443 that map to 2 probes etc...

###
#gene ids to chromosomes
###
ps_chr = toTable(hgu95av2CHR)
head(ps_chr) #<-- probe id to chromosome
##    probe_id chromosome
## 1   1000_at         16
## 2   1001_at          1
## 3 1002_f_at         10
## 4 1003_s_at         11
## 5   1004_at         11
## 6   1005_at          5
ps_eg = toTable(hgu95av2ENTREZID)
head(ps_eg)  #<-- probe id to entrez gene
##    probe_id gene_id
## 1   1000_at    5595
## 2   1001_at    7075
## 3 1002_f_at    1557
## 4 1003_s_at     643
## 5   1004_at     643
## 6   1005_at    1843
chr = merge(ps_chr, ps_eg)
head(chr) #<--probe id to chromosome to gene id
##    probe_id chromosome gene_id
## 1  100_g_at         14    5875
## 2   1000_at         16    5595
## 3   1001_at          1    7075
## 4 1002_f_at         10    1557
## 5 1003_s_at         11     643
## 6   1004_at         11     643
chr = unique(chr[, colnames(chr)!="probe_id"])
head(chr)
##   chromosome gene_id
## 1         14    5875
## 2         16    5595
## 3          1    7075
## 4         10    1557
## 5         11     643
## 7          5    1843
###
#gene ids to GO
###
#relevant packages
#GOStats
#annotate
#topGO
#goTools
# The induced GO graph for a collection of genes is the graph that results
# from taking the union of all GO terms annotated to the genes and also all
# their parent terms.
# as.list(GOMFCHILDREN["GO:0008094"]) #<--get direct descendants of term in MF ontology
# as.list(GOMFOFFSPRING["GO:0008094"])#<--get all    descendants
# as.list(GOMFANCESTOR["GO:0008094"]) #<--get all    parents
# as.list(GOMFPARENTS["GO:0008094"])  #<--get direct parents


# hyperGTest
# The function hyperGTest will compute the Hypergeometric p-values for overrepresentation
# of genes at all GO terms in the induced GO graph.
# Because of the way that genes are annotated at GO terms, there are concerns with how one might
# address the multiple testing issues that arise. Both the topGO and GOstats
# packages implement a form of conditional testing that is designed to address some
# of these concern.
# library("GOstats")
# #To perform the test we firrst define the universe of genes and create a parameter object to set the analysis options.
# affyUniverse = featureNames(ALLfilt_af4bcr)
# uniId = hgu95av2ENTREZID[affyUniverse]
# entrezUniverse = unique(as.character(uniId))
# params = new("GOHyperGParams", geneIds=EGsub, universeGeneIds=entrezUniverse, annotation="hgu95av2", ontology="BP", pvalueCutoff=0.001, conditional=FALSE, testDirection="over")
# mfhyper = hyperGTest(params)
# hist(pvalues(mfhyper), breaks=50, col="mistyrose")

?hgu95av2GO
#Maps between manufacturer IDs and Gene Ontology (GO) IDs
# hgu95av2GO is an R object that provides mappings between manufacturer identifiers and the GO identifiers that they are directly associated with. This mapping and its reverse mapping (hgu95av2GO2PROBE) do NOT associate the child terms from the GO ontology with the gene. Only the directly evidenced terms are represented here.
x <- hgu95av2GO
head(x)
## GO submap for chip hgu95av2 (object of class "ProbeGo3AnnDbBimap")
# Get the manufacturer identifiers that are mapped to a GO ID
mapped_genes <- mappedkeys(x)
head(mapped_genes)
## [1] "1000_at"   "1001_at"   "1002_f_at" "1003_s_at" "1004_at"   "1005_at"
# Convert to a list
xx <- as.list(x[mapped_genes])
#head(xx)
#xx[1]
class(xx)
## [1] "list"
#str(xx)
if(length(xx) > 0) {
        # Try the first one
        got <- xx[[1]]           
        got[[1]][["GOID"]]
        got[[1]][["Ontology"]]
        got[[1]][["Evidence"]]
}
## [1] "TAS"
hgu95av2GO.table <- toTable(hgu95av2GO)
head(hgu95av2GO.table)
##   probe_id      go_id Evidence Ontology
## 1  1000_at GO:0000165      TAS       BP
## 2  1000_at GO:0000186      TAS       BP
## 3  1000_at GO:0000187      TAS       BP
## 4  1000_at GO:0002224      TAS       BP
## 5  1000_at GO:0002755      TAS       BP
## 6  1000_at GO:0002756      TAS       BP
table(hgu95av2GO.table$probe_id)[1:5]
## 
##  100_g_at   1000_at   1001_at 1002_f_at 1003_s_at 
##         7        83        15        23        14
table(table(hgu95av2GO.table$probe_id))
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
## 167 177 295 332 405 466 461 511 520 470 509 469 413 435 414 383 345 319 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
## 290 291 211 224 196 168 207 139 137 117 134 130 131  74  84  89  82  69 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##  66  54  75  52  52  69  41  54  42  47  38  23  41  39  27  30  32  18 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##  36  26  19  25   6  24  19  18  20  15  20  19  17  13   5  22  21  12 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##  13   9   4   5  11  14   1   7   9   8  15   9   2   5   9   5   4   4 
##  91  92  93  94  96  97  98  99 100 101 103 104 105 106 108 111 112 113 
##   5   3   4   9   2   4   4   3   1   3   7   1   2   3   4   2   1   3 
## 114 115 124 126 128 130 131 136 140 143 144 147 150 159 184 197 
##   5   4   1   1   4   2   5   8   1   3   5   3   2   1   3   1
#table(hgu95av2GO.table$probe_id, hgu95av2GO.table$Ontology)[1:5]


##
# Summary
##

#probe_id to Entrez Gene ids
ps_eg = toTable(hgu95av2ENTREZID)

#probe_id to GO mappings
hgu95av2GO.table <- toTable(hgu95av2GO)

Retrieve a list of protein complexes annotated by GO and the Entrez Gene Ids for the proteins in them.

# there is an interesting example here of how the order of loading libraries can matter
# the "select" function from the AnnotatioDb package stopped working with error
# Error in select(GO.db, keys = "GO:1990237", columns = c("GOID", "TERM",  : 
#   unused arguments (keys = "GO:1990237", columns = c("GOID", "TERM", "ONTOLOGY"), keytype = "GOID")
# 
# I say "stopped working" because the call was composed correctly and had worked before.  The above cryptic message doesnt tell you that 
# its not trying to call the "select" function you think you are trying to call.  In the case above, the phrase "unused arguments"
# should be the tip-off that you need to check this
# try this
# > select
# function (obj) 
# UseMethod("select")
# <bytecode: 0x10e0389c0>
# <environment: namespace:MASS>
# 
# the problem was solved by doing this...
# detach("package:MASS", unload=TRUE)  

library("AnnotationDbi")
library(GO.db)
## 
#detach("package:AnnotationDbi", unload=TRUE)
#detach("package:GO.db", unload=TRUE)

gcon = GO_dbconn()

#a list of very simple search terms to retrieve complex like collections
searchTerms <- c("complex", "some", "mere")


#retrieve goids and their details related to the search terms
allGoidDetails <- NULL
complexGoid <- NULL
allComplexGoid <- NULL
for (searchTerm in searchTerms) {
  query <- paste("select go_id from go_term where term like '%", searchTerm, "%'", sep = "")
  theseGoids <- as.character(dbGetQuery(gcon, query )[[1]]) 
  goidDetails <- select(GO.db, keys=theseGoids, columns = c("GOID", "TERM", "ONTOLOGY"), keytype="GOID")
  goidDetails <- goidDetails[goidDetails$ONTOLOGY == "CC",]
  allGoidDetails <- unique(rbind(allGoidDetails, goidDetails))
  allComplexGoid <- unique(c(allComplexGoid, goidDetails$GOID))
}

##spot-check
check(allComplexGoid)
## 
## head:
## [1] "GO:0000015" "GO:0000109" "GO:0000110" "GO:0000111" "GO:0000112"
## [6] "GO:0000113"
## 
## tail:
## [1] "GO:0030017" "GO:0033583" "GO:0035996" "GO:0035997" "GO:0043034"
## [6] "GO:0061468"
## 
## str:
##  
##  chr [1:1874] "GO:0000015" "GO:0000109" "GO:0000110" ...
## NULL
## 
## length: 1874 
## 
## dim: 
## 
## total NA: 0 
## 
## total NULL: 0 
## 
## total duplicated: 0 
## 
## summary:
## 
##    Length     Class      Mode 
##      1874 character character
check(allGoidDetails)
## 
## head:
##         GOID                                        TERM ONTOLOGY
## 1 GO:0000015           phosphopyruvate hydratase complex       CC
## 2 GO:0000109          nucleotide-excision repair complex       CC
## 3 GO:0000110 nucleotide-excision repair factor 1 complex       CC
## 4 GO:0000111 nucleotide-excision repair factor 2 complex       CC
## 5 GO:0000112 nucleotide-excision repair factor 3 complex       CC
## 6 GO:0000113 nucleotide-excision repair factor 4 complex       CC
## 
## tail:
##            GOID                            TERM ONTOLOGY
## 5310 GO:0030017                       sarcomere       CC
## 8010 GO:0033583             rhabdomere membrane       CC
## 8710 GO:0035996          rhabdomere microvillus       CC
## 8810 GO:0035997 rhabdomere microvillus membrane       CC
## 9010 GO:0043034                       costamere       CC
## 1058 GO:0061468                       karyomere       CC
## 
## str:
##  
## 'data.frame':    1874 obs. of  3 variables:
##  $ GOID    : chr  "GO:0000015" "GO:0000109" "GO:0000110" "GO:0000111" ...
##  $ TERM    : chr  "phosphopyruvate hydratase complex" "nucleotide-excision repair complex" "nucleotide-excision repair factor 1 complex" "nucleotide-excision repair factor 2 complex" ...
##  $ ONTOLOGY: chr  "CC" "CC" "CC" "CC" ...
## NULL
## 
## length: 3 
## 
## dim: 1874 3 
## 
## total NA: 0 
## 
## total NULL: 0 
## 
## total duplicated: 0 
## 
## summary:
## 
##      GOID                 TERM               ONTOLOGY          
##  "Length:1874       " "Length:1874       " "Length:1874       "
##  "Class :character  " "Class :character  " "Class :character  "
##  "Mode  :character  " "Mode  :character  " "Mode  :character  "
head(allGoidDetails[grep("some", allGoidDetails$TERM),])
##           GOID                                TERM ONTOLOGY
## 22  GO:0000176     nuclear exosome (RNase complex)       CC
## 23  GO:0000177 cytoplasmic exosome (RNase complex)       CC
## 24  GO:0000178             exosome (RNase complex)       CC
## 56  GO:0000502                  proteasome complex       CC
## 215 GO:0005839             proteasome core complex       CC
## 315 GO:0008537        proteasome activator complex       CC
#now retrieve all human geneids annotated with these goids
library(org.Hs.eg.db)
##columns(org.Hs.eg.db)
theseComplexGeneDetails <- select(org.Hs.eg.db, keys=allComplexGoid, columns=c("ENTREZID", "GO"), keytype="GO")
## Warning in .generateExtraRows(tab, keys, jointype): 'select' resulted in
## 1:many mapping between keys and return rows
#remove NAs (duplicate gene ids are expected with different goids)
theseComplexGeneDetails <- theseComplexGeneDetails[!is.na(theseComplexGeneDetails$ENTREZID),]
theseComplexGeneDetails <- unique(theseComplexGeneDetails)
check(theseComplexGeneDetails$ENTREZID) #no NULL
## 
## head:
## [1] "2023"   "2026"   "2027"   "387712" "1161"   "2067"  
## 
## tail:
## [1] "9456"   "23336"  "23676"  "79026"  "113146" "202333"
## 
## str:
##  
##  chr [1:9612] "2023" "2026" "2027" "387712" "1161" "2067" ...
## NULL
## 
## length: 9612 
## 
## dim: 
## 
## total NA: 0 
## 
## total NULL: 0 
## 
## total duplicated: 3328 
## 
## summary:
## 
##    Length     Class      Mode 
##      9612 character character
check(theseComplexGeneDetails)
## 
## head:
##           GO EVIDENCE ONTOLOGY ENTREZID
## 1 GO:0000015      IEA       CC     2023
## 2 GO:0000015      IEA       CC     2026
## 3 GO:0000015      IEA       CC     2027
## 4 GO:0000015      IEA       CC   387712
## 5 GO:0000109      IDA       CC     1161
## 6 GO:0000109      IDA       CC     2067
## 
## tail:
##               GO EVIDENCE ONTOLOGY ENTREZID
## 10836 GO:0043034      IEA       CC     9456
## 10837 GO:0043034      IDA       CC    23336
## 10838 GO:0043034      IEA       CC    23676
## 10839 GO:0043034      ISS       CC    79026
## 10840 GO:0043034      ISS       CC   113146
## 10841 GO:0043034      IEA       CC   202333
## 
## str:
##  
## 'data.frame':    9612 obs. of  4 variables:
##  $ GO      : chr  "GO:0000015" "GO:0000015" "GO:0000015" "GO:0000015" ...
##  $ EVIDENCE: chr  "IEA" "IEA" "IEA" "IEA" ...
##  $ ONTOLOGY: chr  "CC" "CC" "CC" "CC" ...
##  $ ENTREZID: chr  "2023" "2026" "2027" "387712" ...
## NULL
## 
## length: 4 
## 
## dim: 9612 4 
## 
## total NA: 0 
## 
## total NULL: 0 
## 
## total duplicated: 0 
## 
## summary:
## 
##       GO                EVIDENCE             ONTOLOGY          
##  "Length:9612       " "Length:9612       " "Length:9612       "
##  "Class :character  " "Class :character  " "Class :character  "
##  "Mode  :character  " "Mode  :character  " "Mode  :character  "
##    ENTREZID          
##  "Length:9612       "
##  "Class :character  "
##  "Mode  :character  "
##table evidence type - at this point, evidence is not used to filter complexes
table(theseComplexGeneDetails$EVIDENCE)
## 
##  IBA   IC  IDA  IEA  IMP  IPI  ISS  NAS  TAS 
##  162   33 6086 1871   49   60  507  276  568
##IBA   IC  IDA  IEA  IMP  IPI  ISS  NAS  RCA  TAS 
##122   32 2933 1846   33   51  456  321    1  589 


#convert results to a list format
tmp<- theseComplexGeneDetails[,c(1,4)]
tmp$ENTREZID <- as.integer(tmp$ENTREZID)
gocomplex2geneids.l <- unstack(tmp, form=ENTREZID ~ GO)
#check(gocomplex2geneids.l) # 577 members in list, 18 duplicates ok 
#remove complexes with only one member
theseLengths <- sapply(gocomplex2geneids.l, length) 
gocomplex2geneids.l <- gocomplex2geneids.l[(theseLengths > 1)]
#check(gocomplex2geneids.l)#522 membrs in list , 3 duplicates

#set attributes before storing these 
attr(gocomplex2geneids.l, "title") <- "GOComplexes"
attr(gocomplex2geneids.l, "descr") <- "GO complexes are a collection of gene sets where each set is defined by a term from the cellular component gene ontology  (CC GO) that has the string 'complex', 'ome', or 'mere' in it and is used to annotate  two or more human genes."

Putting it all together

First I will collapse by gene ids as a control (should make little difference).

####
# finally,put it all together
#
# collapse whole expression set by gene id and selecting max row for each
tmp <- exprs(ALLfilt_bcrneg)
rowID<-rownames(tmp)#prepare row names and group names
ps_eg = toTable(hgu95av2ENTREZID) #probe_id to Entrez Gene ids
ind<-match(rowID, ps_eg[,1]) #find matching entries of rowIDs in ps_eg table
geneid <- ps_eg[ind,2] #retrieve corresponding gene id for each rowID - all are unique so collapsing wont actually do anything
check(geneid)#chekc that all rowIDs were mapped to a gene id
## 
## head:
## [1] "100"       "10001"     "10006"     "100129361" "100133941" "10016"    
## 
## tail:
## [1] "9976" "998"  "9984" "9987" "9988" "9989"
## 
## str:
##  
##  chr [1:2160] "100" "10001" "10006" "100129361" "100133941" ...
## NULL
## 
## length: 2160 
## 
## dim: 
## 
## total NA: 0 
## 
## total NULL: 0 
## 
## total duplicated: 0 
## 
## summary:
## 
##    Length     Class      Mode 
##      2160 character character
rowGroup<-geneid
myCollapse<-function(someMatrix)apply(someMatrix, 2, function(x){ return(x[which.max(abs(x))])}) #define a collapse function
collapseRows.results<-collapseRows(datET=tmp, rowGroup=rowGroup, rowID=rowID, method="function", methodFunction=myCollapse)#collapse
## Comment: make sure methodFunction takes a matrix as input.
## ...Ignore previous comment.  Function completed properly!
tmpCollapsed<-collapseRows.results$datETcollapsed  
  
  
# prepare expression set for ml
eData <- as.data.frame(t(tmpCollapsed))#transpose and cast as a data frame
newColNames <- paste("geneid", colnames(eData), sep="", collapse=NULL) #fix column names
learnThis <- ALLfilt_bcrneg$mol.biol ##finally, add the outcome variable separately
newColNames <- c("learnThis", newColNames)
eData <- cbind(learnThis, eData) 
colnames(eData) <- newColNames

# do the random forest thang
length(learnThis)
## [1] 79
dim(eData)
## [1]   79 2161
rf1e = MLearn( learnThis~., data=eData, randomForestI, TrainInd, ntree=1000, mtry=55, importance=TRUE)
#....
rf1e.cm<-confuMat(rf1e)
rf1e.cm
##          predicted
## given     BCR/ABL NEG
##   BCR/ABL      13   4
##   NEG           0  22
acc(as.table(rf1e.cm))
## [1] 0.8974359
#          predicted
# given     BCR/ABL NEG
#   BCR/ABL      13   4
#   NEG           0  22
# > acc(as.table(rf1e.cm))
# [1] 0.8974359

Collapsing by complex

Next try collapsing into complexes before running random forest

# collapse whole expression set by complex id and selecting max row for each
tmp <- exprs(ALLfilt_bcrneg)
rowID<-rownames(tmp)#prepare row names and group names
ps_eg = toTable(hgu95av2ENTREZID) #probe_id to Entrez Gene ids
ind<-match(rowID, ps_eg[,1]) #find matching entries of rowIDs in ps_eg table
geneid <- ps_eg[ind,2] #retrieve corresponding gene id for each rowID - all are unique so collapsing wont actually do anything
check(geneid)#chekc that all rowIDs were mapped to a gene id
## 
## head:
## [1] "100"       "10001"     "10006"     "100129361" "100133941" "10016"    
## 
## tail:
## [1] "9976" "998"  "9984" "9987" "9988" "9989"
## 
## str:
##  
##  chr [1:2160] "100" "10001" "10006" "100129361" "100133941" ...
## NULL
## 
## length: 2160 
## 
## dim: 
## 
## total NA: 0 
## 
## total NULL: 0 
## 
## total duplicated: 0 
## 
## summary:
## 
##    Length     Class      Mode 
##      2160 character character
#theseComplexGeneDetails
#convert geneid to complexid
ind<-match(geneid,theseComplexGeneDetails$ENTREZID)
#for every case where ind exists, map to correpsonding complex id
complexId<-theseComplexGeneDetails[ind,1]
#use complexIds as grouping ids where they exist for a probe, 
rowGroup<-complexId
#...otherwise use the probe id as a group id
rowGroup[is.na(complexId)]<-rowID[is.na(complexId)]
check(rowGroup) #0 NA, 851 duplicates - good
## 
## head:
## [1] "GO:0005764" "GO:0016592" "GO:0031209" "36023_at"   "266_s_at"  
## [6] "GO:0005768"
## 
## tail:
## [1] "40698_at"   "GO:0070062" "GO:0000346" "GO:0005681" "41808_at"  
## [6] "GO:0030289"
## 
## str:
##  
##  chr [1:2160] "GO:0005764" "GO:0016592" "GO:0031209" ...
## NULL
## 
## length: 2160 
## 
## dim: 
## 
## total NA: 0 
## 
## total NULL: 0 
## 
## total duplicated: 851 
## 
## summary:
## 
##    Length     Class      Mode 
##      2160 character character
myCollapse<-function(someMatrix)apply(someMatrix, 2, function(x){ return(x[which.max(abs(x))])}) #define a collapse function
collapseRows.results<-collapseRows(datET=tmp, rowGroup=rowGroup, rowID=rowID, method="function", methodFunction=myCollapse)#collapse
## Comment: make sure methodFunction takes a matrix as input.
## ...Ignore previous comment.  Function completed properly!
tmpCollapsed<-collapseRows.results$datETcollapsed  
dim(tmp)  
## [1] 2160   79
dim(tmpCollapsed)  
## [1] 1309   79
# > dim(tmp)
# [1] 2160   79
# > dim(tmpCollapsed)
# [1] 1309   79


# prepare expression set for ml
eData <- as.data.frame(t(tmpCollapsed))#transpose and cast as a data frame
newColNames <- paste("AAA", colnames(eData), sep="", collapse=NULL) #fix column names
#fix column names - need to replace :
newColNames<-gsub(":", "", newColNames)
learnThis <- ALLfilt_bcrneg$mol.biol ##finally, add the outcome variable separately
newColNames <- c("learnThis", newColNames)
eData <- cbind(learnThis, eData) 
colnames(eData) <- newColNames

# do the random forest thang
length(learnThis)
## [1] 79
dim(eData)
## [1]   79 1310
rf1e = MLearn( learnThis~., data=eData, randomForestI, TrainInd, ntree=1000, mtry=55, importance=TRUE)
#....
rf1e.cm<-confuMat(rf1e)
rf1e.cm
##          predicted
## given     BCR/ABL NEG
##   BCR/ABL      13   4
##   NEG           0  22
acc(as.table(rf1e.cm))
## [1] 0.8974359
#          predicted
# given     BCR/ABL NEG
#   BCR/ABL      13   4
#   NEG           0  22
# > acc(as.table(rf1e.cm))
# [1] 0.8974359

#ok - makes no diff to performance - good - lets see whats important

Examining importance of features from a random forest model

The importance parameter of the random forest method allows us to examine which features had the largest impact on performance. Data about feture importance is retrieved using the getVarImp function of the MLInterfaces package

opar = par(no.readonly=TRUE, mar=c(7,7,4,2)) #opar=original parameters - keep these so they can be restored
par(las=2) #<-- style of the axis label - parallel to the axis
?getVarImp #<-- from the MLInterfaces package
impV1 = getVarImp(rf1e)
plot(impV1, n=15, plat="hgu95av2", toktype="SYMBOL") #<-- implementation of plot allows direct mapping of gene symbols

par(opar) #<-- restore the original graphics parameters

#some of the top hits are
#http://amigo.geneontology.org/amigo/term/GO:0034673

Session info

sessionInfo()
## R version 3.1.1 (2014-07-10)
## Platform: x86_64-apple-darwin13.1.0 (64-bit)
## 
## locale:
## [1] en_CA.UTF-8/en_CA.UTF-8/en_CA.UTF-8/C/en_CA.UTF-8/en_CA.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] GO.db_3.0.0          WGCNA_1.42           fastcluster_1.1.16  
##  [4] dynamicTreeCut_1.62  MLInterfaces_1.46.0  cluster_2.0.1       
##  [7] annotate_1.44.0      XML_3.98-1.1         randomForest_4.6-10 
## [10] RColorBrewer_1.1-2   bioDist_1.38.0       KernSmooth_2.23-14  
## [13] hgu95av2.db_3.0.0    org.Hs.eg.db_3.0.0   RSQLite_1.0.0       
## [16] DBI_0.3.1            AnnotationDbi_1.28.2 GenomeInfoDb_1.2.4  
## [19] IRanges_2.0.1        S4Vectors_0.4.0      multtest_2.22.0     
## [22] genefilter_1.48.1    ALL_1.7.1            Biobase_2.26.0      
## [25] BiocGenerics_0.12.1  BiocInstaller_1.16.2
## 
## loaded via a namespace (and not attached):
##  [1] acepack_1.3-3.3       class_7.3-12          codetools_0.2-11     
##  [4] colorspace_1.2-6      digest_0.6.8          doParallel_1.0.8     
##  [7] evaluate_0.5.5        foreach_1.4.2         foreign_0.8-63       
## [10] formatR_1.0           Formula_1.2-0         gdata_2.13.3         
## [13] ggplot2_1.0.1         grid_3.1.1            gtable_0.1.2         
## [16] gtools_3.4.1          Hmisc_3.15-0          htmltools_0.2.6      
## [19] impute_1.40.0         iterators_1.0.7       knitr_1.9            
## [22] lattice_0.20-30       latticeExtra_0.6-26   MASS_7.3-40          
## [25] matrixStats_0.14.0    munsell_0.4.2         nnet_7.3-9           
## [28] pls_2.4-3             plyr_1.8.1            preprocessCore_1.28.0
## [31] proto_0.3-10          Rcpp_0.11.5           rda_1.0.2-2          
## [34] reshape_0.8.5         reshape2_1.4.1        rmarkdown_0.5.1      
## [37] rpart_4.1-9           scales_0.2.4          sfsmisc_1.0-27       
## [40] splines_3.1.1         stringr_0.6.2         survival_2.38-1      
## [43] tools_3.1.1           xtable_1.7-4          yaml_2.1.13

Save session

save.image(file="analysis.RData")
#load(file="analysis.Rdata")