{r ropts_chunk$set(cache=TRUE, echo=TRUE)}
GEO overview - http://www.ncbi.nlm.nih.gov/geo/info/overview.html - describes data organization GEO2R - http://www.ncbi.nlm.nih.gov/geo/info/geo2r.html - compare 2 or more samples in a GEO series Querying GEO data sets - http://www.ncbi.nlm.nih.gov/geo/info/qqtutorial.html - a tutorial describing how to query for specific data sets.
Array workflows - http://www.bioconductor.org/help/workflows/arrays/ Bioconductor case studies - http://www.bioconductor.org/help/publications/books/bioconductor-case-studies/ http://www-huber.embl.de/pub/pdf/HahneHuberGentlemanFalcon2008.pdf p.85, p.91, p.123, p.139 http://www.bioconductor.org/help/publications/books/bioconductor-case-studies/web-supplement/ GEOquery - http://www.bioconductor.org/packages/release/bioc/html/GEOquery.html
Bioinformatics for Cancer Genomics (http://bioinformatics.ca/) has a tutorial in R analyzing data from GEO (GSE17172) - only 3 experimental conditions with three technical replicates each.
Initial plan 1. Repeat work in chapter 6-7 of Bioconductor case studies 2. Find a more interesting data set using GEO interface and GEO query and R GEOquery package 3. Try machine learning approach to predict case versus control 4. Try to identify signature related to predictor 5. Something with pathways - example of using machine learning in a predictive modelling (classification/regression) setting
source("http://bioconductor.org/biocLite.R")
## Bioconductor version 3.0 (BiocInstaller 1.16.2), ?biocLite for help
biocLite()
## BioC_mirror: http://bioconductor.org
## Using Bioconductor version 3.0 (BiocInstaller 1.16.2), R version 3.1.1.
biocLite("Biobase")
## BioC_mirror: http://bioconductor.org
## Using Bioconductor version 3.0 (BiocInstaller 1.16.2), R version 3.1.1.
## Installing package(s) 'Biobase'
library("Biobase")
## Loading required package: BiocGenerics
## Warning: package 'BiocGenerics' was built under R version 3.1.2
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
## intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unlist, unsplit
##
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
biocLite("ALL")
## BioC_mirror: http://bioconductor.org
## Using Bioconductor version 3.0 (BiocInstaller 1.16.2), R version 3.1.1.
## Installing package(s) 'ALL'
library("ALL")
data("ALL")
biocLite("genefilter")
## BioC_mirror: http://bioconductor.org
## Using Bioconductor version 3.0 (BiocInstaller 1.16.2), R version 3.1.1.
## Installing package(s) 'genefilter'
library("genefilter")
##
## Attaching package: 'genefilter'
##
## The following object is masked from 'package:base':
##
## anyNA
biocLite("multtest")
## BioC_mirror: http://bioconductor.org
## Using Bioconductor version 3.0 (BiocInstaller 1.16.2), R version 3.1.1.
## Installing package(s) 'multtest'
library("multtest")
biocLite("hgu95av2.db")
## BioC_mirror: http://bioconductor.org
## Using Bioconductor version 3.0 (BiocInstaller 1.16.2), R version 3.1.1.
## Installing package(s) 'hgu95av2.db'
library("hgu95av2.db")
## Loading required package: AnnotationDbi
## Warning: package 'AnnotationDbi' was built under R version 3.1.3
## Loading required package: stats4
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 3.1.2
## Loading required package: S4Vectors
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 3.1.2
##
## Attaching package: 'AnnotationDbi'
##
## The following object is masked from 'package:GenomeInfoDb':
##
## species
##
## Loading required package: org.Hs.eg.db
## Warning: package 'RSQLite' was built under R version 3.1.2
## Loading required package: DBI
biocLite("bioDist")
## BioC_mirror: http://bioconductor.org
## Using Bioconductor version 3.0 (BiocInstaller 1.16.2), R version 3.1.1.
## Installing package(s) 'bioDist'
library("bioDist")
## Loading required package: KernSmooth
## Warning: package 'KernSmooth' was built under R version 3.1.2
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
install.packages("RColorBrewer", repos="http://cran.ma.imperial.ac.uk/")
library("RColorBrewer")
## Warning: package 'RColorBrewer' was built under R version 3.1.2
install.packages("randomForest", repos="http://cran.ma.imperial.ac.uk/")
library("randomForest")
## Warning: package 'randomForest' was built under R version 3.1.2
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:Biobase':
##
## combine
##
## The following object is masked from 'package:BiocGenerics':
##
## combine
biocLite("MLInterfaces")
## BioC_mirror: http://bioconductor.org
## Using Bioconductor version 3.0 (BiocInstaller 1.16.2), R version 3.1.1.
## Installing package(s) 'MLInterfaces'
library("MLInterfaces")
## Loading required package: annotate
## Loading required package: XML
## Warning: package 'XML' was built under R version 3.1.2
##
## Attaching package: 'annotate'
##
## The following object is masked from 'package:GenomeInfoDb':
##
## organism
##
## Loading required package: cluster
## Warning: package 'cluster' was built under R version 3.1.2
install.packages("WGCNA", repos="http://cran.ma.imperial.ac.uk/")
#library("WGCNA")
check <- function(x){
#returns lots of meta-information related to a specified variable
cat("\nhead:\n", sep=" ");
print(head(x));
cat("\ntail:\n", sep=" ");
print(tail(x));
cat("\nstr:\n","\n", sep=" ");
print(str(x));
cat("\nlength:", length(x), "\n", sep=" ");
cat("\ndim:", dim(x), "\n", sep=" ");
cat("\ntotal NA:", sum(is.na(x)), "\n", sep=" ");
cat("\ntotal NULL:", sum(is.null(x)), "\n", sep=" ");
cat("\ntotal duplicated:", sum(duplicated(x)), "\n", sep=" ");
cat("\nsummary:\n", "\n", sep=" ");
if (!is.list(x)){
summary(x);
}
else {
head(summary(x));
}
}
checkMemory <- function(Units="MB"){
#returns the memory consumption of variables in the current working environment
#in the specified units
units2divisor = list(KB=1000, MB=1000000, GB=1000000000)
divisor=as.integer(units2divisor[Units])
theseVariables <- ls()
theseSizes <- sapply(theseVariables, function(x) object.size(get(x)))
theseSizes<-round(as.integer(unname(theseSizes))/divisor, 2)
results <- data.frame(theseVariables, theseSizes, Units, stringsAsFactors=FALSE)
results <- results[rev(order(theseSizes)),]
results <- rbind(c("TOTAL", sum(theseSizes), Units), results)
results
}
#investigate ALL
?ALL
class(ALL)
## [1] "ExpressionSet"
## attr(,"package")
## [1] "Biobase"
dim(ALL)
## Features Samples
## 12625 128
str(ALL) #<--- has 7 slots
## Formal class 'ExpressionSet' [package "Biobase"] with 7 slots
## ..@ experimentData :Formal class 'MIAME' [package "Biobase"] with 13 slots
## .. .. ..@ name : chr "Chiaretti et al."
## .. .. ..@ lab : chr "Department of Medical Oncology, Dana-Farber Cancer Institute, Department of Medicine, Brigham and Women's Hospital, Harvard Med"| __truncated__
## .. .. ..@ contact : chr ""
## .. .. ..@ title : chr "Gene expression profile of adult T-cell acute lymphocytic leukemia identifies distinct subsets of patients with different respo"| __truncated__
## .. .. ..@ abstract : chr "Gene expression profiles were examined in 33 adult patients with T-cell acute lymphocytic leukemia (T-ALL). Nonspecific filteri"| __truncated__
## .. .. ..@ url : chr ""
## .. .. ..@ pubMedIds : chr [1:2] "14684422" "16243790"
## .. .. ..@ samples : list()
## .. .. ..@ hybridizations : list()
## .. .. ..@ normControls : list()
## .. .. ..@ preprocessing : list()
## .. .. ..@ other : list()
## .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
## .. .. .. .. ..@ .Data:List of 1
## .. .. .. .. .. ..$ : int [1:3] 1 0 0
## ..@ assayData :<environment: 0x7f7f9cafba40>
## ..@ phenoData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. .. ..@ varMetadata :'data.frame': 21 obs. of 1 variable:
## .. .. .. ..$ labelDescription: chr [1:21] " Patient ID" " Date of diagnosis" " Gender of the patient" " Age of the patient at entry" ...
## .. .. ..@ data :'data.frame': 128 obs. of 21 variables:
## .. .. .. ..$ cod : chr [1:128] "1005" "1010" "3002" "4006" ...
## .. .. .. ..$ diagnosis : chr [1:128] "5/21/1997" "3/29/2000" "6/24/1998" "7/17/1997" ...
## .. .. .. ..$ sex : Factor w/ 2 levels "F","M": 2 2 1 2 2 2 1 2 2 2 ...
## .. .. .. ..$ age : int [1:128] 53 19 52 38 57 17 18 16 15 40 ...
## .. .. .. ..$ BT : Factor w/ 10 levels "B","B1","B2",..: 3 3 5 2 3 2 2 2 3 3 ...
## .. .. .. ..$ remission : Factor w/ 2 levels "CR","REF": 1 1 1 1 1 1 1 1 1 1 ...
## .. .. .. ..$ CR : chr [1:128] "CR" "CR" "CR" "CR" ...
## .. .. .. ..$ date.cr : chr [1:128] "8/6/1997" "6/27/2000" "8/17/1998" "9/8/1997" ...
## .. .. .. ..$ t(4;11) : logi [1:128] FALSE FALSE NA TRUE FALSE FALSE ...
## .. .. .. ..$ t(9;22) : logi [1:128] TRUE FALSE NA FALSE FALSE FALSE ...
## .. .. .. ..$ cyto.normal : logi [1:128] FALSE FALSE NA FALSE FALSE FALSE ...
## .. .. .. ..$ citog : chr [1:128] "t(9;22)" "simple alt." NA "t(4;11)" ...
## .. .. .. ..$ mol.biol : Factor w/ 6 levels "ALL1/AF4","BCR/ABL",..: 2 4 2 1 4 4 4 4 4 2 ...
## .. .. .. ..$ fusion protein: Factor w/ 3 levels "p190","p190/p210",..: 3 NA 1 NA NA NA NA NA NA 1 ...
## .. .. .. ..$ mdr : Factor w/ 2 levels "NEG","POS": 1 2 1 1 1 1 2 1 1 1 ...
## .. .. .. ..$ kinet : Factor w/ 2 levels "dyploid","hyperd.": 1 1 1 1 1 2 2 1 1 NA ...
## .. .. .. ..$ ccr : logi [1:128] FALSE FALSE FALSE FALSE FALSE FALSE ...
## .. .. .. ..$ relapse : logi [1:128] FALSE TRUE TRUE TRUE TRUE TRUE ...
## .. .. .. ..$ transplant : logi [1:128] TRUE FALSE FALSE FALSE FALSE FALSE ...
## .. .. .. ..$ f.u : chr [1:128] "BMT / DEATH IN CR" "REL" "REL" "REL" ...
## .. .. .. ..$ date last seen: chr [1:128] NA "8/28/2000" "10/15/1999" "1/23/1998" ...
## .. .. ..@ dimLabels : chr [1:2] "sampleNames" "sampleColumns"
## .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
## .. .. .. .. ..@ .Data:List of 1
## .. .. .. .. .. ..$ : int [1:3] 1 1 0
## ..@ featureData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. .. ..@ varMetadata :'data.frame': 0 obs. of 1 variable:
## .. .. .. ..$ labelDescription: logi(0)
## .. .. ..@ data :'data.frame': 12625 obs. of 0 variables
## .. .. ..@ dimLabels : chr [1:2] "featureNames" "featureColumns"
## .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
## .. .. .. .. ..@ .Data:List of 1
## .. .. .. .. .. ..$ : int [1:3] 1 1 0
## ..@ annotation : chr "hgu95av2"
## ..@ protocolData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. .. ..@ varMetadata :'data.frame': 0 obs. of 1 variable:
## .. .. .. ..$ labelDescription: chr(0)
## .. .. ..@ data :'data.frame': 128 obs. of 0 variables
## .. .. ..@ dimLabels : chr [1:2] "sampleNames" "sampleColumns"
## .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
## .. .. .. .. ..@ .Data:List of 1
## .. .. .. .. .. ..$ : int [1:3] 1 1 0
## ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
## .. .. ..@ .Data:List of 4
## .. .. .. ..$ : int [1:3] 2 10 0
## .. .. .. ..$ : int [1:3] 2 5 5
## .. .. .. ..$ : int [1:3] 1 3 0
## .. .. .. ..$ : int [1:3] 1 0 0
?ExpressionSet #<--- the contents of the 7 slots are explained here
####
#READ THIS and see class specific functions - there are methods to retrieve data
#from each slot and also methods to transform the entire ExpressionSet into a
#data.frame
# information about assay and sample data
featureNames(ALL)[1:10]
## [1] "1000_at" "1001_at" "1002_f_at" "1003_s_at" "1004_at"
## [6] "1005_at" "1006_at" "1007_s_at" "1008_f_at" "1009_at"
sampleNames(ALL)[1:5]
## [1] "01005" "01010" "03002" "04006" "04007"
experimentData(ALL)
## Experiment data
## Experimenter name: Chiaretti et al.
## Laboratory: Department of Medical Oncology, Dana-Farber Cancer Institute, Department of Medicine, Brigham and Women's Hospital, Harvard Medical School, Boston, MA 02115, USA.
## Contact information:
## Title: Gene expression profile of adult T-cell acute lymphocytic leukemia identifies distinct subsets of patients with different response to therapy and survival.
## URL:
## PMIDs: 14684422 16243790
##
## Abstract: A 187 word abstract is available. Use 'abstract' method.
# ## subset first 10 gene probe expression results for samples 2, 4, and 10
# the [ ] notation retrieves subsets of the the expression data ALONG WITH all of the meta data
# in the other slots. so the notation looks like you are dealing with a data frame but that's not true
# what is returned is a data structure of class called ExpressionSet
expressionSet <- ALL[1:10,c(2,4,10)] #
expressionSet
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 10 features, 3 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: 01010 04006 08001
## varLabels: cod diagnosis ... date last seen (21 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 14684422 16243790
## Annotation: hgu95av2
str(expressionSet) #an object of the ExpressionSet class
## Formal class 'ExpressionSet' [package "Biobase"] with 7 slots
## ..@ experimentData :Formal class 'MIAME' [package "Biobase"] with 13 slots
## .. .. ..@ name : chr "Chiaretti et al."
## .. .. ..@ lab : chr "Department of Medical Oncology, Dana-Farber Cancer Institute, Department of Medicine, Brigham and Women's Hospital, Harvard Med"| __truncated__
## .. .. ..@ contact : chr ""
## .. .. ..@ title : chr "Gene expression profile of adult T-cell acute lymphocytic leukemia identifies distinct subsets of patients with different respo"| __truncated__
## .. .. ..@ abstract : chr "Gene expression profiles were examined in 33 adult patients with T-cell acute lymphocytic leukemia (T-ALL). Nonspecific filteri"| __truncated__
## .. .. ..@ url : chr ""
## .. .. ..@ pubMedIds : chr [1:2] "14684422" "16243790"
## .. .. ..@ samples : list()
## .. .. ..@ hybridizations : list()
## .. .. ..@ normControls : list()
## .. .. ..@ preprocessing : list()
## .. .. ..@ other : list()
## .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
## .. .. .. .. ..@ .Data:List of 1
## .. .. .. .. .. ..$ : int [1:3] 1 0 0
## ..@ assayData :<environment: 0x7f7f9188f180>
## ..@ phenoData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. .. ..@ varMetadata :'data.frame': 21 obs. of 1 variable:
## .. .. .. ..$ labelDescription: chr [1:21] " Patient ID" " Date of diagnosis" " Gender of the patient" " Age of the patient at entry" ...
## .. .. ..@ data :'data.frame': 3 obs. of 21 variables:
## .. .. .. ..$ cod : chr [1:3] "1010" "4006" "8001"
## .. .. .. ..$ diagnosis : chr [1:3] "3/29/2000" "7/17/1997" "1/15/1997"
## .. .. .. ..$ sex : Factor w/ 2 levels "F","M": 2 2 2
## .. .. .. ..$ age : int [1:3] 19 38 40
## .. .. .. ..$ BT : Factor w/ 10 levels "B","B1","B2",..: 3 2 3
## .. .. .. ..$ remission : Factor w/ 2 levels "CR","REF": 1 1 1
## .. .. .. ..$ CR : chr [1:3] "CR" "CR" "CR"
## .. .. .. ..$ date.cr : chr [1:3] "6/27/2000" "9/8/1997" "3/26/1997"
## .. .. .. ..$ t(4;11) : logi [1:3] FALSE TRUE FALSE
## .. .. .. ..$ t(9;22) : logi [1:3] FALSE FALSE FALSE
## .. .. .. ..$ cyto.normal : logi [1:3] FALSE FALSE FALSE
## .. .. .. ..$ citog : chr [1:3] "simple alt." "t(4;11)" "del(p15)"
## .. .. .. ..$ mol.biol : Factor w/ 6 levels "ALL1/AF4","BCR/ABL",..: 4 1 2
## .. .. .. ..$ fusion protein: Factor w/ 3 levels "p190","p190/p210",..: NA NA 1
## .. .. .. ..$ mdr : Factor w/ 2 levels "NEG","POS": 2 1 1
## .. .. .. ..$ kinet : Factor w/ 2 levels "dyploid","hyperd.": 1 1 NA
## .. .. .. ..$ ccr : logi [1:3] FALSE FALSE FALSE
## .. .. .. ..$ relapse : logi [1:3] TRUE TRUE TRUE
## .. .. .. ..$ transplant : logi [1:3] FALSE FALSE FALSE
## .. .. .. ..$ f.u : chr [1:3] "REL" "REL" "REL"
## .. .. .. ..$ date last seen: chr [1:3] "8/28/2000" "1/23/1998" "7/11/1997"
## .. .. ..@ dimLabels : chr [1:2] "sampleNames" "sampleColumns"
## .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
## .. .. .. .. ..@ .Data:List of 1
## .. .. .. .. .. ..$ : int [1:3] 1 1 0
## ..@ featureData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. .. ..@ varMetadata :'data.frame': 0 obs. of 1 variable:
## .. .. .. ..$ labelDescription: chr(0)
## .. .. ..@ data :'data.frame': 10 obs. of 0 variables
## .. .. ..@ dimLabels : chr [1:2] "featureNames" "featureColumns"
## .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
## .. .. .. .. ..@ .Data:List of 1
## .. .. .. .. .. ..$ : int [1:3] 1 1 0
## ..@ annotation : chr "hgu95av2"
## ..@ protocolData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. .. ..@ varMetadata :'data.frame': 0 obs. of 1 variable:
## .. .. .. ..$ labelDescription: chr(0)
## .. .. ..@ data :'data.frame': 3 obs. of 0 variables
## .. .. ..@ dimLabels : chr [1:2] "sampleNames" "sampleColumns"
## .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
## .. .. .. .. ..@ .Data:List of 1
## .. .. .. .. .. ..$ : int [1:3] 1 1 0
## ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
## .. .. ..@ .Data:List of 4
## .. .. .. ..$ : int [1:3] 2 10 0
## .. .. .. ..$ : int [1:3] 2 5 5
## .. .. .. ..$ : int [1:3] 1 3 0
## .. .. .. ..$ : int [1:3] 1 0 0
# named features and their expression levels
subset <- ALL[c("1000_at","1001_at"),] #names of features retrieved using featureNames()
exprs(subset)
## 01005 01010 03002 04006 04007 04008 04010
## 1000_at 7.597323 7.479445 7.567593 7.384684 7.905312 7.065914 7.474537
## 1001_at 5.046194 4.932537 4.799294 4.922627 4.844565 5.147762 5.122518
## 04016 06002 08001 08011 08012 08018 08024
## 1000_at 7.536119 7.183331 7.735545 7.591498 7.824284 7.231814 7.879988
## 1001_at 5.016132 5.288943 4.633217 4.583148 4.685951 5.059300 4.830464
## 09008 09017 11005 12006 12007 12012 12019
## 1000_at 7.891793 7.756734 7.640012 7.759599 7.678636 7.464285 7.652719
## 1001_at 5.999496 4.987595 4.967288 4.770481 5.456332 4.785863 5.175609
## 12026 14016 15001 15004 15005 16004 16009
## 1000_at 7.501591 7.570417 7.331509 7.366208 7.455451 7.328875 7.297313
## 1001_at 5.188992 5.258312 4.627955 4.733495 5.125098 5.332775 5.215707
## 19005 20002 22009 22010 22011 22013 24001
## 1000_at 7.563561 7.541133 8.016818 7.862181 7.702580 7.412003 7.916169
## 1001_at 4.858392 4.964424 5.216252 5.135825 4.802946 5.222676 4.790170
## 24005 24008 24010 24011 24017 24018 24019
## 1000_at 7.595848 7.296349 7.506236 7.144425 7.513972 7.815971 7.406135
## 1001_at 4.804743 5.002518 4.218220 5.228892 5.264158 4.899316 4.791335
## 24022 25003 25006 26001 26003 26005 26008
## 1000_at 7.300980 7.845054 7.651229 7.376930 7.663977 7.250353 7.663612
## 1001_at 5.177703 5.250315 4.896195 5.123546 5.078104 4.945670 5.124591
## 27003 27004 28001 28003 28005 28006 28007
## 1000_at 7.329996 7.360754 7.035203 7.705260 7.551734 7.538601 7.501531
## 1001_at 5.438098 4.757900 5.005279 5.009705 4.944978 4.511194 4.888814
## 28019 28021 28023 28024 28028 28031 28032
## 1000_at 7.116676 7.107979 7.427808 6.549926 7.514761 7.377215 6.973861
## 1001_at 5.275964 4.865566 5.057619 5.185277 4.788468 4.778381 4.970430
## 28035 28036 28037 28042 28043 28044 28047
## 1000_at 7.227516 7.407561 7.158049 7.235291 7.589310 7.988476 7.362458
## 1001_at 6.408157 5.042222 5.431469 4.686293 4.851805 4.894379 4.843868
## 30001 31007 31011 33005 36001 36002 37013
## 1000_at 7.508667 7.147843 7.651676 7.486432 7.759074 7.473427 7.627685
## 1001_at 5.587029 4.943857 4.741654 4.642628 4.962544 4.953122 5.358236
## 43001 43004 43007 43012 48001 49006 57001
## 1000_at 7.577529 7.600206 7.776844 7.585928 7.450666 7.004613 7.195206
## 1001_at 5.054157 4.879037 4.949908 5.057530 4.960382 4.836905 4.744006
## 62001 62002 62003 63001 64001 64002 65005
## 1000_at 7.407351 7.756195 7.913324 7.270997 7.694588 7.583071 7.609538
## 1001_at 4.930312 5.238937 5.074681 4.513671 4.928159 4.804083 4.715693
## 68001 68003 84004 LAL5 01003 01007 02020
## 1000_at 7.324502 7.545120 7.679603 7.604093 7.240252 7.676749 7.934247
## 1001_at 5.379102 4.650231 4.795495 4.988922 5.224752 5.129002 5.667907
## 04018 09002 10005 11002 12008 15006 16002
## 1000_at 7.874448 7.404271 7.775253 7.771891 7.355677 7.388882 7.589734
## 1001_at 5.005420 5.127949 4.423445 4.476761 5.461252 5.330129 4.836986
## 16007 17003 18001 19002 19008 19014 19017
## 1000_at 7.675929 7.662426 7.584008 7.840099 7.164922 7.843162 7.695714
## 1001_at 4.959669 5.743215 4.674920 5.208166 4.554529 5.718569 4.498515
## 20005 24006 26009 28008 28009 31015 37001
## 1000_at 7.520867 7.836577 7.470524 7.520806 7.646947 7.727560 7.849455
## 1001_at 5.135697 5.129836 5.213340 4.690815 4.902946 4.866731 4.959450
## 43006 43015 44001 49004 56007 64005 65003
## 1000_at 7.960842 8.188617 7.399999 7.813474 7.816922 7.913249 7.800199
## 1001_at 4.537677 5.154500 5.071885 4.874525 4.788699 5.403640 5.443827
## 83001 LAL4
## 1000_at 8.030047 7.702217
## 1001_at 5.178633 5.029670
# subset using properties in phenoData
str(expressionSet)
## Formal class 'ExpressionSet' [package "Biobase"] with 7 slots
## ..@ experimentData :Formal class 'MIAME' [package "Biobase"] with 13 slots
## .. .. ..@ name : chr "Chiaretti et al."
## .. .. ..@ lab : chr "Department of Medical Oncology, Dana-Farber Cancer Institute, Department of Medicine, Brigham and Women's Hospital, Harvard Med"| __truncated__
## .. .. ..@ contact : chr ""
## .. .. ..@ title : chr "Gene expression profile of adult T-cell acute lymphocytic leukemia identifies distinct subsets of patients with different respo"| __truncated__
## .. .. ..@ abstract : chr "Gene expression profiles were examined in 33 adult patients with T-cell acute lymphocytic leukemia (T-ALL). Nonspecific filteri"| __truncated__
## .. .. ..@ url : chr ""
## .. .. ..@ pubMedIds : chr [1:2] "14684422" "16243790"
## .. .. ..@ samples : list()
## .. .. ..@ hybridizations : list()
## .. .. ..@ normControls : list()
## .. .. ..@ preprocessing : list()
## .. .. ..@ other : list()
## .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
## .. .. .. .. ..@ .Data:List of 1
## .. .. .. .. .. ..$ : int [1:3] 1 0 0
## ..@ assayData :<environment: 0x7f7f9188f180>
## ..@ phenoData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. .. ..@ varMetadata :'data.frame': 21 obs. of 1 variable:
## .. .. .. ..$ labelDescription: chr [1:21] " Patient ID" " Date of diagnosis" " Gender of the patient" " Age of the patient at entry" ...
## .. .. ..@ data :'data.frame': 3 obs. of 21 variables:
## .. .. .. ..$ cod : chr [1:3] "1010" "4006" "8001"
## .. .. .. ..$ diagnosis : chr [1:3] "3/29/2000" "7/17/1997" "1/15/1997"
## .. .. .. ..$ sex : Factor w/ 2 levels "F","M": 2 2 2
## .. .. .. ..$ age : int [1:3] 19 38 40
## .. .. .. ..$ BT : Factor w/ 10 levels "B","B1","B2",..: 3 2 3
## .. .. .. ..$ remission : Factor w/ 2 levels "CR","REF": 1 1 1
## .. .. .. ..$ CR : chr [1:3] "CR" "CR" "CR"
## .. .. .. ..$ date.cr : chr [1:3] "6/27/2000" "9/8/1997" "3/26/1997"
## .. .. .. ..$ t(4;11) : logi [1:3] FALSE TRUE FALSE
## .. .. .. ..$ t(9;22) : logi [1:3] FALSE FALSE FALSE
## .. .. .. ..$ cyto.normal : logi [1:3] FALSE FALSE FALSE
## .. .. .. ..$ citog : chr [1:3] "simple alt." "t(4;11)" "del(p15)"
## .. .. .. ..$ mol.biol : Factor w/ 6 levels "ALL1/AF4","BCR/ABL",..: 4 1 2
## .. .. .. ..$ fusion protein: Factor w/ 3 levels "p190","p190/p210",..: NA NA 1
## .. .. .. ..$ mdr : Factor w/ 2 levels "NEG","POS": 2 1 1
## .. .. .. ..$ kinet : Factor w/ 2 levels "dyploid","hyperd.": 1 1 NA
## .. .. .. ..$ ccr : logi [1:3] FALSE FALSE FALSE
## .. .. .. ..$ relapse : logi [1:3] TRUE TRUE TRUE
## .. .. .. ..$ transplant : logi [1:3] FALSE FALSE FALSE
## .. .. .. ..$ f.u : chr [1:3] "REL" "REL" "REL"
## .. .. .. ..$ date last seen: chr [1:3] "8/28/2000" "1/23/1998" "7/11/1997"
## .. .. ..@ dimLabels : chr [1:2] "sampleNames" "sampleColumns"
## .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
## .. .. .. .. ..@ .Data:List of 1
## .. .. .. .. .. ..$ : int [1:3] 1 1 0
## ..@ featureData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. .. ..@ varMetadata :'data.frame': 0 obs. of 1 variable:
## .. .. .. ..$ labelDescription: chr(0)
## .. .. ..@ data :'data.frame': 10 obs. of 0 variables
## .. .. ..@ dimLabels : chr [1:2] "featureNames" "featureColumns"
## .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
## .. .. .. .. ..@ .Data:List of 1
## .. .. .. .. .. ..$ : int [1:3] 1 1 0
## ..@ annotation : chr "hgu95av2"
## ..@ protocolData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. .. ..@ varMetadata :'data.frame': 0 obs. of 1 variable:
## .. .. .. ..$ labelDescription: chr(0)
## .. .. ..@ data :'data.frame': 3 obs. of 0 variables
## .. .. ..@ dimLabels : chr [1:2] "sampleNames" "sampleColumns"
## .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
## .. .. .. .. ..@ .Data:List of 1
## .. .. .. .. .. ..$ : int [1:3] 1 1 0
## ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
## .. .. ..@ .Data:List of 4
## .. .. .. ..$ : int [1:3] 2 10 0
## .. .. .. ..$ : int [1:3] 2 5 5
## .. .. .. ..$ : int [1:3] 1 3 0
## .. .. .. ..$ : int [1:3] 1 0 0
expressionSet$sex #<--one of the phenoData columns
## [1] M M M
## Levels: F M
maleExpressionSet <- expressionSet$sex == "M"
#class specific methods to access data in an object of the ExpressionSet class
assayData(ALL)
## <environment: 0x7f7f9cafba40>
sampleNames(ALL)
## [1] "01005" "01010" "03002" "04006" "04007" "04008" "04010" "04016"
## [9] "06002" "08001" "08011" "08012" "08018" "08024" "09008" "09017"
## [17] "11005" "12006" "12007" "12012" "12019" "12026" "14016" "15001"
## [25] "15004" "15005" "16004" "16009" "19005" "20002" "22009" "22010"
## [33] "22011" "22013" "24001" "24005" "24008" "24010" "24011" "24017"
## [41] "24018" "24019" "24022" "25003" "25006" "26001" "26003" "26005"
## [49] "26008" "27003" "27004" "28001" "28003" "28005" "28006" "28007"
## [57] "28019" "28021" "28023" "28024" "28028" "28031" "28032" "28035"
## [65] "28036" "28037" "28042" "28043" "28044" "28047" "30001" "31007"
## [73] "31011" "33005" "36001" "36002" "37013" "43001" "43004" "43007"
## [81] "43012" "48001" "49006" "57001" "62001" "62002" "62003" "63001"
## [89] "64001" "64002" "65005" "68001" "68003" "84004" "LAL5" "01003"
## [97] "01007" "02020" "04018" "09002" "10005" "11002" "12008" "15006"
## [105] "16002" "16007" "17003" "18001" "19002" "19008" "19014" "19017"
## [113] "20005" "24006" "26009" "28008" "28009" "31015" "37001" "43006"
## [121] "43015" "44001" "49004" "56007" "64005" "65003" "83001" "LAL4"
featureNames(ALL)[1:10]
## [1] "1000_at" "1001_at" "1002_f_at" "1003_s_at" "1004_at"
## [6] "1005_at" "1006_at" "1007_s_at" "1008_f_at" "1009_at"
dims(ALL)
## exprs
## Features 12625
## Samples 128
phenoData(ALL)
## An object of class 'AnnotatedDataFrame'
## sampleNames: 01005 01010 ... LAL4 (128 total)
## varLabels: cod diagnosis ... date last seen (21 total)
## varMetadata: labelDescription
varLabels(ALL)
## [1] "cod" "diagnosis" "sex" "age"
## [5] "BT" "remission" "CR" "date.cr"
## [9] "t(4;11)" "t(9;22)" "cyto.normal" "citog"
## [13] "mol.biol" "fusion protein" "mdr" "kinet"
## [17] "ccr" "relapse" "transplant" "f.u"
## [21] "date last seen"
varMetadata(ALL)
## labelDescription
## cod Patient ID
## diagnosis Date of diagnosis
## sex Gender of the patient
## age Age of the patient at entry
## BT does the patient have B-cell or T-cell ALL
## remission Complete remission(CR), refractory(REF) or NA. Derived from CR
## CR Original remisson data
## date.cr Date complete remission if achieved
## t(4;11) did the patient have t(4;11) translocation. Derived from citog
## t(9;22) did the patient have t(9;22) translocation. Derived from citog
## cyto.normal Was cytogenetic test normal? Derived from citog
## citog original citogenetics data, deletions or t(4;11), t(9;22) status
## mol.biol molecular biology
## fusion protein which of p190, p210 or p190/210 for bcr/able
## mdr multi-drug resistant
## kinet ploidy: either diploid or hyperd.
## ccr Continuous complete remission? Derived from f.u
## relapse Relapse? Derived from f.u
## transplant did the patient receive a bone marrow transplant? Derived from f.u
## f.u follow up data available
## date last seen date patient was last seen
pData(ALL)[1:5,] #<--show actual phenotypic data
## cod diagnosis sex age BT remission CR date.cr t(4;11) t(9;22)
## 01005 1005 5/21/1997 M 53 B2 CR CR 8/6/1997 FALSE TRUE
## 01010 1010 3/29/2000 M 19 B2 CR CR 6/27/2000 FALSE FALSE
## 03002 3002 6/24/1998 F 52 B4 CR CR 8/17/1998 NA NA
## 04006 4006 7/17/1997 M 38 B1 CR CR 9/8/1997 TRUE FALSE
## 04007 4007 7/22/1997 M 57 B2 CR CR 9/17/1997 FALSE FALSE
## cyto.normal citog mol.biol fusion protein mdr kinet ccr
## 01005 FALSE t(9;22) BCR/ABL p210 NEG dyploid FALSE
## 01010 FALSE simple alt. NEG <NA> POS dyploid FALSE
## 03002 NA <NA> BCR/ABL p190 NEG dyploid FALSE
## 04006 FALSE t(4;11) ALL1/AF4 <NA> NEG dyploid FALSE
## 04007 FALSE del(6q) NEG <NA> NEG dyploid FALSE
## relapse transplant f.u date last seen
## 01005 FALSE TRUE BMT / DEATH IN CR <NA>
## 01010 TRUE FALSE REL 8/28/2000
## 03002 TRUE FALSE REL 10/15/1999
## 04006 TRUE FALSE REL 1/23/1998
## 04007 TRUE FALSE REL 11/4/1997
experimentData(ALL)
## Experiment data
## Experimenter name: Chiaretti et al.
## Laboratory: Department of Medical Oncology, Dana-Farber Cancer Institute, Department of Medicine, Brigham and Women's Hospital, Harvard Medical School, Boston, MA 02115, USA.
## Contact information:
## Title: Gene expression profile of adult T-cell acute lymphocytic leukemia identifies distinct subsets of patients with different response to therapy and survival.
## URL:
## PMIDs: 14684422 16243790
##
## Abstract: A 187 word abstract is available. Use 'abstract' method.
pubMedIds(ALL)
## [1] "14684422" "16243790"
abstract(ALL)
## [1] "Gene expression profiles were examined in 33 adult patients with T-cell acute lymphocytic leukemia (T-ALL). Nonspecific filtering criteria identified 313 genes differentially expressed in the leukemic cells. Hierarchical clustering of samples identified 2 groups that reflected the degree of T-cell differentiation but was not associated with clinical outcome. Comparison between refractory patients and those who responded to induction chemotherapy identified a single gene, interleukin 8 (IL-8), that was highly expressed in refractory T-ALL cells and a set of 30 genes that was highly expressed in leukemic cells from patients who achieved complete remission. We next identified 19 genes that were differentially expressed in T-ALL cells from patients who either had a relapse or remained in continuous complete remission. A model based on the expression of 3 of these genes was predictive of duration of remission. The 3-gene model was validated on a further set of T-ALL samples from 18 additional patients treated on the same clinical protocol. This study demonstrates that gene expression profiling can identify a limited number of genes that are predictive of response to induction therapy and remission duration in adult patients with T-ALL."
annotation(ALL)
## [1] "hgu95av2"
protocolData(ALL)
## An object of class 'AnnotatedDataFrame': none
#$ notation dereferences columns in the data slot of the phenoData slot of the ExpressionSet
#str(ALL)
ALL$BT
## [1] B2 B2 B4 B1 B2 B1 B1 B1 B2 B2 B3 B3 B3 B2 B3 B B2 B3 B2 B3 B2 B2 B2
## [24] B1 B1 B2 B1 B2 B1 B2 B B B2 B2 B2 B1 B2 B2 B2 B2 B2 B4 B4 B2 B2 B2
## [47] B4 B2 B1 B2 B2 B3 B4 B3 B3 B3 B4 B3 B3 B1 B1 B1 B1 B3 B3 B3 B3 B3 B3
## [70] B3 B3 B1 B3 B1 B4 B2 B2 B1 B3 B4 B4 B2 B2 B3 B4 B4 B4 B1 B2 B2 B2 B1
## [93] B2 B B T T3 T2 T2 T3 T2 T T4 T2 T3 T3 T T2 T3 T2 T2 T2 T1 T4 T
## [116] T2 T3 T2 T2 T2 T2 T3 T3 T3 T2 T3 T2 T
## Levels: B B1 B2 B3 B4 T T1 T2 T3 T4
#@ notation can be used to dereference a single slot at a time of the ExpressionSet
ALL@experimentData
## Experiment data
## Experimenter name: Chiaretti et al.
## Laboratory: Department of Medical Oncology, Dana-Farber Cancer Institute, Department of Medicine, Brigham and Women's Hospital, Harvard Medical School, Boston, MA 02115, USA.
## Contact information:
## Title: Gene expression profile of adult T-cell acute lymphocytic leukemia identifies distinct subsets of patients with different response to therapy and survival.
## URL:
## PMIDs: 14684422 16243790
##
## Abstract: A 187 word abstract is available. Use 'abstract' method.
#1 slot - experimentDaa
ALL@experimentData #<--- a description of the experiment
## Experiment data
## Experimenter name: Chiaretti et al.
## Laboratory: Department of Medical Oncology, Dana-Farber Cancer Institute, Department of Medicine, Brigham and Women's Hospital, Harvard Medical School, Boston, MA 02115, USA.
## Contact information:
## Title: Gene expression profile of adult T-cell acute lymphocytic leukemia identifies distinct subsets of patients with different response to therapy and survival.
## URL:
## PMIDs: 14684422 16243790
##
## Abstract: A 187 word abstract is available. Use 'abstract' method.
abstract(ALL)
## [1] "Gene expression profiles were examined in 33 adult patients with T-cell acute lymphocytic leukemia (T-ALL). Nonspecific filtering criteria identified 313 genes differentially expressed in the leukemic cells. Hierarchical clustering of samples identified 2 groups that reflected the degree of T-cell differentiation but was not associated with clinical outcome. Comparison between refractory patients and those who responded to induction chemotherapy identified a single gene, interleukin 8 (IL-8), that was highly expressed in refractory T-ALL cells and a set of 30 genes that was highly expressed in leukemic cells from patients who achieved complete remission. We next identified 19 genes that were differentially expressed in T-ALL cells from patients who either had a relapse or remained in continuous complete remission. A model based on the expression of 3 of these genes was predictive of duration of remission. The 3-gene model was validated on a further set of T-ALL samples from 18 additional patients treated on the same clinical protocol. This study demonstrates that gene expression profiling can identify a limited number of genes that are predictive of response to induction therapy and remission duration in adult patients with T-ALL."
#2 slot - assayData
ALL@assayData #<--- an environment
## <environment: 0x7f7f9cafba40>
#this is where the actual expression data lives
#use exprs() method to access
theData<-exprs(ALL)
str(theData)
## num [1:12625, 1:128] 7.6 5.05 3.9 5.9 5.93 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:12625] "1000_at" "1001_at" "1002_f_at" "1003_s_at" ...
## ..$ : chr [1:128] "01005" "01010" "03002" "04006" ...
#3 slot - phenoData - all the metadata about each patient
str(ALL@phenoData)
## Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## ..@ varMetadata :'data.frame': 21 obs. of 1 variable:
## .. ..$ labelDescription: chr [1:21] " Patient ID" " Date of diagnosis" " Gender of the patient" " Age of the patient at entry" ...
## ..@ data :'data.frame': 128 obs. of 21 variables:
## .. ..$ cod : chr [1:128] "1005" "1010" "3002" "4006" ...
## .. ..$ diagnosis : chr [1:128] "5/21/1997" "3/29/2000" "6/24/1998" "7/17/1997" ...
## .. ..$ sex : Factor w/ 2 levels "F","M": 2 2 1 2 2 2 1 2 2 2 ...
## .. ..$ age : int [1:128] 53 19 52 38 57 17 18 16 15 40 ...
## .. ..$ BT : Factor w/ 10 levels "B","B1","B2",..: 3 3 5 2 3 2 2 2 3 3 ...
## .. ..$ remission : Factor w/ 2 levels "CR","REF": 1 1 1 1 1 1 1 1 1 1 ...
## .. ..$ CR : chr [1:128] "CR" "CR" "CR" "CR" ...
## .. ..$ date.cr : chr [1:128] "8/6/1997" "6/27/2000" "8/17/1998" "9/8/1997" ...
## .. ..$ t(4;11) : logi [1:128] FALSE FALSE NA TRUE FALSE FALSE ...
## .. ..$ t(9;22) : logi [1:128] TRUE FALSE NA FALSE FALSE FALSE ...
## .. ..$ cyto.normal : logi [1:128] FALSE FALSE NA FALSE FALSE FALSE ...
## .. ..$ citog : chr [1:128] "t(9;22)" "simple alt." NA "t(4;11)" ...
## .. ..$ mol.biol : Factor w/ 6 levels "ALL1/AF4","BCR/ABL",..: 2 4 2 1 4 4 4 4 4 2 ...
## .. ..$ fusion protein: Factor w/ 3 levels "p190","p190/p210",..: 3 NA 1 NA NA NA NA NA NA 1 ...
## .. ..$ mdr : Factor w/ 2 levels "NEG","POS": 1 2 1 1 1 1 2 1 1 1 ...
## .. ..$ kinet : Factor w/ 2 levels "dyploid","hyperd.": 1 1 1 1 1 2 2 1 1 NA ...
## .. ..$ ccr : logi [1:128] FALSE FALSE FALSE FALSE FALSE FALSE ...
## .. ..$ relapse : logi [1:128] FALSE TRUE TRUE TRUE TRUE TRUE ...
## .. ..$ transplant : logi [1:128] TRUE FALSE FALSE FALSE FALSE FALSE ...
## .. ..$ f.u : chr [1:128] "BMT / DEATH IN CR" "REL" "REL" "REL" ...
## .. ..$ date last seen: chr [1:128] NA "8/28/2000" "10/15/1999" "1/23/1998" ...
## ..@ dimLabels : chr [1:2] "sampleNames" "sampleColumns"
## ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
## .. .. ..@ .Data:List of 1
## .. .. .. ..$ : int [1:3] 1 1 0
#4 slot - featureData
str(ALL@featureData)
## Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## ..@ varMetadata :'data.frame': 0 obs. of 1 variable:
## .. ..$ labelDescription: logi(0)
## ..@ data :'data.frame': 12625 obs. of 0 variables
## ..@ dimLabels : chr [1:2] "featureNames" "featureColumns"
## ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
## .. .. ..@ .Data:List of 1
## .. .. .. ..$ : int [1:3] 1 1 0
#5 slot - annotation
ALL@annotation #<-- [1] "hgu95av2"
## [1] "hgu95av2"
#Affymetrix Human Genome U95 Set annotation data (chip hgu95av2)
#http://www.bioconductor.org/packages/release/data/annotation/html/hgu95av2.html
#6 slot - protocolData
str(ALL@protocolData)
## Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## ..@ varMetadata :'data.frame': 0 obs. of 1 variable:
## .. ..$ labelDescription: chr(0)
## ..@ data :'data.frame': 128 obs. of 0 variables
## ..@ dimLabels : chr [1:2] "sampleNames" "sampleColumns"
## ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
## .. .. ..@ .Data:List of 1
## .. .. .. ..$ : int [1:3] 1 1 0
##
#convert the ALL data set into a data frame
#Coerce objects of ExpressionSet-class to data.frame by transposing the expression matrix and concatenating phenoData
#this format might be more amenable to standard machine learning packages
##
ALL.df <- as(ALL, "data.frame")
#str(ALL.df)
length(colnames(ALL.df)) #12646
## [1] 12646
head(colnames(ALL.df))
## [1] "X1000_at" "X1001_at" "X1002_f_at" "X1003_s_at" "X1004_at"
## [6] "X1005_at"
#[1] "X1000_at" "X1001_at" "X1002_f_at" "X1003_s_at" "X1004_at" "X1005_at"
tail(colnames(ALL.df))
## [1] "kinet" "ccr" "relapse" "transplant"
## [5] "f.u" "date.last.seen"
#[1] "kinet" "ccr" "relapse" "transplant" "f.u" "date.last.seen"
Construction of the data set is first. ?ALL The ALL data set (Acute Lymphoblastic Leukemia Data) is microarray data from 128 individuals that has ben normalized using rma. Two features (covariates of the data) are used to select a sybset of the data for further analysis The BT feature describes the type and stge of the disease. We will select individuals with B-cell type leukemia The mol.biol feature describes the “molecular biology” of the disease for the individual. This feature really describes the presence or absence of some cytogenetic variation. In this case, we select the subset of individuals with BCR/ABL translocations or no cytogenetic abnormalities at all (NEG)
?ALL
head(ALL$BT)
## [1] B2 B2 B4 B1 B2 B1
## Levels: B B1 B2 B3 B4 T T1 T2 T3 T4
bcell = grep("^B", as.character(ALL$BT))
#bcell is a vector indices of individuals with B-cell type leukemia
head(bcell)
## [1] 1 2 3 4 5 6
moltyp = which(as.character(ALL$mol.biol) %in% c("NEG", "BCR/ABL"))
#moltype is a vector of indices of individuals with BCR/ABL translocations or no cytogentic abnormalities (NEG)
head(moltyp)
## [1] 1 2 3 5 6 7
ALL_bcrneg = ALL[, intersect(bcell, moltyp)]
#get a list of individuals meeting both criteria for our data subset
# ALL_bcrneg is an ExpressionSet
#this next line simply refactors the mol.biol feature which will now have only two levels
ALL_bcrneg$mol.biol = factor(ALL_bcrneg$mol.biol)
str(ALL_bcrneg$mol.biol)
## Factor w/ 2 levels "BCR/ABL","NEG": 1 2 1 2 2 2 2 2 1 1 ...
This next section removes data for probes whose variation is independent of the sample. First, we calculate the standard deviation of each probe across all samples in the set. The distribution of standard deviations is non-normal (see the histogram below). We want to filter out probes that have little variation (regardless of the individual’s phenotype). To do this, we select the probes that have standard deviations above the “shorth” The “shorth” is the shortest interval that coverss half the values in a vector x. The “shorth” function in the genefilter package, returns the mean of the x values that lie in the shorth. This is used as a robust estimator of location. By removing probes that are likely to be non-informaive, we reduce the number of results that will be eliminated by multiple hypothesis correction methods.
library("genefilter")
?genefilter #<- a package for filtering gene expression probe data
expressionData <- exprs(ALL_bcrneg)
str(expressionData) #<-matrix of 12625 probes by 79 samples (individuals)
## num [1:12625, 1:79] 7.6 5.05 3.9 5.9 5.93 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:12625] "1000_at" "1001_at" "1002_f_at" "1003_s_at" ...
## ..$ : chr [1:79] "01005" "01010" "03002" "04007" ...
sds = rowSds(expressionData) #<-the standard deviation of each row (rowSds is from genefilter package)
sds[1:5]
## 1000_at 1001_at 1002_f_at 1003_s_at 1004_at
## 0.2580000 0.3126742 0.1881744 0.2601203 0.2630441
sds_alternative <- apply(expressionData, 1, sd) #alternative way of coding the same thing
sds_alternative[1:5]
## 1000_at 1001_at 1002_f_at 1003_s_at 1004_at
## 0.2580000 0.3126742 0.1881744 0.2601203 0.2630441
#plot the distribution of the standard deviations of the probes
hist(sds, breaks=50, col="blue", xlab="standard deviation")
sh = shorth(sds)
sh
## [1] 0.2423124
abline(v=sh, col="red", lwd=3, lty=2)
#make a new data set that eliminates all probes that are below sh
ALLsfilt = ALL_bcrneg[sds>=sh, ]
dim(exprs(ALLsfilt))
## [1] 8812 79
#and look at the new distribution of standard deviations
sds_new <- rowSds(exprs(ALLsfilt))
hist(sds_new, breaks=50, col="blue", xlab="standard deviation")
sh_new = shorth(sds_new)
sh_new
## [1] 0.3064567
abline(v=sh, col="red", lwd=3, lty=2)
# alternative methods of filtering - not done
# 1. discard all probes with consistantly low expression values
# 2. remove non-specific probes using the nsFilter function from the Category package - see application in Chapter 1
#explain shorth - not done
x<-rnorm(n=1000,mean=0,sd=1)
shorth_x<-shorth(x)
shorth_x
## [1] 0.03602156
The rowttests function from the genefilter package is used to test for differential expression between two different phenotypes in our data set. Recall that the mol.biol feature indicates whether individuals in the data set have a BCR/ABL translocation or not. The table function is used to count the number of samples (individuals) in the data set with these phenotypes. There are 37 with the translocation and 42 without. The function call rowttests(ALLsfilt, "mol.biol") examines each probe in our filtered data set for significant differences in expression between these two phenotypes.
table(ALLsfilt$mol.biol)
##
## BCR/ABL NEG
## 37 42
tt = rowttests(ALLsfilt, "mol.biol")
str(tt)
## 'data.frame': 8812 obs. of 3 variables:
## $ statistic: num 0.737 0.453 -0.215 -0.533 -1.72 ...
## $ dm : num 0.043 0.0321 -0.0127 -0.0318 -0.4273 ...
## $ p.value : num 0.4637 0.652 0.8302 0.5954 0.0894 ...
## - attr(*, "df")= num 77
names(tt)
## [1] "statistic" "dm" "p.value"
hist(tt$p.value, breaks=50, col="blue", main="Histogram of all p-values (filtered data set)", xlab="p-value")
#looking at just those p-values less than 0.05
hist(tt$p.value[tt$p.value < 0.05], breaks=50, col="blue", main="Histogram of all p-values (filtered data set)", xlab="p-value")
sum(tt$p.value < 0.05) #<--1155
## [1] 1155
length(tt$p.value) #<--8812
## [1] 8812
0.05 * length(tt$p.value) #<---440
## [1] 440.6
#we can go back and repeat the same analysis for the data that was filtered away from this set (to see if we have removed an undue number of probes that have significant differences of expression between phenotypes)
ALLsrest = ALL_bcrneg[sds<sh, ]
ttrest = rowttests(ALLsrest, "mol.biol")
hist(ttrest$p.value, breaks=50, col="blue", main="Histogram of all p-values (data removed by filter step)", xlab="p-value")
hist(ttrest$p.value[ttrest$p.value < 0.05], breaks=50, col="blue", main="Histogram of all p-values (data removed by filter step)", xlab="p-value")
sum(ttrest$p.value < 0.05) #<--so only 84 probe results were removed and its likely that many of these would not have survived multiple-hypothesis correction anyway
## [1] 84
We have done 8812 t-tests. If we were testing at the 0.05 significance level, we would expect, by chance, 8812*0.05=440.6 results to be “significant” or, more correctly, we would expect to “reject the NULL hypothesis 440 times” even if we were looking at a normally distributed data set where we knew that there were no difference. We need to correct for the fact that we have performed multiple hypothesis tests to determine if the number of significant results we observed (1115 p-values were less than 0.05) is really more than what we would expect by chance.
The Benjamini and Hochberg procedure uses the false-discovery rate (FDR) to recalculate p-values taking into account multiple hypothesis testing.
p_adjust <- p.adjust(tt$p.value, method="BH")
num_significant <- sum(tt$p.value < 0.05)
num_significant #<-- 1155
## [1] 1155
num_significant_adj <- sum(p_adjust < 0.05)
num_significant_adj #<--201
## [1] 201
#alternatively, use the multtest function from the multtest bioconductor package
#this function works directly on the expression matrix and returns a sorted list
library("multtest")
str(tt)
## 'data.frame': 8812 obs. of 3 variables:
## $ statistic: num 0.737 0.453 -0.215 -0.533 -1.72 ...
## $ dm : num 0.043 0.0321 -0.0127 -0.0318 -0.4273 ...
## $ p.value : num 0.4637 0.652 0.8302 0.5954 0.0894 ...
## - attr(*, "df")= num 77
mt = mt.rawp2adjp(tt$p.value, proc="BH")
str(mt)
## List of 4
## $ adjp : num [1:8812, 1:2] 3.76e-14 4.79e-13 2.45e-10 1.28e-09 5.27e-09 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:2] "rawp" "BH"
## $ index : int [1:8812] 498 6778 497 529 7326 4781 7128 1639 4790 6866 ...
## $ h0.ABH : NULL
## $ h0.TSBH: NULL
mt$adjp[1:10,1]
## [1] 3.762489e-14 4.791997e-13 2.445693e-10 1.280912e-09 5.265146e-09
## [6] 2.740257e-08 2.785038e-08 1.536188e-07 2.600957e-07 4.741431e-07
sum(mt$adjp[,1] < 0.05) #<--1155
## [1] 1155
sum(mt$adjp[,2] < 0.05) #<--201
## [1] 201
#finally, retrieve a list of genes corresponding to the 10 most significant differentially expressed probe sets
g = featureNames(ALLsfilt)[mt$index[1:10]]
str(g)
## chr [1:10] "1636_g_at" "39730_at" "1635_at" "1674_at" ...
library("hgu95av2.db")
links(hgu95av2SYMBOL[g])
## probe_id symbol
## 1 1635_at ABL1
## 2 1636_g_at ABL1
## 3 1674_at YES1
## 4 32434_at MARCKS
## 5 37015_at ALDH1A1
## 6 37027_at AHNAK
## 7 39730_at ABL1
## 8 39837_s_at ZNF467
## 9 40202_at KLF9
## 10 40504_at PON2
mb = ALLsfilt$mol.biol #<--vector of values indicating molecular biology of each sample
y = exprs(ALLsfilt)[g[1],] #<--expression values for probe across all individuals
ord = order(mb) #<--to order the data points by molecular biology
plot(y[ord], pch=c(1,16)[mb[ord]], col=c("black", "red")[mb[ord]], main=g[1], ylab=expression(log[2]~intensity), xlab="samples")
This section uses the same ALL data subset as above: individuals all have B-cell type leukemias and either have or do not have a BCL/ABL translocation (mol.biol is either “BCL/ABL” or “NEG” respectively).
We first begin by determining the number of samples we have in each mol.biol class.
table(ALL_bcrneg$mol.biol)
##
## BCR/ABL NEG
## 37 42
#BCR/ABL NEG
# 37 42
Next, the nsFilter function from the genefilter package is used to remove non-informative probes. By default this function will also removes control probes on Affymetrix arrays (identifed by their AFFX prefix). We also exclude genes without Entrez Gene identifiers, and we select the top 25% of genes on the basis of variability across samples. nsFilter takes as input a data object of class ExpressionSet and returns the same class type.
?nsFilter
class(ALL_bcrneg)
## [1] "ExpressionSet"
## attr(,"package")
## [1] "Biobase"
ALLfilt_bcrneg = nsFilter(ALL_bcrneg, var.cutoff=0.75)$eset
class(ALLfilt_bcrneg)
## [1] "ExpressionSet"
## attr(,"package")
## [1] "Biobase"
The data are first centred and scaled using median and IQR. This method is more robust to non-normal distributions outliers and the typical method of centering and scaling that uses mean and standard deviation. See http://en.wikipedia.org/wiki/Robust_measures_of_scale.
# rowIQRs is a helper fuction that calculates inter-quartile ranges (IQR) for each of the probes (rows) in an expression data set.
# eSet is the expression data from the assayData slot of the ExpressionSet class. These data are retrieved using the expr() function.
# rowQ (from the Biobase package) retrieves the requested quantile for each row of a matrix of an ExpressionSet
rowIQRs = function(eSet) {
numSamp = ncol(eSet)
lowQ = rowQ(eSet, floor(0.25 * numSamp))
upQ = rowQ(eSet, ceiling(0.75 * numSamp))
upQ - lowQ
}
# standardize is a helper fuction that centres and scales expression data across samples
# by subtracting the median for the probe value and then dividing by the IQR
standardize = function(x) (x - rowMedians(x)) / rowIQRs(x)
#this line standardizes our data set
exprs(ALLfilt_bcrneg) = standardize(exprs(ALLfilt_bcrneg))
#
Tools is R to calculate distance: stats package: dist cluster package: daisy bioDist package: Kullback-Leibler distance mutual information distance Euclidean distance Manhattan distance correlation distance (using Pearson, Spearman, or Kendall’s tau)
#not done
Distance between samples in the expression matrix. Since this matrix is rows x columns = probes x individuals, we want to call the dist function on the transpose of this matrix (individuals x probes). The resulting distance matrix will have rows x columns = individuals x individuals with euclidean distances for each pair of individuals.
The distance matrix is visualized using the heatmap() function. The RColorBrewer package is used to construct a color pallette for the visualization.
#calculate euclidean distances between samples
eucD = dist(t(exprs(ALLfilt_bcrneg)))
str(eucD)
## Class 'dist' atomic [1:3081] 52 37.3 38 42.2 54.3 ...
## ..- attr(*, "Size")= int 79
## ..- attr(*, "Labels")= chr [1:79] "01005" "01010" "03002" "04007" ...
## ..- attr(*, "Diag")= logi FALSE
## ..- attr(*, "Upper")= logi FALSE
## ..- attr(*, "method")= chr "euclidean"
## ..- attr(*, "call")= language dist(x = t(exprs(ALLfilt_bcrneg)))
eucM = as.matrix(eucD)
str(eucM)
## num [1:79, 1:79] 0 52 37.3 38 42.2 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:79] "01005" "01010" "03002" "04007" ...
## ..$ : chr [1:79] "01005" "01010" "03002" "04007" ...
dim(eucM)
## [1] 79 79
library("RColorBrewer")
hmcol = colorRampPalette(brewer.pal(10, "RdBu"))(256)
hmcol = rev(hmcol) #<---so that red and blue correspond to high and low values respectively
# - dark blue indicates samples that are identical
# - dark red indicate samples that are highly dissimilar
heatmap(eucM, sym=TRUE, col=hmcol, distfun=as.dist)
#repeat the above steps using the Spearman correlation distance metric
library("bioDist")
spD = spearman.dist(ALLfilt_bcrneg)
str(spD)
## Class 'dist' atomic [1:3081] 0.705 0.872 0.743 0.846 0.686 ...
## ..- attr(*, "Size")= int 79
## ..- attr(*, "Labels")= chr [1:79] "01005" "01010" "03002" "04007" ...
## ..- attr(*, "Diag")= logi FALSE
## ..- attr(*, "Upper")= logi FALSE
## ..- attr(*, "methods")= chr "spearman"
## spD@Size (79*79 - 79)/2=3081
attr(spD, "Size")
## [1] 79
#[1] 79
spM = as.matrix(spD)
heatmap(spM, sym=TRUE, col=hmcol, distfun=function(x) as.dist(x))
MLInterfaces provides a consistant interface to a number of machine learning packages in R (much like carret but adapted specifically for Bioconductor containers)
library("MLInterfaces")
#explore MLInterfaces packages
library(help=MLInterfaces)
?MLearn
#openVignette() #<==== not done, go through these tutorials for MLearn
# #code from the MLearn help page
# library("MASS")
# data(crabs)
# dim(crabs)
# head(crabs)
# table(crabs$sex)
#
# #selecting a training and testing set
# set.seed(1234)
# #create a training set (indices)
# kp = sample(1:200, size=120) #<--random sample of indices in the data set (used for the trainInd in calls below)
#
# #a random forest example
# rf1 = MLearn(sp~CW+RW, data=crabs, randomForestI, kp, ntree=600 )
# rf1 #<-- the resulting model
# str(rf1) #<---it's data structure
# ?testScores(rf1) #<---get a description of the output data structure and its methods
# rf1@testOutcomes #<---the actual labels for the outcome variable for the test set
# rf1@testPredictions #<---the predicted labels for the outcome variable for the test set
# rf1@testScores #<---individual scores for each class for each sample in the test data
# confuMat(rf1) #<--the confusion matrix for the test set
# show(rf1)
# testScores(rf1)
# #many other methods to retrieve predictions, scores for test set and train set
#
# #neural net
# nn1 = MLearn(sp~CW+RW, data=crabs, nnetI, kp, size=3, decay=.01,
# trace=FALSE )
# nn1
# RObject(nn1)
#
# #k-nearest neighbors
# knn1 = MLearn(sp~CW+RW, data=crabs, knnI(k=3,l=2), kp)
# knn1
# names(RObject(knn1))
#
#
# dlda1 = MLearn(sp~CW+RW, data=crabs, dldaI, kp )
# dlda1
# names(RObject(dlda1))
#
# #linear discriminant analysis
# lda1 = MLearn(sp~CW+RW, data=crabs, ldaI, kp )
# lda1
# names(RObject(lda1))
The training set is balanced between samples that have the BCR/ABL translocation phenotype (BCR/ABL) and those that don’t (NEG).
#create sample indices for the training and test sets
set.seed(1234)
Negs = which(ALLfilt_bcrneg$mol.biol == "NEG")
Bcr = which(ALLfilt_bcrneg$mol.biol == "BCR/ABL")
S1 = sample(Negs, 20, replace=FALSE)
S2 = sample(Bcr, 20, replace = FALSE)
TrainInd = c(S1, S2)
TestInd = setdiff(1:79, TrainInd)
kans = MLearn( mol.biol ~ ., data=ALLfilt_bcrneg, knnI(k=1,l=0), TrainInd)
kans.cm <- confuMat(kans)
kans.cm
## predicted
## given BCR/ABL NEG
## BCR/ABL 16 1
## NEG 4 18
# predicted
# given BCR/ABL NEG
# BCR/ABL 16 1
# NEG 4 18
str(kans.cm)
## int [1:2, 1:2] 16 4 1 18
## - attr(*, "dimnames")=List of 2
## ..$ given : chr [1:2] "BCR/ABL" "NEG"
## ..$ predicted: chr [1:2] "BCR/ABL" "NEG"
dldans = MLearn( mol.biol ~ ., ALLfilt_bcrneg, dldaI, TrainInd)
dldans.cm <- confuMat(dldans)
dldans.cm
## predicted
## given BCR/ABL NEG
## BCR/ABL 13 4
## NEG 2 20
# predicted
# given BCR/ABL NEG
# BCR/ABL 13 4
# NEG 2 20
ldaans = MLearn( mol.biol ~ ., ALLfilt_bcrneg, ldaI, TrainInd)
## Warning in lda.default(x, grouping, ...): variables are collinear
ldans.cm <- confuMat(ldaans)
ldans.cm
## predicted
## given BCR/ABL NEG
## BCR/ABL 13 4
## NEG 2 20
# predicted
# given BCR/ABL NEG
# BCR/ABL 13 4
# NEG 2 20
Select gene probes that are likely to be indicative of the presence or absence of the BCL/ABL translocation. Use the t-test to do this. This selection is done on only the training set - doing feature selection on the entire data set would lead to an overestimated performance.
The learning methods used above are repeated using the new data set containing only the selected features.
#compute the t-tests on the training set
Traintt = rowttests(ALLfilt_bcrneg[, TrainInd], "mol.biol")
# then sort them from largest to smallest
ordTT = order(abs(Traintt$statistic), decreasing=TRUE)
#and then obtain the names of the 50 that have the largest observed test statistics.
fNtt = featureNames(ALLfilt_bcrneg)[ordTT[1:50]]
#select a subset of the data based on the selected features
BNf = ALLfilt_bcrneg[fNtt,] #BNf means Best N features ?
#redo knn
kans.bf = MLearn( mol.biol ~ ., data=BNf, knnI(k=1,l=0), TrainInd)
kans.bf.cm <- confuMat(kans.bf)
kans.bf.cm
## predicted
## given BCR/ABL NEG
## BCR/ABL 14 3
## NEG 1 21
# predicted
# given BCR/ABL NEG
# BCR/ABL 14 3
# NEG 1 21
#redo diagonal linear discriminant analysis
dldans.bf = MLearn( mol.biol ~ ., data=BNf, dldaI, TrainInd)
dldans.bf.cm <- confuMat(dldans.bf)
dldans.bf.cm
## predicted
## given BCR/ABL NEG
## BCR/ABL 13 4
## NEG 1 21
# predicted
# given BCR/ABL NEG
# BCR/ABL 13 4
# NEG 1 21
ldaans.bf = MLearn( mol.biol ~ ., data=BNf, ldaI, TrainInd)
## Warning in lda.default(x, grouping, ...): variables are collinear
ldans.bf.cm <- confuMat(ldaans.bf)
ldans.bf.cm
## predicted
## given BCR/ABL NEG
## BCR/ABL 14 3
## NEG 3 19
# predicted
# given BCR/ABL NEG
# BCR/ABL 14 3
# NEG 3 19
Cross-validation can be employed in the MLInterfaces package by specifying xvalSpec parameters that are passed to the MLearn function. The xvalSpec allows you to specify a cross-validation type, a partition function, the number of partitions, and optionally a function that helps to select features in each subset.
There are only three cross-validations types supported: LOO=leave-one-out, LOG=leave-out-group and NOTEST=use the entire data set for training. There is no built-in supprot for boot-strapping.
Finally, the possibility for feature selection is built into the cross-validation object - see the fs.Fun parameter. Three basic methods for feature selection are provided based on variation between two classes detected using a T-test (fs.absT), fs.probT and fs.topVariance. It is possible to define your own feature-selection helper functions following these examples.
?xvalSpec
#limit the size of the data set to include only the first 1000 probes
#this is strictly for practical reasons due to the computational time required of cross-validation schemes
BNx = ALLfilt_bcrneg[1:1000,]
#str(BNx)
#perform leave-one-out (LOO) cross validation for knn
#note that in this case, a cross-validation capability is built into the knn.cvI method
?knn.cvI
knnXval1 = MLearn(mol.biol~., data=BNx, knn.cvI(k=1, l=0), trainInd=1:ncol(BNx))
#perform leave-one-out (LOO) cross validation for knn - does the same as the above but using the xvalSpec functionality of MLearn - the code is slower than the above
#knnXval1 = MLearn(mol.biol~., data=BNx, knnI(k=1, l=0), xvalSpec("LOO"))
#look at the results
knnXval1
## MLInterfaces classification output container
## The call was:
## MLearn(formula = mol.biol ~ ., data = BNx, .method = knn.cvI(k = 1,
## l = 0), trainInd = 1:ncol(BNx))
## Predicted outcome distribution for test set:
##
## BCR/ABL NEG
## 47 32
## Summary of scores on test set (use testScores() method for details):
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1 1 1 1 1
knnXval1.cm <- confuMat(knnXval1)
knnXval1.cm
## predicted
## given BCR/ABL NEG
## BCR/ABL 31 6
## NEG 16 26
sum(knnXval1.cm)
## [1] 79
#compare this to the non-cv method used above
kans2 = MLearn( mol.biol ~ ., data=BNx, knnI(k=1,l=0), TrainInd)
kans2.cm <- confuMat(kans2)
kans2.cm
## predicted
## given BCR/ABL NEG
## BCR/ABL 16 1
## NEG 4 18
sum(kans2.cm)
## [1] 39
#the results of the two are not directly comparable
#need helper functions to calculate precision, recall etc. - are these not in MLearn
?precision
#ah yes, here they are - but you have to cast the confusion matrix provided by confuMat to a table (strange)
?confuMat
cm <- as.table(knnXval1.cm)
precision(cm)
## BCR/ABL NEG
## 0.8378378 0.6190476
recall(cm)
## BCR/ABL NEG
## 0.6595745 0.8125000
acc(cm)
## [1] 0.721519
cm <- as.table(kans2.cm)
precision(cm)
## BCR/ABL NEG
## 0.9411765 0.8181818
recall(cm)
## BCR/ABL NEG
## 0.8000000 0.9473684
acc(cm)
## [1] 0.8717949
# so in this case, the cross-validation method gives a much more conservative assessment of performance - expected
# always use cross-validation
#feature selection can be combined with cross-validation by providing a helper function to the cross-validation specification
?xvalSpec
?fs.absT
#fs.absT #to see code
lk3f1 = MLearn(mol.biol~., data=BNx, knnI(k=1), xvalSpec("LOO", fsFun=fs.absT(50)))
lk3f1.cm <- confuMat(lk3f1)
cm <- as.table(lk3f1.cm)
precision(cm)
## BCR/ABL NEG
## 0.8918919 0.8333333
recall(cm)
## BCR/ABL NEG
## 0.8250000 0.8974359
acc(cm)
## [1] 0.8607595
#ok, comparable accuracy as before, but probably not large enough sample mto see any differences
knnXval2 = MLearn(mol.biol~., data=BNx, knn.cvI(k=2, l=0), trainInd=1:ncol(BNx))
confuMat(knnXval2)
## predicted
## given BCR/ABL NEG
## BCR/ABL 26 11
## NEG 19 23
acc(as.table(confuMat(knnXval2)))
## [1] 0.6202532
knnXval3 = MLearn(mol.biol~., data=BNx, knn.cvI(k=3, l=0), trainInd=1:ncol(BNx))
confuMat(knnXval3)
## predicted
## given BCR/ABL NEG
## BCR/ABL 33 4
## NEG 19 23
acc(as.table(confuMat(knnXval3)))
## [1] 0.7088608
knnXval5 = MLearn(mol.biol~., data=BNx, knn.cvI(k=5, l=0), trainInd=1:ncol(BNx))
confuMat(knnXval5)
## predicted
## given BCR/ABL NEG
## BCR/ABL 35 2
## NEG 20 22
acc(as.table(confuMat(knnXval5)))
## [1] 0.721519
knnXval10 = MLearn(mol.biol~., data=BNx, knn.cvI(k=10, l=0), trainInd=1:ncol(BNx))
confuMat(knnXval10)
## predicted
## given BCR/ABL NEG
## BCR/ABL 33 4
## NEG 20 22
acc(as.table(confuMat(knnXval10)))
## [1] 0.6962025
library("randomForest")
set.seed(123)
#the guideline rule for mtry is the sqrt of the number of features
dim(ALLfilt_bcrneg)
## Features Samples
## 2160 79
sqrt(2160)
## [1] 46.4758
rf1 = MLearn( mol.biol~., data=ALLfilt_bcrneg, randomForestI, TrainInd, ntree=1000, mtry=55, importance=TRUE)
confuMat(rf1)
## predicted
## given BCR/ABL NEG
## BCR/ABL 13 4
## NEG 0 22
acc(as.table(confuMat(rf1)))
## [1] 0.8974359
#but much smaller values of mtry can be used - potentially with comparable results.
rf2 = MLearn( mol.biol~., data=ALLfilt_bcrneg, randomForestI, TrainInd, ntree=1000, mtry=10, importance=TRUE)
confuMat(rf2)
## predicted
## given BCR/ABL NEG
## BCR/ABL 13 4
## NEG 2 20
acc(as.table(confuMat(rf2)))
## [1] 0.8461538
#the preformance is almost the same in both cases
#the random forest performance is better than the KNN solution
The importance parameter of the random forest method allows us to examine which features had the largest impact on performance. Data about feture importance is retrieved using the getVarImp function of the MLInterfaces package
opar = par(no.readonly=TRUE, mar=c(7,5,4,2)) #opar=original parameters - keep these so they can be restored
par(las=2) #<-- style of the axis label - parallel to the axis
?getVarImp #<-- from the MLInterfaces package
impV1 = getVarImp(rf1)
plot(impV1, n=15, plat="hgu95av2", toktype="SYMBOL") #<-- implementation of plot allows direct mapping of gene symbols
par(opar) #<-- restore the original graphics parameters
par(las=2, mar=c(7,5,4,2))
impV2 = getVarImp(rf2)
plot(impV2, n=15, plat="hgu95av2", toktype="SYMBOL")
par(opar)
There are other Chapters (all of them) in Bioconductor Case Studies that are worth going through later. Most notably
Chapter 7. A more detailed treatment of differential expression analysis Chapter 10. Unsupervised machine learning (clustering) Chapter 13. GSEA Chapter 14. Hypergeometric tests for GSEA
But first I want to look at trying to collapse probes into pathways and then using this as a data set for ML. The aggregate function in R is a likely candidate and this would require first adding an extra column indicating the pathway/complex that the probe belongs to and on which to aggregate. Someone has almost certainly already done this so lets code with Google:
Google(“collapse affymetrix probes into pathways”)
and found this
Strategies for aggregating gene expression data: The collapseRows R function - http://www.biomedcentral.com/1471-2105/12/322
This is from 2011 and is distributed as the WGCNA package. Lets go take a look.
biocLite("preprocessCore")
## BioC_mirror: http://bioconductor.org
## Using Bioconductor version 3.0 (BiocInstaller 1.16.2), R version 3.1.1.
## Installing package(s) 'preprocessCore'
##
## The downloaded binary packages are in
## /var/folders/hx/07bqmsdn1fn95ld5btyqm_hc0000gn/T//Rtmp9RPDBY/downloaded_packages
#install.packages("WGCNA")
#Weighted Correlation Network Analysis
library("WGCNA")
## Warning: package 'WGCNA' was built under R version 3.1.2
## Loading required package: dynamicTreeCut
## Warning: package 'dynamicTreeCut' was built under R version 3.1.2
## Loading required package: fastcluster
## Warning: package 'fastcluster' was built under R version 3.1.2
##
## Attaching package: 'fastcluster'
##
## The following object is masked from 'package:stats':
##
## hclust
## ==========================================================================
## *
## * Package WGCNA 1.42 loaded.
## *
## * Important note: It appears that your system supports multi-threading,
## * but it is not enabled within WGCNA in R.
## * To allow multi-threading within WGCNA with all available cores, use
## *
## * allowWGCNAThreads()
## *
## * within R. Use disableWGCNAThreads() to disable threading if necessary.
## * Alternatively, set the following environment variable on your system:
## *
## * ALLOW_WGCNA_THREADS=<number_of_processors>
## *
## * for example
## *
## * ALLOW_WGCNA_THREADS=4
## *
## * To set the environment variable in linux bash shell, type
## *
## * export ALLOW_WGCNA_THREADS=4
## *
## * before running R. Other operating systems or shells will
## * have a similar command to achieve the same aim.
## *
## ==========================================================================
##
## Attaching package: 'WGCNA'
##
## The following object is masked from 'package:IRanges':
##
## cor
##
## The following object is masked from 'package:stats':
##
## cor
help(package=WGCNA)
?collapseRows
#first we need to retrieve just the expression data, transpose and cast to a data frame - this took several minutes to figure out because of the $%^$&&* poor documentation
eData <- as.data.frame(t(exprs(ALLfilt_bcrneg)))
#finally, add the outcome variable separately
learnThis <- ALLfilt_bcrneg$mol.biol
newColNames <- c("learnThis", colnames(eData))
eData <- cbind(learnThis, eData)
colnames(eData) <- newColNames
colnames(eData)[1:10]
## [1] "learnThis" "41654_at" "35430_at" "38924_s_at" "36023_at"
## [6] "266_s_at" "37569_at" "33047_at" "39428_at" "36515_at"
#str(eData)
length(learnThis)
## [1] 79
dim(eData)
## [1] 79 2161
#make sure this is acceptable as input to MLearn
###################
# hours of fun
# the next several hours is spent finding out that the probe names which begin with numbers
# create a problem for random forest and so they must be appended with letters
##################
# length(learnThis)
# dim(eData)
# rf1e = MLearn( learnThis~., data=eData, randomForestI, TrainInd, ntree=1000, mtry=55, importance=TRUE)
# #Error in eval(expr, envir, enclos) : object '41654_at' not found
# #why?????
# dim(eData)
# eData[,c(1:5, 2161)]
# str(colnames(eData))
# sum(is.na(eData)) #0
# colnames(eData) <- c(1:2160,"outcome")
#
# first lets try a positive control - repeat something i know works
# library("MASS")
# data(crabs)
# set.seed(1234)
# kp = sample(1:200, size=120)
# rf1 = MLearn(sp~CW+RW, data=crabs, randomForestI, kp, ntree=600 )
# rf1
#
# #try this
# attach(eData)
# # The following object is masked _by_ .GlobalEnv:
# #
# # outcome
#
# #make sure this is acceptable as input to MLearn
# rf1e = MLearn( outcome~., data=eData, randomForestI, TrainInd, ntree=1000, mtry=55, importance=TRUE)
# # Error in randomForest.default(m, y, ...) :
# # length of response must be the same as predictors
# detach(eData)
#
#
# eData <- t(crabs)
# eData <- t(eData)
# eData <- as.data.frame(eData, stringsAsFactors=FALSE)
# learnThis <- eData$sp
# dim(crabs)[2]
# eData <- cbind(learnThis, eData[,2:dim(crabs)[2]])
# rf1e = MLearn( learnThis~., data=eData, randomForestI, TrainInd, ntree=1000, mtry=55, importance=TRUE)
# # Error in randomForest.default(m, y, ...) :
# # Need at least two classes to do classification.
# str(eData)
#
#
# #################
#
# #first we need to retrieve just the expression data, transpose and cast to a data frame - this took several minutes to figure out because of the $%^$&&* poor documentation
# newColNames<-rownames(ALLfilt_bcrneg)
# newRowNames<-colnames(ALLfilt_bcrneg)
# str(exprs(ALLfilt_bcrneg)[1:10,1:10])
# eData<-as.data.frame(t(exprs(ALLfilt_bcrneg)))
# colnames(eData)<-newColNames
# rownames(eData)<-newRowNames
# str(eData[1:10,1:10])
#
#
# eData <- as.data.frame(t(exprs(ALLfilt_bcrneg)))
#
# #finally, add the outcome variable separately
# learnThis <- ALLfilt_bcrneg$mol.biol
# newColNames <- c("learnThis", colnames(eData))
# eData <- cbind(learnThis, eData)
# colnames(eData) <- newColNames
#
# colnames(eData)
# str(eData)
# length(learnThis)
# dim(eData)
#
#
# #make sure this is acceptable as input to MLearn
#
# length(learnThis)
# dim(eData)
# rf1e = MLearn( learnThis~., data=eData, randomForestI, TrainInd, ntree=1000, mtry=55, importance=TRUE)
# #....Error in eval(expr, envir, enclos) : object '41654_at' not found
#
# #next try making up data and see if i can reproduce the error
# outcome<-sample(c("black","white"), 100, replace=TRUE)
# train<-matrix(data=runif(10000, 0, 1), nrow=100)
# dim(train)
# train.df <- as.data.frame(train)
# ##using alphabetic colnames works
# colnames(train.df) <- apply(train.df,2,function(x){paste(sample(c(letters),5), sep = "", collapse="")})
# ##but introducing numbers or underscores will cause problems for random_forest and the error
# #Error in eval(expr, envir, enclos) : object '8o2fq' not found
# #will be returned for the first column name with a number or _ at the beginning
# colnames(train.df) <- apply(train.df,2,function(x){paste(sample(c(letters,1:10),5), sep = "", collapse="")})
# colnames(train.df) <- apply(train.df,2,function(x){paste(sample(c(letters,"_"),5), sep = "", collapse="")})
# colnames(train.df)
# dim(train.df)
# str(train.df)
# train.df<-cbind(outcome, train.df)
# str(train.df[1:10,1:10])
# #colnames(train.df)<-c("outcome", as.character(1:100))
#
#
# rownames(train.df)<-c(as.character(1:100))
# str(train.df)
#
# y<-sample(1:100, 20)
# rftest<- MLearn(outcome~., data=train.df, randomForestI, y, ntree=1000, mtry=55, importance=TRUE)
# rftest.cm<-confuMat(rftest)
# acc(as.table(rftest.cm))
# #ok - so the problem seems to be solved
########
#first we need to retrieve just the expression data, transpose and cast to a data frame - this took several minutes to figure out because of the $%^$&&* poor documentation
eData <- as.data.frame(t(exprs(ALLfilt_bcrneg)))
###
###<==== must append something to beginning of probe names so they do not begin with a number
newColNames <- paste("AAA", colnames(eData), sep="", collapse=NULL)
####
####
#finally, add the outcome variable separately
learnThis <- ALLfilt_bcrneg$mol.biol
newColNames <- c("learnThis", newColNames)
eData <- cbind(learnThis, eData)
colnames(eData) <- newColNames
colnames(eData)[1:10]
## [1] "learnThis" "AAA41654_at" "AAA35430_at" "AAA38924_s_at"
## [5] "AAA36023_at" "AAA266_s_at" "AAA37569_at" "AAA33047_at"
## [9] "AAA39428_at" "AAA36515_at"
str(eData)
## 'data.frame': 79 obs. of 2161 variables:
## $ learnThis : Factor w/ 2 levels "BCR/ABL","NEG": 1 2 1 2 2 2 2 2 1 1 ...
## $ AAA41654_at : num 0.302 -0.333 -0.269 -0.195 -0.968 ...
## $ AAA35430_at : num 0.168 -1.564 -0.488 -0.257 -0.675 ...
## $ AAA38924_s_at: num 0.2552 -0.4167 0.4683 0.7595 0.0545 ...
## $ AAA36023_at : num 0.3659 -0.9778 0.0377 0.5133 -1.1639 ...
## $ AAA266_s_at : num 0.353 -0.3864 0.2118 0.3164 -0.0454 ...
## $ AAA37569_at : num -0.0336 -0.2993 -0.0016 0.086 -0.4045 ...
## $ AAA33047_at : num 0.191 -0.53 0.378 0.131 -0.662 ...
## $ AAA39428_at : num 0.552 -0.852 0.215 0.166 0.382 ...
## $ AAA36515_at : num -0.0319 -0.837 -0.0167 0.5539 -0.5017 ...
## $ AAA32589_at : num 0.2711 -0.7963 -1.1034 0.0659 -0.2122 ...
## $ AAA38843_at : num 1.3 -1.88 -1.2 1.13 0.7 ...
## $ AAA39134_at : num -0.4043 0.2827 0.0683 0.0285 1.5156 ...
## $ AAA41233_at : num 0.596 -0.777 1.229 1.152 0.433 ...
## $ AAA32130_at : num 0.0782 0.1309 0.0692 0.4793 -0.1468 ...
## $ AAA34878_at : num 0.9765 -0.6601 0.5408 1.0579 -0.0318 ...
## $ AAA34814_at : num 0.175 -0.898 -0.823 0.747 -0.202 ...
## $ AAA1933_g_at : num -0.5035 -0.2458 0.9783 -0.0657 -0.9887 ...
## $ AAA32163_f_at: num -0.2164 0.4552 0.2242 -0.1617 0.0198 ...
## $ AAA40427_at : num -0.934 0.608 -0.227 0.116 0.361 ...
## $ AAA34372_at : num -0.5635 0.121 0.2703 -0.0281 -0.139 ...
## $ AAA35831_at : num -0.0263 -0.422 0.0849 2.4236 -0.2785 ...
## $ AAA34325_at : num -0.0765 1.1914 -0.0514 0.0866 1.4476 ...
## $ AAA38392_at : num -0.386 -0.376 0.66 0.41 -0.65 ...
## $ AAA35810_at : num 0.2775 -0.2718 0.786 -0.0479 -1.7225 ...
## $ AAA39043_at : num -0.5537 0.3854 -0.2388 -0.0692 -1.2403 ...
## $ AAA35271_at : num -0.019 0.3004 1.4439 0.5096 -0.0302 ...
## $ AAA35734_at : num 0.1807 -0.735 0.9578 0.0378 -0.303 ...
## $ AAA40990_at : num -0.407 -0.136 0.863 1.273 -0.489 ...
## $ AAA40712_at : num -0.0461 0.6305 0.4847 0.9942 1.406 ...
## $ AAA41202_s_at: num 0.2739 -0.7677 0.3301 -0.0441 -0.8925 ...
## $ AAA39829_at : num -0.361 0.608 0.54 -0.339 0.212 ...
## $ AAA33291_at : num -0.522 0.388 -0.36 0.406 0.162 ...
## $ AAA41750_at : num 0.474 -0.586 -1.148 0.664 -0.378 ...
## $ AAA1468_at : num 0.465 -0.365 -0.807 0.511 -0.453 ...
## $ AAA41742_s_at: num 0.332 0.171 0.523 1.122 0.984 ...
## $ AAA41724_at : num -0.5444 1.2145 0.1716 0.0582 0.0828 ...
## $ AAA33849_at : num 1 -0.109 1.447 0.876 1.422 ...
## $ AAA38573_at : num 0.557 -0.574 0.615 0.752 -0.227 ...
## $ AAA40631_at : num 0.0194 -0.5644 1.0923 0.9533 0.2914 ...
## $ AAA36506_at : num 0.17 -1.308 0.445 0.534 0.398 ...
## $ AAA41133_at : num -0.4587 0.2464 0.0439 0.5585 0.02 ...
## $ AAA32223_at : num 0.18 -1.454 -0.116 0.554 -0.792 ...
## $ AAA32193_at : num -0.0586 -0.3474 1.2351 -0.4959 -0.948 ...
## $ AAA33710_at : num -0.266 -0.896 1.079 0.537 0.42 ...
## $ AAA32803_at : num 0.611 -1.2287 -0.0406 0.723 -0.2909 ...
## $ AAA32804_at : num 0.451 -0.841 1.064 0.76 0.121 ...
## $ AAA37542_at : num 0 -0.387 0.505 0.965 2.353 ...
## $ AAA1942_s_at : num -0.3899 0.0057 -0.6981 0.4313 -0.7233 ...
## $ AAA34998_at : num -0.746 -0.91 -0.287 0.385 -0.847 ...
## $ AAA35970_g_at: num 0.1518 -0.7169 -0.0208 0.7091 -1.063 ...
## $ AAA31851_at : num 0.9391 -1.2726 0.1419 1.2747 0.0421 ...
## $ AAA40203_at : num 0.337 0.13 0.93 0.143 0.77 ...
## $ AAA1207_at : num 0.0606 0.3911 0.0991 1.3855 -0.7271 ...
## $ AAA38568_at : num -0.2925 -0.2522 0.3156 2.0414 -0.0944 ...
## $ AAA149_at : num 0.0278 -1.43 -0.6234 0.6257 0.3028 ...
## $ AAA33247_at : num 0.0183 -0.7447 0.5192 0 0.7644 ...
## $ AAA40196_at : num -0.103 -0.53 0.212 0.367 -0.758 ...
## $ AAA34961_at : num -0.0759 -0.1188 0.9586 1.6728 -0.8327 ...
## $ AAA38359_at : num -0.479 -0.515 -0.392 1.201 -0.783 ...
## $ AAA39792_at : num 0.234 -1.014 -0.323 0.713 0 ...
## $ AAA35140_at : num 0.0474 -0.6076 0.3419 0.7936 0.1206 ...
## $ AAA40063_at : num 0.158 -0.803 0.252 0.993 -0.671 ...
## $ AAA32173_at : num -0.655 0.353 -0.483 0.1 -0.964 ...
## $ AAA41737_at : num 0.482 -0.451 -0.209 1.103 0.745 ...
## $ AAA38767_at : num 0.368 -1.204 0.775 1.916 0.449 ...
## $ AAA33700_at : num 0.5541 -0.8779 0.981 0.0479 -0.3282 ...
## $ AAA2031_s_at : num 0.509 0.204 0.959 0.447 -0.112 ...
## $ AAA32961_at : num 1.015 -0.613 0.143 0.869 0.324 ...
## $ AAA35151_at : num -0.5431 1.0672 0.3853 0.2683 -0.0736 ...
## $ AAA33847_s_at: num 0.734 -0.688 0.17 1.565 -0.159 ...
## $ AAA34268_at : num -0.209 1.283 0.377 -0.626 -0.364 ...
## $ AAA39221_at : num 0 -0.5154 0.1736 0.1524 -0.0686 ...
## $ AAA33351_at : num 1.032 -0.539 0.246 0.464 -0.174 ...
## $ AAA32832_at : num 0.419 -1.511 -0.391 0.354 0.655 ...
## $ AAA38070_at : num 0 -0.737 0.638 -0.472 -1.482 ...
## $ AAA33791_at : num 0.154 -0.706 -0.187 1.206 -0.189 ...
## $ AAA34544_at : num 1.2526 -0.0239 0.1342 0.926 1.0894 ...
## $ AAA35166_at : num -0.688 -0.653 0.236 0.239 0.443 ...
## $ AAA1797_at : num 0.931 0.677 -0.509 0.961 0.214 ...
## $ AAA834_at : num -0.294 -0.248 -0.21 1.062 0.979 ...
## $ AAA1599_at : num -0.0931 -0.321 -0.3019 0.6173 -0.1053 ...
## $ AAA37955_at : num -0.4849 0.0339 -0.3365 -0.1529 -1.0689 ...
## $ AAA37964_at : num 0.1823 -1.5958 0.0742 0.7164 -1.3492 ...
## $ AAA36825_at : num -0.0385 0.0405 0.4315 0.7139 -1.2051 ...
## $ AAA41526_at : num -0.3742 0.6121 -0.198 -0.0442 0.8871 ...
## $ AAA33113_at : num 0.096 -2.372 1.054 1.54 -0.461 ...
## $ AAA32272_at : num 0.4965 -1.2341 0.0371 0.7363 -1.3649 ...
## $ AAA429_f_at : num -0.232 0.198 0.924 -0.682 0.751 ...
## $ AAA33678_i_at: num -0.2169 0.0015 1.1055 -0.6857 0.5163 ...
## $ AAA38241_at : num -0.3709 0.7107 -0.0985 0.0782 -0.6484 ...
## $ AAA38518_at : num 0.3503 -0.5535 -0.0664 0.6618 -0.9554 ...
## $ AAA41187_at : num 0.587 -0.212 0.896 0.768 0.257 ...
## $ AAA33267_at : num 0.0695 -0.8214 0.5734 0.0477 -1.5886 ...
## $ AAA36933_at : num 0.855 -0.165 0.725 0.726 0.805 ...
## $ AAA40041_at : num 0.491 -0.914 0.318 0.997 0.745 ...
## $ AAA32607_at : num 1.0713 -0.2841 0.0262 -0.0388 0.5067 ...
## $ AAA41745_at : num -0.2493 0.1612 0.3265 0.9085 -0.0403 ...
## $ AAA38353_at : num 0.563 -1.391 -0.088 0.199 0.182 ...
## [list output truncated]
length(learnThis)
## [1] 79
dim(eData)
## [1] 79 2161
#make sure this is acceptable as input to MLearn
length(learnThis)
## [1] 79
dim(eData)
## [1] 79 2161
rf1e = MLearn( learnThis~., data=eData, randomForestI, TrainInd, ntree=1000, mtry=55, importance=TRUE)
#....
rf1e.cm<-confuMat(rf1e)
acc(as.table(rf1e.cm))
## [1] 0.8974359
#so now lets go back and first try collapsing the rows in the expression data set
tmp <- exprs(ALLfilt_bcrneg)
#make a small test set
tmp2<-tmp[1:6,1:6]
tmp2
## 01005 01010 03002 04007 04008
## 41654_at 0.30223298 -0.3331243 -0.26938075 -0.19468856 -0.96768116
## 35430_at 0.16821060 -1.5636539 -0.48766759 -0.25706581 -0.67450101
## 38924_s_at 0.25522567 -0.4166590 0.46829547 0.75954472 0.05449203
## 36023_at 0.36592774 -0.9777903 0.03769396 0.51333134 -1.16386312
## 266_s_at 0.35301246 -0.3863641 0.21180977 0.31643576 -0.04539891
## 37569_at -0.03356995 -0.2993378 -0.00160298 0.08596511 -0.40454587
## 04010
## 41654_at 0.88593546
## 35430_at -1.02849426
## 38924_s_at -0.93322046
## 36023_at -0.94238529
## 266_s_at 0.52930562
## 37569_at -0.08933493
#prepare row names and group names
rowID<-rownames(tmp2)
rowID
## [1] "41654_at" "35430_at" "38924_s_at" "36023_at" "266_s_at"
## [6] "37569_at"
rowGroup<-c("g1", "g1", "g2", "g2", "g2", "g3")
rowGroup
## [1] "g1" "g1" "g2" "g2" "g2" "g3"
#collapse the rows and examine the results
collapseRows.results<-collapseRows(datET=tmp2, rowGroup=rowGroup, rowID=rowID)
tmp2Collapsed<-collapseRows.results$datETcollapsed
tmp2Collapsed #<====resulting object is a matrix with group names as new row names
## 01005 01010 03002 04007 04008 04010
## g1 0.30223298 -0.3331243 -0.26938075 -0.19468856 -0.96768116 0.88593546
## g2 0.35301246 -0.3863641 0.21180977 0.31643576 -0.04539891 0.52930562
## g3 -0.03356995 -0.2993378 -0.00160298 0.08596511 -0.40454587 -0.08933493
str(tmp2Collapsed)
## num [1:3, 1:6] 0.3022 0.353 -0.0336 -0.3331 -0.3864 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:3] "g1" "g2" "g3"
## ..$ : chr [1:6] "01005" "01010" "03002" "04007" ...
#the function also returns meta-information about the groups
collapseRows.results$group2row
## group selectedRowID
## g1 "g1" "41654_at"
## g2 "g2" "266_s_at"
## g3 "g3" "37569_at"
collapseRows.results$selectedRow
## 41654_at 35430_at 38924_s_at 36023_at 266_s_at 37569_at
## TRUE FALSE FALSE FALSE TRUE TRUE
#now there are multiple ways to collapse rows including user-defined methods
#when method="function", then methodFunction must be specified
#other values for mehtod could be:
# "MaxMean" (default) or
# "MinMean" = choose the row with the highest or lowest mean value, respectively.
# "maxRowVariance" = choose the row with the highest variance (across the columns of datET).
# "absMaxMean" or "absMinMean" = choose the row with the highest or lowest mean absolute value.
# "ME" = choose the eigenrow (first principal component of the rows in each group). Note that with this method option, connectivityBasedCollapsing is automatically set to FALSE.
# "Average" = for each column, take the average value of the rows in each group
# "function" = use this method for a user-input function (see the description of the argument "methodFunction").
# Note: if method="ME", "Average" or "function", the output parameters "group2row" and "selectedRow" are not informative.
collapseRows.results<-collapseRows(datET=tmp2, rowGroup=rowGroup, rowID=rowID, method="function", methodFunction=colMeans)
## Comment: make sure methodFunction takes a matrix as input.
## ...Ignore previous comment. Function completed properly!
tmp2Collapsed<-collapseRows.results$datETcollapsed
tmp2Collapsed #<====resulting object is a matrix with group names as new row names
## 01005 01010 03002 04007 04008 04010
## g1 0.23522179 -0.9483891 -0.37852417 -0.22587719 -0.8210911 -0.07127940
## g2 0.32472195 -0.5936045 0.23926640 0.52977060 -0.3849233 -0.44876671
## g3 -0.03356995 -0.2993378 -0.00160298 0.08596511 -0.4045459 -0.08933493
str(tmp2Collapsed)
## num [1:3, 1:6] 0.2352 0.3247 -0.0336 -0.9484 -0.5936 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:3] "g1" "g2" "g3"
## ..$ : chr [1:6] "01005" "01010" "03002" "04007" ...
#since a function is used to generate a new value for each column, meta-information about the groups is meaningless
collapseRows.results$group2row
## group selectedRowID
## [1,] "g1" "function.g1"
## [2,] "g2" "function.g2"
## [3,] "g3" "function.g3"
collapseRows.results$selectedRow
## 41654_at 35430_at 38924_s_at 36023_at 266_s_at 37569_at
## FALSE FALSE FALSE FALSE FALSE FALSE
#try selecting the absolute maximum value for each column
#first have to provide a function that will return the maximum absolute value for each column in a collapse group
myCollapse<-function(someMatrix)apply(someMatrix, 2, function(x){ return(x[which.max(abs(x))])})
collapseRows.results<-collapseRows(datET=tmp2, rowGroup=rowGroup, rowID=rowID, method="function", methodFunction=myCollapse)
## Comment: make sure methodFunction takes a matrix as input.
## ...Ignore previous comment. Function completed properly!
tmp2Collapsed<-collapseRows.results$datETcollapsed
tmp2Collapsed #<====resulting object is a matrix with group names as new row names
## 01005 01010 03002 04007 04008 04010
## g1 0.30223298 -1.5636539 -0.48766759 -0.25706581 -0.9676812 -1.02849426
## g2 0.36592774 -0.9777903 0.46829547 0.75954472 -1.1638631 -0.94238529
## g3 -0.03356995 -0.2993378 -0.00160298 0.08596511 -0.4045459 -0.08933493
str(tmp2Collapsed)
## num [1:3, 1:6] 0.3022 0.3659 -0.0336 -1.5637 -0.9778 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:3] "g1" "g2" "g3"
## ..$ : chr [1:6] "01005" "01010" "03002" "04007" ...
#since a function is used to generate a new value for each column, meta-information about the groups is meaningless
collapseRows.results$group2row
## group selectedRowID
## [1,] "g1" "function.g1"
## [2,] "g2" "function.g2"
## [3,] "g3" "function.g3"
collapseRows.results$selectedRow
## 41654_at 35430_at 38924_s_at 36023_at 266_s_at 37569_at
## FALSE FALSE FALSE FALSE FALSE FALSE
####
# finally,put it all together
#
# collapse whole expression set by gene id and selecting max row for each
tmp <- exprs(ALLfilt_bcrneg)
rowID<-rownames(tmp)#prepare row names and group names
ps_eg = toTable(hgu95av2ENTREZID) #probe_id to Entrez Gene ids
ind<-match(rowID, ps_eg[,1]) #find matching entries of rowIDs in ps_eg table
geneid <- ps_eg[ind,2] #retrieve corresponding gene id for each rowID - all are unique so collapsing wont actually do anything
check(geneid)#chekc that all rowIDs were mapped to a gene id
##
## head:
## [1] "100" "10001" "10006" "100129361" "100133941" "10016"
##
## tail:
## [1] "9976" "998" "9984" "9987" "9988" "9989"
##
## str:
##
## chr [1:2160] "100" "10001" "10006" "100129361" "100133941" ...
## NULL
##
## length: 2160
##
## dim:
##
## total NA: 0
##
## total NULL: 0
##
## total duplicated: 0
##
## summary:
##
## Length Class Mode
## 2160 character character
rowGroup<-geneid
myCollapse<-function(someMatrix)apply(someMatrix, 2, function(x){ return(x[which.max(abs(x))])}) #define a collapse function
collapseRows.results<-collapseRows(datET=tmp, rowGroup=rowGroup, rowID=rowID, method="function", methodFunction=myCollapse)#collapse
## Comment: make sure methodFunction takes a matrix as input.
## ...Ignore previous comment. Function completed properly!
tmpCollapsed<-collapseRows.results$datETcollapsed
# prepare expression set for ml
eData <- as.data.frame(t(tmpCollapsed))#transpose and cast as a data frame
newColNames <- paste("geneid", colnames(eData), sep="", collapse=NULL) #fix column names
learnThis <- ALLfilt_bcrneg$mol.biol ##finally, add the outcome variable separately
newColNames <- c("learnThis", newColNames)
eData <- cbind(learnThis, eData)
colnames(eData) <- newColNames
# do the random forest thang
length(learnThis)
## [1] 79
dim(eData)
## [1] 79 2161
rf1e = MLearn( learnThis~., data=eData, randomForestI, TrainInd, ntree=1000, mtry=55, importance=TRUE)
#....
rf1e.cm<-confuMat(rf1e)
rf1e.cm
## predicted
## given BCR/ABL NEG
## BCR/ABL 13 4
## NEG 0 22
acc(as.table(rf1e.cm))
## [1] 0.8974359
How do I map probe ids to gene names/ids and ids to pathways and complexes
This section should eventually be moved to Chapter 8 notes.
###
#using the annotation environment - probe id to entrez gene id
###
?hgu95av
## No documentation for 'hgu95av' in specified packages and libraries:
## you could try '??hgu95av'
?hgu95av2ENTREZID
EG = as.character(hgu95av2ENTREZID[featureNames(ALL)])
dim(ALL) #12625 x 128
## Features Samples
## 12625 128
length(EG) #11543 therefore, not all probe ids map to a gene id
## [1] 11543
EG[1:5]
## 1000_at 1001_at 1002_f_at 1003_s_at 1004_at
## "5595" "7075" "1557" "643" "643"
sum(is.na(EG)) #0
## [1] 0
#so EG is a dictionary of probe names 2 geneids that can be accessed like this
thisProbe <- "40812_at"
EG[thisProbe]
## 40812_at
## "6526"
#we can also do this to get an ordinary table
ps_eg = toTable(hgu95av2ENTREZID)
head(ps_eg)
## probe_id gene_id
## 1 1000_at 5595
## 2 1001_at 7075
## 3 1002_f_at 1557
## 4 1003_s_at 643
## 5 1004_at 643
## 6 1005_at 1843
#how many genes map to one probe, two probes, three probes...etc.
table(table(EG))
##
## 1 2 3 4 5 6 7 8
## 6586 1443 458 104 22 16 5 5
# 1 2 3 4 5 6 7 8
# 6586 1443 458 104 22 16 5 5
#The inner call to table counts for each EntrezGene ID the number of probe sets
#that are mapped to it. The outer one tabulates how often each count is seen, one, two, three, . . . times.
#so there are 6586 genes that map to one probe, there are 1443 that map to 2 probes etc...
###
#gene ids to chromosomes
###
ps_chr = toTable(hgu95av2CHR)
head(ps_chr) #<-- probe id to chromosome
## probe_id chromosome
## 1 1000_at 16
## 2 1001_at 1
## 3 1002_f_at 10
## 4 1003_s_at 11
## 5 1004_at 11
## 6 1005_at 5
ps_eg = toTable(hgu95av2ENTREZID)
head(ps_eg) #<-- probe id to entrez gene
## probe_id gene_id
## 1 1000_at 5595
## 2 1001_at 7075
## 3 1002_f_at 1557
## 4 1003_s_at 643
## 5 1004_at 643
## 6 1005_at 1843
chr = merge(ps_chr, ps_eg)
head(chr) #<--probe id to chromosome to gene id
## probe_id chromosome gene_id
## 1 100_g_at 14 5875
## 2 1000_at 16 5595
## 3 1001_at 1 7075
## 4 1002_f_at 10 1557
## 5 1003_s_at 11 643
## 6 1004_at 11 643
chr = unique(chr[, colnames(chr)!="probe_id"])
head(chr)
## chromosome gene_id
## 1 14 5875
## 2 16 5595
## 3 1 7075
## 4 10 1557
## 5 11 643
## 7 5 1843
###
#gene ids to GO
###
#relevant packages
#GOStats
#annotate
#topGO
#goTools
# The induced GO graph for a collection of genes is the graph that results
# from taking the union of all GO terms annotated to the genes and also all
# their parent terms.
# as.list(GOMFCHILDREN["GO:0008094"]) #<--get direct descendants of term in MF ontology
# as.list(GOMFOFFSPRING["GO:0008094"])#<--get all descendants
# as.list(GOMFANCESTOR["GO:0008094"]) #<--get all parents
# as.list(GOMFPARENTS["GO:0008094"]) #<--get direct parents
# hyperGTest
# The function hyperGTest will compute the Hypergeometric p-values for overrepresentation
# of genes at all GO terms in the induced GO graph.
# Because of the way that genes are annotated at GO terms, there are concerns with how one might
# address the multiple testing issues that arise. Both the topGO and GOstats
# packages implement a form of conditional testing that is designed to address some
# of these concern.
# library("GOstats")
# #To perform the test we firrst define the universe of genes and create a parameter object to set the analysis options.
# affyUniverse = featureNames(ALLfilt_af4bcr)
# uniId = hgu95av2ENTREZID[affyUniverse]
# entrezUniverse = unique(as.character(uniId))
# params = new("GOHyperGParams", geneIds=EGsub, universeGeneIds=entrezUniverse, annotation="hgu95av2", ontology="BP", pvalueCutoff=0.001, conditional=FALSE, testDirection="over")
# mfhyper = hyperGTest(params)
# hist(pvalues(mfhyper), breaks=50, col="mistyrose")
?hgu95av2GO
#Maps between manufacturer IDs and Gene Ontology (GO) IDs
# hgu95av2GO is an R object that provides mappings between manufacturer identifiers and the GO identifiers that they are directly associated with. This mapping and its reverse mapping (hgu95av2GO2PROBE) do NOT associate the child terms from the GO ontology with the gene. Only the directly evidenced terms are represented here.
x <- hgu95av2GO
head(x)
## GO submap for chip hgu95av2 (object of class "ProbeGo3AnnDbBimap")
# Get the manufacturer identifiers that are mapped to a GO ID
mapped_genes <- mappedkeys(x)
head(mapped_genes)
## [1] "1000_at" "1001_at" "1002_f_at" "1003_s_at" "1004_at" "1005_at"
# Convert to a list
xx <- as.list(x[mapped_genes])
#head(xx)
#xx[1]
class(xx)
## [1] "list"
#str(xx)
if(length(xx) > 0) {
# Try the first one
got <- xx[[1]]
got[[1]][["GOID"]]
got[[1]][["Ontology"]]
got[[1]][["Evidence"]]
}
## [1] "TAS"
hgu95av2GO.table <- toTable(hgu95av2GO)
head(hgu95av2GO.table)
## probe_id go_id Evidence Ontology
## 1 1000_at GO:0000165 TAS BP
## 2 1000_at GO:0000186 TAS BP
## 3 1000_at GO:0000187 TAS BP
## 4 1000_at GO:0002224 TAS BP
## 5 1000_at GO:0002755 TAS BP
## 6 1000_at GO:0002756 TAS BP
table(hgu95av2GO.table$probe_id)[1:5]
##
## 100_g_at 1000_at 1001_at 1002_f_at 1003_s_at
## 7 83 15 23 14
table(table(hgu95av2GO.table$probe_id))
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 167 177 295 332 405 466 461 511 520 470 509 469 413 435 414 383 345 319
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 290 291 211 224 196 168 207 139 137 117 134 130 131 74 84 89 82 69
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 66 54 75 52 52 69 41 54 42 47 38 23 41 39 27 30 32 18
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 36 26 19 25 6 24 19 18 20 15 20 19 17 13 5 22 21 12
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 13 9 4 5 11 14 1 7 9 8 15 9 2 5 9 5 4 4
## 91 92 93 94 96 97 98 99 100 101 103 104 105 106 108 111 112 113
## 5 3 4 9 2 4 4 3 1 3 7 1 2 3 4 2 1 3
## 114 115 124 126 128 130 131 136 140 143 144 147 150 159 184 197
## 5 4 1 1 4 2 5 8 1 3 5 3 2 1 3 1
#table(hgu95av2GO.table$probe_id, hgu95av2GO.table$Ontology)[1:5]
##
# Summary
##
#probe_id to Entrez Gene ids
ps_eg = toTable(hgu95av2ENTREZID)
#probe_id to GO mappings
hgu95av2GO.table <- toTable(hgu95av2GO)
# there is an interesting example here of how the order of loading libraries can matter
# the "select" function from the AnnotatioDb package stopped working with error
# Error in select(GO.db, keys = "GO:1990237", columns = c("GOID", "TERM", :
# unused arguments (keys = "GO:1990237", columns = c("GOID", "TERM", "ONTOLOGY"), keytype = "GOID")
#
# I say "stopped working" because the call was composed correctly and had worked before. The above cryptic message doesnt tell you that
# its not trying to call the "select" function you think you are trying to call. In the case above, the phrase "unused arguments"
# should be the tip-off that you need to check this
# try this
# > select
# function (obj)
# UseMethod("select")
# <bytecode: 0x10e0389c0>
# <environment: namespace:MASS>
#
# the problem was solved by doing this...
# detach("package:MASS", unload=TRUE)
library("AnnotationDbi")
library(GO.db)
##
#detach("package:AnnotationDbi", unload=TRUE)
#detach("package:GO.db", unload=TRUE)
gcon = GO_dbconn()
#a list of very simple search terms to retrieve complex like collections
searchTerms <- c("complex", "some", "mere")
#retrieve goids and their details related to the search terms
allGoidDetails <- NULL
complexGoid <- NULL
allComplexGoid <- NULL
for (searchTerm in searchTerms) {
query <- paste("select go_id from go_term where term like '%", searchTerm, "%'", sep = "")
theseGoids <- as.character(dbGetQuery(gcon, query )[[1]])
goidDetails <- select(GO.db, keys=theseGoids, columns = c("GOID", "TERM", "ONTOLOGY"), keytype="GOID")
goidDetails <- goidDetails[goidDetails$ONTOLOGY == "CC",]
allGoidDetails <- unique(rbind(allGoidDetails, goidDetails))
allComplexGoid <- unique(c(allComplexGoid, goidDetails$GOID))
}
##spot-check
check(allComplexGoid)
##
## head:
## [1] "GO:0000015" "GO:0000109" "GO:0000110" "GO:0000111" "GO:0000112"
## [6] "GO:0000113"
##
## tail:
## [1] "GO:0030017" "GO:0033583" "GO:0035996" "GO:0035997" "GO:0043034"
## [6] "GO:0061468"
##
## str:
##
## chr [1:1874] "GO:0000015" "GO:0000109" "GO:0000110" ...
## NULL
##
## length: 1874
##
## dim:
##
## total NA: 0
##
## total NULL: 0
##
## total duplicated: 0
##
## summary:
##
## Length Class Mode
## 1874 character character
check(allGoidDetails)
##
## head:
## GOID TERM ONTOLOGY
## 1 GO:0000015 phosphopyruvate hydratase complex CC
## 2 GO:0000109 nucleotide-excision repair complex CC
## 3 GO:0000110 nucleotide-excision repair factor 1 complex CC
## 4 GO:0000111 nucleotide-excision repair factor 2 complex CC
## 5 GO:0000112 nucleotide-excision repair factor 3 complex CC
## 6 GO:0000113 nucleotide-excision repair factor 4 complex CC
##
## tail:
## GOID TERM ONTOLOGY
## 5310 GO:0030017 sarcomere CC
## 8010 GO:0033583 rhabdomere membrane CC
## 8710 GO:0035996 rhabdomere microvillus CC
## 8810 GO:0035997 rhabdomere microvillus membrane CC
## 9010 GO:0043034 costamere CC
## 1058 GO:0061468 karyomere CC
##
## str:
##
## 'data.frame': 1874 obs. of 3 variables:
## $ GOID : chr "GO:0000015" "GO:0000109" "GO:0000110" "GO:0000111" ...
## $ TERM : chr "phosphopyruvate hydratase complex" "nucleotide-excision repair complex" "nucleotide-excision repair factor 1 complex" "nucleotide-excision repair factor 2 complex" ...
## $ ONTOLOGY: chr "CC" "CC" "CC" "CC" ...
## NULL
##
## length: 3
##
## dim: 1874 3
##
## total NA: 0
##
## total NULL: 0
##
## total duplicated: 0
##
## summary:
##
## GOID TERM ONTOLOGY
## "Length:1874 " "Length:1874 " "Length:1874 "
## "Class :character " "Class :character " "Class :character "
## "Mode :character " "Mode :character " "Mode :character "
head(allGoidDetails[grep("some", allGoidDetails$TERM),])
## GOID TERM ONTOLOGY
## 22 GO:0000176 nuclear exosome (RNase complex) CC
## 23 GO:0000177 cytoplasmic exosome (RNase complex) CC
## 24 GO:0000178 exosome (RNase complex) CC
## 56 GO:0000502 proteasome complex CC
## 215 GO:0005839 proteasome core complex CC
## 315 GO:0008537 proteasome activator complex CC
#now retrieve all human geneids annotated with these goids
library(org.Hs.eg.db)
##columns(org.Hs.eg.db)
theseComplexGeneDetails <- select(org.Hs.eg.db, keys=allComplexGoid, columns=c("ENTREZID", "GO"), keytype="GO")
## Warning in .generateExtraRows(tab, keys, jointype): 'select' resulted in
## 1:many mapping between keys and return rows
#remove NAs (duplicate gene ids are expected with different goids)
theseComplexGeneDetails <- theseComplexGeneDetails[!is.na(theseComplexGeneDetails$ENTREZID),]
theseComplexGeneDetails <- unique(theseComplexGeneDetails)
check(theseComplexGeneDetails$ENTREZID) #no NULL
##
## head:
## [1] "2023" "2026" "2027" "387712" "1161" "2067"
##
## tail:
## [1] "9456" "23336" "23676" "79026" "113146" "202333"
##
## str:
##
## chr [1:9612] "2023" "2026" "2027" "387712" "1161" "2067" ...
## NULL
##
## length: 9612
##
## dim:
##
## total NA: 0
##
## total NULL: 0
##
## total duplicated: 3328
##
## summary:
##
## Length Class Mode
## 9612 character character
check(theseComplexGeneDetails)
##
## head:
## GO EVIDENCE ONTOLOGY ENTREZID
## 1 GO:0000015 IEA CC 2023
## 2 GO:0000015 IEA CC 2026
## 3 GO:0000015 IEA CC 2027
## 4 GO:0000015 IEA CC 387712
## 5 GO:0000109 IDA CC 1161
## 6 GO:0000109 IDA CC 2067
##
## tail:
## GO EVIDENCE ONTOLOGY ENTREZID
## 10836 GO:0043034 IEA CC 9456
## 10837 GO:0043034 IDA CC 23336
## 10838 GO:0043034 IEA CC 23676
## 10839 GO:0043034 ISS CC 79026
## 10840 GO:0043034 ISS CC 113146
## 10841 GO:0043034 IEA CC 202333
##
## str:
##
## 'data.frame': 9612 obs. of 4 variables:
## $ GO : chr "GO:0000015" "GO:0000015" "GO:0000015" "GO:0000015" ...
## $ EVIDENCE: chr "IEA" "IEA" "IEA" "IEA" ...
## $ ONTOLOGY: chr "CC" "CC" "CC" "CC" ...
## $ ENTREZID: chr "2023" "2026" "2027" "387712" ...
## NULL
##
## length: 4
##
## dim: 9612 4
##
## total NA: 0
##
## total NULL: 0
##
## total duplicated: 0
##
## summary:
##
## GO EVIDENCE ONTOLOGY
## "Length:9612 " "Length:9612 " "Length:9612 "
## "Class :character " "Class :character " "Class :character "
## "Mode :character " "Mode :character " "Mode :character "
## ENTREZID
## "Length:9612 "
## "Class :character "
## "Mode :character "
##table evidence type - at this point, evidence is not used to filter complexes
table(theseComplexGeneDetails$EVIDENCE)
##
## IBA IC IDA IEA IMP IPI ISS NAS TAS
## 162 33 6086 1871 49 60 507 276 568
##IBA IC IDA IEA IMP IPI ISS NAS RCA TAS
##122 32 2933 1846 33 51 456 321 1 589
#convert results to a list format
tmp<- theseComplexGeneDetails[,c(1,4)]
tmp$ENTREZID <- as.integer(tmp$ENTREZID)
gocomplex2geneids.l <- unstack(tmp, form=ENTREZID ~ GO)
#check(gocomplex2geneids.l) # 577 members in list, 18 duplicates ok
#remove complexes with only one member
theseLengths <- sapply(gocomplex2geneids.l, length)
gocomplex2geneids.l <- gocomplex2geneids.l[(theseLengths > 1)]
#check(gocomplex2geneids.l)#522 membrs in list , 3 duplicates
#set attributes before storing these
attr(gocomplex2geneids.l, "title") <- "GOComplexes"
attr(gocomplex2geneids.l, "descr") <- "GO complexes are a collection of gene sets where each set is defined by a term from the cellular component gene ontology (CC GO) that has the string 'complex', 'ome', or 'mere' in it and is used to annotate two or more human genes."
First I will collapse by gene ids as a control (should make little difference).
####
# finally,put it all together
#
# collapse whole expression set by gene id and selecting max row for each
tmp <- exprs(ALLfilt_bcrneg)
rowID<-rownames(tmp)#prepare row names and group names
ps_eg = toTable(hgu95av2ENTREZID) #probe_id to Entrez Gene ids
ind<-match(rowID, ps_eg[,1]) #find matching entries of rowIDs in ps_eg table
geneid <- ps_eg[ind,2] #retrieve corresponding gene id for each rowID - all are unique so collapsing wont actually do anything
check(geneid)#chekc that all rowIDs were mapped to a gene id
##
## head:
## [1] "100" "10001" "10006" "100129361" "100133941" "10016"
##
## tail:
## [1] "9976" "998" "9984" "9987" "9988" "9989"
##
## str:
##
## chr [1:2160] "100" "10001" "10006" "100129361" "100133941" ...
## NULL
##
## length: 2160
##
## dim:
##
## total NA: 0
##
## total NULL: 0
##
## total duplicated: 0
##
## summary:
##
## Length Class Mode
## 2160 character character
rowGroup<-geneid
myCollapse<-function(someMatrix)apply(someMatrix, 2, function(x){ return(x[which.max(abs(x))])}) #define a collapse function
collapseRows.results<-collapseRows(datET=tmp, rowGroup=rowGroup, rowID=rowID, method="function", methodFunction=myCollapse)#collapse
## Comment: make sure methodFunction takes a matrix as input.
## ...Ignore previous comment. Function completed properly!
tmpCollapsed<-collapseRows.results$datETcollapsed
# prepare expression set for ml
eData <- as.data.frame(t(tmpCollapsed))#transpose and cast as a data frame
newColNames <- paste("geneid", colnames(eData), sep="", collapse=NULL) #fix column names
learnThis <- ALLfilt_bcrneg$mol.biol ##finally, add the outcome variable separately
newColNames <- c("learnThis", newColNames)
eData <- cbind(learnThis, eData)
colnames(eData) <- newColNames
# do the random forest thang
length(learnThis)
## [1] 79
dim(eData)
## [1] 79 2161
rf1e = MLearn( learnThis~., data=eData, randomForestI, TrainInd, ntree=1000, mtry=55, importance=TRUE)
#....
rf1e.cm<-confuMat(rf1e)
rf1e.cm
## predicted
## given BCR/ABL NEG
## BCR/ABL 13 4
## NEG 0 22
acc(as.table(rf1e.cm))
## [1] 0.8974359
# predicted
# given BCR/ABL NEG
# BCR/ABL 13 4
# NEG 0 22
# > acc(as.table(rf1e.cm))
# [1] 0.8974359
Next try collapsing into complexes before running random forest
# collapse whole expression set by complex id and selecting max row for each
tmp <- exprs(ALLfilt_bcrneg)
rowID<-rownames(tmp)#prepare row names and group names
ps_eg = toTable(hgu95av2ENTREZID) #probe_id to Entrez Gene ids
ind<-match(rowID, ps_eg[,1]) #find matching entries of rowIDs in ps_eg table
geneid <- ps_eg[ind,2] #retrieve corresponding gene id for each rowID - all are unique so collapsing wont actually do anything
check(geneid)#chekc that all rowIDs were mapped to a gene id
##
## head:
## [1] "100" "10001" "10006" "100129361" "100133941" "10016"
##
## tail:
## [1] "9976" "998" "9984" "9987" "9988" "9989"
##
## str:
##
## chr [1:2160] "100" "10001" "10006" "100129361" "100133941" ...
## NULL
##
## length: 2160
##
## dim:
##
## total NA: 0
##
## total NULL: 0
##
## total duplicated: 0
##
## summary:
##
## Length Class Mode
## 2160 character character
#theseComplexGeneDetails
#convert geneid to complexid
ind<-match(geneid,theseComplexGeneDetails$ENTREZID)
#for every case where ind exists, map to correpsonding complex id
complexId<-theseComplexGeneDetails[ind,1]
#use complexIds as grouping ids where they exist for a probe,
rowGroup<-complexId
#...otherwise use the probe id as a group id
rowGroup[is.na(complexId)]<-rowID[is.na(complexId)]
check(rowGroup) #0 NA, 851 duplicates - good
##
## head:
## [1] "GO:0005764" "GO:0016592" "GO:0031209" "36023_at" "266_s_at"
## [6] "GO:0005768"
##
## tail:
## [1] "40698_at" "GO:0070062" "GO:0000346" "GO:0005681" "41808_at"
## [6] "GO:0030289"
##
## str:
##
## chr [1:2160] "GO:0005764" "GO:0016592" "GO:0031209" ...
## NULL
##
## length: 2160
##
## dim:
##
## total NA: 0
##
## total NULL: 0
##
## total duplicated: 851
##
## summary:
##
## Length Class Mode
## 2160 character character
myCollapse<-function(someMatrix)apply(someMatrix, 2, function(x){ return(x[which.max(abs(x))])}) #define a collapse function
collapseRows.results<-collapseRows(datET=tmp, rowGroup=rowGroup, rowID=rowID, method="function", methodFunction=myCollapse)#collapse
## Comment: make sure methodFunction takes a matrix as input.
## ...Ignore previous comment. Function completed properly!
tmpCollapsed<-collapseRows.results$datETcollapsed
dim(tmp)
## [1] 2160 79
dim(tmpCollapsed)
## [1] 1309 79
# > dim(tmp)
# [1] 2160 79
# > dim(tmpCollapsed)
# [1] 1309 79
# prepare expression set for ml
eData <- as.data.frame(t(tmpCollapsed))#transpose and cast as a data frame
newColNames <- paste("AAA", colnames(eData), sep="", collapse=NULL) #fix column names
#fix column names - need to replace :
newColNames<-gsub(":", "", newColNames)
learnThis <- ALLfilt_bcrneg$mol.biol ##finally, add the outcome variable separately
newColNames <- c("learnThis", newColNames)
eData <- cbind(learnThis, eData)
colnames(eData) <- newColNames
# do the random forest thang
length(learnThis)
## [1] 79
dim(eData)
## [1] 79 1310
rf1e = MLearn( learnThis~., data=eData, randomForestI, TrainInd, ntree=1000, mtry=55, importance=TRUE)
#....
rf1e.cm<-confuMat(rf1e)
rf1e.cm
## predicted
## given BCR/ABL NEG
## BCR/ABL 13 4
## NEG 0 22
acc(as.table(rf1e.cm))
## [1] 0.8974359
# predicted
# given BCR/ABL NEG
# BCR/ABL 13 4
# NEG 0 22
# > acc(as.table(rf1e.cm))
# [1] 0.8974359
#ok - makes no diff to performance - good - lets see whats important
The importance parameter of the random forest method allows us to examine which features had the largest impact on performance. Data about feture importance is retrieved using the getVarImp function of the MLInterfaces package
opar = par(no.readonly=TRUE, mar=c(7,7,4,2)) #opar=original parameters - keep these so they can be restored
par(las=2) #<-- style of the axis label - parallel to the axis
?getVarImp #<-- from the MLInterfaces package
impV1 = getVarImp(rf1e)
plot(impV1, n=15, plat="hgu95av2", toktype="SYMBOL") #<-- implementation of plot allows direct mapping of gene symbols
par(opar) #<-- restore the original graphics parameters
#some of the top hits are
#http://amigo.geneontology.org/amigo/term/GO:0034673
Session info
sessionInfo()
## R version 3.1.1 (2014-07-10)
## Platform: x86_64-apple-darwin13.1.0 (64-bit)
##
## locale:
## [1] en_CA.UTF-8/en_CA.UTF-8/en_CA.UTF-8/C/en_CA.UTF-8/en_CA.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] GO.db_3.0.0 WGCNA_1.42 fastcluster_1.1.16
## [4] dynamicTreeCut_1.62 MLInterfaces_1.46.0 cluster_2.0.1
## [7] annotate_1.44.0 XML_3.98-1.1 randomForest_4.6-10
## [10] RColorBrewer_1.1-2 bioDist_1.38.0 KernSmooth_2.23-14
## [13] hgu95av2.db_3.0.0 org.Hs.eg.db_3.0.0 RSQLite_1.0.0
## [16] DBI_0.3.1 AnnotationDbi_1.28.2 GenomeInfoDb_1.2.4
## [19] IRanges_2.0.1 S4Vectors_0.4.0 multtest_2.22.0
## [22] genefilter_1.48.1 ALL_1.7.1 Biobase_2.26.0
## [25] BiocGenerics_0.12.1 BiocInstaller_1.16.2
##
## loaded via a namespace (and not attached):
## [1] acepack_1.3-3.3 class_7.3-12 codetools_0.2-11
## [4] colorspace_1.2-6 digest_0.6.8 doParallel_1.0.8
## [7] evaluate_0.5.5 foreach_1.4.2 foreign_0.8-63
## [10] formatR_1.0 Formula_1.2-0 gdata_2.13.3
## [13] ggplot2_1.0.1 grid_3.1.1 gtable_0.1.2
## [16] gtools_3.4.1 Hmisc_3.15-0 htmltools_0.2.6
## [19] impute_1.40.0 iterators_1.0.7 knitr_1.9
## [22] lattice_0.20-30 latticeExtra_0.6-26 MASS_7.3-40
## [25] matrixStats_0.14.0 munsell_0.4.2 nnet_7.3-9
## [28] pls_2.4-3 plyr_1.8.1 preprocessCore_1.28.0
## [31] proto_0.3-10 Rcpp_0.11.5 rda_1.0.2-2
## [34] reshape_0.8.5 reshape2_1.4.1 rmarkdown_0.5.1
## [37] rpart_4.1-9 scales_0.2.4 sfsmisc_1.0-27
## [40] splines_3.1.1 stringr_0.6.2 survival_2.38-1
## [43] tools_3.1.1 xtable_1.7-4 yaml_2.1.13
Save session
save.image(file="analysis.RData")
#load(file="analysis.Rdata")