Ian Donaldson
Tuesday, March 24th, 2015
40mins presentation outlining their skills and expertise relevant to the position.
If possible this should contain at least one example of using machine learning in a predictive modelling (classification/regression) setting.
Bioconductor case studies
(Hahne, Huber, Gentleman, Falcon)
Chapter 8 - Supervised machine learning
Then my own code
biocLite("ALL")
library(ALL)
data(ALL)
?ALL
The data consist of microarrays from 128 different individuals with acute lymphoblastic leukemia (ALL). A number of additional covariates are available. The data have been normalized (using rma) and it is the jointly normalized data that are available here. The data are presented in the form of an exprSet object.
load(file="analysis.Rdata")
#ls()
bcell = grep("^B", as.character(ALL$BT))
#bcell is a vector indices of individuals with B-cell type leukemia
moltyp = which(as.character(ALL$mol.biol) %in% c("NEG", "BCR/ABL"))
#moltype is a vector of indices of individuals with BCR/ABL translocations or no cytogentic abnormalities (NEG)
ALL_bcrneg = ALL[, intersect(bcell, moltyp)]
#get a list of individuals meeting both criteria for our data subset
# ALL_bcrneg is an ExpressionSet
exprs(ALL_bcrneg)[1:3,1:3]
01005 01010 03002
1000_at 7.597323 7.479445 7.567593
1001_at 5.046194 4.932537 4.799294
1002_f_at 3.900466 4.208155 3.886169
Remove probe data with little information.
Using standard deviations and “shorth”
The “shorth” is the shortest interval that covers half the values in a vector x.
genefilter package
robust estimator of location good for non-normal data
has implications for results left after multiple hypothesis correction
#library("genefilter")
?genefilter #<- a package for filtering gene expression probe data
expressionData <- exprs(ALL_bcrneg)
str(expressionData) #<-matrix of 12625 probes by 79 samples (individuals)
sds = rowSds(expressionData) #<-the standard deviation of each row (rowSds is from genefilter package)
sh = shorth(sds)
sds_alternative <- apply(expressionData, 1, sd) #alternative way of coding the same thing
sds_alternative[1:5]
rowttests(ALLsfilt, "mol.biol")
Most of these won't survive multiple-hypothesis testing anyway, but a few may bear further examination.
we need to correct for doing so many tests
Benjamini and Hochberg procedure uses the false-discovery rate (FDR) to recalculate p-values
p_adjust <- p.adjust(tt$p.value, method="BH")
num_significant <- sum(tt$p.value < 0.05)
num_significant #<-- 1155
#functions in the multtest package give identical results
probe_id symbol
1 1635_at ABL1
2 1636_g_at ABL1
3 1674_at YES1
4 32434_at MARCKS
5 37015_at ALDH1A1
6 37027_at AHNAK
7 39730_at ABL1
8 39837_s_at ZNF467
9 40202_at KLF9
10 40504_at PON2
What does significant mean anyway?
Such a ranked list might be used as input for further analysis including
Predict the presence or absence of a BCL/ABL translocation given an expression profile.
table(ALL_bcrneg$mol.biol)
BCR/ABL NEG
37 42
ALLfilt_bcrneg = nsFilter(ALL_bcrneg, var.cutoff=0.75)$eset`
#calculate euclidean distances between samples
eucD = dist(t(exprs(ALLfilt_bcrneg)))
#create sample indices for the training and test sets
set.seed(1234)
Negs = which(ALLfilt_bcrneg$mol.biol == "NEG")
Bcr = which(ALLfilt_bcrneg$mol.biol == "BCR/ABL")
S1 = sample(Negs, 20, replace=FALSE)
S2 = sample(Bcr, 20, replace = FALSE)
TrainInd = c(S1, S2)
TestInd = setdiff(1:79, TrainInd)
Use KNN to predict the presence or absence of the BCL/ABL translocation
kans = MLearn( mol.biol ~ ., data=ALLfilt_bcrneg, knnI(k=1,l=0), TrainInd)
kans.cm <- confuMat(kans)
kans.cm
predicted
given BCR/ABL NEG
BCR/ABL 16 1
NEG 4 18
cm <- as.table(kans.cm)
precision(cm)
BCR/ABL NEG
0.9411765 0.8181818
recall(cm)
BCR/ABL NEG
0.8000000 0.9473684
acc(cm)
[1] 0.8717949
#redo knn
#library("MLInterfaces")
kans.bf = MLearn( mol.biol ~ ., data=BNf, knnI(k=1,l=0), TrainInd)
kans.bf.cm <- confuMat(kans.bf)
kans.bf.cm
predicted
given BCR/ABL NEG
BCR/ABL 14 3
NEG 1 21
cm<-as.table(kans.bf.cm)
precision(cm)
BCR/ABL NEG
0.8235294 0.9545455
recall(cm)
BCR/ABL NEG
0.9333333 0.8750000
acc(cm)
[1] 0.8974359
set.seed(123)
#the guideline rule for mtry is the sqrt of the number of features
dim(ALLfilt_bcrneg)
sqrt(2160)
rf1 = MLearn( mol.biol~., data=ALLfilt_bcrneg, randomForestI, TrainInd, ntree=1000, mtry=55, importance=TRUE)
confuMat(rf1)
acc(as.table(confuMat(rf1)))
confuMat(rf1)
predicted
given BCR/ABL NEG
BCR/ABL 13 4
NEG 0 22
acc(as.table(confuMat(rf1)))
[1] 0.8974359
Cross-validation is built into this method and the results are more interpretable.
importance parameter of the random forest method allows us to examine which features had the largest impact on performance
data about feature importance is retrieved using the getVarImp function of the MLInterfaces package
opar = par(no.readonly=TRUE, mar=c(7,5,4,2)) #opar=original parameters - keep these so they can be restored
par(las=2) #<-- style of the axis label - parallel to the axis
?getVarImp #<-- from the MLInterfaces package
impV1 = getVarImp(rf1)
plot(impV1, n=15, plat="hgu95av2", toktype="SYMBOL") #<-- implementation of plot allows direct mapping of gene symbols
par(opar) #<-- restore the original graphics parameters
Retrieve complexes from GO
library("AnnotationDbi")
library(GO.db)
searchTerms <- c("complex", "some", "mere")
#...
query <- paste("select go_id from go_term where term like '%", searchTerm, "%'", sep = "")
#...
allGoidDetails[c(1:3), c(2,3)]
TERM ONTOLOGY
1 phosphopyruvate hydratase complex CC
2 nucleotide-excision repair complex CC
3 nucleotide-excision repair factor 1 complex CC
# ...
myCollapse<-function(someMatrix)apply(someMatrix, 2, function(x){ return(x[which.max(abs(x))])}) #define a collapse function
#
collapseRows.results<-collapseRows(datET=tmp, rowGroup=rowGroup, rowID=rowID, method="function", methodFunction=myCollapse)#collapse
This was the hard part
# prepare expression set for ml
eData <- as.data.frame(t(tmpCollapsed))#transpose and cast as a data frame
newColNames <- paste("AAA", colnames(eData), sep="", collapse=NULL) #fix column names
#fix column names - need to replace :
newColNames<-gsub(":", "", newColNames)
learnThis <- ALLfilt_bcrneg$mol.biol ##finally, add the outcome variable separately
newColNames <- c("learnThis", newColNames)
eData <- cbind(learnThis, eData)
colnames(eData) <- newColNames
rf1e = MLearn( learnThis~., data=eData, randomForestI, TrainInd, ntree=1000, mtry=55, importance=TRUE)
rf1e.cm<-confuMat(rf1e)
rf1e.cm
predicted
given BCR/ABL NEG
BCR/ABL 13 4
NEG 0 22
acc(as.table(rf1e.cm))
[1] 0.8974359