We will follow the sample analysis in:

K. S. Pollard and M. J. van der Laan. “Cluster Analysis of Genomic Data” in Gentleman, R., Carey, V.J., et al. Bioinformatics and Computational Biology Solutions Using R and Bioconductor. Springer, 2005. http://cbcb.umd.edu/~hcorrada/CFG/readings/Solutions_ch13.pdf

We will use the Golub leukemia microarray data.

To install missing packages.

source("http://bioconductor.org/biocLite.R")
biocLite("hopach")
library(hopach)
## Loading required package: cluster
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     grep, grepl, intersect, is.unsorted, lapply, lengths, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unlist, unsplit
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
data(golub)
dim(golub)
## [1] 3051   38
golub.labels = c(rep("ALL", 27), rep("AML", 11))
golub.labels
##  [1] "ALL" "ALL" "ALL" "ALL" "ALL" "ALL" "ALL" "ALL" "ALL" "ALL" "ALL"
## [12] "ALL" "ALL" "ALL" "ALL" "ALL" "ALL" "ALL" "ALL" "ALL" "ALL" "ALL"
## [23] "ALL" "ALL" "ALL" "ALL" "ALL" "AML" "AML" "AML" "AML" "AML" "AML"
## [34] "AML" "AML" "AML" "AML" "AML"
rownames(golub) <- golub.gnames[,3]
colnames(golub) <- golub.labels

Select features with high variance. In a proper study, you would use a statistical method to select differentially expressed genes (like limma).

vars<-apply(golub,1,var)
subset<-vars>quantile(vars,(nrow(golub)-100)/nrow(golub))
golub.subset<-golub[subset,]
gnames.subset<-golub.gnames[subset,]

Pre-compute gene distances.

gene.dist <- distancematrix(golub.subset, d = "cosangle")

Run hopach to cluster features using the distances we computed.

gene.hobj <- hopach(golub.subset, dmat = gene.dist)
gene.hobj$clust$k
## [1] 27
table(gene.hobj$clust$sizes)
## 
##  1  2  3  4  8 15 38 
## 15  4  4  1  1  1  1

Pick a good value of k, and run PAM on the same data. The hopach package changed the defaults, so you need to extract the distance matrix from the gene.dist objects.

bestk <- silcheck(dissvector(as.matrix(gene.dist)), diss = TRUE)[1]
pamobj <- pam(dissvector(as.matrix(gene.dist)), k = bestk,
diss = TRUE)
round(pamobj$clusinfo, 2)
##      size max_diss av_diss diameter separation
## [1,]    4     0.08    0.05     0.15       0.49
## [2,]   14     0.55    0.31     0.95       0.18
## [3,]   29     0.68    0.40     1.20       0.25
## [4,]   13     0.87    0.39     1.12       0.19
## [5,]   17     0.61    0.35     1.05       0.10
## [6,]   11     0.79    0.32     0.98       0.25
## [7,]   12     0.73    0.32     1.15       0.10

Pam finds 7 clusters.

How about other algorithms?

heatmap(golub.subset)

How stable are the hopach clusters?

bobj<-boothopach(golub.subset,gene.hobj,B=100)
bootplot(bobj,gene.hobj,ord="bootp",main="Golub AML/ALL Data (1999)",showclusters=FALSE)

Clustering the samples.

array.hobj <- hopach(t(golub.subset), d = "euclid")
array.hobj$clust$k
## [1] 17
dplot(distancematrix(t(golub.subset), d = "euclid"), array.hobj, labels = golub.labels, main = "Golub AML/ALL: Array Clustering")