- Clustering
- Book chapter 8 "Basic Machine Learning", Clustering section
- Batch Effects
- Book chapter 9 "Batch Effects"
March 17, 2016
Similarity and Dissimilarity have fewer yet:
Recall the tissue gene expression dataset:
##biocLite("genomicsclass/tissuesGeneExpression") library(tissuesGeneExpression) data(tissuesGeneExpression) dim(e) ##gene expression data
## [1] 22215 189
table(tissue) ##tissue[i] corresponds to e[,i]
## tissue ## cerebellum colon endometrium hippocampus kidney liver ## 38 34 15 31 39 26 ## placenta ## 6
d.euclidian <- dist( t(e) )
d.pearson <- as.dist( 1 - cor(e) )
** Using default hclust()
settings ("complete" linkage)
** "single" linkage (uncommon)
Note that many equivalent re-orderings exist:
?reorder.dendrogram
hclust()
default, Euclidian distancehclust()
default, 1 - Pearson Correlation distanceUsing the above cutoff, how do clusters overlap with the actual tissues?
hclusters <- cutree(hc.pearson, h=0.125) table(true=tissue, cluster=hclusters)
## cluster ## true 1 2 3 4 5 6 7 8 9 10 11 12 13 ## cerebellum 0 0 0 0 31 0 0 2 0 0 5 0 0 ## colon 0 0 0 0 0 34 0 0 0 0 0 0 0 ## endometrium 0 0 0 0 0 0 0 0 0 15 0 0 0 ## hippocampus 0 0 29 2 0 0 0 0 0 0 0 0 0 ## kidney 19 18 0 0 0 0 0 2 0 0 0 0 0 ## liver 0 0 0 0 0 0 24 0 2 0 0 0 0 ## placenta 0 0 0 0 0 0 0 0 0 0 0 2 4
Alternatively, specifying the number of clusters:
hclusters <- cutree(hc.pearson, k=length(unique(tissue)) + 1 ) table(true=tissue, cluster=hclusters)
## cluster ## true 1 2 3 4 5 6 7 8 ## cerebellum 0 0 36 0 0 0 2 0 ## colon 0 0 0 0 34 0 0 0 ## endometrium 15 0 0 0 0 0 0 0 ## hippocampus 0 0 29 2 0 0 0 0 ## kidney 19 18 0 0 0 0 2 0 ## liver 0 0 0 0 0 24 2 0 ## placenta 0 0 0 0 0 0 0 6
set.seed(1); km <- kmeans(t(e), centers=8) table(Kmeans=km$cluster, Hclust=hclusters.euclidian)
## Hclust ## Kmeans 1 2 3 4 5 6 7 8 ## 1 0 0 0 34 0 0 0 0 ## 2 37 0 0 0 0 0 0 0 ## 3 0 0 0 0 0 0 5 0 ## 4 0 12 19 0 0 0 0 0 ## 5 0 0 0 0 0 6 0 0 ## 6 0 0 31 0 0 0 0 0 ## 7 0 0 0 0 24 0 0 0 ## 8 15 0 0 0 0 0 0 6
library(cluster)
[1] Koren, O. et al. A guide to enterotypes across the human body: meta-analysis of microbial community structures in human microbiome datasets. PLoS Comput. Biol. 9, e1002863 (2013).
Exercises: Look at ?pheatmap::pheatmap
, try example("pheatmap::pheatmap")
?cluster::pam
, ?pamr::pamr.cv
)?NMF::nmf
)library(Biobase) library(genefilter) library(GSE5859) ## BiocInstaller::biocLite("genomicsclass/GSE5859") data(GSE5859) geneExpression = exprs(e) sampleInfo = pData(e)
head(table(sampleInfo$date))
## ## 2002-10-31 2002-11-15 2002-11-19 2002-11-21 2002-11-22 2002-11-27 ## 2 2 2 5 4 2
year <- as.integer(format(sampleInfo$date,"%y") ) year <- year - min(year) month = as.integer( format(sampleInfo$date,"%m") ) + 12*year table(year, sampleInfo$ethnicity)
## ## year ASN CEU HAN ## 0 0 32 0 ## 1 0 54 0 ## 2 0 13 0 ## 3 80 3 0 ## 4 2 0 24
pc <- princomp(geneExpression, cor=TRUE) boxplot(pc$loadings[, 2] ~ month, varwidth=TRUE, notch=TRUE, main="PC2 vs. month", xlab="Month", ylab="PC2")
A starting point for a color palette:
RColorBrewer::display.brewer.all(n=3, colorblindFriendly = TRUE)
Interpolate one color per month on a quantitative palette:
col3 <- c(RColorBrewer::brewer.pal(n=3, "Greys")[2:3], "black") MYcols <- colorRampPalette(col3, space="Lab")(length(unique(month)))
d <- as.dist(1 - cor(geneExpression)) mds <- cmdscale(d) plot(mds, col=MYcols[as.integer(factor(month))], main="MDS shaded by batch")
hcclass <- cutree(hclust(d), k=5) table(hcclass, year)
## year ## hcclass 0 1 2 3 4 ## 1 0 2 8 1 0 ## 2 25 43 0 2 0 ## 3 6 0 0 0 0 ## 4 1 7 0 0 0 ## 5 0 2 5 80 26