exercise 7

An example from HSAUR More about “kmeans” clustering

library(HSAUR)
## Loading required package: tools
library(scatterplot3d)
library(cluster)
library(gplots)
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
library(MASS)
library(marray)
## Loading required package: limma

Loading data

data(planets)

The dataset has 101 individuals (planets) and three columns that describe the properties of planets, namely, mass, period and eccentricity. We will try to cluster the datasets. First, plot the log transformation of the data in a three-dimensional space, one axis corresponds to one property of planet.

scatterplot3d(log(planets$mass), log(planets$period), log(planets$eccen), type = "h", 
    angle = 55, pch = 16, y.ticklabs = seq(0, 10, by = 2), y.margin.add = 0.1, 
    scale.y = 0.7)

plot of chunk unnamed-chunk-3

Transforming data

rge <- apply(planets, 2, max) - apply(planets, 2, min)
planet.dat <- sweep(planets, 2, rge, FUN = "/")

Useful function sweep() In general, one could know how many clusters in the data before clustering. For example, there would be two clusters in a cancer versus control experiment. But in this case, we do not how many clusters is optimal. So the idea is try to cluster into 2 to N clusters. And see the sum of square of error. For example

n <- nrow(planet.dat)
wss <- rep(0, 10)
wss[1] <- (n - 1) * sum(apply(planet.dat, 2, var))
for (i in 2:10) {
    wss[i] <- sum(kmeans(planet.dat, centers = i)$withinss)
}

What should we know is that if we choose N as same as the data points we have in the original data, the sum of error would definitely be zero. The result is extremely overfitted and meaningless. So obviously, we cannot select a big N. The idea of selecting the cluster number is identifing the value from where the sum of square error is not decreasing dramatically. In our current case, unfortunately, this plot does give us a clear clue about the optimized cluster number, we will cluster 3 groups. Clustering

planet_kmeans3 <- kmeans(planet.dat, centers = 3)

How many individuals (planets) in each cluster

table(planet_kmeans3$cluster)
## 
##  1  2  3 
## 14 34 53
plot(1:10, wss, type = "b", xlab = "Number of groups", ylab = "Within groups sum of squares")

plot of chunk unnamed-chunk-7

Write your own function to calculate the mean value of mass, period and eccentricity in each cluster.

for (i in sort(unique(planet_kmeans3$cluster))) {
    print(apply(planet.dat[which(i == planet_kmeans3$cluster), ], 2, mean))
}
##   mass period  eccen 
## 0.6056 0.3161 0.3954 
##   mass period  eccen 
## 0.1678 0.1150 0.5344 
##    mass  period   eccen 
## 0.09576 0.07984 0.13155

Compare your own code with code in block 1. Do you have a more efficient way?

Distance measurement Write your own function for the following distances: #### euclidean

maximum (Chebyshev distance)

mapply
## function (FUN, ..., MoreArgs = NULL, SIMPLIFY = TRUE, USE.NAMES = TRUE) 
## {
##     FUN <- match.fun(FUN)
##     dots <- list(...)
##     answer <- .Internal(mapply(FUN, dots, MoreArgs))
##     if (USE.NAMES && length(dots)) {
##         if (is.null(names1 <- names(dots[[1L]])) && is.character(dots[[1L]])) 
##             names(answer) <- dots[[1L]]
##         else if (!is.null(names1)) 
##             names(answer) <- names1
##     }
##     if (!identical(SIMPLIFY, FALSE) && length(answer)) 
##         simplify2array(answer, higher = (SIMPLIFY == "array"))
##     else answer
## }
## <bytecode: 0x101796040>
## <environment: namespace:base>
manhattan <- function(x, y) {
    len <- 0
    for (i in 1:length(x)) {
        len <- len + abs(x[i] - y[i])
    }
    return(len)
}
x <- c(5, 4, 3)
y <- c(1, 1, 1)

manhattan(x, y)
## [1] 9

manhattan

Canberra

minkowski

Spearman correlation coefficients

Pearson correlation coefficients

Breast Cancer Data

I cannot find a good way for coloring the leaves of a dendrogram, there is a function for this purpose:

source("ftp://129.187.44.58/share/chen/R_Functions/plothclust.R")

check the source code to see an example about how to use this function. plothclust Load the breast cancer data used in module 3. Calculate the

load("~/Dropbox/Uni/Master/HT_Course/Module_3_HypothesisTesting/breastCancerMAINZ_module2.RData")

Calculate the combinations


n <- 76
k <- 2
Bino.Koeffizient <- (factorial(n))/(factorial(k) * factorial(n - k))

Calculate good genes

givesP <- function(x) {
    x$p.value
}
t_Parted <- function(x) {
    t.test(x[1:38], x[39:78], var.equal = T)
}

result <- apply(breast_expr, 1, t_Parted)
x <- lapply(result, givesP)

pvalue <- unlist(x)



fdr_v <- p.adjust(pvalue, method = "fdr")
fdrFrame <- data.frame(fdr = fdr_v, p = pvalue)
#### man braucht zwei spalten um nachher zu sortieren !!!!

Make the Subset



fdrFrame <- subset(x = fdrFrame, subset = fdr < 0.01)

sort the stuff


fdrFrameSort <- fdrFrame[with(fdrFrame, order(fdr)), ]

fdrFrameSort100 <- fdrFrameSort[1:100, ]
fdrFrameSort100$p <- NULL

now get the original data

breast100 <- breast_expr[which(rownames(breast_expr) %in% rownames(fdrFrameSort100)), 
    ]
single = hclust(dist(breast100, method = "euclidean"), method = "single")

plot(single)

plot of chunk unnamed-chunk-16



singledend = as.dendrogram(single)

fac = as.factor(erStatus)
mybarcolor = fac
hmcol <- maPalette(low = "red", high = "green", mid = "black", k = 100)

# levels(mybarcolor) = c('lightblue', 'darkblue', 'darkorange',
# 'darkorange4')

heatmap.2(as.matrix(t(breast100)), col = hmcol, trace = "none", key = TRUE, 
    density.info = "none", labRow = NA, dendrogram = "col")

plot of chunk unnamed-chunk-16

Compare different agglomeration method of clustering (using function hclust), which method is the best in separation of ER positive and ER negative patients?

Specific filtering of data (filtering data based on p-value): find the differentially expressed genes (t test, FDR < 0.01), clustering patients based on the differentially expressed genes with different distance measurements and agglomeration method: Try distance measured by:

Unspecific filtering of data (filtering data based on standard deviation): retains genes that have standard deviation > 1.5 across all patients, do clustering using: distance measurement:

sd_all <- apply(breast_expr, 1, sd)
breastSD <- which(sd_all > 2.5)
breastSD.df <- --breast_expr[which(rownames(breast_expr) %in% names(breastSD)), 
    ]

single = hclust(dist(breastSD.df, method = "euclidean"), method = "single")

plot(single)

plot of chunk unnamed-chunk-17



singledend = as.dendrogram(single)

fac = as.factor(erStatus)
mybarcolor = fac
hmcol <- maPalette(low = "red", high = "green", mid = "black", k = 100)

# levels(mybarcolor) = c('lightblue', 'darkblue', 'darkorange',
# 'darkorange4')

heatmap.2(as.matrix(t(breastSD.df)), col = hmcol, trace = "none", key = TRUE, 
    density.info = "none", labRow = NA, dendrogram = "col")

plot of chunk unnamed-chunk-17

Compare the result with clustering of specific filtered genes (filter with p-value or FDR), discuss the result.

Optional: PCA on specific filtered data: perform PCA on the specifically filtered data.

Plot patient on PC1 vs. PC2, PC1 vs. PC3 and PC2 vs. PC3.

Compare the result with clustering result (Euclidean distance and complete agglomerationmethod).

Combining the result of clustering and PCA, could you identify sub-clusters in ER positive or ERnegative patients?

What could be the reasons for those “miss” classified patients?