Clustering with Dataminig

html_document: default github_document: default html_notebook: default

0. About our data

  • gene information tells how many copy one has. Each informations are drawn from 6 probes which are denoted as variable P4075, P4076, P4077, P4078, P4079, P4080.
  • Data is observed from 408 genes(people?).

1. Let’s check how our data looks like(linearity, normality etc.)

We have 6 variables, and there seems to be 2 missing values(NA).

cnp6 <- read.csv("./cnp6.csv")
head(cnp6)
  RowNames p4075 p4076 p4077 p4078 p4079 p4080
1   BA0485  1.26  1.07  1.31  0.78  0.97  1.11
2   BA0493  1.07  0.64  0.81  1.13  1.08  1.48
3   BA0517  0.93  0.89  1.13  1.02  1.04  1.23
4   BA0523  0.93  0.96  1.16  0.82  0.84  1.17
5   BA0543  1.17  0.87  0.74  0.99  0.96  0.86
6   BA0559  0.72  0.87  1.07  0.99  1.08  1.14
summary(cnp6)
    RowNames       p4075           p4076           p4077      
 BA0227 :  1   Min.   :0.440   Min.   :0.560   Min.   :0.590  
 BA0229 :  1   1st Qu.:0.950   1st Qu.:0.970   1st Qu.:0.980  
 BA0267 :  1   Median :1.040   Median :1.090   Median :1.120  
 BA0269 :  1   Mean   :1.057   Mean   :1.102   Mean   :1.136  
 BA0271 :  1   3rd Qu.:1.150   3rd Qu.:1.210   3rd Qu.:1.270  
 BA0273 :  1   Max.   :2.150   Max.   :2.010   Max.   :2.240  
 (Other):399   NA's   :2       NA's   :2       NA's   :2      
     p4078           p4079            p4080      
 Min.   :0.650   Min.   :0.6600   Min.   :0.660  
 1st Qu.:0.930   1st Qu.:0.8900   1st Qu.:0.980  
 Median :1.000   Median :0.9800   Median :1.090  
 Mean   :1.011   Mean   :0.9888   Mean   :1.111  
 3rd Qu.:1.080   3rd Qu.:1.0700   3rd Qu.:1.230  
 Max.   :1.580   Max.   :1.7200   Max.   :1.820  
 NA's   :2       NA's   :2        NA's   :2      
str(cnp6)
'data.frame':   405 obs. of  7 variables:
 $ RowNames: Factor w/ 405 levels "BA0227","BA0229",..: 39 40 41 42 43 44 45 46 275 276 ...
 $ p4075   : num  1.26 1.07 0.93 0.93 1.17 0.72 0.94 0.92 0.78 0.96 ...
 $ p4076   : num  1.07 0.64 0.89 0.96 0.87 0.87 0.86 0.95 1.27 0.85 ...
 $ p4077   : num  1.31 0.81 1.13 1.16 0.74 1.07 1.26 1.03 0.93 0.94 ...
 $ p4078   : num  0.78 1.13 1.02 0.82 0.99 0.99 1.06 1.03 0.78 1.17 ...
 $ p4079   : num  0.97 1.08 1.04 0.84 0.96 1.08 1.03 1 0.86 1.06 ...
 $ p4080   : num  1.11 1.48 1.23 1.17 0.86 1.14 1.17 1.18 0.98 0.89 ...

Let’s remove missing values

# <U+ACB0><U+CE21><U+CE58><U+AC00> <U+BA87> <U+AC1C> <U+C788><U+B294><U+C9C0> <U+D568><U+C218><U+B85C> <U+AD6C><U+D55C><U+B2E4>.
counting.missing.values <- function(x) {
  count.na <- rep(NA, length(x) - 1)
  for (i in 1:length(x) - 1) {
    count.na[i] <- sum(is.na(x[, i + 1]))
  }
  print(count.na)
}

counting.missing.values(cnp6)
[1] 2 2 2 2 2 2
# <U+ACB0><U+CE21><U+CE58><U+B97C> <U+C81C><U+AC70><U+D558><U+C790>.
cnp6.narm <- na.omit(cnp6)
dim(cnp6.narm)
[1] 403   7



linear relationship between each variable

  • variable P4080 & P4079, P4080 & P4078, P4079 & P4075 seems to have linear relationship.
  • Whereas P4079& p4077, p4080 & p4076 seems to have almost no linear relationship(random relationship).
library(ggplot2)
panel.lm <- function(x, y, col=par("col"), bg=NA, pch=par("pch"),
                     cex=1, col.smooth="black", ...) {
  points(x, y, pch = 20, col = alpha("midnightblue", 0.2), bg = bg, cex = cex)
  abline(stats::lm(y~x), col = col.smooth, ...)
}

panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) {
  usr <- par("usr")
  on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  r <- abs(cor(x, y))
  txt <- format(c(r, 0.123456789), digits = digits)[1]
  txt <- paste0(prefix, txt)
  if (missing(cex.cor)) cex.cor <- 0.8 / strwidth(txt)
  text(.5, 0.5, txt, cex = cex.cor * r)
}


panel.hist <- function(x, ...) {
  usr <- par("usr")
  on.exit(par(usr))
  par(usr = c(usr[1:2], 0, 1.5))
  h <- hist(x, plot = FALSE)
  breaks <- h$breaks
  nB <- length(breaks)
  y <- h$counts
  y <- y / max(y)
  rect(breaks[-nB], 0, breaks[-1], y, col = "darkblue", ...)
}


pairs(
  cnp6.narm[2:7],
  lower.panel = panel.lm,
  upper.panel = panel.cor,
  diag.panel = panel.hist,
  pch = 20,
  main = "scatter-plot matrix, correlation coef., histogram"
)

Let’do PCA anlaysis. -> doenst seem to be significant

pr.out <- prcomp(cnp6.narm[2:7], scale = TRUE)
Cols <- function(vec) {
  cols <- rainbow(length(unique(vec)))
  return(cols[as.numeric(as.factor(vec))])
}

2. Hierarchichal Clustering

Which method fits best for our data?

  • complete method calculated with maximum distance matrix!
library(NbClust)
library(factoextra)
Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
cnp.scaled <- cbind(cnp6.narm$RowNames, scale(cnp6.narm[, 2:7]))
# cnp.dist.man <- dist(cnp.scaled, method = "manhattan")
# cnp.dist.euc <- dist(cnp.scaled, method = "euclidean")
# cnp.dist.can <- dist(cnp.scaled, method = "canberra")
cnp.dist.max <- dist(cnp.scaled, method = "maximum")
# cnp.dist.mink <- dist(cnp.scaled, method = "minkowski")
#
# cnp.hclust.man <- hclust(cnp.dist.man, method = "average")
# cnp.hclust.euc <- hclust(cnp.dist.euc, method = "average")
# cnp.hclust.can <- hclust(cnp.dist.can, method = "average")
# cnp.hclust.max1 <- hclust(cnp.dist.max, method = "average") #less good
cnp.hclust.max2 <- hclust(cnp.dist.max, method = "complete")
# cnp.hclust.mink <- hclust(cnp.dist.mink, method = "average")
# summary(cnp.hclust.man)
# summary(cnp.hclust.euc)
# summary(cnp.hclust.can)
summary(cnp.hclust.max2)
            Length Class  Mode     
merge       804    -none- numeric  
height      402    -none- numeric  
order       403    -none- numeric  
labels      403    -none- character
method        1    -none- character
call          3    -none- call     
dist.method   1    -none- character
# summary(cnp.hclust.mink)
#
#
# par(mfrow=c(2,2))
#
# plot(cnp.hclust.euc)
# plot(cnp.hclust.man)
# plot(cnp.hclust.can)
# plot(cnp.hclust.euc)
# plot(cnp.hclust.max1)
plot(cnp.hclust.max2)

3. Detemining number of clusters

nbclust

  • Dividing data into 3 groups seems to be the best.
library("NbClust")
nb <- NbClust(
  cnp.scaled, distance = "maximum",
  min.nc = 2, max.nc = 10, method = "kmeans"
)

*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot. 
 

*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 5 proposed 2 as the best number of clusters 
* 10 proposed 3 as the best number of clusters 
* 1 proposed 4 as the best number of clusters 
* 1 proposed 8 as the best number of clusters 
* 1 proposed 9 as the best number of clusters 
* 5 proposed 10 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  3 
 
 
******************************************************************* 
nbc <- fviz_nbclust(nb)
Among all indices: 
===================
* 2 proposed  0 as the best number of clusters
* 1 proposed  1 as the best number of clusters
* 5 proposed  2 as the best number of clusters
* 10 proposed  3 as the best number of clusters
* 1 proposed  4 as the best number of clusters
* 1 proposed  8 as the best number of clusters
* 1 proposed  9 as the best number of clusters
* 5 proposed  10 as the best number of clusters

Conclusion
=========================
* According to the majority rule, the best number of clusters is  3 .
nbc

Let’s visualize the result.

  • number of clusters : 3
  • distance method : maximum
  • clustering method : complete
library(RColorBrewer)
fviz_dend(
  cnp.hclust.max2, k = 3, cex = 0.3,
  k_colors = brewer.pal(3, "Set1"),
  color_labels_by_k = TRUE, rect = TRUE
)

hc.class <- cutree(cnp.hclust.max2, k = 3)
table(hc.class)
hc.class
  1   2   3 
140 160 103 
plot(table(hc.class))

4. Kmeans clustering

this is distance matrix visualized in a plot

fviz_dist(
  cnp.dist.max,
  gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")
)

m <- 20
set.seed(100)
wss <- numeric(m)
for (i in 1:m) {
  cnp.km <- kmeans(cnp.scaled[, 2:7], centers = i)
  wss[i] <- cnp.km$tot.withinss
}

plot(
  wss, type = "b",
  xlab = "K (number of clusters)",
  ylab = "Total within sum of squares"
)

cnp.km.3 <- kmeans(
  cnp.scaled[, 2:7],
  centers = 3, nstart = 40
)

Visualizing results

panel.lm.km <- function(x, y, col=par("col"), bg=NA, pch=par("pch"),
                        cex=0.2, col.smooth="black", ...) {
  points(x, y, pch = cnp.km.3$cluster, col = cnp.km.3$cluster, bg = bg, cex = cex)
  abline(stats::lm(y~x), col = col.smooth, ...)
}

pairs(
  cnp.scaled[, 2:7],
  lower.panel = panel.lm.km,
  upper.panel = panel.cor,
  diag.panel = panel.hist,
  pch = 20,
  main = "scatter-plot matrix, correlation coef., histogram"
)

fviz_cluster(cnp.km.3, data = cnp.scaled[, 2:7])

cnp.km.3$cluster <- as.factor(cnp.km.3$cluster)
# ggplot(cnp.scaled, aes(p4075, p4076, color = cnp.km$cluster)) + geom_point()

Let’s see other methods as well, whether 3 clusters are optimal.

Elbow Method

fviz_nbclust(cnp.scaled, kmeans, method = "wss") +
  geom_vline(xintercept = 3, linetype = 2) +
  labs(subtitle = "Elbow method")

Silhouette method

fviz_nbclust(cnp.scaled, kmeans, method = "silhouette") + # This stays 2clusters are optimal cluster.
  labs(subtitle = "Silhouette method")

Gap statistics

nboot <- 50
set.seed(123)
fviz_nbclust(
  cnp.scaled, kmeans,
  nstart = 25, method = "gap_stat",
  nboot = 50
) +
  labs(subtitle = "Gap statistic method")

5. Mixture model

library(mclust)
Package 'mclust' version 5.4
Type 'citation("mclust")' for citing this R package in publications.
fit <- Mclust(cnp6.narm[, 2:7])
plot(fit, what = "classification")

table(fit$classification)

  1   2 
 85 318 

6.Analyzing with result.

anlaysis

cnp.cluster <- cbind(cnp.scaled, fit$classification)
colname <- colnames(cnp.cluster[, 2:7])
colnames(cnp.cluster) <- c("id", colname, "class")
colnames(cnp.cluster)
[1] "id"    "p4075" "p4076" "p4077" "p4078" "p4079" "p4080" "class"
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
# cnp.scaled[, 2:7] %>%
#   mutate(Cluster = cnp.km.3$cluster) %>%
#   group_by(Cluster) %>%
#   summarise_all("mean")