html_document: default github_document: default html_notebook: default
0. About our data
- gene information tells how many copy one has. Each informations are drawn from 6 probes which are denoted as variable P4075, P4076, P4077, P4078, P4079, P4080.
- Data is observed from 408 genes(people?).
1. Let’s check how our data looks like(linearity, normality etc.)
We have 6 variables, and there seems to be 2 missing values(NA).
cnp6 <- read.csv("./cnp6.csv")
head(cnp6) RowNames p4075 p4076 p4077 p4078 p4079 p4080
1 BA0485 1.26 1.07 1.31 0.78 0.97 1.11
2 BA0493 1.07 0.64 0.81 1.13 1.08 1.48
3 BA0517 0.93 0.89 1.13 1.02 1.04 1.23
4 BA0523 0.93 0.96 1.16 0.82 0.84 1.17
5 BA0543 1.17 0.87 0.74 0.99 0.96 0.86
6 BA0559 0.72 0.87 1.07 0.99 1.08 1.14
summary(cnp6) RowNames p4075 p4076 p4077
BA0227 : 1 Min. :0.440 Min. :0.560 Min. :0.590
BA0229 : 1 1st Qu.:0.950 1st Qu.:0.970 1st Qu.:0.980
BA0267 : 1 Median :1.040 Median :1.090 Median :1.120
BA0269 : 1 Mean :1.057 Mean :1.102 Mean :1.136
BA0271 : 1 3rd Qu.:1.150 3rd Qu.:1.210 3rd Qu.:1.270
BA0273 : 1 Max. :2.150 Max. :2.010 Max. :2.240
(Other):399 NA's :2 NA's :2 NA's :2
p4078 p4079 p4080
Min. :0.650 Min. :0.6600 Min. :0.660
1st Qu.:0.930 1st Qu.:0.8900 1st Qu.:0.980
Median :1.000 Median :0.9800 Median :1.090
Mean :1.011 Mean :0.9888 Mean :1.111
3rd Qu.:1.080 3rd Qu.:1.0700 3rd Qu.:1.230
Max. :1.580 Max. :1.7200 Max. :1.820
NA's :2 NA's :2 NA's :2
str(cnp6)'data.frame': 405 obs. of 7 variables:
$ RowNames: Factor w/ 405 levels "BA0227","BA0229",..: 39 40 41 42 43 44 45 46 275 276 ...
$ p4075 : num 1.26 1.07 0.93 0.93 1.17 0.72 0.94 0.92 0.78 0.96 ...
$ p4076 : num 1.07 0.64 0.89 0.96 0.87 0.87 0.86 0.95 1.27 0.85 ...
$ p4077 : num 1.31 0.81 1.13 1.16 0.74 1.07 1.26 1.03 0.93 0.94 ...
$ p4078 : num 0.78 1.13 1.02 0.82 0.99 0.99 1.06 1.03 0.78 1.17 ...
$ p4079 : num 0.97 1.08 1.04 0.84 0.96 1.08 1.03 1 0.86 1.06 ...
$ p4080 : num 1.11 1.48 1.23 1.17 0.86 1.14 1.17 1.18 0.98 0.89 ...
Let’s remove missing values
# <U+ACB0><U+CE21><U+CE58><U+AC00> <U+BA87> <U+AC1C> <U+C788><U+B294><U+C9C0> <U+D568><U+C218><U+B85C> <U+AD6C><U+D55C><U+B2E4>.
counting.missing.values <- function(x) {
count.na <- rep(NA, length(x) - 1)
for (i in 1:length(x) - 1) {
count.na[i] <- sum(is.na(x[, i + 1]))
}
print(count.na)
}
counting.missing.values(cnp6)[1] 2 2 2 2 2 2
# <U+ACB0><U+CE21><U+CE58><U+B97C> <U+C81C><U+AC70><U+D558><U+C790>.
cnp6.narm <- na.omit(cnp6)
dim(cnp6.narm)[1] 403 7
linear relationship between each variable
- variable P4080 & P4079, P4080 & P4078, P4079 & P4075 seems to have linear relationship.
- Whereas P4079& p4077, p4080 & p4076 seems to have almost no linear relationship(random relationship).
library(ggplot2)
panel.lm <- function(x, y, col=par("col"), bg=NA, pch=par("pch"),
cex=1, col.smooth="black", ...) {
points(x, y, pch = 20, col = alpha("midnightblue", 0.2), bg = bg, cex = cex)
abline(stats::lm(y~x), col = col.smooth, ...)
}
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
if (missing(cex.cor)) cex.cor <- 0.8 / strwidth(txt)
text(.5, 0.5, txt, cex = cex.cor * r)
}
panel.hist <- function(x, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5))
h <- hist(x, plot = FALSE)
breaks <- h$breaks
nB <- length(breaks)
y <- h$counts
y <- y / max(y)
rect(breaks[-nB], 0, breaks[-1], y, col = "darkblue", ...)
}
pairs(
cnp6.narm[2:7],
lower.panel = panel.lm,
upper.panel = panel.cor,
diag.panel = panel.hist,
pch = 20,
main = "scatter-plot matrix, correlation coef., histogram"
)Let’do PCA anlaysis. -> doenst seem to be significant
pr.out <- prcomp(cnp6.narm[2:7], scale = TRUE)
Cols <- function(vec) {
cols <- rainbow(length(unique(vec)))
return(cols[as.numeric(as.factor(vec))])
}2. Hierarchichal Clustering
Which method fits best for our data?
- complete method calculated with maximum distance matrix!
library(NbClust)
library(factoextra)Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
cnp.scaled <- cbind(cnp6.narm$RowNames, scale(cnp6.narm[, 2:7]))
# cnp.dist.man <- dist(cnp.scaled, method = "manhattan")
# cnp.dist.euc <- dist(cnp.scaled, method = "euclidean")
# cnp.dist.can <- dist(cnp.scaled, method = "canberra")
cnp.dist.max <- dist(cnp.scaled, method = "maximum")
# cnp.dist.mink <- dist(cnp.scaled, method = "minkowski")
#
# cnp.hclust.man <- hclust(cnp.dist.man, method = "average")
# cnp.hclust.euc <- hclust(cnp.dist.euc, method = "average")
# cnp.hclust.can <- hclust(cnp.dist.can, method = "average")
# cnp.hclust.max1 <- hclust(cnp.dist.max, method = "average") #less good
cnp.hclust.max2 <- hclust(cnp.dist.max, method = "complete")
# cnp.hclust.mink <- hclust(cnp.dist.mink, method = "average")
# summary(cnp.hclust.man)
# summary(cnp.hclust.euc)
# summary(cnp.hclust.can)
summary(cnp.hclust.max2) Length Class Mode
merge 804 -none- numeric
height 402 -none- numeric
order 403 -none- numeric
labels 403 -none- character
method 1 -none- character
call 3 -none- call
dist.method 1 -none- character
# summary(cnp.hclust.mink)
#
#
# par(mfrow=c(2,2))
#
# plot(cnp.hclust.euc)
# plot(cnp.hclust.man)
# plot(cnp.hclust.can)
# plot(cnp.hclust.euc)
# plot(cnp.hclust.max1)
plot(cnp.hclust.max2)3. Detemining number of clusters
nbclust
- Dividing data into 3 groups seems to be the best.
library("NbClust")
nb <- NbClust(
cnp.scaled, distance = "maximum",
min.nc = 2, max.nc = 10, method = "kmeans"
)*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 5 proposed 2 as the best number of clusters
* 10 proposed 3 as the best number of clusters
* 1 proposed 4 as the best number of clusters
* 1 proposed 8 as the best number of clusters
* 1 proposed 9 as the best number of clusters
* 5 proposed 10 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 3
*******************************************************************
nbc <- fviz_nbclust(nb)Among all indices:
===================
* 2 proposed 0 as the best number of clusters
* 1 proposed 1 as the best number of clusters
* 5 proposed 2 as the best number of clusters
* 10 proposed 3 as the best number of clusters
* 1 proposed 4 as the best number of clusters
* 1 proposed 8 as the best number of clusters
* 1 proposed 9 as the best number of clusters
* 5 proposed 10 as the best number of clusters
Conclusion
=========================
* According to the majority rule, the best number of clusters is 3 .
nbcLet’s visualize the result.
- number of clusters : 3
- distance method : maximum
- clustering method : complete
library(RColorBrewer)
fviz_dend(
cnp.hclust.max2, k = 3, cex = 0.3,
k_colors = brewer.pal(3, "Set1"),
color_labels_by_k = TRUE, rect = TRUE
)hc.class <- cutree(cnp.hclust.max2, k = 3)
table(hc.class)hc.class
1 2 3
140 160 103
plot(table(hc.class))4. Kmeans clustering
this is distance matrix visualized in a plot
fviz_dist(
cnp.dist.max,
gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")
)m <- 20
set.seed(100)
wss <- numeric(m)
for (i in 1:m) {
cnp.km <- kmeans(cnp.scaled[, 2:7], centers = i)
wss[i] <- cnp.km$tot.withinss
}
plot(
wss, type = "b",
xlab = "K (number of clusters)",
ylab = "Total within sum of squares"
)cnp.km.3 <- kmeans(
cnp.scaled[, 2:7],
centers = 3, nstart = 40
)Visualizing results
panel.lm.km <- function(x, y, col=par("col"), bg=NA, pch=par("pch"),
cex=0.2, col.smooth="black", ...) {
points(x, y, pch = cnp.km.3$cluster, col = cnp.km.3$cluster, bg = bg, cex = cex)
abline(stats::lm(y~x), col = col.smooth, ...)
}
pairs(
cnp.scaled[, 2:7],
lower.panel = panel.lm.km,
upper.panel = panel.cor,
diag.panel = panel.hist,
pch = 20,
main = "scatter-plot matrix, correlation coef., histogram"
)fviz_cluster(cnp.km.3, data = cnp.scaled[, 2:7])cnp.km.3$cluster <- as.factor(cnp.km.3$cluster)
# ggplot(cnp.scaled, aes(p4075, p4076, color = cnp.km$cluster)) + geom_point()Let’s see other methods as well, whether 3 clusters are optimal.
Elbow Method
fviz_nbclust(cnp.scaled, kmeans, method = "wss") +
geom_vline(xintercept = 3, linetype = 2) +
labs(subtitle = "Elbow method")Silhouette method
fviz_nbclust(cnp.scaled, kmeans, method = "silhouette") + # This stays 2clusters are optimal cluster.
labs(subtitle = "Silhouette method")Gap statistics
nboot <- 50
set.seed(123)
fviz_nbclust(
cnp.scaled, kmeans,
nstart = 25, method = "gap_stat",
nboot = 50
) +
labs(subtitle = "Gap statistic method")5. Mixture model
library(mclust)Package 'mclust' version 5.4
Type 'citation("mclust")' for citing this R package in publications.
fit <- Mclust(cnp6.narm[, 2:7])
plot(fit, what = "classification")table(fit$classification)
1 2
85 318
6.Analyzing with result.
anlaysis
cnp.cluster <- cbind(cnp.scaled, fit$classification)
colname <- colnames(cnp.cluster[, 2:7])
colnames(cnp.cluster) <- c("id", colname, "class")
colnames(cnp.cluster)[1] "id" "p4075" "p4076" "p4077" "p4078" "p4079" "p4080" "class"
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
# cnp.scaled[, 2:7] %>%
# mutate(Cluster = cnp.km.3$cluster) %>%
# group_by(Cluster) %>%
# summarise_all("mean")