########## DATASET 1 ##########
### Reading data. Name: Aggregation. N=788, k=7, D=2.
# Handy command for tab seperating .txt file from terminal: column -t <filename>
df1 <- read.table("Aggregation.txt", sep = "", header = FALSE)
head(df1)
## V1 V2 V3
## 1 15.55 28.65 2
## 2 14.90 27.55 2
## 3 14.45 28.35 2
## 4 14.15 28.80 2
## 5 13.75 28.05 2
## 6 13.35 28.45 2
### Scatterplot with class labels
ggplot(df1, aes(x = V1, y = V2, color = V3)) + geom_point()
### Scatterplot without class label
ggplot(df1, aes(x = V1, y = V2)) + geom_point()
##### Clustering alg. 1: k-means (density based) #####
# Function for calculating silhouette score of clustering
set.seed(101)
silhouette_score <- function(dataframe, k, algorithm){
km <- kmeans(dataframe[,1:2], k, nstart = 20, iter.max = 100, algorithm = algorithm)
sil <- silhouette(km$cluster, dist(dataframe))
mean(sil[, 3])
}
# Functions for finding silhouette for solutions. Used for silhouette plot
silhouette_kmeans <- function(dataframe, k, algorithm){
i <- 2
sil_means = c()
while (i <= k){
sil_means[i-1] <- silhouette_score(dataframe, i, algorithm)
i <- i + 1
}
return(sil_means)
}
### Making silhouette plot for k = 2,..,15
sil_means_result <- silhouette_kmeans(df1, 15, "Forgy")
plot(2:15, type='b', sil_means_result,
xlab='Number of clusters', ylab='Average Silhouette Scores', frame=FALSE)
km <- kmeans(df1[,1:2], 4, nstart = 20, iter.max = 100, algorithm = "Forgy")
label <-km$cluster
ggplot(df1, aes(x = V1, y = V2, color = label)) + geom_point()
# k = 5
km <- kmeans(df1[,1:2], 5, nstart = 20, iter.max = 100, algorithm = "Forgy")
ggplot(df1, aes(x = V1, y = V2, color = label)) + geom_point()
# k = 6
km <- kmeans(df1[,1:2], 6, nstart = 20, iter.max = 100, algorithm = "Forgy")
ggplot(df1, aes(x = V1, y = V2, color = label)) + geom_point()
# k = 7
km <- kmeans(df1[,1:2], 7, nstart = 20, iter.max = 100, algorithm = "Forgy")
ggplot(df1, aes(x = V1, y = V2, color = label)) + geom_point()
Print clustering Parameters.
# Print clustering parameters
print(km)
## K-means clustering with 7 clusters of sizes 128, 48, 128, 167, 104, 135, 78
##
## Cluster means:
## V1 V2
## 1 32.694531 22.137891
## 2 21.160417 22.898958
## 3 20.767187 7.041016
## 4 9.259281 22.981138
## 5 33.142788 8.793750
## 6 14.755185 7.228519
## 7 7.368590 7.757051
##
## Clustering vector:
## [1] 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## [38] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## [75] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## [112] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## [149] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 2 4 4 4 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
## [186] 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 6 6 7 7 6 6 6 6 6 6 6 6 6 6 6
## [223] 6 6 6 6 6 6 6 6 6 7 7 7 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
## [260] 6 6 6 6 6 6 6 6 6 6 6 7 7 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
## [297] 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 3 3 3 3 3 3
## [334] 3 3 3 3 3 3 3 3 3 3 3 3 3 6 6 6 6 6 6 6 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [371] 3 3 6 6 3 3 3 6 6 6 6 6 6 6 6 6 6 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [408] 3 3 3 3 3 3 3 3 3 3 6 6 3 3 3 6 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [445] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 5 5 5 5
## [482] 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
## [519] 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
## [556] 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 1 1 1 1 1 1 1 1 1 1
## [593] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [630] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [667] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [704] 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [741] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
## [778] 7 7 7 7 7 7 7 7 7 7 7
##
## Within cluster sum of squares by cluster:
## [1] 1707.7149 395.3172 1435.5544 3230.6612 1216.9905 1537.3741 1476.8292
## (between_SS / total_SS = 91.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
In order to estimate the optimal number of clustering, Elbow method can be used
nCluster <- 2:15
elbow_data <- map_dbl(nCluster, function(p)
kmeans(df1[,1:2], centers = p, nstart = 30, iter.max = 100)$tot.withinss)%>%
tibble(k = nCluster, totWss = .)
ggplot(elbow_data, aes(x = k, y = totWss)) +
geom_point(size = 3) +
geom_line() +
labs(x = "Number of clusters", y = "Total Within SS") +
theme_bw() +
scale_x_continuous(breaks = elbow_data$k)
Silhoutte Visualization
# Silhoutte Visualization
set.seed(101)
viz_my_silhoutte <- function(agdata,nCluster){
agCluster <- kmeans(agdata, centers = nCluster,
iter.max = 100, nstart= 20, algorithm = "Forgy")
g <- fviz_silhouette(silhouette(agCluster$cluster, dist(agdata)))
return (g)
}
agdata <- df1[,1:2]
k = 4
viz_my_silhoutte(agdata, k)
## cluster size ave.sil.width
## 1 1 332 0.46
## 2 2 110 0.63
## 3 3 187 0.59
## 4 4 159 0.50
Hirarchical Clustering
##### Clustering alg. 2: DBSCAN #####
### Parameter search: what is best minPts and epsilon?
## minPts = 1
setwd("C:/Users/yss/Desktop/R_docs")
kNNdistplot(df1, k = 1)
abline(h = 0.8, lty = 2)
db <- fpc::dbscan(df1[,1:2], eps = 0.8, MinPts = 1)
plot(db, df1, main = "DBSCAN", frame = FALSE)
# minPts = 2
dbscan::kNNdistplot(df1, k = 2)
abline(h = 0.95, lty = 2)
db <- fpc::dbscan(df1[,1:2], eps = 0.95, MinPts = 2)
plot(db, df1, main = "DBSCAN", frame = FALSE)
# minPts = 3
dbscan::kNNdistplot(df1, k = 3)
abline(h = 1.05, lty = 2)
db <- fpc::dbscan(df1[,1:2], eps = 1.05, MinPts = 3)
plot(db, df1, main = "DBSCAN", frame = FALSE)
# minPts = 4
dbscan::kNNdistplot(df1, k = 4)
abline(h = 1.10, lty = 2)
db <- fpc::dbscan(df1[,1:2], eps = 1.10, MinPts = 4)
plot(db, df1, main = "DBSCAN", frame = FALSE)
# minPts = 5
dbscan::kNNdistplot(df1, k = 5)
abline(h = 1.2, lty = 2)
db <- fpc::dbscan(df1[,1:2], eps = 1.2, MinPts = 5)
plot(db, df1, main = "DBSCAN", frame = FALSE)
# minPts = 6
dbscan::kNNdistplot(df1, k = 6)
abline(h = 1.3, lty = 2)
db <- fpc::dbscan(df1[,1:2], eps = 1.3, MinPts = 6)
plot(db, df1, main = "DBSCAN", frame = FALSE)
# minPts = 7
dbscan::kNNdistplot(df1, k = 7)
abline(h = 1.4, lty = 2)
db <- fpc::dbscan(df1[,1:2], eps = 1.4, MinPts = 7)
plot(db, df1, main = "DBSCAN", frame = FALSE)
## Very close to true clustering
# minPts = 8
dbscan::kNNdistplot(df1, k = 8)
abline(h = 1.50, lty = 2)
db <- fpc::dbscan(df1[,1:2], eps = 1.5, MinPts = 8)
plot(db, df1, main = "DBSCAN", frame = FALSE)
# minPts = 9
dbscan::kNNdistplot(df1, k = 9)
abline(h = 1.60, lty = 2)
db <- fpc::dbscan(df1[,1:2], eps = 1.6, MinPts = 9)
plot(db, df1, main = "DBSCAN", frame = FALSE)
# minPts = 10
dbscan::kNNdistplot(df1, k = 10)
abline(h = 1.70, lty = 2)
db <- fpc::dbscan(df1[,1:2], eps = 1.7, MinPts = 10)
plot(db, df1, main = "DBSCAN", frame = FALSE)
# minPts = 15
dbscan::kNNdistplot(df1, k = 15)
abline(h = 2.1, lty = 2)
db <- fpc::dbscan(df1[,1:2], eps = 2.1, MinPts = 15)
plot(db, df1, main = "DBSCAN", frame = FALSE)
# minPts = 35
dbscan::kNNdistplot(df1, k = 35)
abline(h = 3.3, lty = 2)
db <- fpc::dbscan(df1[,1:2], eps = 3.3, MinPts = 35)
plot(db, df1, main = "DBSCAN", frame = FALSE)
##### Clustering alg. 3: OPTICS #####
op <- optics(df1[,1:2], eps = 2.5, minPts = 7)
plot(op)
########## DATASET 2 ##########
### Reading data. Name: Path-based N=300, k=3, D=2.
df2 <- read.table("pathbased.txt", sep = "", header = FALSE)
### Scatterplot with class labels
ggplot(df2, aes(x = V1, y = V2, color = V3)) + geom_point()
### Scatterplot without class label
ggplot(df2, aes(x = V1, y = V2)) + geom_point()
##### Clustering alg. 1: k-means #####
sil_means_result <- silhouette_kmeans(df2, 10, "Forgy")
plot(2:10, type='b', sil_means_result, xlab='Number of clusters', ylab='Average Silhouette Scores', frame=FALSE)
# Plotting result
km <- kmeans(df2[,1:2], 3, nstart = 20, iter.max = 100, algorithm = "Forgy")
label <- km$cluster
ggplot(df2, aes(x = V1, y = V2, color = label)) + geom_point() #shit!
##### Clustering alg. 2: DBSCAN #####
### Parameter search
# minPts = 1
dbscan::kNNdistplot(df1, k = 1)
abline(h = 0.75, lty = 2)
db <- fpc::dbscan(df1[,1:2], eps = 0.75, MinPts = 1)
plot(db, df2, main = "DBSCAN", frame = FALSE)
# minPts = 2
dbscan::kNNdistplot(df1, k = 2)
abline(h = 0.9, lty = 2)
db <- fpc::dbscan(df1[,1:2], eps = 0.9, MinPts = 2)
plot(db, df2, main = "DBSCAN", frame = FALSE)
# minPts = 3
dbscan::kNNdistplot(df1, k = 3)
abline(h = 1, lty = 2)
db <- fpc::dbscan(df1[,1:2], eps = 1, MinPts = 3)
plot(db, df2, main = "DBSCAN", frame = FALSE)
# minPts = 4
dbscan::kNNdistplot(df1, k = 4)
abline(h = 1.1, lty = 2)
db <- fpc::dbscan(df1[,1:2], eps = 1.1, MinPts = 4)
plot(db, df2, main = "DBSCAN", frame = FALSE)
#minPts = 5
dbscan::kNNdistplot(df1, k = 5)
abline(h = 1.2, lty = 2)
db <- fpc::dbscan(df1[,1:2], eps = 1.2, MinPts = 5)
plot(db, df2, main = "DBSCAN", frame = FALSE)
# minPts = 6
dbscan::kNNdistplot(df1, k = 6)
abline(h = 1.3, lty = 2)
db <- fpc::dbscan(df1[,1:2], eps = 1.3, MinPts = 6)
plot(db, df2, main = "DBSCAN", frame = FALSE)
# minPts = 7
dbscan::kNNdistplot(df1, k = 7)
abline(h = 1.4, lty = 2)
db <- fpc::dbscan(df1[,1:2], eps = 1.4, MinPts = 7)
plot(db, df2, main = "DBSCAN", frame = FALSE)
# DBSCAN has hard time to seperate one of the round clusters
# in the middle from the circular cluster because
# the border is kind of blurry. Therefore we try HDBSCAN.
##### Clustering alg. 3: HDBSCAN #####
### Parameter search: best value for minPts?
# minpts = 2
hdb <- hdbscan(df2[,1:2], minPts = 2)
plot(df2, col=hdb$cluster+1, pch=20)
# minPts = 3
hdb <- hdbscan(df2[,1:2], minPts = 3)
plot(df2, col=hdb$cluster+1, pch=20)
# minPts = 4
hdb <- hdbscan(df2[,1:2], minPts = 4)
plot(df2, col=hdb$cluster+1, pch=20)
# minPts = 5
hdb <- hdbscan(df2[,1:2], minPts = 5)
plot(df2, col=hdb$cluster+1, pch=20)
# minPts = 6
hdb <- hdbscan(df2[,1:2], minPts = 6)
plot(df2, col=hdb$cluster+1, pch=20)
# minPts = 7
hdb <- hdbscan(df2[,1:2], minPts = 7)
plot(df2, col=hdb$cluster+1, pch=20)
# minPts = 8
hdb <- hdbscan(df2[,1:2], minPts = 8)
plot(df2, col=hdb$cluster+1, pch=20)
# minPts = 9
hdb <- hdbscan(df2[,1:2], minPts = 9)
plot(df2, col=hdb$cluster+1, pch=20)
# minPts = 10
hdb <- hdbscan(df2[,1:2], minPts = 10)
plot(df2, col=hdb$cluster+1, pch=20)
# minPts = 11
hdb <- hdbscan(df2[,1:2], minPts = 11)
plot(df2, col=hdb$cluster+1, pch=20)
# minPts = 12
hdb <- hdbscan(df2[,1:2], minPts = 12)
plot(df2, col=hdb$cluster+1, pch=20)
# minPts = 13
hdb <- hdbscan(df2[,1:2], minPts = 13)
plot(df2, col=hdb$cluster+1, pch=20)
# minPts = 14
hdb <- hdbscan(df2[,1:2], minPts = 14)
plot(df2, col=hdb$cluster+1, pch=20)
# minPts = 30 (to get a sense of stability)
hdb <- hdbscan(df2[,1:2], minPts = 30)
plot(df2, col=hdb$cluster+1, pch=20)
# minPts = 40 (to get a sense of stability)
hdb <- hdbscan(df2[,1:2], minPts = 40)
plot(df2, col=hdb$cluster+1, pch=20)
# minPts = 50 (to get a sense of stability)
hdb <- hdbscan(df2[,1:2], minPts = 50)
plot(df2, col=hdb$cluster+1, pch=20)
# Gives bestter result than DBSCAN but it seems less stable to the choice of minPts.
##### Clustering alg. 5: optics #####
op <- optics(df2[,1:2], eps = 4, minPts = 6)
plot(op)