DM868: Project 1

########## DATASET 1 ########## 
### Reading data. Name: Aggregation. N=788, k=7, D=2.  
# Handy command for tab seperating .txt file from terminal: column -t <filename>
df1 <- read.table("Aggregation.txt", sep = "", header = FALSE)
head(df1)

##      V1    V2 V3
## 1 15.55 28.65  2
## 2 14.90 27.55  2
## 3 14.45 28.35  2
## 4 14.15 28.80  2
## 5 13.75 28.05  2
## 6 13.35 28.45  2

### Scatterplot with class labels
ggplot(df1, aes(x = V1, y = V2, color = V3)) + geom_point()

### Scatterplot without class label 
ggplot(df1, aes(x = V1, y = V2)) + geom_point()

##### Clustering alg. 1: k-means (density based) ##### 

# Function for calculating silhouette score of clustering 
set.seed(101)
silhouette_score <- function(dataframe, k, algorithm){
    km <- kmeans(dataframe[,1:2], k, nstart = 20, iter.max = 100, algorithm = algorithm)
    sil <- silhouette(km$cluster, dist(dataframe))
    mean(sil[, 3])    
}

# Functions for finding silhouette for solutions. Used for silhouette plot
silhouette_kmeans <- function(dataframe, k, algorithm){
    i <- 2
    sil_means = c()
    while (i <= k){
        sil_means[i-1] <- silhouette_score(dataframe, i, algorithm)
        i <- i + 1
    }
    return(sil_means)
}

### Making silhouette plot for k = 2,..,15
sil_means_result <- silhouette_kmeans(df1, 15, "Forgy")
plot(2:15, type='b', sil_means_result, 
     xlab='Number of clusters', ylab='Average Silhouette Scores', frame=FALSE)

km <- kmeans(df1[,1:2], 4, nstart = 20, iter.max = 100, algorithm = "Forgy")
label <-km$cluster
ggplot(df1, aes(x = V1, y = V2, color = label)) + geom_point()

# k = 5
km <- kmeans(df1[,1:2], 5, nstart = 20, iter.max = 100, algorithm = "Forgy")
ggplot(df1, aes(x = V1, y = V2, color = label)) + geom_point()

# k = 6
km <- kmeans(df1[,1:2], 6, nstart = 20, iter.max = 100, algorithm = "Forgy")
ggplot(df1, aes(x = V1, y = V2, color = label)) + geom_point()

# k = 7
km <- kmeans(df1[,1:2], 7, nstart = 20, iter.max = 100, algorithm = "Forgy")
ggplot(df1, aes(x = V1, y = V2, color = label)) + geom_point()

Print clustering Parameters.

# Print clustering parameters
print(km)

## K-means clustering with 7 clusters of sizes 128, 48, 128, 167, 104, 135, 78
## 
## Cluster means:
##          V1        V2
## 1 32.694531 22.137891
## 2 21.160417 22.898958
## 3 20.767187  7.041016
## 4  9.259281 22.981138
## 5 33.142788  8.793750
## 6 14.755185  7.228519
## 7  7.368590  7.757051
## 
## Clustering vector:
##   [1] 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
##  [38] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
##  [75] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## [112] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## [149] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 2 4 4 4 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
## [186] 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 6 6 7 7 6 6 6 6 6 6 6 6 6 6 6
## [223] 6 6 6 6 6 6 6 6 6 7 7 7 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
## [260] 6 6 6 6 6 6 6 6 6 6 6 7 7 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
## [297] 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 3 3 3 3 3 3
## [334] 3 3 3 3 3 3 3 3 3 3 3 3 3 6 6 6 6 6 6 6 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [371] 3 3 6 6 3 3 3 6 6 6 6 6 6 6 6 6 6 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [408] 3 3 3 3 3 3 3 3 3 3 6 6 3 3 3 6 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [445] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 5 5 5 5
## [482] 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
## [519] 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
## [556] 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 1 1 1 1 1 1 1 1 1 1
## [593] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [630] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [667] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [704] 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [741] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
## [778] 7 7 7 7 7 7 7 7 7 7 7
## 
## Within cluster sum of squares by cluster:
## [1] 1707.7149  395.3172 1435.5544 3230.6612 1216.9905 1537.3741 1476.8292
##  (between_SS / total_SS =  91.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

In order to estimate the optimal number of clustering, Elbow method can be used

nCluster <- 2:15
elbow_data <- map_dbl(nCluster, function(p) 
  kmeans(df1[,1:2], centers = p, nstart = 30, iter.max = 100)$tot.withinss)%>% 
  tibble(k = nCluster, totWss = .)

ggplot(elbow_data, aes(x = k, y = totWss)) + 
  geom_point(size = 3) +
  geom_line() +
  labs(x = "Number of clusters", y = "Total Within SS") +
  theme_bw() +
  scale_x_continuous(breaks = elbow_data$k)

Silhoutte Visualization

# Silhoutte Visualization 
set.seed(101)
viz_my_silhoutte <- function(agdata,nCluster){

  agCluster <- kmeans(agdata, centers = nCluster,
                      iter.max = 100, nstart= 20, algorithm = "Forgy")
 g <- fviz_silhouette(silhouette(agCluster$cluster, dist(agdata))) 
 return (g)
}

agdata <- df1[,1:2]
k = 4
viz_my_silhoutte(agdata, k)

##   cluster size ave.sil.width
## 1       1  332          0.46
## 2       2  110          0.63
## 3       3  187          0.59
## 4       4  159          0.50

Hirarchical Clustering

##### Clustering alg. 2: DBSCAN #####

### Parameter search: what is best minPts and epsilon?
## minPts = 1
setwd("C:/Users/yss/Desktop/R_docs")
kNNdistplot(df1, k = 1)
abline(h = 0.8, lty = 2)

db <- fpc::dbscan(df1[,1:2], eps = 0.8, MinPts = 1)
plot(db, df1, main = "DBSCAN", frame = FALSE)

# minPts = 2
dbscan::kNNdistplot(df1, k =  2)
abline(h = 0.95, lty = 2)

db <- fpc::dbscan(df1[,1:2], eps = 0.95, MinPts = 2)
plot(db, df1, main = "DBSCAN", frame = FALSE)

# minPts = 3
dbscan::kNNdistplot(df1, k =  3)
abline(h = 1.05, lty = 2)

db <- fpc::dbscan(df1[,1:2], eps = 1.05, MinPts = 3)
plot(db, df1, main = "DBSCAN", frame = FALSE)

# minPts = 4
dbscan::kNNdistplot(df1, k =  4)
abline(h = 1.10, lty = 2)

db <- fpc::dbscan(df1[,1:2], eps = 1.10, MinPts = 4)
plot(db, df1, main = "DBSCAN", frame = FALSE)

# minPts = 5
dbscan::kNNdistplot(df1, k =  5)
abline(h = 1.2, lty = 2)

db <- fpc::dbscan(df1[,1:2], eps = 1.2, MinPts = 5)
plot(db, df1, main = "DBSCAN", frame = FALSE)

# minPts = 6
dbscan::kNNdistplot(df1, k =  6)
abline(h = 1.3, lty = 2)

db <- fpc::dbscan(df1[,1:2], eps = 1.3, MinPts = 6)
plot(db, df1, main = "DBSCAN", frame = FALSE)

# minPts = 7
dbscan::kNNdistplot(df1, k =  7)
abline(h = 1.4, lty = 2)

db <- fpc::dbscan(df1[,1:2], eps = 1.4, MinPts = 7)
plot(db, df1, main = "DBSCAN", frame = FALSE)

## Very close to true clustering 

# minPts = 8
dbscan::kNNdistplot(df1, k =  8)
abline(h = 1.50, lty = 2)

db <- fpc::dbscan(df1[,1:2], eps = 1.5, MinPts = 8)
plot(db, df1, main = "DBSCAN", frame = FALSE)

# minPts = 9
dbscan::kNNdistplot(df1, k =  9)
abline(h = 1.60, lty = 2)

db <- fpc::dbscan(df1[,1:2], eps = 1.6, MinPts = 9)
plot(db, df1, main = "DBSCAN", frame = FALSE)

# minPts = 10
dbscan::kNNdistplot(df1, k =  10)
abline(h = 1.70, lty = 2)

db <- fpc::dbscan(df1[,1:2], eps = 1.7, MinPts = 10)
plot(db, df1, main = "DBSCAN", frame = FALSE)

# minPts = 15
dbscan::kNNdistplot(df1, k =  15)
abline(h = 2.1, lty = 2)

db <- fpc::dbscan(df1[,1:2], eps = 2.1, MinPts = 15)
plot(db, df1, main = "DBSCAN", frame = FALSE)

# minPts = 35
dbscan::kNNdistplot(df1, k =  35)
abline(h = 3.3, lty = 2)

db <- fpc::dbscan(df1[,1:2], eps = 3.3, MinPts = 35)
plot(db, df1, main = "DBSCAN", frame = FALSE)

##### Clustering alg. 3: OPTICS #####
op <- optics(df1[,1:2], eps = 2.5, minPts = 7)
plot(op)

########## DATASET 2 ########## 
### Reading data. Name: Path-based N=300, k=3, D=2.  
df2 <- read.table("pathbased.txt", sep = "", header = FALSE)

### Scatterplot with class labels
ggplot(df2, aes(x = V1, y = V2, color = V3)) + geom_point()

### Scatterplot without class label 
ggplot(df2, aes(x = V1, y = V2)) + geom_point()

##### Clustering alg. 1: k-means #####
sil_means_result <- silhouette_kmeans(df2, 10, "Forgy")
plot(2:10, type='b', sil_means_result, xlab='Number of clusters', ylab='Average Silhouette Scores', frame=FALSE)

# Plotting result
km <- kmeans(df2[,1:2], 3, nstart = 20, iter.max = 100, algorithm = "Forgy")
label <- km$cluster
ggplot(df2, aes(x = V1, y = V2, color = label)) + geom_point() #shit!

##### Clustering alg. 2: DBSCAN #####

### Parameter search
# minPts = 1
dbscan::kNNdistplot(df1, k = 1)
abline(h = 0.75, lty = 2)

db <- fpc::dbscan(df1[,1:2], eps = 0.75, MinPts = 1)
plot(db, df2, main = "DBSCAN", frame = FALSE)

# minPts = 2
dbscan::kNNdistplot(df1, k = 2)
abline(h = 0.9, lty = 2)

db <- fpc::dbscan(df1[,1:2], eps = 0.9, MinPts = 2)
plot(db, df2, main = "DBSCAN", frame = FALSE)

# minPts = 3
dbscan::kNNdistplot(df1, k = 3)
abline(h = 1, lty = 2)

db <- fpc::dbscan(df1[,1:2], eps = 1, MinPts = 3)
plot(db, df2, main = "DBSCAN", frame = FALSE)

# minPts = 4
dbscan::kNNdistplot(df1, k = 4)
abline(h = 1.1, lty = 2)

db <- fpc::dbscan(df1[,1:2], eps = 1.1, MinPts = 4)
plot(db, df2, main = "DBSCAN", frame = FALSE)

#minPts = 5
dbscan::kNNdistplot(df1, k = 5)
abline(h = 1.2, lty = 2)

db <- fpc::dbscan(df1[,1:2], eps = 1.2, MinPts = 5)
plot(db, df2, main = "DBSCAN", frame = FALSE)

# minPts = 6
dbscan::kNNdistplot(df1, k = 6)
abline(h = 1.3, lty = 2)

db <- fpc::dbscan(df1[,1:2], eps = 1.3, MinPts = 6)
plot(db, df2, main = "DBSCAN", frame = FALSE)

# minPts = 7
dbscan::kNNdistplot(df1, k = 7)
abline(h = 1.4, lty = 2)

db <- fpc::dbscan(df1[,1:2], eps = 1.4, MinPts = 7)
plot(db, df2, main = "DBSCAN", frame = FALSE)

# DBSCAN has hard time to seperate one of the round clusters 
# in the middle from the circular cluster because
# the border is kind of blurry. Therefore we try HDBSCAN. 

##### Clustering alg. 3: HDBSCAN #####

### Parameter search: best value for minPts?

# minpts = 2
hdb <- hdbscan(df2[,1:2], minPts = 2)
plot(df2, col=hdb$cluster+1, pch=20)

# minPts = 3
hdb <- hdbscan(df2[,1:2], minPts = 3)
plot(df2, col=hdb$cluster+1, pch=20)

# minPts = 4
hdb <- hdbscan(df2[,1:2], minPts = 4)
plot(df2, col=hdb$cluster+1, pch=20)

# minPts = 5
hdb <- hdbscan(df2[,1:2], minPts = 5)
plot(df2, col=hdb$cluster+1, pch=20)

# minPts = 6
hdb <- hdbscan(df2[,1:2], minPts = 6)
plot(df2, col=hdb$cluster+1, pch=20)

# minPts = 7
hdb <- hdbscan(df2[,1:2], minPts = 7)
plot(df2, col=hdb$cluster+1, pch=20)

# minPts = 8
hdb <- hdbscan(df2[,1:2], minPts = 8)
plot(df2, col=hdb$cluster+1, pch=20)

# minPts = 9
hdb <- hdbscan(df2[,1:2], minPts = 9)
plot(df2, col=hdb$cluster+1, pch=20)

# minPts = 10 
hdb <- hdbscan(df2[,1:2], minPts = 10)
plot(df2, col=hdb$cluster+1, pch=20)

# minPts = 11
hdb <- hdbscan(df2[,1:2], minPts = 11)
plot(df2, col=hdb$cluster+1, pch=20)

# minPts = 12 
hdb <- hdbscan(df2[,1:2], minPts = 12)
plot(df2, col=hdb$cluster+1, pch=20)

# minPts = 13
hdb <- hdbscan(df2[,1:2], minPts = 13)
plot(df2, col=hdb$cluster+1, pch=20)

# minPts = 14
hdb <- hdbscan(df2[,1:2], minPts = 14)
plot(df2, col=hdb$cluster+1, pch=20)

# minPts = 30 (to get a sense of stability)
hdb <- hdbscan(df2[,1:2], minPts = 30)
plot(df2, col=hdb$cluster+1, pch=20)

# minPts = 40 (to get a sense of stability)
hdb <- hdbscan(df2[,1:2], minPts = 40)
plot(df2, col=hdb$cluster+1, pch=20)

# minPts = 50 (to get a sense of stability)
hdb <- hdbscan(df2[,1:2], minPts = 50)
plot(df2, col=hdb$cluster+1, pch=20)

# Gives bestter result than DBSCAN but it seems less stable to the choice of minPts. 

##### Clustering alg. 5: optics #####
op <- optics(df2[,1:2], eps = 4, minPts = 6)
plot(op)

DM868: Project 1

Ahmed Shaker Hisab Hisab, Anas Mahir Sadaldin Othman,Youssouf Souare

13/4/2020