data <- read.csv ("C:\\Users\\tariqm\\Documents\\R\\Datasets\\mall_customers.csv", sep = ",", row.names = 1, header = TRUE)

data$Gender <- as.factor(data$Gender)

ndata <- scale(data[-1])

summary(ndata)
##       Age          Annual.Income..k.. Spending.Score..1.100.
##  Min.   :-1.4926   Min.   :-1.73465   Min.   :-1.905240     
##  1st Qu.:-0.7230   1st Qu.:-0.72569   1st Qu.:-0.598292     
##  Median :-0.2040   Median : 0.03579   Median :-0.007745     
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.000000     
##  3rd Qu.: 0.7266   3rd Qu.: 0.66401   3rd Qu.: 0.882916     
##  Max.   : 2.2299   Max.   : 2.91037   Max.   : 1.889750
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.0.5
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
p1 <- fviz_nbclust(ndata, kmeans, method = "wss")  + labs(title = "", subtitle = "Elbow method - NData") + geom_vline(xintercept = 4, linetype = 2)

p2 <- fviz_nbclust(ndata, kmeans, method = "silhouette")  + labs(title = "", subtitle = "Silhouette method - NData") + geom_vline(xintercept = 4, linetype = 2)

p3 <- fviz_nbclust(ndata, kmeans, method = "gap_stat")  + labs(title = "", subtitle = "Gap statistic method - NData") + geom_vline(xintercept = 4, linetype = 2)

gridExtra::grid.arrange(p1,p2,p3, top = "Optimal Number of Clusters")

library(cluster)
## Warning: package 'cluster' was built under R version 4.0.5
cluster_1 <- kmeans(ndata,2)
clusplot(ndata, cluster_1$cluster, color = T, shade = T, labels = 2, lines = 0)

cluster_2 <- kmeans(ndata,3)
clusplot(ndata, cluster_2$cluster, color = T, shade = T, labels = 2, lines = 0)

cluster_3 <- kmeans(ndata,4)
clusplot(ndata, cluster_3$cluster, color = T, shade = T, labels = 2, lines = 0)

cluster_4 <- kmeans(ndata,5)
clusplot(ndata, cluster_4$cluster, color = T, shade = T, labels = 2, lines = 0)

p4 <- fviz_cluster( kmeans(ndata,2), data = ndata, axes = c(1,2))  + labs(title = "", subtitle = "NData - 2 Clusters") 

p5 <- fviz_cluster( kmeans(ndata,3), data = ndata, axes = c(1,2))  + labs(title = "", subtitle = "NData - 3 Clusters") 

p6 <- fviz_cluster( kmeans(ndata,4), data = ndata, axes = c(1,2))  + labs(title = "", subtitle = "NData - 4 Clusters") 

p7 <- fviz_cluster( kmeans(ndata,5), data = ndata, axes = c(1,2))  + labs(title = "", subtitle = "NData - 5 Clusters") 

gridExtra::grid.arrange(p4,p5,p6,p7, ncol = 2, top = "Clustering")

d.hclust1 <- hclust(dist(ndata), method = "average")
d.hclust2 <- hclust(dist(ndata), method = "single")
d.hclust3 <- hclust(dist(ndata), method = "complete")
d.hclust4 <- hclust(dist(ndata), method = "ward.D")
d.hclust5 <- hclust(dist(ndata), method = "centroid")

plot(d.hclust1, hang = -1)
rect.hclust(d.hclust1, k = 4, border = 2:5)

plot(d.hclust2, hang = -1)
rect.hclust(d.hclust2, k = 4, border = 2:5)

plot(d.hclust3, hang = -1)
rect.hclust(d.hclust3, k = 4, border = 2:5)

plot(d.hclust4, hang = -1)
rect.hclust(d.hclust4, k = 4, border = 2:5)

plot(d.hclust5, hang = -1)
rect.hclust(d.hclust5, k = 4, border = 2:5)

member1 <- cutree(d.hclust1, 4)
table(member1)
## member1
##  1  2  3  4 
## 74 52 39 35
aggregate(ndata, list(member1), mean)
##   Group.1        Age Annual.Income..k.. Spending.Score..1.100.
## 1       1  0.8504348         -0.5481816             -0.3719634
## 2       2 -1.0162607         -0.6707740              0.4599265
## 3       3 -0.4408110          0.9891010              1.2364001
## 4       4  0.2030004          1.0534500             -1.2745855
aggregate(data[,-1], list(member1), mean)
##   Group.1      Age Annual.Income..k.. Spending.Score..1.100.
## 1       1 50.72973           46.16216               40.59459
## 2       2 24.65385           42.94231               62.07692
## 3       3 32.69231           86.53846               82.12821
## 4       4 41.68571           88.22857               17.28571
member2 <- cutree(d.hclust2, 4)
table(member2)
## member2
##   1   2   3   4 
##   1 197   1   1
aggregate(ndata, list(member2), mean)
##   Group.1         Age Annual.Income..k.. Spending.Score..1.100.
## 1       1 -1.42100291       -1.734646247            -0.43371311
## 2       2  0.01655223        0.002643912             0.01721957
## 3       3 -1.34941586       -1.696572361            -1.71161783
## 4       4 -0.49037128        2.910367847            -1.24692520
aggregate(data[,-1], list(member2), mean)
##   Group.1      Age Annual.Income..k.. Spending.Score..1.100.
## 1       1 19.00000           15.00000               39.00000
## 2       2 39.08122           60.62944               50.64467
## 3       3 20.00000           16.00000                6.00000
## 4       4 32.00000          137.00000               18.00000
member3 <- cutree(d.hclust3, 4)
table(member3)
## member3
##  1  2  3  4 
## 69 57 39 35
aggregate(ndata, list(member3), mean)
##   Group.1        Age Annual.Income..k.. Spending.Score..1.100.
## 1       1 -0.8285941         -0.6768544              0.2391932
## 2       2  1.1799932         -0.5042586             -0.3528675
## 3       3 -0.4408110          0.9891010              1.2364001
## 4       4  0.2030004          1.0534500             -1.2745855
aggregate(data[,-1], list(member3), mean)
##   Group.1      Age Annual.Income..k.. Spending.Score..1.100.
## 1       1 27.27536           42.78261               56.37681
## 2       2 55.33333           47.31579               41.08772
## 3       3 32.69231           86.53846               82.12821
## 4       4 41.68571           88.22857               17.28571
member4 <- cutree(d.hclust4, 4)
table(member4)
## member4
##  1  2  3  4 
## 77 52 39 32
aggregate(ndata, list(member4), mean)
##   Group.1        Age Annual.Income..k.. Spending.Score..1.100.
## 1       1  0.8456104         -0.5044147             -0.3799013
## 2       2 -1.0162607         -0.6707740              0.4599265
## 3       3 -0.4408110          0.9891010              1.2364001
## 4       4  0.1539122          1.0982888             -1.3401058
aggregate(data[,-1], list(member4), mean)
##   Group.1      Age Annual.Income..k.. Spending.Score..1.100.
## 1       1 50.66234           47.31169               40.38961
## 2       2 24.65385           42.94231               62.07692
## 3       3 32.69231           86.53846               82.12821
## 4       4 41.00000           89.40625               15.59375
member5 <- cutree(d.hclust5, 4)
table(member5)
## member5
##   1   2   3   4 
##   2 191   4   3
plot(silhouette(cutree(d.hclust1,4), dist(ndata))) 

plot(silhouette(cutree(d.hclust2,4), dist(ndata)))

plot(silhouette(cutree(d.hclust3,4), dist(ndata)))

plot(silhouette(cutree(d.hclust4,4), dist(ndata))) 

plot(silhouette(cutree(d.hclust5,4), dist(ndata))) 

library(dendextend)
## Warning: package 'dendextend' was built under R version 4.0.5
## 
## ---------------------
## Welcome to dendextend version 1.15.1
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
tanglegram(d.hclust1,d.hclust2)

tanglegram(d.hclust1,d.hclust3)

tanglegram(d.hclust1,d.hclust4)

tanglegram(d.hclust1,d.hclust5)

tanglegram(d.hclust2,d.hclust3)

tanglegram(d.hclust2,d.hclust4)

tanglegram(d.hclust2,d.hclust5)

tanglegram(d.hclust3,d.hclust4)

tanglegram(d.hclust3,d.hclust5)

tanglegram(d.hclust4,d.hclust5)