data <- read.csv ("C:\\Users\\tariqm\\Documents\\R\\Datasets\\mall_customers.csv", sep = ",", row.names = 1, header = TRUE)
data$Gender <- as.factor(data$Gender)
ndata <- scale(data[-1])
summary(ndata)
## Age Annual.Income..k.. Spending.Score..1.100.
## Min. :-1.4926 Min. :-1.73465 Min. :-1.905240
## 1st Qu.:-0.7230 1st Qu.:-0.72569 1st Qu.:-0.598292
## Median :-0.2040 Median : 0.03579 Median :-0.007745
## Mean : 0.0000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.7266 3rd Qu.: 0.66401 3rd Qu.: 0.882916
## Max. : 2.2299 Max. : 2.91037 Max. : 1.889750
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.0.5
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
p1 <- fviz_nbclust(ndata, kmeans, method = "wss") + labs(title = "", subtitle = "Elbow method - NData") + geom_vline(xintercept = 4, linetype = 2)
p2 <- fviz_nbclust(ndata, kmeans, method = "silhouette") + labs(title = "", subtitle = "Silhouette method - NData") + geom_vline(xintercept = 4, linetype = 2)
p3 <- fviz_nbclust(ndata, kmeans, method = "gap_stat") + labs(title = "", subtitle = "Gap statistic method - NData") + geom_vline(xintercept = 4, linetype = 2)
gridExtra::grid.arrange(p1,p2,p3, top = "Optimal Number of Clusters")

library(cluster)
## Warning: package 'cluster' was built under R version 4.0.5
cluster_1 <- kmeans(ndata,2)
clusplot(ndata, cluster_1$cluster, color = T, shade = T, labels = 2, lines = 0)

cluster_2 <- kmeans(ndata,3)
clusplot(ndata, cluster_2$cluster, color = T, shade = T, labels = 2, lines = 0)

cluster_3 <- kmeans(ndata,4)
clusplot(ndata, cluster_3$cluster, color = T, shade = T, labels = 2, lines = 0)

cluster_4 <- kmeans(ndata,5)
clusplot(ndata, cluster_4$cluster, color = T, shade = T, labels = 2, lines = 0)

p4 <- fviz_cluster( kmeans(ndata,2), data = ndata, axes = c(1,2)) + labs(title = "", subtitle = "NData - 2 Clusters")
p5 <- fviz_cluster( kmeans(ndata,3), data = ndata, axes = c(1,2)) + labs(title = "", subtitle = "NData - 3 Clusters")
p6 <- fviz_cluster( kmeans(ndata,4), data = ndata, axes = c(1,2)) + labs(title = "", subtitle = "NData - 4 Clusters")
p7 <- fviz_cluster( kmeans(ndata,5), data = ndata, axes = c(1,2)) + labs(title = "", subtitle = "NData - 5 Clusters")
gridExtra::grid.arrange(p4,p5,p6,p7, ncol = 2, top = "Clustering")

d.hclust1 <- hclust(dist(ndata), method = "average")
d.hclust2 <- hclust(dist(ndata), method = "single")
d.hclust3 <- hclust(dist(ndata), method = "complete")
d.hclust4 <- hclust(dist(ndata), method = "ward.D")
d.hclust5 <- hclust(dist(ndata), method = "centroid")
plot(d.hclust1, hang = -1)
rect.hclust(d.hclust1, k = 4, border = 2:5)

plot(d.hclust2, hang = -1)
rect.hclust(d.hclust2, k = 4, border = 2:5)

plot(d.hclust3, hang = -1)
rect.hclust(d.hclust3, k = 4, border = 2:5)

plot(d.hclust4, hang = -1)
rect.hclust(d.hclust4, k = 4, border = 2:5)

plot(d.hclust5, hang = -1)
rect.hclust(d.hclust5, k = 4, border = 2:5)

member1 <- cutree(d.hclust1, 4)
table(member1)
## member1
## 1 2 3 4
## 74 52 39 35
aggregate(ndata, list(member1), mean)
## Group.1 Age Annual.Income..k.. Spending.Score..1.100.
## 1 1 0.8504348 -0.5481816 -0.3719634
## 2 2 -1.0162607 -0.6707740 0.4599265
## 3 3 -0.4408110 0.9891010 1.2364001
## 4 4 0.2030004 1.0534500 -1.2745855
aggregate(data[,-1], list(member1), mean)
## Group.1 Age Annual.Income..k.. Spending.Score..1.100.
## 1 1 50.72973 46.16216 40.59459
## 2 2 24.65385 42.94231 62.07692
## 3 3 32.69231 86.53846 82.12821
## 4 4 41.68571 88.22857 17.28571
member2 <- cutree(d.hclust2, 4)
table(member2)
## member2
## 1 2 3 4
## 1 197 1 1
aggregate(ndata, list(member2), mean)
## Group.1 Age Annual.Income..k.. Spending.Score..1.100.
## 1 1 -1.42100291 -1.734646247 -0.43371311
## 2 2 0.01655223 0.002643912 0.01721957
## 3 3 -1.34941586 -1.696572361 -1.71161783
## 4 4 -0.49037128 2.910367847 -1.24692520
aggregate(data[,-1], list(member2), mean)
## Group.1 Age Annual.Income..k.. Spending.Score..1.100.
## 1 1 19.00000 15.00000 39.00000
## 2 2 39.08122 60.62944 50.64467
## 3 3 20.00000 16.00000 6.00000
## 4 4 32.00000 137.00000 18.00000
member3 <- cutree(d.hclust3, 4)
table(member3)
## member3
## 1 2 3 4
## 69 57 39 35
aggregate(ndata, list(member3), mean)
## Group.1 Age Annual.Income..k.. Spending.Score..1.100.
## 1 1 -0.8285941 -0.6768544 0.2391932
## 2 2 1.1799932 -0.5042586 -0.3528675
## 3 3 -0.4408110 0.9891010 1.2364001
## 4 4 0.2030004 1.0534500 -1.2745855
aggregate(data[,-1], list(member3), mean)
## Group.1 Age Annual.Income..k.. Spending.Score..1.100.
## 1 1 27.27536 42.78261 56.37681
## 2 2 55.33333 47.31579 41.08772
## 3 3 32.69231 86.53846 82.12821
## 4 4 41.68571 88.22857 17.28571
member4 <- cutree(d.hclust4, 4)
table(member4)
## member4
## 1 2 3 4
## 77 52 39 32
aggregate(ndata, list(member4), mean)
## Group.1 Age Annual.Income..k.. Spending.Score..1.100.
## 1 1 0.8456104 -0.5044147 -0.3799013
## 2 2 -1.0162607 -0.6707740 0.4599265
## 3 3 -0.4408110 0.9891010 1.2364001
## 4 4 0.1539122 1.0982888 -1.3401058
aggregate(data[,-1], list(member4), mean)
## Group.1 Age Annual.Income..k.. Spending.Score..1.100.
## 1 1 50.66234 47.31169 40.38961
## 2 2 24.65385 42.94231 62.07692
## 3 3 32.69231 86.53846 82.12821
## 4 4 41.00000 89.40625 15.59375
member5 <- cutree(d.hclust5, 4)
table(member5)
## member5
## 1 2 3 4
## 2 191 4 3
plot(silhouette(cutree(d.hclust1,4), dist(ndata)))

plot(silhouette(cutree(d.hclust2,4), dist(ndata)))

plot(silhouette(cutree(d.hclust3,4), dist(ndata)))

plot(silhouette(cutree(d.hclust4,4), dist(ndata)))

plot(silhouette(cutree(d.hclust5,4), dist(ndata)))

library(dendextend)
## Warning: package 'dendextend' was built under R version 4.0.5
##
## ---------------------
## Welcome to dendextend version 1.15.1
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
tanglegram(d.hclust1,d.hclust2)

tanglegram(d.hclust1,d.hclust3)

tanglegram(d.hclust1,d.hclust4)

tanglegram(d.hclust1,d.hclust5)

tanglegram(d.hclust2,d.hclust3)

tanglegram(d.hclust2,d.hclust4)

tanglegram(d.hclust2,d.hclust5)

tanglegram(d.hclust3,d.hclust4)

tanglegram(d.hclust3,d.hclust5)

tanglegram(d.hclust4,d.hclust5)
