data <- read.csv ("C:\\Users\\tariqm\\Documents\\R\\Datasets\\Mall_Customers.csv", sep = ",")

ndata <- scale(data[,-c(1,2)])

library(factoextra)
## Warning: package 'factoextra' was built under R version 4.0.5
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_nbclust(ndata, kmeans, method = "wss")

fviz_nbclust (ndata, kmeans, method = "silhouette")

library(cluster) 
## Warning: package 'cluster' was built under R version 4.0.5
stat_gap <- clusGap(data[,-c(1,2)], FUN = kmeans, nstart = 25, K.max = 10, B = 50)
fviz_gap_stat(stat_gap)

distance <- dist(ndata)

fviz_dist(distance, gradient = list(low = "green", mid = "white", high = "red"))

data.hclust1 <- hclust(distance)

plot(data.hclust1, hang = -1, labels = data$state )
rect.hclust(data.hclust1, k = 4, border = 'green')

data.hclust2 <- hclust(distance, method = 'average')

plot(data.hclust2, hang = -1, labels = data$state)
rect.hclust(data.hclust2, k = 4, border = 'green')

member1 = cutree(data.hclust1, 4)
table(member1)
## member1
##  1  2  3  4 
## 69 57 39 35
aggregate(ndata, list(member1), mean)
##   Group.1        Age Annual.Income..k.. Spending.Score..1.100.
## 1       1 -0.8285941         -0.6768544              0.2391932
## 2       2  1.1799932         -0.5042586             -0.3528675
## 3       3 -0.4408110          0.9891010              1.2364001
## 4       4  0.2030004          1.0534500             -1.2745855
aggregate(data[,-c(1,2)], list(member1), mean)
##   Group.1      Age Annual.Income..k.. Spending.Score..1.100.
## 1       1 27.27536           42.78261               56.37681
## 2       2 55.33333           47.31579               41.08772
## 3       3 32.69231           86.53846               82.12821
## 4       4 41.68571           88.22857               17.28571
plot(silhouette(cutree(data.hclust1,4), distance)) 

member2 = cutree(data.hclust2, 4)
table(member2)
## member2
##  1  2  3  4 
## 74 52 39 35
aggregate(ndata, list(member2), mean)
##   Group.1        Age Annual.Income..k.. Spending.Score..1.100.
## 1       1  0.8504348         -0.5481816             -0.3719634
## 2       2 -1.0162607         -0.6707740              0.4599265
## 3       3 -0.4408110          0.9891010              1.2364001
## 4       4  0.2030004          1.0534500             -1.2745855
aggregate(data[,-c(1,2)], list(member2), mean)
##   Group.1      Age Annual.Income..k.. Spending.Score..1.100.
## 1       1 50.72973           46.16216               40.59459
## 2       2 24.65385           42.94231               62.07692
## 3       3 32.69231           86.53846               82.12821
## 4       4 41.68571           88.22857               17.28571
plot(silhouette(cutree(data.hclust2,4), distance)) 

set.seed(123)
kc <- kmeans(ndata,4)

clusplot(data, kc$cluster, color = T, shade = T, labels = 2, lines = 0)

fviz_cluster( kmeans(ndata,4), data = ndata, axes = c(1,2))

library(psych)
## Warning: package 'psych' was built under R version 4.0.5
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
pairs.panels(data[,-c(1,2)],
             gap = 0,
             bg = c("red", "yellow", "blue"),
             pch=21)

pc <- prcomp(data[,-c(1,2)],
             center = TRUE,
             scale. = TRUE)
attributes(pc)
## $names
## [1] "sdev"     "rotation" "center"   "scale"    "x"       
## 
## $class
## [1] "prcomp"
pc$center
##                    Age     Annual.Income..k.. Spending.Score..1.100. 
##                  38.85                  60.56                  50.20
pc$scale
##                    Age     Annual.Income..k.. Spending.Score..1.100. 
##               13.96901               26.26472               25.82352
print(pc)
## Standard deviations (1, .., p=3):
## [1] 1.1523823 0.9996256 0.8202217
## 
## Rotation (n x k) = (3 x 3):
##                                PC1         PC2         PC3
## Age                     0.70638235 -0.03014116 0.707188441
## Annual.Income..k..     -0.04802398 -0.99883160 0.005397916
## Spending.Score..1.100. -0.70619946  0.03777499 0.707004506
summary(pc)
## Importance of components:
##                           PC1    PC2    PC3
## Standard deviation     1.1524 0.9996 0.8202
## Proportion of Variance 0.4427 0.3331 0.2243
## Cumulative Proportion  0.4427 0.7758 1.0000
pairs.panels(pc$x,
             gap=0,
             bg = c("red", "yellow", "blue"),
             pch=21)

library(devtools)
## Warning: package 'devtools' was built under R version 4.0.5
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 4.0.5
library(ggbiplot)
## Loading required package: plyr
## Warning: package 'plyr' was built under R version 4.0.5
## Loading required package: scales
## Warning: package 'scales' was built under R version 4.0.5
## 
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
## 
##     alpha, rescale
## Loading required package: grid
g <- ggbiplot(pc,
              obs.scale = 1,
              var.scale = 1,
              groups = data$Gender,
              ellipse = TRUE,
              circle = TRUE,
              ellipse.prob = 0.68)
g <- g + scale_color_discrete(name = '')
g <- g + theme(legend.direction = 'horizontal',
               legend.position = 'top')
print(g)