data <- read.csv ("C:\\Users\\tariqm\\Documents\\R\\Datasets\\state_crime.csv", sep = ",")

ndata <- scale(data[-1])

library(factoextra)
## Warning: package 'factoextra' was built under R version 4.0.5
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
p1 <- fviz_nbclust(data[-1], kmeans, method = "wss") + labs(title = "", subtitle = "Elbow method - Data") + geom_vline(xintercept = 4, linetype = 2)
      
p2 <- fviz_nbclust(ndata, kmeans, method = "wss")  + labs(title = "", subtitle = "Elbow method - NData") + geom_vline(xintercept = 4, linetype = 2)
     
p3 <- fviz_nbclust(data[-1], kmeans, method = "silhouette")  + labs(title = "", subtitle = "Silhouette method - Data") + geom_vline(xintercept = 4, linetype = 2)
     
p4 <- fviz_nbclust(ndata, kmeans, method = "silhouette")  + labs(title = "", subtitle = "Silhouette method - NData") + geom_vline(xintercept = 4, linetype = 2)

p5 <- fviz_nbclust(data[-1], kmeans, method = "gap_stat")  + labs(title = "", subtitle = "Gap statistic method - Data") + geom_vline(xintercept = 4, linetype = 2)

p6 <- fviz_nbclust(ndata, kmeans, method = "gap_stat")  + labs(title = "", subtitle = "Gap statistic method - NData") + geom_vline(xintercept = 4, linetype = 2)
     
gridExtra::grid.arrange(p1,p2,p3,p4,p5,p6, top = "Optimal Number of Clusters - Data vs NData")

library(cluster)
## Warning: package 'cluster' was built under R version 4.0.5
cluster_1 <- kmeans(ndata,2)
c1 <- clusplot(ndata, cluster_1$cluster, color = T, shade = T, labels = 2, lines = 0)

cluster_2 <- kmeans(ndata,3)
c2 <- clusplot(ndata, cluster_2$cluster, color = T, shade = T, labels = 2, lines = 0)

cluster_3 <- kmeans(ndata,4)
c3 <- clusplot(ndata, cluster_3$cluster, color = T, shade = T, labels = 2, lines = 0)

cluster_4 <- kmeans(ndata,5)
c4 <- clusplot(ndata, cluster_4$cluster, color = T, shade = T, labels = 2, lines = 0)

p7 <- fviz_cluster( kmeans(data[-1],2), data = data[-1], axes = c(1,2)) + labs(title = "", subtitle = "Data - 2 Clusters")

p8 <- fviz_cluster( kmeans(ndata,2), data = ndata, axes = c(1,2))  + labs(title = "", subtitle = "NData - 2 Clusters") 

p9 <- fviz_cluster( kmeans(data[-1],3), data = data[-1], axes = c(1,2))  + labs(title = "", subtitle = "Data - 3 Clusters") 

p10 <- fviz_cluster( kmeans(ndata,3), data = ndata, axes = c(1,2))  + labs(title = "", subtitle = "NData - 3 Clusters") 

gridExtra::grid.arrange(p7,p8,p9,p10, ncol = 2, top = "Clustering - Data vs Ndata")

p11 <- fviz_cluster( kmeans(data[-1],4), data = data[-1], axes = c(1,2))  + labs(title = "", subtitle = "Data - 4 Clusters") 

p12 <- fviz_cluster( kmeans(ndata,4), data = ndata, axes = c(1,2))  + labs(title = "", subtitle = "NData - 4 Clusters") 

p13 <- fviz_cluster( kmeans(data[-1],5), data = data[-1], axes = c(1,2))  + labs(title = "", subtitle = "Data - 5 Clusters") 

p14 <- fviz_cluster( kmeans(ndata,5), data = ndata, axes = c(1,2))  + labs(title = "", subtitle = "NData - 5 Clusters") 

gridExtra::grid.arrange(p11,p12,p13,p14, ncol = 2, top = "Clustering - Data vs Ndata")

d.hclust1 <- hclust(dist(ndata), method = "average")
d.hclust2 <- hclust(dist(ndata), method = "single")
d.hclust3 <- hclust(dist(ndata), method = "complete")
d.hclust4 <- hclust(dist(ndata), method = "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
plot(d.hclust1, hang = -1, labels = data$state)
rect.hclust(d.hclust1, k = 4, border = 2:5)

member1 <- cutree(d.hclust1, 4)
table(member1)
## member1
##  1  2  3  4 
##  7  1 12 30
aggregate(ndata, list(member1), mean)
##   Group.1     Murder    Assault   UrbanPop        Rape
## 1       1  1.5803956  0.9662584 -0.7775109  0.04844071
## 2       2  0.5078625  1.1068225 -1.2117642  2.48420294
## 3       3  0.7106707  1.0338263  0.8838371  1.17633437
## 4       4 -0.6699560 -0.6758849 -0.1317235 -0.56464334
aggregate(data[,-1], list(member1), mean)
##   Group.1   Murder  Assault UrbanPop     Rape
## 1       1 14.67143 251.2857 54.28571 21.68571
## 2       2 10.00000 263.0000 48.00000 44.50000
## 3       3 10.88333 256.9167 78.33333 32.25000
## 4       4  4.87000 114.4333 63.63333 15.94333
plot(d.hclust2, hang = -1, labels = data$state)
rect.hclust(d.hclust2, k = 4, border = 2:5)

member2 <- cutree(d.hclust2, 4)
table(member2)
## member2
##  1  2  3  4 
## 46  1  2  1
aggregate(ndata, list(member2), mean)
##   Group.1      Murder    Assault    UrbanPop      Rape
## 1       1 -0.07710374 -0.1155488 -0.05683055 -0.181203
## 2       2  0.50786248  1.1068225 -1.21176419  2.484203
## 3       3  0.64561903  1.1188219  1.41349461  2.356085
## 4       4  1.74767144  1.9707777  0.99898006  1.138967
aggregate(data[,-1], list(member2), mean)
##   Group.1    Murder  Assault UrbanPop     Rape
## 1       1  7.452174 161.1304 64.71739 19.53478
## 2       2 10.000000 263.0000 48.00000 44.50000
## 3       3 10.600000 264.0000 86.00000 43.30000
## 4       4 15.400000 335.0000 80.00000 31.90000
plot(d.hclust3, hang = -1, labels = data$state)
rect.hclust(d.hclust3, k = 4, border = 2:5)

member3 <- cutree(d.hclust3, 4)
table(member3)
## member3
##  1  2  3  4 
##  8 11 21 10
aggregate(ndata, list(member3), mean)
##   Group.1     Murder    Assault   UrbanPop       Rape
## 1       1  1.4463290  0.9838289 -0.8317925  0.3529110
## 2       2  0.7499801  1.1199128  0.9361748  1.2156432
## 3       3 -0.4400338 -0.4353831  0.3607592 -0.2830385
## 4       4 -1.0579703 -1.1046626 -1.1219527 -1.0251554
aggregate(data[,-1], list(member3), mean)
##   Group.1    Murder  Assault UrbanPop     Rape
## 1       1 14.087500 252.7500 53.50000 24.53750
## 2       2 11.054545 264.0909 79.09091 32.61818
## 3       3  5.871429 134.4762 70.76190 18.58095
## 4       4  3.180000  78.7000 49.30000 11.63000
plot(d.hclust4, hang = -1, labels = data$state)
rect.hclust(d.hclust4, k = 4, border = 2:5)

member4 <- cutree(d.hclust4, 4)
table(member4)
## member4
##  1  2  3  4 
##  7 12 19 12
aggregate(ndata, list(member4), mean)
##   Group.1     Murder    Assault   UrbanPop        Rape
## 1       1  1.5803956  0.9662584 -0.7775109  0.04844071
## 2       2  0.7298036  1.1188219  0.7571799  1.32135653
## 3       3 -0.3621789 -0.3444705  0.3953887 -0.21863180
## 4       4 -1.0782511 -1.1370610 -0.9296640 -1.00344660
aggregate(data[,-1], list(member4), mean)
##   Group.1    Murder  Assault UrbanPop     Rape
## 1       1 14.671429 251.2857 54.28571 21.68571
## 2       2 10.966667 264.0000 76.50000 33.60833
## 3       3  6.210526 142.0526 71.26316 19.18421
## 4       4  3.091667  76.0000 52.08333 11.83333
plot(silhouette(cutree(d.hclust1,4), dist(ndata))) 

plot(silhouette(cutree(d.hclust2,4), dist(ndata)))

plot(silhouette(cutree(d.hclust3,4), dist(ndata)))

plot(silhouette(cutree(d.hclust4,4), dist(ndata))) 

library(dendextend)
## Warning: package 'dendextend' was built under R version 4.0.5
## 
## ---------------------
## Welcome to dendextend version 1.15.1
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
tanglegram(d.hclust1,d.hclust2)

tanglegram(d.hclust1,d.hclust3)

tanglegram(d.hclust1,d.hclust4)

tanglegram(d.hclust2,d.hclust3)

tanglegram(d.hclust2,d.hclust4)

tanglegram(d.hclust3,d.hclust4)

library(factoextra)
library("FactoMineR")
## Warning: package 'FactoMineR' was built under R version 4.0.5
res.pca <- PCA(ndata, graph = FALSE)
eig.val <- get_eigenvalue(res.pca)
res.pca$eig
##        eigenvalue percentage of variance cumulative percentage of variance
## comp 1  2.4802416              62.006039                          62.00604
## comp 2  0.9897652              24.744129                          86.75017
## comp 3  0.3565632               8.914080                          95.66425
## comp 4  0.1734301               4.335752                         100.00000
fviz_eig(res.pca, addlabels = TRUE)

fviz_pca_var(res.pca, col.var = "black")

fviz_cos2(res.pca, choice = "var")

fviz_pca_var(res.pca, col.var = "cos2", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE)

fviz_pca_var(res.pca, alpha.var = "cos2")

c1 <- fviz_contrib(res.pca, choice = "var", axes = 1)
c2 <- fviz_contrib(res.pca, choice = "var", axes = 2)
c3 <- fviz_contrib(res.pca, choice = "var", axes = 3)
c4 <- fviz_contrib(res.pca, choice = "var", axes = 4)

gridExtra::grid.arrange(c1,c2,c3,c4)

fviz_pca_var(res.pca, col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))

set.seed(123)
my.cont.var <- rnorm(4)
fviz_pca_var(res.pca, col.var = my.cont.var,
             gradient.cols = c("blue", "yellow", "red"),
             legend.title = "Cont.Var")

#set.seed(123)
#res.km <- kmeans(var$coord, centers = 2, nstart = 25)
#grp <- as.factor(res.km$cluster)
#fviz_pca_var(res.pca, col.var = grp, 
#             palette = c("#0073C2FF", "#EFC000FF", "#868686FF"),
#             legend.title = "Cluster")

res.desc <- dimdesc(res.pca, axes = c(1,2), proba = 0.05)
res.desc$Dim.1
## $quanti
##          correlation      p.value
## Assault    0.9184432 5.757351e-21
## Rape       0.8558394 2.403444e-15
## Murder     0.8439764 1.393084e-14
## UrbanPop   0.4381168 1.461945e-03
## 
## attr(,"class")
## [1] "condes" "list"
fviz_pca_ind(res.pca, col.ind = "cos2", 
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE)

fviz_pca_ind(res.pca, pointsize = "cos2", 
             pointshape = 21, fill = "#E7B800",
             repel = TRUE)

fviz_pca_ind(res.pca, col.ind = "cos2", pointsize = "cos2",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE) 

fviz_cos2(res.pca, choice = "ind") + coord_flip()

fviz_contrib(res.pca, choice = "ind", axes = 1:2) + coord_flip()

set.seed(123)
my.cont.var <- rnorm(50)

fviz_pca_ind(res.pca, col.ind = my.cont.var,
             gradient.cols = c("blue", "yellow", "red"),
             legend.title = "Cont.Var")