data <- read.csv ("C:\\Users\\tariqm\\Documents\\R\\Datasets\\state_crime.csv", sep = ",")
ndata <- scale(data[-1])
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.0.5
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
p1 <- fviz_nbclust(data[-1], kmeans, method = "wss") + labs(title = "", subtitle = "Elbow method - Data") + geom_vline(xintercept = 4, linetype = 2)
p2 <- fviz_nbclust(ndata, kmeans, method = "wss") + labs(title = "", subtitle = "Elbow method - NData") + geom_vline(xintercept = 4, linetype = 2)
p3 <- fviz_nbclust(data[-1], kmeans, method = "silhouette") + labs(title = "", subtitle = "Silhouette method - Data") + geom_vline(xintercept = 4, linetype = 2)
p4 <- fviz_nbclust(ndata, kmeans, method = "silhouette") + labs(title = "", subtitle = "Silhouette method - NData") + geom_vline(xintercept = 4, linetype = 2)
p5 <- fviz_nbclust(data[-1], kmeans, method = "gap_stat") + labs(title = "", subtitle = "Gap statistic method - Data") + geom_vline(xintercept = 4, linetype = 2)
p6 <- fviz_nbclust(ndata, kmeans, method = "gap_stat") + labs(title = "", subtitle = "Gap statistic method - NData") + geom_vline(xintercept = 4, linetype = 2)
gridExtra::grid.arrange(p1,p2,p3,p4,p5,p6, top = "Optimal Number of Clusters - Data vs NData")

library(cluster)
## Warning: package 'cluster' was built under R version 4.0.5
cluster_1 <- kmeans(ndata,2)
c1 <- clusplot(ndata, cluster_1$cluster, color = T, shade = T, labels = 2, lines = 0)

cluster_2 <- kmeans(ndata,3)
c2 <- clusplot(ndata, cluster_2$cluster, color = T, shade = T, labels = 2, lines = 0)

cluster_3 <- kmeans(ndata,4)
c3 <- clusplot(ndata, cluster_3$cluster, color = T, shade = T, labels = 2, lines = 0)

cluster_4 <- kmeans(ndata,5)
c4 <- clusplot(ndata, cluster_4$cluster, color = T, shade = T, labels = 2, lines = 0)

p7 <- fviz_cluster( kmeans(data[-1],2), data = data[-1], axes = c(1,2)) + labs(title = "", subtitle = "Data - 2 Clusters")
p8 <- fviz_cluster( kmeans(ndata,2), data = ndata, axes = c(1,2)) + labs(title = "", subtitle = "NData - 2 Clusters")
p9 <- fviz_cluster( kmeans(data[-1],3), data = data[-1], axes = c(1,2)) + labs(title = "", subtitle = "Data - 3 Clusters")
p10 <- fviz_cluster( kmeans(ndata,3), data = ndata, axes = c(1,2)) + labs(title = "", subtitle = "NData - 3 Clusters")
gridExtra::grid.arrange(p7,p8,p9,p10, ncol = 2, top = "Clustering - Data vs Ndata")

p11 <- fviz_cluster( kmeans(data[-1],4), data = data[-1], axes = c(1,2)) + labs(title = "", subtitle = "Data - 4 Clusters")
p12 <- fviz_cluster( kmeans(ndata,4), data = ndata, axes = c(1,2)) + labs(title = "", subtitle = "NData - 4 Clusters")
p13 <- fviz_cluster( kmeans(data[-1],5), data = data[-1], axes = c(1,2)) + labs(title = "", subtitle = "Data - 5 Clusters")
p14 <- fviz_cluster( kmeans(ndata,5), data = ndata, axes = c(1,2)) + labs(title = "", subtitle = "NData - 5 Clusters")
gridExtra::grid.arrange(p11,p12,p13,p14, ncol = 2, top = "Clustering - Data vs Ndata")

d.hclust1 <- hclust(dist(ndata), method = "average")
d.hclust2 <- hclust(dist(ndata), method = "single")
d.hclust3 <- hclust(dist(ndata), method = "complete")
d.hclust4 <- hclust(dist(ndata), method = "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
plot(d.hclust1, hang = -1, labels = data$state)
rect.hclust(d.hclust1, k = 4, border = 2:5)

member1 <- cutree(d.hclust1, 4)
table(member1)
## member1
## 1 2 3 4
## 7 1 12 30
aggregate(ndata, list(member1), mean)
## Group.1 Murder Assault UrbanPop Rape
## 1 1 1.5803956 0.9662584 -0.7775109 0.04844071
## 2 2 0.5078625 1.1068225 -1.2117642 2.48420294
## 3 3 0.7106707 1.0338263 0.8838371 1.17633437
## 4 4 -0.6699560 -0.6758849 -0.1317235 -0.56464334
aggregate(data[,-1], list(member1), mean)
## Group.1 Murder Assault UrbanPop Rape
## 1 1 14.67143 251.2857 54.28571 21.68571
## 2 2 10.00000 263.0000 48.00000 44.50000
## 3 3 10.88333 256.9167 78.33333 32.25000
## 4 4 4.87000 114.4333 63.63333 15.94333
plot(d.hclust2, hang = -1, labels = data$state)
rect.hclust(d.hclust2, k = 4, border = 2:5)

member2 <- cutree(d.hclust2, 4)
table(member2)
## member2
## 1 2 3 4
## 46 1 2 1
aggregate(ndata, list(member2), mean)
## Group.1 Murder Assault UrbanPop Rape
## 1 1 -0.07710374 -0.1155488 -0.05683055 -0.181203
## 2 2 0.50786248 1.1068225 -1.21176419 2.484203
## 3 3 0.64561903 1.1188219 1.41349461 2.356085
## 4 4 1.74767144 1.9707777 0.99898006 1.138967
aggregate(data[,-1], list(member2), mean)
## Group.1 Murder Assault UrbanPop Rape
## 1 1 7.452174 161.1304 64.71739 19.53478
## 2 2 10.000000 263.0000 48.00000 44.50000
## 3 3 10.600000 264.0000 86.00000 43.30000
## 4 4 15.400000 335.0000 80.00000 31.90000
plot(d.hclust3, hang = -1, labels = data$state)
rect.hclust(d.hclust3, k = 4, border = 2:5)

member3 <- cutree(d.hclust3, 4)
table(member3)
## member3
## 1 2 3 4
## 8 11 21 10
aggregate(ndata, list(member3), mean)
## Group.1 Murder Assault UrbanPop Rape
## 1 1 1.4463290 0.9838289 -0.8317925 0.3529110
## 2 2 0.7499801 1.1199128 0.9361748 1.2156432
## 3 3 -0.4400338 -0.4353831 0.3607592 -0.2830385
## 4 4 -1.0579703 -1.1046626 -1.1219527 -1.0251554
aggregate(data[,-1], list(member3), mean)
## Group.1 Murder Assault UrbanPop Rape
## 1 1 14.087500 252.7500 53.50000 24.53750
## 2 2 11.054545 264.0909 79.09091 32.61818
## 3 3 5.871429 134.4762 70.76190 18.58095
## 4 4 3.180000 78.7000 49.30000 11.63000
plot(d.hclust4, hang = -1, labels = data$state)
rect.hclust(d.hclust4, k = 4, border = 2:5)

member4 <- cutree(d.hclust4, 4)
table(member4)
## member4
## 1 2 3 4
## 7 12 19 12
aggregate(ndata, list(member4), mean)
## Group.1 Murder Assault UrbanPop Rape
## 1 1 1.5803956 0.9662584 -0.7775109 0.04844071
## 2 2 0.7298036 1.1188219 0.7571799 1.32135653
## 3 3 -0.3621789 -0.3444705 0.3953887 -0.21863180
## 4 4 -1.0782511 -1.1370610 -0.9296640 -1.00344660
aggregate(data[,-1], list(member4), mean)
## Group.1 Murder Assault UrbanPop Rape
## 1 1 14.671429 251.2857 54.28571 21.68571
## 2 2 10.966667 264.0000 76.50000 33.60833
## 3 3 6.210526 142.0526 71.26316 19.18421
## 4 4 3.091667 76.0000 52.08333 11.83333
plot(silhouette(cutree(d.hclust1,4), dist(ndata)))

plot(silhouette(cutree(d.hclust2,4), dist(ndata)))

plot(silhouette(cutree(d.hclust3,4), dist(ndata)))

plot(silhouette(cutree(d.hclust4,4), dist(ndata)))

library(dendextend)
## Warning: package 'dendextend' was built under R version 4.0.5
##
## ---------------------
## Welcome to dendextend version 1.15.1
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
tanglegram(d.hclust1,d.hclust2)

tanglegram(d.hclust1,d.hclust3)

tanglegram(d.hclust1,d.hclust4)

tanglegram(d.hclust2,d.hclust3)

tanglegram(d.hclust2,d.hclust4)

tanglegram(d.hclust3,d.hclust4)

library(factoextra)
library("FactoMineR")
## Warning: package 'FactoMineR' was built under R version 4.0.5
res.pca <- PCA(ndata, graph = FALSE)
eig.val <- get_eigenvalue(res.pca)
res.pca$eig
## eigenvalue percentage of variance cumulative percentage of variance
## comp 1 2.4802416 62.006039 62.00604
## comp 2 0.9897652 24.744129 86.75017
## comp 3 0.3565632 8.914080 95.66425
## comp 4 0.1734301 4.335752 100.00000
fviz_eig(res.pca, addlabels = TRUE)

fviz_pca_var(res.pca, col.var = "black")

fviz_cos2(res.pca, choice = "var")

fviz_pca_var(res.pca, col.var = "cos2", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE)

fviz_pca_var(res.pca, alpha.var = "cos2")

c1 <- fviz_contrib(res.pca, choice = "var", axes = 1)
c2 <- fviz_contrib(res.pca, choice = "var", axes = 2)
c3 <- fviz_contrib(res.pca, choice = "var", axes = 3)
c4 <- fviz_contrib(res.pca, choice = "var", axes = 4)
gridExtra::grid.arrange(c1,c2,c3,c4)

fviz_pca_var(res.pca, col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))

set.seed(123)
my.cont.var <- rnorm(4)
fviz_pca_var(res.pca, col.var = my.cont.var,
gradient.cols = c("blue", "yellow", "red"),
legend.title = "Cont.Var")

#set.seed(123)
#res.km <- kmeans(var$coord, centers = 2, nstart = 25)
#grp <- as.factor(res.km$cluster)
#fviz_pca_var(res.pca, col.var = grp,
# palette = c("#0073C2FF", "#EFC000FF", "#868686FF"),
# legend.title = "Cluster")
res.desc <- dimdesc(res.pca, axes = c(1,2), proba = 0.05)
res.desc$Dim.1
## $quanti
## correlation p.value
## Assault 0.9184432 5.757351e-21
## Rape 0.8558394 2.403444e-15
## Murder 0.8439764 1.393084e-14
## UrbanPop 0.4381168 1.461945e-03
##
## attr(,"class")
## [1] "condes" "list"
fviz_pca_ind(res.pca, col.ind = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)

fviz_pca_ind(res.pca, pointsize = "cos2",
pointshape = 21, fill = "#E7B800",
repel = TRUE)

fviz_pca_ind(res.pca, col.ind = "cos2", pointsize = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)

fviz_cos2(res.pca, choice = "ind") + coord_flip()

fviz_contrib(res.pca, choice = "ind", axes = 1:2) + coord_flip()

set.seed(123)
my.cont.var <- rnorm(50)
fviz_pca_ind(res.pca, col.ind = my.cont.var,
gradient.cols = c("blue", "yellow", "red"),
legend.title = "Cont.Var")
