library("factoextra")
## Warning: package 'factoextra' was built under R version 4.1.1
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
data (decathlon2)
head (decathlon2)
## X100m Long.jump Shot.put High.jump X400m X110m.hurdle Discus
## SEBRLE 11.04 7.58 14.83 2.07 49.81 14.69 43.75
## CLAY 10.76 7.40 14.26 1.86 49.37 14.05 50.72
## BERNARD 11.02 7.23 14.25 1.92 48.93 14.99 40.87
## YURKOV 11.34 7.09 15.19 2.10 50.42 15.31 46.26
## ZSIVOCZKY 11.13 7.30 13.48 2.01 48.62 14.17 45.67
## McMULLEN 10.83 7.31 13.76 2.13 49.91 14.38 44.41
## Pole.vault Javeline X1500m Rank Points Competition
## SEBRLE 5.02 63.19 291.7 1 8217 Decastar
## CLAY 4.92 60.15 301.5 2 8122 Decastar
## BERNARD 5.32 62.77 280.1 4 8067 Decastar
## YURKOV 4.72 63.44 276.4 5 8036 Decastar
## ZSIVOCZKY 4.42 55.37 268.0 7 8004 Decastar
## McMULLEN 4.42 56.37 285.1 8 7995 Decastar
decathlon2.active <- decathlon2[1:23, 1:10]
head(decathlon2.active[, 1:6], 4)
## X100m Long.jump Shot.put High.jump X400m X110m.hurdle
## SEBRLE 11.04 7.58 14.83 2.07 49.81 14.69
## CLAY 10.76 7.40 14.26 1.86 49.37 14.05
## BERNARD 11.02 7.23 14.25 1.92 48.93 14.99
## YURKOV 11.34 7.09 15.19 2.10 50.42 15.31
library("FactoMineR")
## Warning: package 'FactoMineR' was built under R version 4.1.1
res.pca <- PCA(decathlon2.active, graph = FALSE)
print(res.pca)
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 23 individuals, described by 10 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
library("factoextra")
eig.val<-get_eigenvalue(res.pca)
eig.val
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 4.1242133 41.242133 41.24213
## Dim.2 1.8385309 18.385309 59.62744
## Dim.3 1.2391403 12.391403 72.01885
## Dim.4 0.8194402 8.194402 80.21325
## Dim.5 0.7015528 7.015528 87.22878
## Dim.6 0.4228828 4.228828 91.45760
## Dim.7 0.3025817 3.025817 94.48342
## Dim.8 0.2744700 2.744700 97.22812
## Dim.9 0.1552169 1.552169 98.78029
## Dim.10 0.1219710 1.219710 100.00000
library("factoextra")
fviz_eig(res.pca, addlabels = TRUE, ylim = c(0, 50))
var <- get_pca_var(res.pca)
var
## Principal Component Analysis Results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the variables"
## 2 "$cor" "Correlations between variables and dimensions"
## 3 "$cos2" "Cos2 for the variables"
## 4 "$contrib" "contributions of the variables"
# Coordenadas
head (var$coord)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## X100m -0.8506257 -0.17939806 0.3015564 0.03357320 -0.1944440
## Long.jump 0.7941806 0.28085695 -0.1905465 -0.11538956 0.2331567
## Shot.put 0.7339127 0.08540412 0.5175978 0.12846837 -0.2488129
## High.jump 0.6100840 -0.46521415 0.3300852 0.14455012 0.4027002
## X400m -0.7016034 0.29017826 0.2835329 0.43082552 0.1039085
## X110m.hurdle -0.7641252 -0.02474081 0.4488873 -0.01689589 0.2242200
# Cos2: calidad en el mapa de factores
head (var$cos2)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## X100m 0.7235641 0.0321836641 0.09093628 0.0011271597 0.03780845
## Long.jump 0.6307229 0.0788806285 0.03630798 0.0133147506 0.05436203
## Shot.put 0.5386279 0.0072938636 0.26790749 0.0165041211 0.06190783
## High.jump 0.3722025 0.2164242070 0.10895622 0.0208947375 0.16216747
## X400m 0.4922473 0.0842034209 0.08039091 0.1856106269 0.01079698
## X110m.hurdle 0.5838873 0.0006121077 0.20149984 0.0002854712 0.05027463
# Contribuciones de los principales componentes
head (var$contrib)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## X100m 17.544293 1.7505098 7.338659 0.13755240 5.389252
## Long.jump 15.293168 4.2904162 2.930094 1.62485936 7.748815
## Shot.put 13.060137 0.3967224 21.620432 2.01407269 8.824401
## High.jump 9.024811 11.7715838 8.792888 2.54987951 23.115504
## X400m 11.935544 4.5799296 6.487636 22.65090599 1.539012
## X110m.hurdle 14.157544 0.0332933 16.261261 0.03483735 7.166193
# Coordinates of variables
head(var$coord, 4)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## X100m -0.8506257 -0.17939806 0.3015564 0.0335732 -0.1944440
## Long.jump 0.7941806 0.28085695 -0.1905465 -0.1153896 0.2331567
## Shot.put 0.7339127 0.08540412 0.5175978 0.1284684 -0.2488129
## High.jump 0.6100840 -0.46521415 0.3300852 0.1445501 0.4027002
fviz_pca_var(res.pca, col.var="black")
head (var$cos2, 4)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## X100m 0.7235641 0.032183664 0.09093628 0.00112716 0.03780845
## Long.jump 0.6307229 0.078880629 0.03630798 0.01331475 0.05436203
## Shot.put 0.5386279 0.007293864 0.26790749 0.01650412 0.06190783
## High.jump 0.3722025 0.216424207 0.10895622 0.02089474 0.16216747
library("corrplot")
## corrplot 0.91 loaded
corrplot(var$cos2, is.corr=FALSE)
# Toral cos2 de variables en Dim.1 y Dim.2
fviz_cos2(res.pca, choice = "var", axes = 1:2)
# Color by cos2 values: quality on the factor map
fviz_pca_var(res.pca, col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE
)
head(var$contrib, 4)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## X100m 17.544293 1.7505098 7.338659 0.1375524 5.389252
## Long.jump 15.293168 4.2904162 2.930094 1.6248594 7.748815
## Shot.put 13.060137 0.3967224 21.620432 2.0140727 8.824401
## High.jump 9.024811 11.7715838 8.792888 2.5498795 23.115504
library(corrplot)
corrplot(var$contrib, is.corr=FALSE)
# Contribución de varibales en PC1
fviz_contrib(res.pca, choice = "var", axes = 1, top = 10)
# Contribución de varibales en PC2
fviz_contrib(res.pca, choice = "var", axes = 2, top = 10)
fviz_contrib(res.pca, choice = "var", axes = 1:2, top = 10)
fviz_pca_var(res.pca, col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")
)
fviz_pca_var(res.pca, alpha.var = "contrib")
# Create a random continuous variable of length 10
set.seed(123)
my.cont.var<-rnorm(10)
# Color variables by the continuous variable
fviz_pca_var(res.pca, col.var = my.cont.var,
gradient.cols=c("blue", "yellow", "red"),
legend.title = "Cont.Var")
# Create a grouping variable using kmeans
# Create 3 groups of variables (center = 3)
set.seed(123)
res.km<-kmeans(var$coord, centers = 3, nstart = 25)
grp <- as.factor(res.km$cluster)
# Color variables by groups
fviz_pca_var(res.pca, col.var = grp,
palette = c("#0073C2FF", "#EFC000FF", "#868686FF"),
legend.tittle = "Cluster")
res.desc<-dimdesc(res.pca, axes = c(1,2), proba=0.05)
# Descipción de la dimensión 1
res.desc$Dim.1
## $quanti
## correlation p.value
## Long.jump 0.7941806 6.059893e-06
## Discus 0.7432090 4.842563e-05
## Shot.put 0.7339127 6.723102e-05
## High.jump 0.6100840 1.993677e-03
## Javeline 0.4282266 4.149192e-02
## X400m -0.7016034 1.910387e-04
## X110m.hurdle -0.7641252 2.195812e-05
## X100m -0.8506257 2.727129e-07
##
## attr(,"class")
## [1] "condes" "list"
# Descipción de la dimensión 2
res.desc$Dim.2
## $quanti
## correlation p.value
## Pole.vault 0.8074511 3.205016e-06
## X1500m 0.7844802 9.384747e-06
## High.jump -0.4652142 2.529390e-02
##
## attr(,"class")
## [1] "condes" "list"
ind<-get_pca_ind(res.pca)
ind
## Principal Component Analysis Results for individuals
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the individuals"
## 2 "$cos2" "Cos2 for the individuals"
## 3 "$contrib" "contributions of the individuals"
# Coordenada de los individuos
head(ind$coord)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## SEBRLE 0.1955047 1.5890567 0.6424912 0.08389652 1.16829387
## CLAY 0.8078795 2.4748137 -1.3873827 1.29838232 -0.82498206
## BERNARD -1.3591340 1.6480950 0.2005584 -1.96409420 0.08419345
## YURKOV -0.8889532 -0.4426067 2.5295843 0.71290837 0.40782264
## ZSIVOCZKY -0.1081216 -2.0688377 -1.3342591 -0.10152796 -0.20145217
## McMULLEN 0.1212195 -1.0139102 -0.8625170 1.34164291 1.62151286
# Calidad de los individuos
head(ind$cos2)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## SEBRLE 0.007530179 0.49747323 0.081325232 0.001386688 0.2689026575
## CLAY 0.048701249 0.45701660 0.143628117 0.125791741 0.0507850580
## BERNARD 0.197199804 0.28996555 0.004294015 0.411819183 0.0007567259
## YURKOV 0.096109800 0.02382571 0.778230322 0.061812637 0.0202279796
## ZSIVOCZKY 0.001574385 0.57641944 0.239754152 0.001388216 0.0054654972
## McMULLEN 0.002175437 0.15219499 0.110137872 0.266486530 0.3892621478
# Contribución de los individuos
head(ind$contrib)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## SEBRLE 0.04029447 5.9714533 1.4483919 0.03734589 8.45894063
## CLAY 0.68805664 14.4839248 6.7537381 8.94458283 4.21794385
## BERNARD 1.94740183 6.4234107 0.1411345 20.46819433 0.04393073
## YURKOV 0.83308415 0.4632733 22.4517396 2.69663605 1.03075263
## ZSIVOCZKY 0.01232413 10.1217143 6.2464325 0.05469230 0.25151025
## McMULLEN 0.01549089 2.4310854 2.6102794 9.55055888 16.29493304
fviz_pca_ind(res.pca)
fviz_pca_ind(res.pca, col.ind="cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE
)
fviz_pca_ind(res.pca, pointsize="cos2",
pointshape = 21, fill = "#E7B800",
repel = TRUE
)
fviz_pca_ind(res.pca, col.ind="cos2", pointsize="cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE
)
fviz_cos2(res.pca, choice = "ind")
# Contribución total en PC1 PC2
fviz_contrib(res.pca, choice = "ind", axes = 1:2)
# Crear una variable continua aleatoria de tamaño 23
# Mismo tamaño de individuos activos en el análisis de componentes principales
set.seed(123)
my.cont.var <- rnorm(23)
#Colorear las variables según la variable continua
fviz_pca_ind(res.pca, col.ind = my.cont.var,
gradient.cols = c("blue", "yellow", "red"),
legend.tittle = "Cont.Var")
head(iris, 3)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
# La variable Species (index=5) es removida
# antes del análisis
iris.pca <- PCA(iris[-5], graph = FALSE)
fviz_pca_ind(iris.pca,
geom.ind = "point",
col.ind = iris$Species,
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
addEllipses = TRUE,
legend.tittle = "Groups"
)
fviz_pca_ind(iris.pca, geom.ind = "point", col.ind = iris$Species,
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
addEllipses = TRUE, ellipse.type = "confidence",
legend.tittle = "Groups"
)
fviz_pca_ind(iris.pca,
label = "none",
habillage = iris$Species,
addEllipses = TRUE,
palette = "jco"
)
fviz_pca_var(res.pca, axes = c(2, 3))
fviz_pca_ind(res.pca, axes = c(2, 3))
fviz_pca_var(res.pca, geom.var = c("point", "text"))
fviz_pca_ind(res.pca, geom.ind = "text")
fviz_pca_var(res.pca, arrowsize = 1, labelsize = 5,
repel = TRUE)
fviz_pca_ind(res.pca,
pointsize = 3, pointshape = 21, fill = "lightblue",
labelsize = 5, repel = TRUE)
###Este argumento de nivel de elipse permite también cambiar el tamaño de la concentración de la elipse un probabilidad normal. Por ejemplo, especifique ellipse.level = 0.95 or ellipse.level = 0.66
fviz_pca_ind(iris.pca, geom.ind = "point",
col.ind = iris$Species, # color by groups
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
addEllipses = TRUE, ellipse.type = "confidence",
legend.title = "Groups"
)
fviz_pca_ind(iris.pca, geom.ind = "point",
col.ind = iris$Species, # color by groups
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
addEllipses = TRUE, ellipse.type = "convex",
legend.title = "Groups"
)
#" 3.4.5.5. Puntos medios del grupo
fviz_pca_ind(iris.pca,
geom.ind = "point", # show points only (but not "text")
group.ind = iris$Species, # color by groups
legend.title = "Groups",
mean.point = FALSE)
fviz_pca_var(res.pca, axes.linetype = "blank")
ind.p <- fviz_pca_ind(iris.pca, geom = "point", col.ind = iris$Species)
ggpubr::ggpar(ind.p,
title = "Principal Component Analysis",
subtitle = "Iris data set",
caption = "Source: factoextra",
xlab = "PC1", ylab = "PC2",
legend.title = "Species", legend.position = "top",
ggtheme = theme_gray(), palette = "jco"
)
fviz_pca_biplot(res.pca, repel = TRUE,
col.var = "#2E9FDF", # Variables color
col.ind = "#696969" # Individuals color
)
fviz_pca_biplot(iris.pca,
col.ind = iris$Species, palette = "jco",
addEllipses = TRUE, label = "var",
col.var = "black", repel = TRUE,
legend.title = "Species")
fviz_pca_biplot(iris.pca,
# Fill individuals by groups
geom.ind = "point",
pointshape = 21,
pointsize = 2.5,
fill.ind = iris$Species,
col.ind = "black",
# Color variable by groups
col.var = factor(c("sepal", "sepal", "petal", "petal")),
legend.title = list(fill = "Species", color = "Clusters"),
repel = TRUE # Avoid label overplotting
)+
ggpubr::fill_palette("jco")+ # Indiviual fill color
ggpubr::color_palette("npg") # Variable colors
fviz_pca_biplot(iris.pca,
# Individuals
geom.ind = "point",
fill.ind = iris$Species, col.ind = "black",
pointshape = 21, pointsize = 2,
palette = "jco",
addEllipses = TRUE,
# Variables
alpha.var ="contrib", col.var = "contrib",
gradient.cols = "RdYlBu",
legend.title = list(fill = "Species", color = "Contrib",
alpha = "Contrib")
)
PCA(X, ind.sup = NULL, quanti.sup = NULL, quali.sup = NULL, graph = TRUE)
res.pca <- PCA(decathlon2, ind.sup = 24:27,
quanti.sup = 11:12, quali.sup = 13, graph=FALSE)
res.pca$quanti.sup
## $coord
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Rank -0.7014777 -0.24519443 -0.1834294 0.05575186 -0.07382647
## Points 0.9637075 0.07768262 0.1580225 -0.16623092 -0.03114711
##
## $cor
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Rank -0.7014777 -0.24519443 -0.1834294 0.05575186 -0.07382647
## Points 0.9637075 0.07768262 0.1580225 -0.16623092 -0.03114711
##
## $cos2
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Rank 0.4920710 0.060120310 0.03364635 0.00310827 0.0054503477
## Points 0.9287322 0.006034589 0.02497110 0.02763272 0.0009701427
fviz_pca_var(res.pca)
fviz_pca_var(res.pca,
col.var = "black", # Active variables
col.quanti.sup = "red" # Suppl. quantitative variables
)
# Hide active variables on the plot,
# show only supplementary variables
fviz_pca_var(res.pca, invisible = "var")
# Hide supplementary variables
fviz_pca_var(res.pca, invisible = "quanti.sup")
# Plot of active variables
p <- fviz_pca_var(res.pca, invisible = "quanti.sup")
# Add supplementary active variables
fviz_add(p, res.pca$quanti.sup$coord,
geom = c("arrow", "text"),
color = "red")
res.pca$ind.sup
## $coord
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## KARPOV 0.7947206 0.77951227 -1.6330203 1.7242283 -0.75070396
## WARNERS -0.3864645 -0.12159237 -1.7387332 -0.7063341 -0.03230011
## Nool -0.5591306 1.97748871 -0.4830358 -2.2784526 -0.25461493
## Drews -1.1092038 0.01741477 -3.0488182 -1.5343468 -0.32642192
##
## $cos2
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## KARPOV 0.05104677 4.911173e-02 0.21553730 0.24028620 0.0455487744
## WARNERS 0.02422707 2.398250e-03 0.49039677 0.08092862 0.0001692349
## Nool 0.02897149 3.623868e-01 0.02162236 0.48108780 0.0060077529
## Drews 0.09207094 2.269527e-05 0.69560547 0.17617609 0.0079736753
##
## $dist
## KARPOV WARNERS Nool Drews
## 3.517470 2.482899 3.284943 3.655527
+Visualizar todos los individuos (activos y suplementarios). En la gráfica puedes agregar también las variables cualitativas suplementarias (quali.sup), cuyas coordenadas son accesibles usando res.pca\(quali.supp\)coord.
p <- fviz_pca_ind(res.pca, col.ind.sup = "blue", repel = TRUE)
p <- fviz_add(p, res.pca$quali.sup$coord, color = "red")
p
res.pca$quali
## $coord
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Decastar -1.343451 0.1218097 -0.03789524 0.1808357 0.1343364
## OlympicG 1.231497 -0.1116589 0.03473730 -0.1657661 -0.1231417
##
## $cos2
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Decastar 0.9051233 0.007440939 0.0007201669 0.01639956 0.009050062
## OlympicG 0.9051233 0.007440939 0.0007201669 0.01639956 0.009050062
##
## $v.test
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Decastar -2.970766 0.4034256 -0.1528767 0.8971036 0.7202457
## OlympicG 2.970766 -0.4034256 0.1528767 -0.8971036 -0.7202457
##
## $dist
## Decastar OlympicG
## 1.412108 1.294433
##
## $eta2
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Competition 0.4011568 0.00739783 0.001062332 0.03658159 0.02357972
fviz_pca_ind(res.pca, habillage = 13,
addEllipses =TRUE, ellipse.type = "confidence",
palette = "jco", repel = TRUE)
df<- read.csv("C:\\Users\\USUARIO\\Downloads\\Base de datos 2.csv")
dft <- dplyr::select(df, areasymbol,
soil_erosion = soil.erosion,
watereros, winderos, sci, sciom, scifo,
slope = Slope)
View(dft)
head(dft)
## areasymbol soil_erosion watereros winderos sci sciom scifo slope
## 1 OK009 0.63 0.63 0.00 0.43 -0.21 0.92 0.5
## 2 OK009 0.19 0.19 0.00 0.70 0.37 0.93 0.5
## 3 OK009 1.85 0.74 1.11 0.06 -0.13 0.15 0.5
## 4 OK009 0.81 0.36 0.46 0.33 0.32 0.16 0.5
## 5 OK009 1.56 1.56 0.00 0.30 -0.37 0.92 2.0
## 6 OK009 0.39 0.39 0.00 0.58 0.10 0.93 2.0
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df_areasymbol <- group_by(dft, areasymbol)
View(df_areasymbol)
head(df_areasymbol)
## # A tibble: 6 x 8
## # Groups: areasymbol [1]
## areasymbol soil_erosion watereros winderos sci sciom scifo slope
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 OK009 0.63 0.63 0 0.43 -0.21 0.92 0.5
## 2 OK009 0.19 0.19 0 0.7 0.37 0.93 0.5
## 3 OK009 1.85 0.74 1.11 0.06 -0.13 0.15 0.5
## 4 OK009 0.81 0.36 0.46 0.33 0.32 0.16 0.5
## 5 OK009 1.56 1.56 0 0.3 -0.37 0.92 2
## 6 OK009 0.39 0.39 0 0.58 0.1 0.93 2
df_pca <- summarize(df_areasymbol,
soil_erosion = mean(soil_erosion, na.rm = TRUE),
watereros = mean(watereros, na.rm = TRUE),
winderos = mean(winderos, na.rm = TRUE),
sci = mean(sci, na.rm = TRUE),
sciom = mean(sciom, na.rm = TRUE),
scifo = mean(scifo, na.rm = TRUE),
slope = mean(slope, na.rm = TRUE))
View(df_pca)
str(df_pca)
## tibble [50 x 8] (S3: tbl_df/tbl/data.frame)
## $ areasymbol : chr [1:50] "OK009" "OK015" "OK031" "OK033" ...
## $ soil_erosion: num [1:50] 7.42 7.55 21.03 19.72 12.08 ...
## $ watereros : num [1:50] 3.27 5.53 6.07 5.49 4.37 ...
## $ winderos : num [1:50] 4.15 2.01 14.96 14.23 7.71 ...
## $ sci : num [1:50] -0.191 -0.249 -1.324 -1.231 -0.59 ...
## $ sciom : num [1:50] -0.159 -0.276 -0.311 -0.337 -0.238 ...
## $ scifo : num [1:50] 0.641 0.641 0.641 0.641 0.641 ...
## $ slope : num [1:50] 4.36 5.27 4.57 3.15 4.09 ...
df_pca <- df_pca[, 2:ncol(df_pca)]
df_pca
## # A tibble: 50 x 7
## soil_erosion watereros winderos sci sciom scifo slope
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.42 3.27 4.15 -0.191 -0.159 0.641 4.36
## 2 7.55 5.53 2.01 -0.249 -0.276 0.641 5.27
## 3 21.0 6.07 15.0 -1.32 -0.311 0.641 4.57
## 4 19.7 5.49 14.2 -1.23 -0.337 0.641 3.15
## 5 12.1 4.37 7.71 -0.590 -0.238 0.641 4.09
## 6 6.46 2.62 3.84 -0.103 -0.126 0.641 3.55
## 7 5.72 2.13 3.58 -0.0124 -0.0470 0.641 3.72
## 8 3.24 1.53 1.71 0.205 0.00862 0.641 2.85
## 9 6.81 4.18 2.63 -0.147 -0.170 0.641 4.24
## 10 5.81 2.76 3.05 -0.0643 -0.159 0.641 2.84
## # ... with 40 more rows
library(FactoMineR)
library(factoextra)
datos.active <- df_pca[1:43792, 1:5]
head (datos.active[, 1:5],4)
## # A tibble: 4 x 5
## soil_erosion watereros winderos sci sciom
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.42 3.27 4.15 -0.191 -0.159
## 2 7.55 5.53 2.01 -0.249 -0.276
## 3 21.0 6.07 15.0 -1.32 -0.311
## 4 19.7 5.49 14.2 -1.23 -0.337
library(FactoMineR)
library(factoextra)
PCA(df_pca, scale.unit = TRUE, ncp = 5, graph = TRUE)
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 50 individuals, described by 7 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
library(FactoMineR)
res.pca <- PCA(datos.active, graph = FALSE)
## Warning in PCA(datos.active, graph = FALSE): Missing values are imputed by
## the mean of the variable: you should use the imputePCA function of the missMDA
## package
print(res.pca)
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 43792 individuals, described by 5 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
library(factoextra)
eig.val <- get_eigenvalue(res.pca)
eig.val
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 3.866706e+00 7.733412e+01 77.33412
## Dim.2 8.874969e-01 1.774994e+01 95.08405
## Dim.3 2.457970e-01 4.915940e+00 99.99999
## Dim.4 2.919159e-07 5.838317e-06 100.00000
## Dim.5 8.896662e-10 1.779332e-08 100.00000
fviz_eig(res.pca, addlabels = TRUE, ylim = c(0,50))
countries <- read.csv("C:\\Users\\USUARIO\\Downloads\\countries_of_the_world.csv", na.string = c("", "NA"))
head(countries)
## Country Region Population Area..sq..mi..
## 1 Afghanistan ASIA (EX. NEAR EAST) 31056997 647500
## 2 Albania EASTERN EUROPE 3581655 28748
## 3 Algeria NORTHERN AFRICA 32930091 2381740
## 4 American Samoa OCEANIA 57794 199
## 5 Andorra WESTERN EUROPE 71201 468
## 6 Angola SUB-SAHARAN AFRICA 12127071 1246700
## Pop..Density..per.sq..mi.. Coastline..coast.area.ratio. Net.migration
## 1 48,0 0,00 23,06
## 2 124,6 1,26 -4,93
## 3 13,8 0,04 -0,39
## 4 290,4 58,29 -20,71
## 5 152,1 0,00 6,6
## 6 9,7 0,13 0
## Infant.mortality..per.1000.births. GDP....per.capita. Literacy....
## 1 163,07 700 36,0
## 2 21,52 4500 86,5
## 3 31 6000 70,0
## 4 9,27 8000 97,0
## 5 4,05 19000 100,0
## 6 191,19 1900 42,0
## Phones..per.1000. Arable.... Crops.... Other.... Climate Birthrate Deathrate
## 1 3,2 12,13 0,22 87,65 1 46,6 20,34
## 2 71,2 21,09 4,42 74,49 3 15,11 5,22
## 3 78,1 3,22 0,25 96,53 1 17,14 4,61
## 4 259,5 10 15 75 2 22,46 3,27
## 5 497,2 2,22 0 97,78 3 8,71 6,25
## 6 7,8 2,41 0,24 97,35 <NA> 45,11 24,2
## Agriculture Industry Service
## 1 0,38 0,24 0,38
## 2 0,232 0,188 0,579
## 3 0,101 0,6 0,298
## 4 <NA> <NA> <NA>
## 5 <NA> <NA> <NA>
## 6 0,096 0,658 0,246
any(is.na(countries))
## [1] TRUE
sum(is.na(countries))
## [1] 110
library(Amelia)
## Warning: package 'Amelia' was built under R version 4.1.1
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.0, built: 2021-05-26)
## ## Copyright (C) 2005-2021 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(countries, legend = TRUE, col = c("yellow", "dodgerblue"))
str(countries)
## 'data.frame': 227 obs. of 20 variables:
## $ Country : chr "Afghanistan " "Albania " "Algeria " "American Samoa " ...
## $ Region : chr "ASIA (EX. NEAR EAST) " "EASTERN EUROPE " "NORTHERN AFRICA " "OCEANIA " ...
## $ Population : int 31056997 3581655 32930091 57794 71201 12127071 13477 69108 39921833 2976372 ...
## $ Area..sq..mi.. : int 647500 28748 2381740 199 468 1246700 102 443 2766890 29800 ...
## $ Pop..Density..per.sq..mi.. : chr "48,0" "124,6" "13,8" "290,4" ...
## $ Coastline..coast.area.ratio. : chr "0,00" "1,26" "0,04" "58,29" ...
## $ Net.migration : chr "23,06" "-4,93" "-0,39" "-20,71" ...
## $ Infant.mortality..per.1000.births.: chr "163,07" "21,52" "31" "9,27" ...
## $ GDP....per.capita. : int 700 4500 6000 8000 19000 1900 8600 11000 11200 3500 ...
## $ Literacy.... : chr "36,0" "86,5" "70,0" "97,0" ...
## $ Phones..per.1000. : chr "3,2" "71,2" "78,1" "259,5" ...
## $ Arable.... : chr "12,13" "21,09" "3,22" "10" ...
## $ Crops.... : chr "0,22" "4,42" "0,25" "15" ...
## $ Other.... : chr "87,65" "74,49" "96,53" "75" ...
## $ Climate : chr "1" "3" "1" "2" ...
## $ Birthrate : chr "46,6" "15,11" "17,14" "22,46" ...
## $ Deathrate : chr "20,34" "5,22" "4,61" "3,27" ...
## $ Agriculture : chr "0,38" "0,232" "0,101" NA ...
## $ Industry : chr "0,24" "0,188" "0,6" NA ...
## $ Service : chr "0,38" "0,579" "0,298" NA ...
head(as.character(countries$Agriculture))
## [1] "0,38" "0,232" "0,101" NA NA "0,096"
for (i in 3:length(names(countries))){
countries[,i] <- as.numeric(gsub(",",'.',(sapply(countries[,i], as.character))))
}
str(countries)
## 'data.frame': 227 obs. of 20 variables:
## $ Country : chr "Afghanistan " "Albania " "Algeria " "American Samoa " ...
## $ Region : chr "ASIA (EX. NEAR EAST) " "EASTERN EUROPE " "NORTHERN AFRICA " "OCEANIA " ...
## $ Population : num 31056997 3581655 32930091 57794 71201 ...
## $ Area..sq..mi.. : num 647500 28748 2381740 199 468 ...
## $ Pop..Density..per.sq..mi.. : num 48 124.6 13.8 290.4 152.1 ...
## $ Coastline..coast.area.ratio. : num 0 1.26 0.04 58.29 0 ...
## $ Net.migration : num 23.06 -4.93 -0.39 -20.71 6.6 ...
## $ Infant.mortality..per.1000.births.: num 163.07 21.52 31 9.27 4.05 ...
## $ GDP....per.capita. : num 700 4500 6000 8000 19000 1900 8600 11000 11200 3500 ...
## $ Literacy.... : num 36 86.5 70 97 100 42 95 89 97.1 98.6 ...
## $ Phones..per.1000. : num 3.2 71.2 78.1 259.5 497.2 ...
## $ Arable.... : num 12.13 21.09 3.22 10 2.22 ...
## $ Crops.... : num 0.22 4.42 0.25 15 0 0.24 0 4.55 0.48 2.3 ...
## $ Other.... : num 87.7 74.5 96.5 75 97.8 ...
## $ Climate : num 1 3 1 2 3 NA 2 2 3 4 ...
## $ Birthrate : num 46.6 15.11 17.14 22.46 8.71 ...
## $ Deathrate : num 20.34 5.22 4.61 3.27 6.25 ...
## $ Agriculture : num 0.38 0.232 0.101 NA NA 0.096 0.04 0.038 0.095 0.239 ...
## $ Industry : num 0.24 0.188 0.6 NA NA 0.658 0.18 0.22 0.358 0.343 ...
## $ Service : num 0.38 0.579 0.298 NA NA 0.246 0.78 0.743 0.547 0.418 ...
countries$Climate = ifelse(is.na(countries$Climate), 'Unknown', countries$Climate)
countries$Climate = as.factor(countries$Climate)
num_cols = c(3:20)
num_cols = num_cols[num_cols != 15]
for (index in num_cols)
{countries[,index] = ifelse(is.na(countries[,index]),ave(countries[,index],
FUN =function(x) mean(x, na.rm=TRUE)), countries[,index]) }
missmap(countries, legend = TRUE, col = c("red", "dodgerblue"))
library(FactoMineR)
res <- PCA(countries, scale.unit = TRUE, quali.sup = c(1,2,15), ncp = 5, graph = T)
summary(res)
##
## Call:
## PCA(X = countries, scale.unit = TRUE, ncp = 5, quali.sup = c(1,
## 2, 15), graph = T)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 5.200 2.423 1.825 1.485 1.280 1.009 0.785
## % of var. 30.587 14.250 10.734 8.734 7.529 5.936 4.618
## Cumulative % of var. 30.587 44.837 55.572 64.306 71.835 77.771 82.389
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 0.696 0.562 0.516 0.416 0.387 0.192 0.137
## % of var. 4.094 3.308 3.036 2.445 2.274 1.131 0.807
## Cumulative % of var. 86.482 89.790 92.826 95.271 97.545 98.676 99.483
## Dim.15 Dim.16 Dim.17
## Variance 0.085 0.002 0.000
## % of var. 0.502 0.015 0.000
## Cumulative % of var. 99.985 100.000 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2
## 1 | 7.642 | -5.001 2.118 0.428 | 2.455
## 2 | 2.173 | -0.015 0.000 0.000 | -1.280
## 3 | 3.706 | -0.787 0.053 0.045 | 1.616
## 4 | 4.820 | 0.480 0.020 0.010 | -2.516
## 5 | 3.145 | 2.033 0.350 0.418 | 1.705
## 6 | 7.342 | -5.395 2.466 0.540 | 2.504
## 7 | 3.723 | 2.035 0.351 0.299 | 1.607
## 8 | 2.691 | 1.806 0.276 0.450 | -0.909
## 9 | 1.945 | 0.835 0.059 0.184 | 0.787
## 10 | 2.403 | -0.115 0.001 0.002 | -0.597
## ctr cos2 Dim.3 ctr cos2
## 1 1.096 0.103 | -0.600 0.087 0.006 |
## 2 0.298 0.347 | -0.041 0.000 0.000 |
## 3 0.475 0.190 | 1.560 0.588 0.177 |
## 4 1.151 0.272 | -0.469 0.053 0.009 |
## 5 0.529 0.294 | -0.369 0.033 0.014 |
## 6 1.140 0.116 | 1.031 0.257 0.020 |
## 7 0.469 0.186 | -1.399 0.472 0.141 |
## 8 0.150 0.114 | -0.389 0.037 0.021 |
## 9 0.113 0.164 | 1.073 0.278 0.305 |
## 10 0.065 0.062 | 0.638 0.098 0.070 |
##
## Variables (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr
## Population | -0.023 0.011 0.001 | -0.009 0.003
## Area..sq..mi.. | 0.027 0.014 0.001 | 0.278 3.184
## Pop..Density..per.sq..mi.. | 0.254 1.241 0.065 | 0.117 0.569
## Coastline..coast.area.ratio. | 0.172 0.569 0.030 | -0.284 3.336
## Net.migration | 0.178 0.608 0.032 | 0.494 10.076
## Infant.mortality..per.1000.births. | -0.911 15.958 0.830 | 0.120 0.590
## GDP....per.capita. | 0.788 11.943 0.621 | 0.286 3.366
## Literacy.... | 0.782 11.770 0.612 | -0.090 0.334
## Phones..per.1000. | 0.848 13.829 0.719 | 0.154 0.975
## Arable.... | 0.115 0.254 0.013 | -0.623 16.006
## cos2 Dim.3 ctr cos2
## Population 0.000 | 0.598 19.620 0.358 |
## Area..sq..mi.. 0.077 | 0.500 13.726 0.250 |
## Pop..Density..per.sq..mi.. 0.014 | -0.365 7.298 0.133 |
## Coastline..coast.area.ratio. 0.081 | -0.497 13.532 0.247 |
## Net.migration 0.244 | -0.033 0.060 0.001 |
## Infant.mortality..per.1000.births. 0.014 | -0.057 0.177 0.003 |
## GDP....per.capita. 0.082 | 0.005 0.001 0.000 |
## Literacy.... 0.008 | 0.118 0.765 0.014 |
## Phones..per.1000. 0.024 | -0.073 0.294 0.005 |
## Arable.... 0.388 | 0.465 11.855 0.216 |
##
## Supplementary categories (the 10 first)
## Dist Dim.1 cos2 v.test Dim.2
## Afghanistan | 7.642 | -5.001 0.428 -2.193 | 2.455
## Albania | 2.173 | -0.015 0.000 -0.007 | -1.280
## Algeria | 3.706 | -0.787 0.045 -0.345 | 1.616
## American Samoa | 4.820 | 0.480 0.010 0.211 | -2.516
## Andorra | 3.145 | 2.033 0.418 0.891 | 1.705
## Angola | 7.342 | -5.395 0.540 -2.366 | 2.504
## Anguilla | 3.723 | 2.035 0.299 0.893 | 1.607
## Antigua & Barbuda | 2.691 | 1.806 0.450 0.792 | -0.909
## Argentina | 1.945 | 0.835 0.184 0.366 | 0.787
## Armenia | 2.403 | -0.115 0.002 -0.051 | -0.597
## cos2 v.test Dim.3 cos2 v.test
## Afghanistan 0.103 1.578 | -0.600 0.006 -0.444 |
## Albania 0.347 -0.822 | -0.041 0.000 -0.030 |
## Algeria 0.190 1.039 | 1.560 0.177 1.155 |
## American Samoa 0.272 -1.616 | -0.469 0.009 -0.347 |
## Andorra 0.294 1.096 | -0.369 0.014 -0.273 |
## Angola 0.116 1.609 | 1.031 0.020 0.763 |
## Anguilla 0.186 1.032 | -1.399 0.141 -1.035 |
## Antigua & Barbuda 0.114 -0.584 | -0.389 0.021 -0.288 |
## Argentina 0.164 0.506 | 1.073 0.305 0.795 |
## Armenia 0.062 -0.383 | 0.638 0.070 0.472 |
countries.active <- countries [1:23, 4:10]
head(countries.active[, 1:6],4)
## Area..sq..mi.. Pop..Density..per.sq..mi.. Coastline..coast.area.ratio.
## 1 647500 48.0 0.00
## 2 28748 124.6 1.26
## 3 2381740 13.8 0.04
## 4 199 290.4 58.29
## Net.migration Infant.mortality..per.1000.births. GDP....per.capita.
## 1 23.06 163.07 700
## 2 -4.93 21.52 4500
## 3 -0.39 31.00 6000
## 4 -20.71 9.27 8000
str(countries.active)
## 'data.frame': 23 obs. of 7 variables:
## $ Area..sq..mi.. : num 647500 28748 2381740 199 468 ...
## $ Pop..Density..per.sq..mi.. : num 48 124.6 13.8 290.4 152.1 ...
## $ Coastline..coast.area.ratio. : num 0 1.26 0.04 58.29 0 ...
## $ Net.migration : num 23.06 -4.93 -0.39 -20.71 6.6 ...
## $ Infant.mortality..per.1000.births.: num 163.07 21.52 31 9.27 4.05 ...
## $ GDP....per.capita. : num 700 4500 6000 8000 19000 1900 8600 11000 11200 3500 ...
## $ Literacy.... : num 36 86.5 70 97 100 42 95 89 97.1 98.6 ...
library(FactoMineR)
n.pca <- PCA(countries.active, graph = FALSE)
print(n.pca)
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 23 individuals, described by 7 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
library(factoextra)
eig.val <- get_eigenvalue(n.pca)
eig.val
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 2.7257763 38.939661 38.93966
## Dim.2 1.5487229 22.124613 61.06427
## Dim.3 1.2491447 17.844924 78.90920
## Dim.4 0.6702525 9.575036 88.48423
## Dim.5 0.4212294 6.017563 94.50180
## Dim.6 0.2597508 3.710725 98.21252
## Dim.7 0.1251234 1.787478 100.00000
fviz_eig(n.pca, addlabels = TRUE, ylim = c(0,50))
var <- get_pca_var(n.pca)
var
## Principal Component Analysis Results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the variables"
## 2 "$cor" "Correlations between variables and dimensions"
## 3 "$cos2" "Cos2 for the variables"
## 4 "$contrib" "contributions of the variables"
head(var$coord)
## Dim.1 Dim.2 Dim.3
## Area..sq..mi.. -0.06226865 -0.56931087 0.60693852
## Pop..Density..per.sq..mi.. 0.49358248 0.72960937 0.07229704
## Coastline..coast.area.ratio. 0.62940214 0.57468659 0.13434223
## Net.migration -0.31876214 0.17621077 0.77206966
## Infant.mortality..per.1000.births. -0.84708843 0.35173582 0.12489353
## GDP....per.capita. 0.78443220 -0.04971073 0.47811101
## Dim.4 Dim.5
## Area..sq..mi.. 0.53609413 -0.01256514
## Pop..Density..per.sq..mi.. 0.20902382 -0.39421143
## Coastline..coast.area.ratio. 0.15954837 0.45916260
## Net.migration -0.49508260 -0.02766298
## Infant.mortality..per.1000.births. 0.12738614 0.20128258
## GDP....per.capita. -0.05194893 -0.03052811
head(var$cos2)
## Dim.1 Dim.2 Dim.3
## Area..sq..mi.. 0.003877384 0.324114866 0.368374366
## Pop..Density..per.sq..mi.. 0.243623662 0.532329836 0.005226862
## Coastline..coast.area.ratio. 0.396147059 0.330264672 0.018047836
## Net.migration 0.101609300 0.031050237 0.596091557
## Infant.mortality..per.1000.births. 0.717558812 0.123718084 0.015598394
## GDP....per.capita. 0.615333878 0.002471156 0.228590140
## Dim.4 Dim.5
## Area..sq..mi.. 0.287396915 0.0001578828
## Pop..Density..per.sq..mi.. 0.043690958 0.1554026506
## Coastline..coast.area.ratio. 0.025455682 0.2108302935
## Net.migration 0.245106781 0.0007652403
## Infant.mortality..per.1000.births. 0.016227229 0.0405146763
## GDP....per.capita. 0.002698691 0.0009319657
head(var$contrib)
## Dim.1 Dim.2 Dim.3 Dim.4
## Area..sq..mi.. 0.1422488 20.9278796 29.4901285 42.8789010
## Pop..Density..per.sq..mi.. 8.9377717 34.3721806 0.4184352 6.5185817
## Coastline..coast.area.ratio. 14.5333666 21.3249684 1.4448155 3.7979240
## Net.migration 3.7277197 2.0048930 47.7199779 36.5693187
## Infant.mortality..per.1000.births. 26.3249342 7.9883937 1.2487260 2.4210620
## GDP....per.capita. 22.5746288 0.1595609 18.2997332 0.4026379
## Dim.5
## Area..sq..mi.. 0.03748144
## Pop..Density..per.sq..mi.. 36.89263763
## Coastline..coast.area.ratio. 50.05117729
## Net.migration 0.18166827
## Infant.mortality..per.1000.births. 9.61819676
## GDP....per.capita. 0.22124895
head(var$coord, 4)
## Dim.1 Dim.2 Dim.3 Dim.4
## Area..sq..mi.. -0.06226865 -0.5693109 0.60693852 0.5360941
## Pop..Density..per.sq..mi.. 0.49358248 0.7296094 0.07229704 0.2090238
## Coastline..coast.area.ratio. 0.62940214 0.5746866 0.13434223 0.1595484
## Net.migration -0.31876214 0.1762108 0.77206966 -0.4950826
## Dim.5
## Area..sq..mi.. -0.01256514
## Pop..Density..per.sq..mi.. -0.39421143
## Coastline..coast.area.ratio. 0.45916260
## Net.migration -0.02766298
fviz_pca_var(n.pca, col.var = "black")
head(var$cos2, 4)
## Dim.1 Dim.2 Dim.3 Dim.4
## Area..sq..mi.. 0.003877384 0.32411487 0.368374366 0.28739691
## Pop..Density..per.sq..mi.. 0.243623662 0.53232984 0.005226862 0.04369096
## Coastline..coast.area.ratio. 0.396147059 0.33026467 0.018047836 0.02545568
## Net.migration 0.101609300 0.03105024 0.596091557 0.24510678
## Dim.5
## Area..sq..mi.. 0.0001578828
## Pop..Density..per.sq..mi.. 0.1554026506
## Coastline..coast.area.ratio. 0.2108302935
## Net.migration 0.0007652403
library("corrplot")
corrplot(var$cos2, is.corr=FALSE)
#### También es posible crear un gráfico de barras de las variables cos2 utilizando la función fviz_cos2()[en factoextra]:
fviz_cos2(n.pca, choice = "var", axes = 1:2)
fviz_pca_var(n.pca, col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
REPEL = TRUE)
# Cambiar la transparencia por valores cos2
fviz_pca_var(n.pca, alpha.var = "cos2")
head(var$contrib, 4)
## Dim.1 Dim.2 Dim.3 Dim.4
## Area..sq..mi.. 0.1422488 20.927880 29.4901285 42.878901
## Pop..Density..per.sq..mi.. 8.9377717 34.372181 0.4184352 6.518582
## Coastline..coast.area.ratio. 14.5333666 21.324968 1.4448155 3.797924
## Net.migration 3.7277197 2.004893 47.7199779 36.569319
## Dim.5
## Area..sq..mi.. 0.03748144
## Pop..Density..per.sq..mi.. 36.89263763
## Coastline..coast.area.ratio. 50.05117729
## Net.migration 0.18166827
library("corrplot")
corrplot(var$contrib, is.corr = FALSE)
fviz_contrib(n.pca, choice = "var", axes = 1, top = 10)
fviz_contrib(n.pca, choice = "var", axes = 2, top = 10)
fviz_contrib(n.pca, choice = "var", axes = 1:2, top = 10)
fviz_pca_var(n.pca, col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))
fviz_pca_var(n.pca, alpha.var = "contrib")
set.seed(123)
my.cont.var <- rnorm(7)
fviz_pca_var(n.pca, col.var = my.cont.var,
gradient.cols = c("blue", "yellow", "red", "orange", "pink", "purple","green"),
legend.title = "Cont.Var")
###Como no tenemos ninguna variable de agrupación en nuestros conjuntos de datos para clasificar las variables, la crearemos. #### En el siguiente ejemplo de demostración, comenzamos clasificando las variables en 3 grupos utilizando el algoritmo de agrupación de kmeans. A continuación, utilizamos los clusters devueltos por el algoritmo kmeans para colorear las variables.
set.seed(123)
res.km <- kmeans(var$coord, centers = 3, nstart = 25)
grp <- as.factor(res.km$cluster)
fviz_pca_var(n.pca, col.var = grp,
palette = c("#0073C2FF", "#EFC000FF", "868686FF"),
legend.title = "Cluster")
res.desc <- dimdesc(n.pca, axes = c(1,2), proba = 0.05)
res.desc$Dim.1
## $quanti
## correlation p.value
## Literacy.... 0.8047523 3.662401e-06
## GDP....per.capita. 0.7844322 9.404557e-06
## Coastline..coast.area.ratio. 0.6294021 1.291714e-03
## Pop..Density..per.sq..mi.. 0.4935825 1.668819e-02
## Infant.mortality..per.1000.births. -0.8470884 3.430159e-07
##
## attr(,"class")
## [1] "condes" "list"
res.desc$Dim.2
## $quanti
## correlation p.value
## Pop..Density..per.sq..mi.. 0.7296094 7.791019e-05
## Coastline..coast.area.ratio. 0.5746866 4.127162e-03
## Literacy.... -0.4525197 3.014994e-02
## Area..sq..mi.. -0.5693109 4.577407e-03
##
## attr(,"class")
## [1] "condes" "list"
ind <- get_pca_ind(n.pca)
ind
## Principal Component Analysis Results for individuals
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the individuals"
## 2 "$cos2" "Cos2 for the individuals"
## 3 "$contrib" "contributions of the individuals"
head(ind$coord)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## 1 -3.9262822 1.47297709 2.0710787 -0.9974377 0.35354097
## 2 -0.3097826 -0.43849402 -1.1468045 -0.0315631 -0.09709536
## 3 -0.9914323 -0.86293449 0.1866867 0.6709429 -0.06323149
## 4 1.2815603 -0.08769639 -2.3645664 1.4250895 0.67461697
## 5 0.5383881 -0.56567767 0.3756779 -1.2742044 -0.29624424
## 6 -3.4814812 0.86793597 0.2092392 1.0820690 0.76686875
head(ind$cos2)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## 1 0.66948480 0.094225769 0.186282002 0.0432065198 0.005428217
## 2 0.05667823 0.113561105 0.776749593 0.0005883849 0.005567999
## 3 0.36654707 0.277689386 0.012996622 0.1678706539 0.001490975
## 4 0.16838403 0.000788472 0.573225909 0.2082126245 0.046659202
## 5 0.11749951 0.129712912 0.057210599 0.6581476336 0.035575005
## 6 0.76160020 0.047334121 0.002750965 0.0735713711 0.036952262
head(ind$contrib)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## 1 24.5892325 6.09102562 14.9297530 6.453648244 1.29012844
## 2 0.1530721 0.53979119 4.5776039 0.006462387 0.09730841
## 3 1.5678629 2.09051890 0.1213071 2.920148025 0.04126866
## 4 2.6197514 0.02159046 19.4608792 13.174009838 4.69751281
## 5 0.4623522 0.89833127 0.4912367 10.532025447 0.90584374
## 6 19.3334811 2.11482195 0.1523861 7.595276858 6.07009550
fviz_pca_ind(n.pca)
fviz_pca_ind(n.pca, col.ind = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)
#### Tengamos en cuenta que, los individuos que son similares se agrupan en el gráfico.
fviz_pca_ind(n.pca, point.size = "cos2",
pointshape = 21, fill = "#E7B800",
repel = TRUE)
fviz_pca_ind(n.pca, col.ind = "cos2", pointsize = "cos2",
gradient.cols = c("#00AFBB","#E7B800", "#FC4E07"),
repel = TRUE )
fviz_cos2(n.pca, choice = "ind")
fviz_cos2(n.pca, choice = "ind", axes = 1:2)
#### Contribución total en PC1 y PC2
set.seed(123)
my.cont.var <- rnorm(23)
str(my.cont.var)
## num [1:23] -0.5605 -0.2302 1.5587 0.0705 0.1293 ...
fviz_pca_ind(n.pca, col.ind = my.cont.var,
gradient.cols = c("blue", "yellow", "red"),
legend.title = "Cont.Var")
res <- PCA(countries, scale.unit = TRUE, quali.sup = c(1,2,15), ncp = 5, graph = T)
res.hcpc<-HCPC(res ,nb.clust=4,consol=TRUE,min=4,max=10,graph=TRUE)
plot.HCPC(res.hcpc, choice = 'tree', ind.names = F)
res.hcpc$desc.var
##
## Link between the cluster variable and the categorical variables (chi-square test)
## =================================================================================
## p.value df
## Region 4.318832e-32 30
## Climate 4.763335e-08 18
##
## Description of each cluster by the categories
## =============================================
## $`1`
## Cla/Mod Mod/Cla Global
## Region=SUB-SAHARAN AFRICA 78.431373 75.471698 22.466960
## Climate=Climate_2 31.531532 66.037736 48.898678
## Region=EASTERN EUROPE 0.000000 0.000000 5.286344
## Region=WESTERN EUROPE 0.000000 0.000000 12.334802
## Climate=Climate_3 4.166667 3.773585 21.145374
## Region=LATIN AMER. & CARIB 2.222222 1.886792 19.823789
## p.value v.test
## Region=SUB-SAHARAN AFRICA 3.349460e-23 9.921716
## Climate=Climate_2 4.635364e-03 2.831338
## Region=EASTERN EUROPE 3.751916e-02 -2.080070
## Region=WESTERN EUROPE 3.337638e-04 -3.587578
## Climate=Climate_3 1.091659e-04 -3.869261
## Region=LATIN AMER. & CARIB 2.691977e-05 -4.198071
##
## $`2`
## Cla/Mod Mod/Cla Global
## Region=LATIN AMER. & CARIB 62.22222 29.787234 19.823789
## Region=NORTHERN AFRICA 100.00000 6.382979 2.643172
## Climate=Climate_1 62.06897 19.148936 12.775330
## Region=C.W. OF IND. STATES 75.00000 9.574468 5.286344
## Region=NEAR EAST 68.75000 11.702128 7.048458
## Region=SUB-SAHARAN AFRICA 17.64706 9.574468 22.466960
## Region=WESTERN EUROPE 0.00000 0.000000 12.334802
## p.value v.test
## Region=LATIN AMER. & CARIB 1.900139e-03 3.105412
## Region=NORTHERN AFRICA 4.579835e-03 2.835191
## Climate=Climate_1 1.843826e-02 2.356697
## Region=C.W. OF IND. STATES 2.014684e-02 2.323602
## Region=NEAR EAST 2.655889e-02 2.217940
## Region=SUB-SAHARAN AFRICA 6.098568e-05 -4.008964
## Region=WESTERN EUROPE 8.394196e-08 -5.358444
##
## $`3`
## Cla/Mod Mod/Cla Global
## Region=NORTHERN AMERICA 40 33.33333 2.2026432
## Country=United States 100 16.66667 0.4405286
## Country=Russia 100 16.66667 0.4405286
## Country=India 100 16.66667 0.4405286
## Country=China 100 16.66667 0.4405286
## Country=Canada 100 16.66667 0.4405286
## Country=Brazil 100 16.66667 0.4405286
## p.value v.test
## Region=NORTHERN AMERICA 0.005743768 2.762061
## Country=United States 0.026431718 2.219809
## Country=Russia 0.026431718 2.219809
## Country=India 0.026431718 2.219809
## Country=China 0.026431718 2.219809
## Country=Canada 0.026431718 2.219809
## Country=Brazil 0.026431718 2.219809
##
## $`4`
## Cla/Mod Mod/Cla Global
## Region=WESTERN EUROPE 100.000000 37.837838 12.334802
## Climate=Climate_3 64.583333 41.891892 21.145374
## Region=BALTICS 100.000000 4.054054 1.321586
## Climate=Climate_1.5 0.000000 0.000000 3.524229
## Climate=Climate_1 10.344828 4.054054 12.775330
## Climate=Climate_2 23.423423 35.135135 48.898678
## Region=SUB-SAHARAN AFRICA 3.921569 2.702703 22.466960
## p.value v.test
## Region=WESTERN EUROPE 3.669061e-16 8.149013
## Climate=Climate_3 3.059578e-07 5.119740
## Region=BALTICS 3.369537e-02 2.123698
## Climate=Climate_1.5 4.005749e-02 -2.053156
## Climate=Climate_1 4.189220e-03 -2.863551
## Climate=Climate_2 4.089476e-03 -2.871176
## Region=SUB-SAHARAN AFRICA 4.591219e-08 -5.466456
##
##
## Link between the cluster variable and the quantitative variables
## ================================================================
## Eta2 P-value
## Infant.mortality..per.1000.births. 0.74699012 2.915334e-66
## Area..sq..mi.. 0.70665282 4.130799e-59
## Birthrate 0.68883078 2.928182e-56
## Phones..per.1000. 0.68068267 5.196742e-55
## GDP....per.capita. 0.60307145 1.680444e-44
## Literacy.... 0.56623713 3.229679e-40
## Agriculture 0.52784257 3.992075e-36
## Population 0.45969072 1.260852e-29
## Service 0.43770833 1.050326e-27
## Deathrate 0.41782250 4.947534e-26
## Net.migration 0.16422222 1.018363e-08
## Crops.... 0.10333454 2.083735e-05
## Industry 0.09523640 5.469109e-05
## Pop..Density..per.sq..mi.. 0.04471092 1.676227e-02
##
## Description of each cluster by quantitative variables
## =====================================================
## $`1`
## v.test Mean in category Overall mean
## Infant.mortality..per.1000.births. 12.414794 87.9926415 35.5069643
## Birthrate 11.283418 37.1800000 22.1147321
## Agriculture 10.277027 0.3261509 0.1508443
## Deathrate 9.592699 14.9467925 9.2413453
## Industry -2.090134 0.2492075 0.2827109
## Coastline..coast.area.ratio. -2.316284 1.0292453 21.1653304
## GDP....per.capita. -6.768329 1528.3018868 9689.8230088
## Service -7.288327 0.4248302 0.5652830
## Phones..per.1000. -8.199803 13.2339623 236.0614350
## Literacy.... -10.914391 58.0037736 82.8382775
## sd in category Overall sd p.value
## Infant.mortality..per.1000.births. 27.9360014 3.507671e+01 2.172457e-35
## Birthrate 7.2485954 1.107780e+01 1.584667e-29
## Agriculture 0.1537108 1.415299e-01 8.944394e-25
## Deathrate 5.7345447 4.934764e+00 8.581003e-22
## Industry 0.1306651 1.329939e-01 3.660575e-02
## Coastline..coast.area.ratio. 3.5545415 7.212747e+01 2.054274e-02
## GDP....per.capita. 1287.4396646 1.000477e+04 1.302780e-11
## Service 0.1193302 1.598896e-01 3.138287e-13
## Phones..per.1000. 13.4113011 2.254669e+02 2.407811e-16
## Literacy.... 18.0666164 1.887876e+01 9.838620e-28
##
## $`2`
## v.test Mean in category Overall mean
## Crops.... 4.795404 7.7157896 4.5642222
## Industry 4.343945 0.3284222 0.2827109
## Literacy.... 2.543365 86.6374580 82.8382775
## Service -2.408861 0.5348083 0.5652830
## Infant.mortality..per.1000.births. -2.484834 28.6105414 35.5069643
## Phones..per.1000. -4.381240 157.9008969 236.0614350
## GDP....per.capita. -5.221655 5556.2747129 9689.8230088
## Net.migration -5.490116 -2.0669747 0.0381250
## Deathrate -5.802548 6.9756955 9.2413453
## sd in category Overall sd p.value
## Crops.... 11.3526605 8.306034e+00 1.623471e-06
## Industry 0.1435132 1.329939e-01 1.399468e-05
## Literacy.... 12.3737343 1.887876e+01 1.097904e-02
## Service 0.1265579 1.598896e-01 1.600240e-02
## Infant.mortality..per.1000.births. 16.6138878 3.507671e+01 1.296118e-02
## Phones..per.1000. 98.3013717 2.254669e+02 1.180059e-05
## GDP....per.capita. 4382.8812703 1.000477e+04 1.773311e-07
## Net.migration 4.9174244 4.846000e+00 4.016691e-08
## Deathrate 3.1421890 4.934764e+00 6.531482e-09
##
## $`3`
## v.test Mean in category Overall mean sd in category Overall sd
## Area..sq..mi.. 12.59516 9681301 598227 4021158 1786335
## Population 10.17577 511973437 28740284 499929759 117631367
## p.value
## Area..sq..mi.. 2.244984e-36
## Population 2.543799e-24
##
## $`4`
## v.test Mean in category Overall mean
## Phones..per.1000. 11.470498 4.834279e+02 2.360614e+02
## GDP....per.capita. 11.117314 2.032838e+04 9.689823e+03
## Service 9.010700 7.030847e-01 5.652830e-01
## Literacy.... 6.927970 9.534822e+01 8.283828e+01
## Net.migration 5.274639 2.482973e+00 3.812500e-02
## Pop..Density..per.sq..mi.. 3.167211 8.808703e+02 3.790471e+02
## Coastline..coast.area.ratio. 1.974568 3.478757e+01 2.116533e+01
## Area..sq..mi.. -2.430370 1.829759e+05 5.982270e+05
## Deathrate -2.462195 8.079189e+00 9.241345e+00
## Industry -3.020129 2.442931e-01 2.827109e-01
## Crops.... -3.034856 2.153165e+00 4.564222e+00
## Agriculture -7.256933 5.260690e-02 1.508443e-01
## Infant.mortality..per.1000.births. -8.285923 7.707568e+00 3.550696e+01
## Birthrate -9.038224 1.253811e+01 2.211473e+01
## sd in category Overall sd p.value
## Phones..per.1000. 1.766953e+02 2.254669e+02 1.855883e-30
## GDP....per.capita. 9.040996e+03 1.000477e+04 1.033367e-28
## Service 1.110149e-01 1.598896e-01 2.047458e-19
## Literacy.... 5.554338e+00 1.887876e+01 4.269213e-12
## Net.migration 4.561822e+00 4.846000e+00 1.330174e-07
## Pop..Density..per.sq..mi.. 2.791255e+03 1.656525e+03 1.539084e-03
## Coastline..coast.area.ratio. 6.445480e+01 7.212747e+01 4.831717e-02
## Area..sq..mi.. 8.880410e+05 1.786335e+06 1.508344e-02
## Deathrate 2.602342e+00 4.934764e+00 1.380896e-02
## Industry 1.003280e-01 1.329939e-01 2.526667e-03
## Crops.... 3.098353e+00 8.306034e+00 2.406502e-03
## Agriculture 5.265687e-02 1.415299e-01 3.959658e-13
## Infant.mortality..per.1000.births. 4.561772e+00 3.507671e+01 1.171962e-16
## Birthrate 3.578951e+00 1.107780e+01 1.592380e-19
cluster <- data.frame(res.hcpc$data.clust)
library(dplyr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.3 v purrr 0.3.4
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
cluster %>% group_by(clust) %>% summarize(Flower = n())
## # A tibble: 4 x 2
## clust Flower
## <fct> <int>
## 1 1 53
## 2 2 94
## 3 3 6
## 4 4 74
cluster <- cluster %>% arrange(by = clust)
cluster[,c('Country', 'clust')]
## Country clust
## 1 Afghanistan 1
## 2 Angola 1
## 3 Bangladesh 1
## 4 Benin 1
## 5 Bhutan 1
## 6 Botswana 1
## 7 Burkina Faso 1
## 8 Burma 1
## 9 Burundi 1
## 10 Cambodia 1
## 11 Cameroon 1
## 12 Central African Rep. 1
## 13 Chad 1
## 14 Comoros 1
## 15 Congo, Dem. Rep. 1
## 16 Congo, Repub. of the 1
## 17 Cote d'Ivoire 1
## 18 Djibouti 1
## 19 Eritrea 1
## 20 Ethiopia 1
## 21 Gambia, The 1
## 22 Ghana 1
## 23 Guinea 1
## 24 Guinea-Bissau 1
## 25 Haiti 1
## 26 Kenya 1
## 27 Laos 1
## 28 Lesotho 1
## 29 Liberia 1
## 30 Madagascar 1
## 31 Malawi 1
## 32 Mali 1
## 33 Mauritania 1
## 34 Mozambique 1
## 35 Nepal 1
## 36 Niger 1
## 37 Nigeria 1
## 38 Pakistan 1
## 39 Papua New Guinea 1
## 40 Rwanda 1
## 41 Senegal 1
## 42 Sierra Leone 1
## 43 Somalia 1
## 44 Sudan 1
## 45 Swaziland 1
## 46 Tajikistan 1
## 47 Tanzania 1
## 48 Togo 1
## 49 Uganda 1
## 50 Vanuatu 1
## 51 Yemen 1
## 52 Zambia 1
## 53 Zimbabwe 1
## 54 Albania 2
## 55 Algeria 2
## 56 American Samoa 2
## 57 Argentina 2
## 58 Armenia 2
## 59 Azerbaijan 2
## 60 Belize 2
## 61 Bolivia 2
## 62 Bosnia & Herzegovina 2
## 63 Brunei 2
## 64 Bulgaria 2
## 65 Cape Verde 2
## 66 Chile 2
## 67 Colombia 2
## 68 Cook Islands 2
## 69 Costa Rica 2
## 70 Cuba 2
## 71 Dominica 2
## 72 Dominican Republic 2
## 73 East Timor 2
## 74 Ecuador 2
## 75 Egypt 2
## 76 El Salvador 2
## 77 Equatorial Guinea 2
## 78 Fiji 2
## 79 Gabon 2
## 80 Gaza Strip 2
## 81 Georgia 2
## 82 Greenland 2
## 83 Grenada 2
## 84 Guatemala 2
## 85 Guyana 2
## 86 Honduras 2
## 87 Indonesia 2
## 88 Iran 2
## 89 Iraq 2
## 90 Jamaica 2
## 91 Jordan 2
## 92 Kazakhstan 2
## 93 Kiribati 2
## 94 Korea, North 2
## 95 Kyrgyzstan 2
## 96 Lebanon 2
## 97 Libya 2
## 98 Macedonia 2
## 99 Malaysia 2
## 100 Maldives 2
## 101 Marshall Islands 2
## 102 Mayotte 2
## 103 Mexico 2
## 104 Micronesia, Fed. St. 2
## 105 Moldova 2
## 106 Mongolia 2
## 107 Montserrat 2
## 108 Morocco 2
## 109 Namibia 2
## 110 Nauru 2
## 111 Nicaragua 2
## 112 Oman 2
## 113 Panama 2
## 114 Paraguay 2
## 115 Peru 2
## 116 Philippines 2
## 117 Puerto Rico 2
## 118 Qatar 2
## 119 Romania 2
## 120 Saint Helena 2
## 121 Saint Lucia 2
## 122 Saint Vincent and the Grenadines 2
## 123 Samoa 2
## 124 Sao Tome & Principe 2
## 125 Saudi Arabia 2
## 126 Serbia 2
## 127 Seychelles 2
## 128 Solomon Islands 2
## 129 South Africa 2
## 130 Sri Lanka 2
## 131 Suriname 2
## 132 Syria 2
## 133 Thailand 2
## 134 Tonga 2
## 135 Trinidad & Tobago 2
## 136 Tunisia 2
## 137 Turkey 2
## 138 Turkmenistan 2
## 139 Tuvalu 2
## 140 Ukraine 2
## 141 United Arab Emirates 2
## 142 Uzbekistan 2
## 143 Venezuela 2
## 144 Vietnam 2
## 145 Wallis and Futuna 2
## 146 West Bank 2
## 147 Western Sahara 2
## 148 Brazil 3
## 149 Canada 3
## 150 China 3
## 151 India 3
## 152 Russia 3
## 153 United States 3
## 154 Andorra 4
## 155 Anguilla 4
## 156 Antigua & Barbuda 4
## 157 Aruba 4
## 158 Australia 4
## 159 Austria 4
## 160 Bahamas, The 4
## 161 Bahrain 4
## 162 Barbados 4
## 163 Belarus 4
## 164 Belgium 4
## 165 Bermuda 4
## 166 British Virgin Is. 4
## 167 Cayman Islands 4
## 168 Croatia 4
## 169 Cyprus 4
## 170 Czech Republic 4
## 171 Denmark 4
## 172 Estonia 4
## 173 Faroe Islands 4
## 174 Finland 4
## 175 France 4
## 176 French Guiana 4
## 177 French Polynesia 4
## 178 Germany 4
## 179 Gibraltar 4
## 180 Greece 4
## 181 Guadeloupe 4
## 182 Guam 4
## 183 Guernsey 4
## 184 Hong Kong 4
## 185 Hungary 4
## 186 Iceland 4
## 187 Ireland 4
## 188 Isle of Man 4
## 189 Israel 4
## 190 Italy 4
## 191 Japan 4
## 192 Jersey 4
## 193 Korea, South 4
## 194 Kuwait 4
## 195 Latvia 4
## 196 Liechtenstein 4
## 197 Lithuania 4
## 198 Luxembourg 4
## 199 Macau 4
## 200 Malta 4
## 201 Martinique 4
## 202 Mauritius 4
## 203 Monaco 4
## 204 Netherlands 4
## 205 Netherlands Antilles 4
## 206 New Caledonia 4
## 207 New Zealand 4
## 208 N. Mariana Islands 4
## 209 Norway 4
## 210 Palau 4
## 211 Poland 4
## 212 Portugal 4
## 213 Reunion 4
## 214 Saint Kitts & Nevis 4
## 215 St Pierre & Miquelon 4
## 216 San Marino 4
## 217 Singapore 4
## 218 Slovakia 4
## 219 Slovenia 4
## 220 Spain 4
## 221 Sweden 4
## 222 Switzerland 4
## 223 Taiwan 4
## 224 Turks & Caicos Is 4
## 225 United Kingdom 4
## 226 Uruguay 4
## 227 Virgin Islands 4
plot.HCPC(res.hcpc, axes = 1:2)
data.frame el conjunto de datos y observarlos, hacemos:df <- data.frame(res.hcpc$call$X)
head(df)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 clust
## 118 -6.030281 0.8968411 -1.5402495 1.9771110 0.4218002 1
## 184 -6.004205 1.3319730 -0.5304170 1.2636678 -0.2594869 1
## 189 -5.452586 1.5963044 -1.5034654 1.7919220 0.6359090 1
## 6 -5.395421 2.5036958 1.0311855 -0.3314996 -0.8679051 1
## 152 -5.346937 1.4163410 -1.1462500 1.5842756 0.6499661 1
## 1 -5.000564 2.4554476 -0.6001591 3.5634204 -0.1727132 1
library(ggplot2)
ggplot(df, aes(Dim.1, Dim.2))+geom_point(aes(col = clust))+theme_bw()
fviz_pca_var(res.pca, axes = c(2, 3))
fviz_pca_var(res, axes = c(2, 3))
fviz_pca_ind(res, axes = c(2, 3))
fviz_pca_var(res, geom.var = c("point", "text"))
fviz_pca_ind(res, geom.ind = "text")
fviz_pca_var(res, arrowsize = 1, labelsize = 5, repel = TRUE)
fviz_pca_ind(res,
pointsize = 3, pointshape = 21, fill = "lightblue", labelsize = 5, repel = TRUE)
## Warning: ggrepel: 150 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
fviz_pca_var(res, axes.linetype = "blank")
fviz_pca_biplot(res, repel = TRUE,
col.var = "#2E9FDF", # Variables color
col.ind = "#696969" # Individuals color
)
## Warning: ggrepel: 146 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
res.pca <- PCA(countries.active, ind.sup = 20:23, quanti.sup = 4:5, quali.sup = 1, graph=FALSE)
res.pca$quanti.sup
## $coord
## Dim.1 Dim.2 Dim.3 Dim.4
## Net.migration -0.2603223 -0.14776963 0.2138053 -0.3938129
## Infant.mortality..per.1000.births. -0.7957248 -0.02016766 -0.1135108 -0.3558943
##
## $cor
## Dim.1 Dim.2 Dim.3 Dim.4
## Net.migration -0.2603223 -0.14776963 0.2138053 -0.3938129
## Infant.mortality..per.1000.births. -0.7957248 -0.02016766 -0.1135108 -0.3558943
##
## $cos2
## Dim.1 Dim.2 Dim.3 Dim.4
## Net.migration 0.06776769 0.0218358622 0.04571271 0.1550886
## Infant.mortality..per.1000.births. 0.63317802 0.0004067345 0.01288470 0.1266607
fviz_pca_var(res.pca)
#### Tenga en cuenta que, por defecto, las variables cuantitativas suplementarias se muestran en color azul y con líneas discontinuas.
fviz_pca_var(res.pca,
col.var = "black",
col.quanti.sup = "red")
fviz_pca_var(res.pca, invisible = "var")
fviz_pca_var(res.pca, invisible = "quanti.sup")
p <- fviz_pca_var(res.pca, invisible = "quanti.sup")
fviz_add(p, res.pca$quanti.sup$coord, geom = c("arrow", "text"), color = "red")
res.pca$ind.sup
## $coord
## Dim.1 Dim.2 Dim.3 Dim.4
## 20 1.2836708 -0.4079976 1.6058185 -0.3450825
## 21 -0.4407413 -0.8925518 -0.3380844 0.6756283
## 22 -2.4376802 -0.1699341 -0.2363116 -0.8185180
## 23 5.7929803 6.8626280 -3.3336629 -2.8179050
##
## $cos2
## Dim.1 Dim.2 Dim.3 Dim.4
## 20 0.3652056 0.036893115 0.571509019 0.02639223
## 21 0.1243874 0.510124121 0.073191250 0.29229720
## 22 0.8873085 0.004312034 0.008338567 0.10004092
## 23 0.3365684 0.472334994 0.111458341 0.07963829
##
## $dist
## 20 21 22 23
## 2.124149 1.249670 2.587852 9.985398
p <- fviz_pca_ind(res.pca, col.ind.sup = "blue", repel = TRUE)
p <- fviz_add(p, res.pca$quali.sup$coord, color = "red")
p
res.pca$quali
## $coord
## Dim.1 Dim.2 Dim.3 Dim.4
## 102 1.0457047 0.8642140 -1.95548086 -0.2785755425
## 193 1.9108544 0.6101877 0.37394683 -0.8092294439
## 199 1.0407484 1.2231968 -1.70201612 0.0005519662
## 431 0.8326336 1.1390749 0.53308540 0.4211130014
## 443 0.4895620 0.3103696 -0.88459562 -0.2281872832
## 468 0.6625871 -0.8037378 0.73241594 0.1674635525
## 665 0.6877632 2.3013498 1.18690984 0.4241474363
## 13940 0.8926390 -0.4130873 -0.46583600 -0.3463871940
## 28748 -0.7189218 -0.5255241 -0.14911817 0.5558804451
## 29800 -0.4236393 -0.7397208 -0.24234775 1.0234819457
## 83870 1.3358455 -1.0552603 1.29407665 -0.6292859104
## 86600 -0.4819050 -0.7422559 -0.25500435 0.9676427591
## 144000 -2.2928035 2.2872645 1.25504337 0.1162093178
## 207600 -0.2177313 -0.9123945 -0.17199567 0.8492937026
## 647500 -2.6246559 -0.1723868 -0.27256317 -0.9795214353
## 1246700 -2.3491150 -0.3471964 -0.28647172 -0.8882631641
## 2381740 -1.1756490 -0.6870230 -0.13390984 -0.1844988043
## 2766890 0.0474485 -1.0335368 0.07219825 0.4142315453
## 7686850 1.3386344 -1.3035335 1.07166298 -0.5960668943
##
## $cos2
## Dim.1 Dim.2 Dim.3 Dim.4
## 102 0.190442776 0.130073530 0.665968188 1.351551e-02
## 193 0.757798873 0.077272634 0.029021407 1.359071e-01
## 199 0.197792612 0.273219225 0.528988107 5.563441e-08
## 431 0.282707084 0.529094594 0.115883705 7.231462e-02
## 443 0.204745618 0.082292011 0.668480596 4.448177e-02
## 468 0.266155458 0.391632073 0.325210858 1.700161e-02
## 665 0.064287254 0.719800705 0.191461938 2.445010e-02
## 13940 0.610843610 0.130816319 0.166358263 9.198181e-02
## 28748 0.459721952 0.245650241 0.019778484 2.748493e-01
## 29800 0.097915766 0.298535344 0.032043361 5.715055e-01
## 83870 0.359145412 0.224118175 0.337037226 7.969919e-02
## 86600 0.130136036 0.308732275 0.036439287 5.246924e-01
## 144000 0.435279943 0.433179379 0.130422484 1.118193e-03
## 207600 0.029070573 0.510478148 0.018140399 4.423109e-01
## 647500 0.866268675 0.003736939 0.009342050 1.206523e-01
## 1246700 0.847676141 0.018517054 0.012606221 1.212006e-01
## 2381740 0.725110981 0.247623381 0.009407497 1.785814e-02
## 2766890 0.001805059 0.856442812 0.004179264 1.375729e-01
## 7686850 0.358754424 0.340186979 0.229926880 7.113172e-02
##
## $v.test
## Dim.1 Dim.2 Dim.3 Dim.4
## 102 0.8048809 0.8019722 -2.21244554 -0.4582086163
## 193 1.4707882 0.5662413 0.42308622 -1.3310425621
## 199 0.8010660 1.1351007 -1.92567366 0.0009078889
## 431 0.6408797 1.0570373 0.60313678 0.6926580990
## 443 0.3768168 0.2880164 -1.00083805 -0.3753286393
## 468 0.5099945 -0.7458516 0.82866082 0.2754485983
## 665 0.5293726 2.1356038 1.34287860 0.6976492199
## 13940 0.6870659 -0.3833362 -0.52705030 -0.5697470620
## 28748 -0.5533555 -0.4876752 -0.16871340 0.9143272497
## 29800 -0.3260760 -0.6864452 -0.27419404 1.6834508946
## 83870 1.0282027 -0.9792592 1.46412792 -1.0350665522
## 86600 -0.3709232 -0.6887978 -0.28851382 1.5916050842
## 144000 -1.7647751 2.1225329 1.41996538 0.1911442414
## 207600 -0.1675882 -0.8466828 -0.19459718 1.3969413426
## 647500 -2.0202025 -0.1599713 -0.30837999 -1.6111434533
## 1246700 -1.8081181 -0.3221909 -0.32411622 -1.4610393711
## 2381740 -0.9048992 -0.6375427 -0.15150659 -0.3034686430
## 2766890 0.0365212 -0.9591003 0.08168563 0.6813392933
## 7686850 1.0303494 -1.2096514 1.21248744 -0.9804270125
##
## $dist
## 102 193 199 431 443 468 665 13940
## 2.396221 2.195081 2.340134 1.565978 1.081933 1.284326 2.712542 1.142118
## 28748 29800 83870 86600 144000 207600 647500 1246700
## 1.060313 1.353848 2.229056 1.335865 3.475222 1.277009 2.819980 2.551463
## 2381740 2766890 7686850
## 1.380624 1.116804 2.234927
##
## $eta2
## Dim.1 Dim.2 Dim.3 Dim.4
## Area..sq..mi.. 1 1 1 1
fviz_pca_ind(res.pca, habillage = 1,
addEllipses =TRUE,
ellipse.type = "confidence",
palette = "jco",
repel = TRUE)
## Warning: Computation failed in `stat_conf_ellipse()`:
## valor ausente donde TRUE/FALSE es necesario
## Warning: This manual palette can handle a maximum of 10 values. You have
## supplied 19.
## Warning: Removed 9 rows containing missing values (geom_point).
## Warning: Removed 9 rows containing missing values (geom_point).
fviz_pca_var(res.pca, select.var = list(cos2 = 0.6))
fviz_pca_var(res.pca, select.var= list(cos2 = 5))
name <- list(name = c("Net.migration", "Infant.mortality..per.1000.briths."))
fviz_pca_var(res.pca, select.var = name)
fviz_pca_biplot(res.pca, select.ind = list(contrib = 5), select.var = list(contrib = 5),
ggtheme = theme_minimal())
scree.plot <- fviz_eig(res.pca)
ind.plot <- fviz_pca_ind(res.pca)
var.plot <- fviz_pca_var(res.pca)
print(scree.plot)
print(ind.plot)
print(var.plot)
# Modelo K-Means
library(tidyverse) # data manipulation
library(cluster) # clustering algorithms
library(factoextra) # clustering algorithms & visualization
df <- USArrests
df <- na.omit(df)
df <- scale(df)
head(df)
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207
\(d(x, y)=\sqrt{\sum (x_{i}-y_{i})^2}\)
\(d(x, y)={\sum |(x_{i}-y_{i})|}\)
\(d(x, y)=1-\frac{\sum (x_{i}-\bar x)(y_{i}-\bar y)}{\sqrt{\sum (x_{i}-\bar x)^2\sum (y_{i}-\bar y)^2}}\)
\(d(x, y)=1-\frac{\sum (x′_{i}-\bar x′)(y′_{i}-\bar y′)}{\sqrt{\sum (x′_{i}-\bar x′)^2\sum (y′_{i}-\bar y′)^2}}\)
\(x′_{i}=rank(x_{i})\)
\(d(x, y)=1-\frac{n_{c}-n_{d}}{\frac{1}{2}n(n-1)}\)
distance <- get_dist(df)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
\(W(C_{k})=\sum (x_{i}-\mu_{k})^2\)
\(x_{i}\) ### es un dato perteneciente a el clúster \(C_{k}\) ### y \(\mu_{k}\) ### es la media de los punto asignados en dicho clúster
\(tot.withiness=\sum W(C_{k})\)
k2 <- kmeans(df, centers = 2, nstart = 25)
str(k2)
## List of 9
## $ cluster : Named int [1:50] 2 2 2 1 2 2 1 1 2 2 ...
## ..- attr(*, "names")= chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ centers : num [1:2, 1:4] -0.67 1.005 -0.676 1.014 -0.132 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "1" "2"
## .. ..$ : chr [1:4] "Murder" "Assault" "UrbanPop" "Rape"
## $ totss : num 196
## $ withinss : num [1:2] 56.1 46.7
## $ tot.withinss: num 103
## $ betweenss : num 93.1
## $ size : int [1:2] 30 20
## $ iter : int 1
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
k2
## K-means clustering with 2 clusters of sizes 30, 20
##
## Cluster means:
## Murder Assault UrbanPop Rape
## 1 -0.669956 -0.6758849 -0.1317235 -0.5646433
## 2 1.004934 1.0138274 0.1975853 0.8469650
##
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 2 2 2 1 2
## Colorado Connecticut Delaware Florida Georgia
## 2 1 1 2 2
## Hawaii Idaho Illinois Indiana Iowa
## 1 1 2 1 1
## Kansas Kentucky Louisiana Maine Maryland
## 1 1 2 1 2
## Massachusetts Michigan Minnesota Mississippi Missouri
## 1 2 1 2 2
## Montana Nebraska Nevada New Hampshire New Jersey
## 1 1 2 1 1
## New Mexico New York North Carolina North Dakota Ohio
## 2 2 2 1 1
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 1 1 1 1 2
## South Dakota Tennessee Texas Utah Vermont
## 1 2 2 1 1
## Virginia Washington West Virginia Wisconsin Wyoming
## 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 56.11445 46.74796
## (between_SS / total_SS = 47.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(k2, data = df)
df %>%
as_tibble() %>%
mutate(cluster = k2$cluster,
state = row.names(USArrests)) %>%
ggplot(aes(UrbanPop, Murder, color = factor(cluster), label = state)) +
geom_text()
k3 <- kmeans(df, centers = 3, nstart = 25)
k4 <- kmeans(df, centers = 4, nstart = 25)
k5 <- kmeans(df, centers = 5, nstart = 25)
# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point", data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point", data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point", data = df) + ggtitle("k = 5")
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.1.1
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p1, p2, p3, p4, nrow = 2)
\(minimize (\sum W(C_{k}))\)
donde \(C_{k}\) es el cluster \((k^{th})\) y \(W(C_{k}\) es la variación dentro del conglomerado. La suma total del cuadrado dentro del conglomerado (wss) mide la compacidad del conglomerado y queremos que sea lo más pequeño posible. Por tanto, podemos utilizar el siguiente algoritmo para definir los clústeres óptimos:
set.seed(123)
# function to compute total within-cluster sum of square
wss <- function(k) {
kmeans(df, k, nstart = 10 )$tot.withinss
}
# Compute and plot wss for k = 1 to k = 15
k.values <- 1:15
# extract wss for 2-15 clusters
wss_values <- map_dbl(k.values, wss)
plot(k.values, wss_values,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
set.seed(123)
fviz_nbclust(df, kmeans, method = "wss")
# function to compute average silhouette for k clusters
avg_sil <- function(k) {
km.res <- kmeans(df, centers = k, nstart = 25)
ss <- silhouette(km.res$cluster, dist(df))
mean(ss[, 3])
}
# Compute and plot wss for k = 2 to k = 15
k.values <- 2:15
# extract avg silhouette for 2-15 clusters
avg_sil_values <- map_dbl(k.values, avg_sil)
plot(k.values, avg_sil_values,
type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Average Silhouettes")
fviz_nbclust(df, kmeans, method = "silhouette")
\(Gap_a(k) = E^*_n log(W_k) - log(W_k)\)
# compute gap statistic
set.seed(123)
gap_stat <- clusGap(df, FUN = kmeans, nstart = 25,
K.max = 10, B = 50)
# Print the result
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = df, FUNcluster = kmeans, K.max = 10, B = 50, nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstmax'): 4
## logW E.logW gap SE.sim
## [1,] 3.458369 3.640154 0.1817845 0.04422857
## [2,] 3.135112 3.372283 0.2371717 0.03559601
## [3,] 2.977727 3.233771 0.2560446 0.03749193
## [4,] 2.826221 3.119172 0.2929511 0.04067348
## [5,] 2.738868 3.019965 0.2810969 0.04185469
## [6,] 2.666967 2.930002 0.2630347 0.04105040
## [7,] 2.609895 2.852152 0.2422572 0.04184725
## [8,] 2.539156 2.778562 0.2394054 0.04292750
## [9,] 2.468162 2.711752 0.2435901 0.04344197
## [10,] 2.407265 2.647595 0.2403307 0.04548446
fviz_gap_stat(gap_stat)
# Compute k-means clustering with k = 4
set.seed(123)
final <- kmeans(df, 4, nstart = 25)
print(final)
## K-means clustering with 4 clusters of sizes 8, 13, 16, 13
##
## Cluster means:
## Murder Assault UrbanPop Rape
## 1 1.4118898 0.8743346 -0.8145211 0.01927104
## 2 -0.9615407 -1.1066010 -0.9301069 -0.96676331
## 3 -0.4894375 -0.3826001 0.5758298 -0.26165379
## 4 0.6950701 1.0394414 0.7226370 1.27693964
##
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 1 4 4 1 4
## Colorado Connecticut Delaware Florida Georgia
## 4 3 3 4 1
## Hawaii Idaho Illinois Indiana Iowa
## 3 2 4 3 2
## Kansas Kentucky Louisiana Maine Maryland
## 3 2 1 2 4
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 4 2 1 4
## Montana Nebraska Nevada New Hampshire New Jersey
## 2 2 4 2 3
## New Mexico New York North Carolina North Dakota Ohio
## 4 4 1 2 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 1
## South Dakota Tennessee Texas Utah Vermont
## 2 1 4 3 2
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 2 2 3
##
## Within cluster sum of squares by cluster:
## [1] 8.316061 11.952463 16.212213 19.922437
## (between_SS / total_SS = 71.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(final, data = df)
USArrests %>%
mutate(Cluster = final$cluster) %>%
group_by(Cluster) %>%
summarise_all("mean")
## # A tibble: 4 x 5
## Cluster Murder Assault UrbanPop Rape
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 13.9 244. 53.8 21.4
## 2 2 3.6 78.5 52.1 12.2
## 3 3 5.66 139. 73.9 18.8
## 4 4 10.8 257. 76 33.2
library(tidyverse)
library(cluster)
library(factoextra)
dfs <- read.csv("C:\\Users\\USUARIO\\Downloads\\mall_customers.csv")
View(dfs)
str(dfs)
## 'data.frame': 200 obs. of 5 variables:
## $ CustomerID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Gender : chr "Male" "Male" "Female" "Female" ...
## $ Age : int 19 21 20 23 31 22 35 23 64 30 ...
## $ Annual.Income..k.. : int 15 15 16 16 17 17 18 18 19 19 ...
## $ Spending.Score..1.100.: int 39 81 6 77 40 76 6 94 3 72 ...
dfs <- na.omit(dfs)
library(dplyr)
c <- dplyr::select(dfs, CustomerID, Spending.Score..1.100.)
dfs <- scale(c)
head(c)
## CustomerID Spending.Score..1.100.
## 1 1 39
## 2 2 81
## 3 3 6
## 4 4 77
## 5 5 40
## 6 6 76
distance <- get_dist(dfs)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")) +
theme(text = element_text(size = 7),
axis.title = element_text(size = 7),
axis.text = element_text(size = 7))
k2
## K-means clustering with 2 clusters of sizes 101, 99
##
## Cluster means:
## CustomerID Spending.Score..1.100.
## 1 -0.8552297 -0.01387943
## 2 0.8725071 0.01415983
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 95.37144 153.35045
## (between_SS / total_SS = 37.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(k2, data = c)
library(tidyverse)
c %>%
as_tibble() %>%
mutate(cluster = k2$cluster,
state = row.names(c)) %>%
ggplot(aes(CustomerID, Spending.Score..1.100., color = factor(cluster), label = CustomerID)) +
geom_text()
set.seed(123)
wss <- function(k) {
kmeans(dfs, k, nstart = 10 )$tot.withinss
}
k.values <- 1:15
wss_values <- map_dbl(k.values, wss)
plot(k.values, wss_values,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
set.seed(123)
fviz_nbclust(dfs, kmeans, method = "wss")
library(cluster)
avg_sil <- function(k) {
km.res <- kmeans(dfs, centers = k, nstart = 25)
ss <- silhouette(km.res$cluster, dist(dfs))
mean(ss[, 3])
}
k.values <- 2:15
avg_sil_values <- map_dbl(k.values, avg_sil)
plot(k.values, avg_sil_values,
type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Average Silhouettes")
fviz_nbclust(dfs, kmeans, method = "silhouette")
set.seed(123)
gap_stat <- clusGap(dfs, FUN = kmeans, nstart = 25,
K.max = 10, B = 50)
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = dfs, FUNcluster = kmeans, K.max = 10, B = 50, nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstmax'): 1
## logW E.logW gap SE.sim
## [1,] 4.476995 4.807922 0.3309275 0.02295009
## [2,] 4.206978 4.525650 0.3186716 0.02235368
## [3,] 3.901223 4.285966 0.3847432 0.02164186
## [4,] 3.696870 4.090762 0.3938925 0.02146859
## [5,] 3.442368 3.977308 0.5349397 0.01882190
## [6,] 3.310858 3.876316 0.5654585 0.01870485
## [7,] 3.243231 3.784177 0.5409460 0.02088222
## [8,] 3.176834 3.701425 0.5245908 0.01919271
## [9,] 3.114256 3.626897 0.5126412 0.02144987
## [10,] 3.057683 3.563489 0.5058055 0.02224318
fviz_gap_stat(gap_stat)
set.seed(123)
final <- kmeans(dfs, 4, nstart = 25)
print(final)
## K-means clustering with 4 clusters of sizes 23, 39, 99, 39
##
## Cluster means:
## CustomerID Spending.Score..1.100.
## 1 -1.3389961 -1.1341194
## 2 1.0448378 -1.2012503
## 3 -0.5191064 0.2496354
## 4 1.0625582 1.2364001
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 1 3 1 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 3 2 3 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4
##
## Within cluster sum of squares by cluster:
## [1] 6.798524 13.200052 60.934760 10.895640
## (between_SS / total_SS = 76.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(final, data = dfs)
library(tidyverse)
c %>%
mutate(Cluster = final$cluster) %>%
group_by(Cluster) %>%
summarise_all("mean")
## # A tibble: 4 x 3
## Cluster CustomerID Spending.Score..1.100.
## <int> <dbl> <dbl>
## 1 1 23 20.9
## 2 2 161. 19.2
## 3 3 70.5 56.6
## 4 4 162 82.1