library("FactoMineR")
## Warning: package 'FactoMineR' was built under R version 4.1.2
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.1.2
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
data(decathlon2)
head(decathlon2)
## X100m Long.jump Shot.put High.jump X400m X110m.hurdle Discus
## SEBRLE 11.04 7.58 14.83 2.07 49.81 14.69 43.75
## CLAY 10.76 7.40 14.26 1.86 49.37 14.05 50.72
## BERNARD 11.02 7.23 14.25 1.92 48.93 14.99 40.87
## YURKOV 11.34 7.09 15.19 2.10 50.42 15.31 46.26
## ZSIVOCZKY 11.13 7.30 13.48 2.01 48.62 14.17 45.67
## McMULLEN 10.83 7.31 13.76 2.13 49.91 14.38 44.41
## Pole.vault Javeline X1500m Rank Points Competition
## SEBRLE 5.02 63.19 291.7 1 8217 Decastar
## CLAY 4.92 60.15 301.5 2 8122 Decastar
## BERNARD 5.32 62.77 280.1 4 8067 Decastar
## YURKOV 4.72 63.44 276.4 5 8036 Decastar
## ZSIVOCZKY 4.42 55.37 268.0 7 8004 Decastar
## McMULLEN 4.42 56.37 285.1 8 7995 Decastar
str(decathlon2)
## 'data.frame': 27 obs. of 13 variables:
## $ X100m : num 11 10.8 11 11.3 11.1 ...
## $ Long.jump : num 7.58 7.4 7.23 7.09 7.3 7.31 6.81 7.56 6.97 7.27 ...
## $ Shot.put : num 14.8 14.3 14.2 15.2 13.5 ...
## $ High.jump : num 2.07 1.86 1.92 2.1 2.01 2.13 1.95 1.86 1.95 1.98 ...
## $ X400m : num 49.8 49.4 48.9 50.4 48.6 ...
## $ X110m.hurdle: num 14.7 14.1 15 15.3 14.2 ...
## $ Discus : num 43.8 50.7 40.9 46.3 45.7 ...
## $ Pole.vault : num 5.02 4.92 5.32 4.72 4.42 4.42 4.92 4.82 4.72 4.62 ...
## $ Javeline : num 63.2 60.1 62.8 63.4 55.4 ...
## $ X1500m : num 292 302 280 276 268 ...
## $ Rank : int 1 2 4 5 7 8 9 10 11 12 ...
## $ Points : int 8217 8122 8067 8036 8004 7995 7802 7733 7708 7651 ...
## $ Competition : Factor w/ 2 levels "Decastar","OlympicG": 1 1 1 1 1 1 1 1 1 1 ...
decathlon2.active <- decathlon2[1:23, 1:10]
head(decathlon2.active)
## X100m Long.jump Shot.put High.jump X400m X110m.hurdle Discus
## SEBRLE 11.04 7.58 14.83 2.07 49.81 14.69 43.75
## CLAY 10.76 7.40 14.26 1.86 49.37 14.05 50.72
## BERNARD 11.02 7.23 14.25 1.92 48.93 14.99 40.87
## YURKOV 11.34 7.09 15.19 2.10 50.42 15.31 46.26
## ZSIVOCZKY 11.13 7.30 13.48 2.01 48.62 14.17 45.67
## McMULLEN 10.83 7.31 13.76 2.13 49.91 14.38 44.41
## Pole.vault Javeline X1500m
## SEBRLE 5.02 63.19 291.7
## CLAY 4.92 60.15 301.5
## BERNARD 5.32 62.77 280.1
## YURKOV 4.72 63.44 276.4
## ZSIVOCZKY 4.42 55.37 268.0
## McMULLEN 4.42 56.37 285.1
res.pca <- PCA(decathlon2.active, graph = FALSE)
print(res.pca)
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 23 individuals, described by 10 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
eig.val <- get_eigenvalue(res.pca)
eig.val
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 4.1242133 41.242133 41.24213
## Dim.2 1.8385309 18.385309 59.62744
## Dim.3 1.2391403 12.391403 72.01885
## Dim.4 0.8194402 8.194402 80.21325
## Dim.5 0.7015528 7.015528 87.22878
## Dim.6 0.4228828 4.228828 91.45760
## Dim.7 0.3025817 3.025817 94.48342
## Dim.8 0.2744700 2.744700 97.22812
## Dim.9 0.1552169 1.552169 98.78029
## Dim.10 0.1219710 1.219710 100.00000
La suma de los valores propios da una varianza total de 10. En la segunda columna se indica la proporción explicada por cada valor propio. Por ejemplo, 4.124 dividido por 10 es igual a 0.4124, es decir, alrededor del 41.24% de la variación se explica por este primer valor propio.
En nuestro análisis, los 3 primeros Componentes Principales explican el 72% de la variación lo cual es un porcentaje aceptablemente grande. Un método alternativo para determinar el número de componentes principales es observar un Screen Plot con la funcuón fviz_eig(), la cual es el grádico de los valores propios ordenados de forma descendente.
fviz_eig(res.pca, addlabels = TRUE, ylim = c(0, 50))
- Un método sencillo para extraer los resultados, para las variables, de la salida de un PCA es utilizar la función %%get_pca_var(). Dicha función da como resultado una lista de matrices que contiene todos los resultados de las variables activas (coordenada,s correlación entre variables y ejes, coseno cuadrado y contribuciones).
var <- get_pca_var(res.pca)
var
## Principal Component Analysis Results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the variables"
## 2 "$cor" "Correlations between variables and dimensions"
## 3 "$cos2" "Cos2 for the variables"
## 4 "$contrib" "contributions of the variables"
-Los componentes de get_pca_var() se pueden utilizar en el gráfico de variables de la siguiente forma:
var$coord
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## X100m -0.850625692 -0.17939806 0.30155643 0.03357320 -0.1944440
## Long.jump 0.794180641 0.28085695 -0.19054653 -0.11538956 0.2331567
## Shot.put 0.733912733 0.08540412 0.51759781 0.12846837 -0.2488129
## High.jump 0.610083985 -0.46521415 0.33008517 0.14455012 0.4027002
## X400m -0.701603377 0.29017826 0.28353292 0.43082552 0.1039085
## X110m.hurdle -0.764125197 -0.02474081 0.44888733 -0.01689589 0.2242200
## Discus 0.743209016 0.04966086 0.17652518 0.39500915 -0.4082391
## Pole.vault -0.217268042 0.80745110 0.09405773 -0.33898477 -0.2216853
## Javeline 0.428226639 0.38610928 0.60412432 -0.33173454 0.1978128
## X1500m 0.004278487 0.78448019 -0.21947068 0.44800961 0.2632527
var$cos2
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## X100m 7.235641e-01 0.0321836641 0.090936280 0.0011271597 0.03780845
## Long.jump 6.307229e-01 0.0788806285 0.036307981 0.0133147506 0.05436203
## Shot.put 5.386279e-01 0.0072938636 0.267907488 0.0165041211 0.06190783
## High.jump 3.722025e-01 0.2164242070 0.108956221 0.0208947375 0.16216747
## X400m 4.922473e-01 0.0842034209 0.080390914 0.1856106269 0.01079698
## X110m.hurdle 5.838873e-01 0.0006121077 0.201499837 0.0002854712 0.05027463
## Discus 5.523596e-01 0.0024662013 0.031161138 0.1560322304 0.16665918
## Pole.vault 4.720540e-02 0.6519772763 0.008846856 0.1149106765 0.04914437
## Javeline 1.833781e-01 0.1490803723 0.364966189 0.1100478063 0.03912992
## X1500m 1.830545e-05 0.6154091638 0.048167378 0.2007126089 0.06930197
var$contrib
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## X100m 1.754429e+01 1.7505098 7.3386590 0.13755240 5.389252
## Long.jump 1.529317e+01 4.2904162 2.9300944 1.62485936 7.748815
## Shot.put 1.306014e+01 0.3967224 21.6204325 2.01407269 8.824401
## High.jump 9.024811e+00 11.7715838 8.7928883 2.54987951 23.115504
## X400m 1.193554e+01 4.5799296 6.4876363 22.65090599 1.539012
## X110m.hurdle 1.415754e+01 0.0332933 16.2612611 0.03483735 7.166193
## Discus 1.339309e+01 0.1341398 2.5147385 19.04132022 23.755756
## Pole.vault 1.144592e+00 35.4618611 0.7139512 14.02307063 7.005084
## Javeline 4.446377e+00 8.1086683 29.4531777 13.42963254 5.577615
## X1500m 4.438531e-04 33.4728757 3.8871610 24.49386930 9.878367
var$coord
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## X100m -0.850625692 -0.17939806 0.30155643 0.03357320 -0.1944440
## Long.jump 0.794180641 0.28085695 -0.19054653 -0.11538956 0.2331567
## Shot.put 0.733912733 0.08540412 0.51759781 0.12846837 -0.2488129
## High.jump 0.610083985 -0.46521415 0.33008517 0.14455012 0.4027002
## X400m -0.701603377 0.29017826 0.28353292 0.43082552 0.1039085
## X110m.hurdle -0.764125197 -0.02474081 0.44888733 -0.01689589 0.2242200
## Discus 0.743209016 0.04966086 0.17652518 0.39500915 -0.4082391
## Pole.vault -0.217268042 0.80745110 0.09405773 -0.33898477 -0.2216853
## Javeline 0.428226639 0.38610928 0.60412432 -0.33173454 0.1978128
## X1500m 0.004278487 0.78448019 -0.21947068 0.44800961 0.2632527
fviz_pca_var(res.pca, col.var = "red")
head(var$cos2, 4)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## X100m 0.7235641 0.032183664 0.09093628 0.00112716 0.03780845
## Long.jump 0.6307229 0.078880629 0.03630798 0.01331475 0.05436203
## Shot.put 0.5386279 0.007293864 0.26790749 0.01650412 0.06190783
## High.jump 0.3722025 0.216424207 0.10895622 0.02089474 0.16216747
library("corrplot")
## Warning: package 'corrplot' was built under R version 4.1.2
## corrplot 0.92 loaded
corrplot(var$cos2, is.corr=FALSE)
fviz_cos2(res.pca, choice = "var", axes = 1:2)
• Los valores de cos2 se utilizan para estimar la calidad de la representación. • Cuanto más cerca esté una variable del círculo de correlaciones, mejor será su representación en el mapa de factores (y lo más importante es interpretar estos componentes) • Las variables que están cerradas al centro de la gráfica son menos importantes para la primeros componentes.
fviz_pca_var(res.pca, col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
- Tenga en cuenta que también es posible cambiar la transparencia de las variables según su valores de cos2 usando la opción alpha.var = “cos2”. Por ejemplo:
fviz_pca_var(res.pca, alpha.var = "cos2")
• Las variables que están correlacionadas con PC1 (es decir, Dim.1) y PC2 (es decir, Dim.2) son las más importante para explicar la variabilidad en el conjunto de datos. • Variables que no se correlacionan con ningún PC o correlacionan con las últimas dimensiones son variables con una contribución baja y podrían eliminarse para simplificar el análisis.
La contribución de las variables se puede extraer de la siguiente manera:
head(var$contrib, 4)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## X100m 17.544293 1.7505098 7.338659 0.1375524 5.389252
## Long.jump 15.293168 4.2904162 2.930094 1.6248594 7.748815
## Shot.put 13.060137 0.3967224 21.620432 2.0140727 8.824401
## High.jump 9.024811 11.7715838 8.792888 2.5498795 23.115504
library("corrplot")
corrplot(var$contrib, is.corr=FALSE)
- La función fviz_contrib () [paquete factoextra] se puede utilizar para dibujar un diagrama de barras de la variable contribuciones. Si sus datos contienen muchas variables, puede decidir mostrar solo la parte superior variables contribuyentes. El código R a continuación muestra las 10 principales variables que contribuyen a la componentes principales:
# Contributions of variables to PC1
fviz_contrib(res.pca, choice = "var", axes = 1, top = 10)
# Contributions of variables to PC2
fviz_contrib(res.pca, choice = "var", axes = 2, top = 10)
fviz_contrib(res.pca, choice = "var", axes = 1:2, top = 10)
- Se puede ver que las variables - X100m, Long.jump y Pole.vault - contribuyen más a las dimensiones 1 y 2.
fviz_pca_var(res.pca, col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")
)
fviz_pca_var(res.pca, alpha.var = "contrib")
set.seed(123)
my.cont.var <- rnorm(10)
fviz_pca_var(res.pca, col.var = my.cont.var,
gradient.cols = c("blue", "yellow", "red"),
legend.title = "Cont.Var")
## Color por grupos.
set.seed(123)
res.km <- kmeans(var$coord, centers = 3, nstart = 25)
grp <- as.factor(res.km$cluster)
fviz_pca_var(res.pca, col.var = grp,
palette = c("#0073C2FF", "#EFC000FF", "#868686FF"),
legend.title = "Cluster")
- Tenga en cuenta que, para cambiar el color de los grupos, se debe utilizar la paleta de argumentos. Para cambiar los colores del degradado, se debe utilizar el argumento gradient.cols.
library(FactoMineR )
res.desc <- dimdesc(res.pca, axes = c(1,2), proba = 0.05)
# Description of dimension 1
res.desc$Dim.1
## $quanti
## correlation p.value
## Long.jump 0.7941806 6.059893e-06
## Discus 0.7432090 4.842563e-05
## Shot.put 0.7339127 6.723102e-05
## High.jump 0.6100840 1.993677e-03
## Javeline 0.4282266 4.149192e-02
## X400m -0.7016034 1.910387e-04
## X110m.hurdle -0.7641252 2.195812e-05
## X100m -0.8506257 2.727129e-07
##
## attr(,"class")
## [1] "condes" "list"
res.desc$Dim.2
## $quanti
## correlation p.value
## Pole.vault 0.8074511 3.205016e-06
## X1500m 0.7844802 9.384747e-06
## High.jump -0.4652142 2.529390e-02
##
## attr(,"class")
## [1] "condes" "list"
ind <- get_pca_ind(res.pca)
ind
## Principal Component Analysis Results for individuals
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the individuals"
## 2 "$cos2" "Cos2 for the individuals"
## 3 "$contrib" "contributions of the individuals"
# Coordinates of individuals
head(ind$coord)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## SEBRLE 0.1955047 1.5890567 0.6424912 0.08389652 1.16829387
## CLAY 0.8078795 2.4748137 -1.3873827 1.29838232 -0.82498206
## BERNARD -1.3591340 1.6480950 0.2005584 -1.96409420 0.08419345
## YURKOV -0.8889532 -0.4426067 2.5295843 0.71290837 0.40782264
## ZSIVOCZKY -0.1081216 -2.0688377 -1.3342591 -0.10152796 -0.20145217
## McMULLEN 0.1212195 -1.0139102 -0.8625170 1.34164291 1.62151286
# Quality of individuals
head(ind$cos2)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## SEBRLE 0.007530179 0.49747323 0.081325232 0.001386688 0.2689026575
## CLAY 0.048701249 0.45701660 0.143628117 0.125791741 0.0507850580
## BERNARD 0.197199804 0.28996555 0.004294015 0.411819183 0.0007567259
## YURKOV 0.096109800 0.02382571 0.778230322 0.061812637 0.0202279796
## ZSIVOCZKY 0.001574385 0.57641944 0.239754152 0.001388216 0.0054654972
## McMULLEN 0.002175437 0.15219499 0.110137872 0.266486530 0.3892621478
# Contributions of individuals
head(ind$contrib)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## SEBRLE 0.04029447 5.9714533 1.4483919 0.03734589 8.45894063
## CLAY 0.68805664 14.4839248 6.7537381 8.94458283 4.21794385
## BERNARD 1.94740183 6.4234107 0.1411345 20.46819433 0.04393073
## YURKOV 0.83308415 0.4632733 22.4517396 2.69663605 1.03075263
## ZSIVOCZKY 0.01232413 10.1217143 6.2464325 0.05469230 0.25151025
## McMULLEN 0.01549089 2.4310854 2.6102794 9.55055888 16.29493304
fviz_pca_ind(res.pca)
- Al igual que las variables, también es posible colorear a los individuos por sus valores de cos2:
fviz_pca_ind(res.pca, col.ind = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping (slow if many points)
)
fviz_pca_ind(res.pca, pointsize = "cos2",
pointshape = 21, fill = "#E7B800",
repel = TRUE # Avoid text overlapping (slow if many points)
)
fviz_pca_ind(res.pca, col.ind = "cos2", pointsize = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping (slow if many points)
)
fviz_cos2(res.pca, choice = "ind")
fviz_contrib(res.pca, choice = "ind", axes = 1:2)
set.seed(123)
my.cont.var <- rnorm(23)
fviz_pca_ind(res.pca, col.ind = my.cont.var,
gradient.cols = c("blue", "yellow", "red"),
legend.title = "Cont.Var")
head(iris, 3)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
iris.pca <- PCA(iris[,-5], graph = FALSE)
fviz_pca_ind(iris.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = iris$Species, # color by groups
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
addEllipses = TRUE, # Concentration ellipses
legend.title = "Groups"
)
fviz_pca_ind(iris.pca, geom.ind = "point", col.ind = iris$Species,
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
addEllipses = TRUE, ellipse.type = "confidence",
legend.title = "Groups"
)
- Tenga en cuenta que los valores permitidos para la paleta incluyen:
• “gris” para paletas de colores grises;
• paletas de cerveza, p. Ej. “RdBu”, “Blues”, …; Para ver todo, escriba esto en R: RColor- Brewer :: display.brewer.all ().
• paleta de colores personalizada, p. Ej. c (“azul”, “rojo”); • y paletas de revistas científicas del paquete ggsci R, por ejemplo: “npg”, “aaas”, “lancet”, “Jco”, “ucscgb”, “uchicago”, “simpsons” y “rickandmorty”. - Por ejemplo, para usar la paleta de colores jco (revista de oncología clínica), escriba esto:
fviz_pca_ind(iris.pca,
label = "none", # hide individual labels
habillage = iris$Species, # color by groups
addEllipses = TRUE, # Concentration ellipses
palette = "jco"
)
fviz_pca_var(res.pca, axes = c(2, 3))
fviz_pca_ind(res.pca, axes = c(2, 3))
## Elementos de la trama: punto, texto, flecha. - El argumento geom (para geometría) y derivados se utilizan para especificar la geometría elementos o elementos gráficos que se utilizarán para trazar. 1) geom.var: un texto que especifica la geometría que se utilizará para trazar las variables. Permitido los valores son la combinación de c (“punto”, “flecha”, “texto”). • Utilice geom.var = “punto”, para mostrar solo puntos; • Utilice geom.var = “texto” para mostrar solo etiquetas de texto; • Utilice geom.var = c (“punto”, “texto”) para mostrar tanto los puntos como las etiquetas de texto • Utilice geom.var = c (“flecha”, “texto”) para mostrar flechas y etiquetas (predeterminado).
fviz_pca_var(res.pca, geom.var = c("point", "text"))
fviz_pca_ind(res.pca, geom.ind = "text")
### Tamaño y forma de los elementos de la trama.
fviz_pca_var(res.pca, arrowsize = 1, labelsize = 5,
repel = TRUE)
# Change points size, shape and fill color
# Change labelsize
fviz_pca_ind(res.pca,
pointsize = 3, pointshape = 21, fill = "lightblue",
labelsize = 5, repel = TRUE)
El argumento ellipse.level también está disponible para cambiar el tamaño de la elipse de concentración. En probabilidad normal.
fviz_pca_ind(iris.pca, geom.ind = "point",
col.ind = iris$Species, # color by groups
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
addEllipses = TRUE, ellipse.type = "confidence",
legend.title = "Groups"
)
# Convex hull
fviz_pca_ind(iris.pca, geom.ind = "point",
col.ind = iris$Species, # color by groups
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
addEllipses = TRUE, ellipse.type = "convex",
legend.title = "Groups"
)
Al colorear los individuos por grupos (sección 3.4.4.4), los puntos medios de los grupos (baricentros) también se muestran de forma predeterminada.
Para eliminar los puntos medios, use el argumento mean.point = FALSE.
fviz_pca_ind(iris.pca,
geom.ind = "point", # show points only (but not "text")
group.ind = iris$Species, # color by groups
legend.title = "Groups",
mean.point = FALSE)
fviz_pca_var(res.pca, axes.linetype = "blank")
ind.p <- fviz_pca_ind(iris.pca, geom = "point", col.ind = iris$Species)
ggpubr::ggpar(ind.p,
title = "Principal Component Analysis",
subtitle = "Iris data set",
caption = "Source: factoextra",
xlab = "PC1", ylab = "PC2",
legend.title = "Species", legend.position = "top",
ggtheme = theme_gray(), palette = "jco"
)
### Biplot.
fviz_pca_biplot(res.pca, repel = TRUE,
col.var = "#2E9FDF", # Variables color
col.ind = "#696969" # Individuals color
)
- En términos generales, un biplot se puede interpretar de la siguiente manera: • un individuo que está en el mismo lado de una variable dada tiene un valor alto para esta variable; • un individuo que está en el lado opuesto de una variable dada tiene un valor bajo para esta variable.
fviz_pca_biplot(iris.pca,
col.ind = iris$Species, palette = "jco",
addEllipses = TRUE, label = "var",
col.var = "black", repel = TRUE,
legend.title = "Species")
En el siguiente ejemplo, queremos colorear tanto los individuos como las variables por grupos.El truco consiste en usar pointshape = 21 para puntos individuales.
El color de la línea de borde de puntos individuales se establece en “negro” utilizando col.ind. Para colorear variables por grupos, se utilizará el argumento col.var. Para personalizar los colores individuales y variables, usamos las funciones auxiliares fill_palette () y color_palette () [en el paquete ggpubr].
fviz_pca_biplot(iris.pca,
# Fill individuals by groups
geom.ind = "point",
pointshape = 21,
pointsize = 2.5,
fill.ind = iris$Species,
col.ind = "black",
# Color variable by groups
col.var = factor(c("sepal", "sepal", "petal", "petal")),
legend.title = list(fill = "Species", color = "Clusters"),
repel = TRUE # Avoid label overplotting
)+
ggpubr::fill_palette("jco")+ # Indiviual fill color
ggpubr::color_palette("npg")
fviz_pca_biplot(iris.pca,
# Individuals
geom.ind = "point",
fill.ind = iris$Species, col.ind = "black",
pointshape = 21, pointsize = 2,
palette = "jco",
addEllipses = TRUE,
# Variables
alpha.var ="contrib", col.var = "contrib",
gradient.cols = "RdYlBu",
legend.title = list(fill = "Species", color = "Contrib",
alpha = "Contrib")
)
Como se describió anteriormente (sección 3.3.2), los conjuntos de datos de decatlón2 contienen variables continuas (cuanti.sup, columnas 11:12), variable cualitativa suplementarias (cali.sup, columna 13) e individuos suplementarios (ind.sup, filas 24:27).
Las variables y los individuos suplementarios no se utilizan para la determinación de los componentes principales. Sus coordenadas se predicen utilizando solo la información proporcionada por el análisis de componentes principales realizado sobre variables activas / individuos.
PCA(X, ind.sup = NULL, quanti.sup = NULL, quali.sup = NULL, graph = TRUE)
• X: un marco de datos. Las filas son individuos y las columnas son variables numéricas.
• ind.sup: un vector numérico que especifica los índices del indicador suplementario individuales
• quanti.sup, quali.sup: un vector numérico que especifica, respectivamente, los índices de las variables cuantitativas y cualitativas.
• gráfico: un valor lógico. Si es VERDADERO, se muestra un gráfico.
res.pca <- PCA(decathlon2, ind.sup = 24:27,
quanti.sup = 11:12, quali.sup = 13, graph=FALSE)
res.pca$quanti.sup
## $coord
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Rank -0.7014777 -0.24519443 -0.1834294 0.05575186 -0.07382647
## Points 0.9637075 0.07768262 0.1580225 -0.16623092 -0.03114711
##
## $cor
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Rank -0.7014777 -0.24519443 -0.1834294 0.05575186 -0.07382647
## Points 0.9637075 0.07768262 0.1580225 -0.16623092 -0.03114711
##
## $cos2
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Rank 0.4920710 0.060120310 0.03364635 0.00310827 0.0054503477
## Points 0.9287322 0.006034589 0.02497110 0.02763272 0.0009701427
fviz_pca_var(res.pca)
Tenga en cuenta que, de forma predeterminada, las variables cuantitativas complementarias se muestran en color azul y líneas discontinuas.
Más argumentos para personalizar la trama:
fviz_pca_var(res.pca,
col.var = "black", # Active variables
col.quanti.sup = "red" # Suppl. quantitative variables
)
# Hide active variables on the plot,
# show only supplementary variables
fviz_pca_var(res.pca, invisible = "var")
# Hide supplementary variables
fviz_pca_var(res.pca, invisible = "quanti.sup")
p <- fviz_pca_var(res.pca, invisible = "quanti.sup")
fviz_add(p, res.pca$quanti.sup$coord,
geom = c("arrow", "text"),
color = "red")
res.pca$ind.sup
## $coord
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## KARPOV 0.7947206 0.77951227 -1.6330203 1.7242283 -0.75070396
## WARNERS -0.3864645 -0.12159237 -1.7387332 -0.7063341 -0.03230011
## Nool -0.5591306 1.97748871 -0.4830358 -2.2784526 -0.25461493
## Drews -1.1092038 0.01741477 -3.0488182 -1.5343468 -0.32642192
##
## $cos2
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## KARPOV 0.05104677 4.911173e-02 0.21553730 0.24028620 0.0455487744
## WARNERS 0.02422707 2.398250e-03 0.49039677 0.08092862 0.0001692349
## Nool 0.02897149 3.623868e-01 0.02162236 0.48108780 0.0060077529
## Drews 0.09207094 2.269527e-05 0.69560547 0.17617609 0.0079736753
##
## $dist
## KARPOV WARNERS Nool Drews
## 3.517470 2.482899 3.284943 3.655527
p <- fviz_pca_ind(res.pca, col.ind.sup = "blue", repel = TRUE)
p <- fviz_add(p, res.pca$quali.sup$coord, color = "red")
p
Tenga en cuenta que las variables cualitativas complementarias también se pueden utilizar para colorear individuos por grupos. Esto puede ayudar a interpretar los datos. Los conjuntos de datos decathlon2 contienen una variable cualitativa complementaria en las columnas 13 correspondiente al tipo de competiciones.
Los resultados relativos a la variable cualitativa complementaria son:
res.pca$quali
## $coord
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Decastar -1.343451 0.1218097 -0.03789524 0.1808357 0.1343364
## OlympicG 1.231497 -0.1116589 0.03473730 -0.1657661 -0.1231417
##
## $cos2
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Decastar 0.9051233 0.007440939 0.0007201669 0.01639956 0.009050062
## OlympicG 0.9051233 0.007440939 0.0007201669 0.01639956 0.009050062
##
## $v.test
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Decastar -2.970766 0.4034256 -0.1528767 0.8971036 0.7202457
## OlympicG 2.970766 -0.4034256 0.1528767 -0.8971036 -0.7202457
##
## $dist
## Decastar OlympicG
## 1.412108 1.294433
##
## $eta2
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Competition 0.4011568 0.00739783 0.001062332 0.03658159 0.02357972
fviz_pca_ind(res.pca, habillage = 13,
addEllipses =TRUE, ellipse.type = "confidence",
palette = "jco", repel = TRUE)
• nombre: es un vector de caracteres que contiene los nombres de individuos / variables que se van a trazar • cos2: si cos2 está en [0, 1], ex: 0.6, entonces individuos / variables con un cos2> 0.6 están trazados.
• si cos2> 1, ej: 5, entonces los 5 principales individuos / variables activos y los 5 principales se trazan las columnas / filas complementarias con el cos2 más alto.
• contrib: si contrib> 1, ex: 5, entonces los 5 principales individuos / variables con el se grafican las contribuciones más altas.
fviz_pca_var(res.pca, select.var = list(cos2 = 0.6))
# Top 5 active variables with the highest cos2
fviz_pca_var(res.pca, select.var= list(cos2 = 5))
# Select by names
name <- list(name = c("Long.jump", "High.jump", "X100m"))
fviz_pca_var(res.pca, select.var = name)
# top 5 contributing individuals and variable
fviz_pca_biplot(res.pca, select.ind = list(contrib = 5),
select.var = list(contrib = 5),
ggtheme = theme_minimal())
- Cuando la selección se realiza de acuerdo con los valores de contribución, suplementarios los individuos / variables no se muestran porque no contribuyen a la construcción de los ejes.
Exportar trazados a archivos PDF / PNG El paquete factoextra produce gráficos basados en ggplot2. Para guardar cualquier ggplots, el código R estándar es el siguiente:
library(graphics)
pdf("myplot.pdf")
print(plot)
## function (x, y, ...)
## UseMethod("plot")
## <bytecode: 0x0000000019736050>
## <environment: namespace:base>
dev.off()
## png
## 2
En los siguientes ejemplos, le mostraremos cómo guardar los diferentes gráficos en archivos pdf o png.
El primer paso es crear los gráficos que desea como objeto R:
scree.plot <- fviz_eig(res.pca)
# Plot of individuals
ind.plot <- fviz_pca_ind(res.pca)
# Plot of variables
var.plot <- fviz_pca_var(res.pca)
pdf("PCA.pdf") # Create a new pdf device
print(scree.plot)
print(ind.plot)
print(var.plot)
dev.off()
## png
## 2
png("pca-scree-plot.png")
print(scree.plot)
dev.off()
## png
## 2
# Print individuals plot to a png file
png("pca-variables.png")
print(var.plot)
dev.off()
## png
## 2
# Print variables plot to a png file
png("pca-individuals.png")
print(ind.plot)
dev.off()
## png
## 2
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.1.2
ggexport(plotlist = list(scree.plot, ind.plot, var.plot),
filename = "PCA.pdf")
## file saved to PCA.pdf
ggexport(plotlist = list(scree.plot, ind.plot, var.plot),
nrow = 2, ncol = 2,
filename = "PCA.pdf")
## file saved to PCA.pdf
ggexport(plotlist = list(scree.plot, ind.plot, var.plot),
filename = "PCA.png")
## [1] "PCA%03d.png"
## file saved to PCA%03d.png
write.infile(res.pca, "pca.txt", sep = "\t")
# Export into a CSV file
write.infile(res.pca, "pca.csv", sep = ";")
En conclusión, describimos cómo realizar e interpretar el análisis de componentes principales (PCA). Calculamos PCA usando la función PCA () [FactoMineR]. A continuación, usamos el paquete factoextra R para producir una visualización basada en ggplot2 de los resultados de PCA. Hay otras funciones [paquetes] para calcular PCA en R:
res.pca <- prcomp(iris[, -5], scale. = TRUE)
res.pca <- princomp(iris[, -5], cor = TRUE)
library("ade4")
## Warning: package 'ade4' was built under R version 4.1.2
##
## Attaching package: 'ade4'
## The following object is masked from 'package:FactoMineR':
##
## reconst
res.pca <- dudi.pca(iris[, -5], scannf = FALSE, nf = 5)
library("ExPosition")
## Warning: package 'ExPosition' was built under R version 4.1.1
## Loading required package: prettyGraphs
## Warning: package 'prettyGraphs' was built under R version 4.1.1
res.pca <- epPCA(iris[, -5], graph = FALSE)
fviz_eig(res.pca) # Scree plot
fviz_pca_ind(res.pca) # Graph of individuals
fviz_pca_var(res.pca)
Para este informe, comenzamos cargando los datos.
countries = read.csv("C:/Users/Usuario/Desktop/Analisis Estadístico/countries_of_the_world.csv", na.string = c("", "NA"))
head(countries)
## Country Region Population Area..sq..mi..
## 1 Afghanistan ASIA (EX. NEAR EAST) 31056997 647500
## 2 Albania EASTERN EUROPE 3581655 28748
## 3 Algeria NORTHERN AFRICA 32930091 2381740
## 4 American Samoa OCEANIA 57794 199
## 5 Andorra WESTERN EUROPE 71201 468
## 6 Angola SUB-SAHARAN AFRICA 12127071 1246700
## Pop..Density..per.sq..mi.. Coastline..coast.area.ratio. Net.migration
## 1 48,0 0,00 23,06
## 2 124,6 1,26 -4,93
## 3 13,8 0,04 -0,39
## 4 290,4 58,29 -20,71
## 5 152,1 0,00 6,6
## 6 9,7 0,13 0
## Infant.mortality..per.1000.births. GDP....per.capita. Literacy....
## 1 163,07 700 36,0
## 2 21,52 4500 86,5
## 3 31 6000 70,0
## 4 9,27 8000 97,0
## 5 4,05 19000 100,0
## 6 191,19 1900 42,0
## Phones..per.1000. Arable.... Crops.... Other.... Climate Birthrate Deathrate
## 1 3,2 12,13 0,22 87,65 1 46,6 20,34
## 2 71,2 21,09 4,42 74,49 3 15,11 5,22
## 3 78,1 3,22 0,25 96,53 1 17,14 4,61
## 4 259,5 10 15 75 2 22,46 3,27
## 5 497,2 2,22 0 97,78 3 8,71 6,25
## 6 7,8 2,41 0,24 97,35 <NA> 45,11 24,2
## Agriculture Industry Service
## 1 0,38 0,24 0,38
## 2 0,232 0,188 0,579
## 3 0,101 0,6 0,298
## 4 <NA> <NA> <NA>
## 5 <NA> <NA> <NA>
## 6 0,096 0,658 0,246
any(is.na(countries))
## [1] TRUE
sum(is.na(countries))
## [1] 110
library(Amelia)
## Warning: package 'Amelia' was built under R version 4.1.2
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.0, built: 2021-05-26)
## ## Copyright (C) 2005-2021 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(countries, legend = TRUE, col = c("yellow", "dodgerblue"))
str(countries)
## 'data.frame': 227 obs. of 20 variables:
## $ Country : chr "Afghanistan " "Albania " "Algeria " "American Samoa " ...
## $ Region : chr "ASIA (EX. NEAR EAST) " "EASTERN EUROPE " "NORTHERN AFRICA " "OCEANIA " ...
## $ Population : int 31056997 3581655 32930091 57794 71201 12127071 13477 69108 39921833 2976372 ...
## $ Area..sq..mi.. : int 647500 28748 2381740 199 468 1246700 102 443 2766890 29800 ...
## $ Pop..Density..per.sq..mi.. : chr "48,0" "124,6" "13,8" "290,4" ...
## $ Coastline..coast.area.ratio. : chr "0,00" "1,26" "0,04" "58,29" ...
## $ Net.migration : chr "23,06" "-4,93" "-0,39" "-20,71" ...
## $ Infant.mortality..per.1000.births.: chr "163,07" "21,52" "31" "9,27" ...
## $ GDP....per.capita. : int 700 4500 6000 8000 19000 1900 8600 11000 11200 3500 ...
## $ Literacy.... : chr "36,0" "86,5" "70,0" "97,0" ...
## $ Phones..per.1000. : chr "3,2" "71,2" "78,1" "259,5" ...
## $ Arable.... : chr "12,13" "21,09" "3,22" "10" ...
## $ Crops.... : chr "0,22" "4,42" "0,25" "15" ...
## $ Other.... : chr "87,65" "74,49" "96,53" "75" ...
## $ Climate : chr "1" "3" "1" "2" ...
## $ Birthrate : chr "46,6" "15,11" "17,14" "22,46" ...
## $ Deathrate : chr "20,34" "5,22" "4,61" "3,27" ...
## $ Agriculture : chr "0,38" "0,232" "0,101" NA ...
## $ Industry : chr "0,24" "0,188" "0,6" NA ...
## $ Service : chr "0,38" "0,579" "0,298" NA ...
head(as.character(countries$Agriculture))
## [1] "0,38" "0,232" "0,101" NA NA "0,096"
for (i in 3:length(names(countries))){
countries[,i] <- as.numeric(gsub(",",'.',(sapply(countries[,i], as.character))))
}
str(countries)
## 'data.frame': 227 obs. of 20 variables:
## $ Country : chr "Afghanistan " "Albania " "Algeria " "American Samoa " ...
## $ Region : chr "ASIA (EX. NEAR EAST) " "EASTERN EUROPE " "NORTHERN AFRICA " "OCEANIA " ...
## $ Population : num 31056997 3581655 32930091 57794 71201 ...
## $ Area..sq..mi.. : num 647500 28748 2381740 199 468 ...
## $ Pop..Density..per.sq..mi.. : num 48 124.6 13.8 290.4 152.1 ...
## $ Coastline..coast.area.ratio. : num 0 1.26 0.04 58.29 0 ...
## $ Net.migration : num 23.06 -4.93 -0.39 -20.71 6.6 ...
## $ Infant.mortality..per.1000.births.: num 163.07 21.52 31 9.27 4.05 ...
## $ GDP....per.capita. : num 700 4500 6000 8000 19000 1900 8600 11000 11200 3500 ...
## $ Literacy.... : num 36 86.5 70 97 100 42 95 89 97.1 98.6 ...
## $ Phones..per.1000. : num 3.2 71.2 78.1 259.5 497.2 ...
## $ Arable.... : num 12.13 21.09 3.22 10 2.22 ...
## $ Crops.... : num 0.22 4.42 0.25 15 0 0.24 0 4.55 0.48 2.3 ...
## $ Other.... : num 87.7 74.5 96.5 75 97.8 ...
## $ Climate : num 1 3 1 2 3 NA 2 2 3 4 ...
## $ Birthrate : num 46.6 15.11 17.14 22.46 8.71 ...
## $ Deathrate : num 20.34 5.22 4.61 3.27 6.25 ...
## $ Agriculture : num 0.38 0.232 0.101 NA NA 0.096 0.04 0.038 0.095 0.239 ...
## $ Industry : num 0.24 0.188 0.6 NA NA 0.658 0.18 0.22 0.358 0.343 ...
## $ Service : num 0.38 0.579 0.298 NA NA 0.246 0.78 0.743 0.547 0.418 ...
Nota: El clima es una variable categórica: no podemos imputar la media. Convertiremos el NA a Desconocido como factor, será una característica de la columna Clima, significa no disponible.
countries$Climate = ifelse(is.na(countries$Climate), 'Unknown', countries$Climate)
countries$Climate = as.factor(countries$Climate)
str(countries)
## 'data.frame': 227 obs. of 20 variables:
## $ Country : chr "Afghanistan " "Albania " "Algeria " "American Samoa " ...
## $ Region : chr "ASIA (EX. NEAR EAST) " "EASTERN EUROPE " "NORTHERN AFRICA " "OCEANIA " ...
## $ Population : num 31056997 3581655 32930091 57794 71201 ...
## $ Area..sq..mi.. : num 647500 28748 2381740 199 468 ...
## $ Pop..Density..per.sq..mi.. : num 48 124.6 13.8 290.4 152.1 ...
## $ Coastline..coast.area.ratio. : num 0 1.26 0.04 58.29 0 ...
## $ Net.migration : num 23.06 -4.93 -0.39 -20.71 6.6 ...
## $ Infant.mortality..per.1000.births.: num 163.07 21.52 31 9.27 4.05 ...
## $ GDP....per.capita. : num 700 4500 6000 8000 19000 1900 8600 11000 11200 3500 ...
## $ Literacy.... : num 36 86.5 70 97 100 42 95 89 97.1 98.6 ...
## $ Phones..per.1000. : num 3.2 71.2 78.1 259.5 497.2 ...
## $ Arable.... : num 12.13 21.09 3.22 10 2.22 ...
## $ Crops.... : num 0.22 4.42 0.25 15 0 0.24 0 4.55 0.48 2.3 ...
## $ Other.... : num 87.7 74.5 96.5 75 97.8 ...
## $ Climate : Factor w/ 7 levels "1","1.5","2",..: 1 5 1 3 5 7 3 3 5 6 ...
## $ Birthrate : num 46.6 15.11 17.14 22.46 8.71 ...
## $ Deathrate : num 20.34 5.22 4.61 3.27 6.25 ...
## $ Agriculture : num 0.38 0.232 0.101 NA NA 0.096 0.04 0.038 0.095 0.239 ...
## $ Industry : num 0.24 0.188 0.6 NA NA 0.658 0.18 0.22 0.358 0.343 ...
## $ Service : num 0.38 0.579 0.298 NA NA 0.246 0.78 0.743 0.547 0.418 ...
num_cols = c(3:20)
num_cols = num_cols[num_cols != 15]
# Since the 15th column is also categorical, we exclude it
for (index in num_cols)
{countries[,index] = ifelse(is.na(countries[,index]),ave(countries[,index],
FUN =function(x) mean(x, na.rm=TRUE)), countries[,index]) }
missmap(countries, legend = TRUE, col = c("yellow", "dodgerblue"))
- Veamos como interpreta R las columnas de nuestra base de datos
str(countries)
## 'data.frame': 227 obs. of 20 variables:
## $ Country : chr "Afghanistan " "Albania " "Algeria " "American Samoa " ...
## $ Region : chr "ASIA (EX. NEAR EAST) " "EASTERN EUROPE " "NORTHERN AFRICA " "OCEANIA " ...
## $ Population : num 31056997 3581655 32930091 57794 71201 ...
## $ Area..sq..mi.. : num 647500 28748 2381740 199 468 ...
## $ Pop..Density..per.sq..mi.. : num 48 124.6 13.8 290.4 152.1 ...
## $ Coastline..coast.area.ratio. : num 0 1.26 0.04 58.29 0 ...
## $ Net.migration : num 23.06 -4.93 -0.39 -20.71 6.6 ...
## $ Infant.mortality..per.1000.births.: num 163.07 21.52 31 9.27 4.05 ...
## $ GDP....per.capita. : num 700 4500 6000 8000 19000 1900 8600 11000 11200 3500 ...
## $ Literacy.... : num 36 86.5 70 97 100 42 95 89 97.1 98.6 ...
## $ Phones..per.1000. : num 3.2 71.2 78.1 259.5 497.2 ...
## $ Arable.... : num 12.13 21.09 3.22 10 2.22 ...
## $ Crops.... : num 0.22 4.42 0.25 15 0 0.24 0 4.55 0.48 2.3 ...
## $ Other.... : num 87.7 74.5 96.5 75 97.8 ...
## $ Climate : Factor w/ 7 levels "1","1.5","2",..: 1 5 1 3 5 7 3 3 5 6 ...
## $ Birthrate : num 46.6 15.11 17.14 22.46 8.71 ...
## $ Deathrate : num 20.34 5.22 4.61 3.27 6.25 ...
## $ Agriculture : num 0.38 0.232 0.101 0.151 0.151 ...
## $ Industry : num 0.24 0.188 0.6 0.283 0.283 ...
## $ Service : num 0.38 0.579 0.298 0.565 0.565 ...
View(countries)
library("FactoMineR")
res.pca = PCA(countries, scale.unit = TRUE, quali.sup = c(1,2,15), ncp = 5, graph = FALSE)
print(res.pca)
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 227 individuals, described by 20 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$quali.sup" "results for the supplementary categorical variables"
## 12 "$quali.sup$coord" "coord. for the supplementary categories"
## 13 "$quali.sup$v.test" "v-test of the supplementary categories"
## 14 "$call" "summary statistics"
## 15 "$call$centre" "mean of the variables"
## 16 "$call$ecart.type" "standard error of the variables"
## 17 "$call$row.w" "weights for the individuals"
## 18 "$call$col.w" "weights for the variables"
library("factoextra")
eig.val <- get_eigenvalue(res.pca)
eig.val
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 5.199798e+00 3.058704e+01 30.58704
## Dim.2 2.422569e+00 1.425041e+01 44.83745
## Dim.3 1.824847e+00 1.073439e+01 55.57184
## Dim.4 1.484760e+00 8.733881e+00 64.30572
## Dim.5 1.279901e+00 7.528831e+00 71.83455
## Dim.6 1.009175e+00 5.936326e+00 77.77088
## Dim.7 7.850656e-01 4.618033e+00 82.38891
## Dim.8 6.959031e-01 4.093547e+00 86.48246
## Dim.9 5.623432e-01 3.307901e+00 89.79036
## Dim.10 5.161298e-01 3.036058e+00 92.82642
## Dim.11 4.156580e-01 2.445047e+00 95.27147
## Dim.12 3.865689e-01 2.273934e+00 97.54540
## Dim.13 1.922629e-01 1.130958e+00 98.67636
## Dim.14 1.371155e-01 8.065615e-01 99.48292
## Dim.15 8.541160e-02 5.024212e-01 99.98534
## Dim.16 2.491993e-03 1.465878e-02 100.00000
## Dim.17 1.222947e-07 7.193806e-07 100.00000
fviz_eig(res.pca, addlabels = TRUE, ylim = c(0, 35))
var <- get_pca_var(res.pca)
var
## Principal Component Analysis Results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the variables"
## 2 "$cor" "Correlations between variables and dimensions"
## 3 "$cos2" "Cos2 for the variables"
## 4 "$contrib" "contributions of the variables"
head(var$coord)
## Dim.1 Dim.2 Dim.3
## Population -0.02347349 -0.008573752 0.59835722
## Area..sq..mi.. 0.02738842 0.277733292 0.50047468
## Pop..Density..per.sq..mi.. 0.25407600 0.117450977 -0.36494682
## Coastline..coast.area.ratio. 0.17195135 -0.284284828 -0.49693402
## Net.migration 0.17774214 0.494067596 -0.03306897
## Infant.mortality..per.1000.births. -0.91092339 0.119593853 -0.05678322
## Dim.4 Dim.5
## Population 0.11612002 0.63522080
## Area..sq..mi.. -0.01641998 0.64309861
## Pop..Density..per.sq..mi.. 0.16859420 0.24532797
## Coastline..coast.area.ratio. -0.20921362 0.41072239
## Net.migration 0.47670860 -0.07503242
## Infant.mortality..per.1000.births. 0.19732351 0.02208708
head(var$cos2)
## Dim.1 Dim.2 Dim.3
## Population 0.0005510048 7.350922e-05 0.358031367
## Area..sq..mi.. 0.0007501256 7.713578e-02 0.250474902
## Pop..Density..per.sq..mi.. 0.0645546134 1.379473e-02 0.133186181
## Coastline..coast.area.ratio. 0.0295672678 8.081786e-02 0.246943420
## Net.migration 0.0315922671 2.441028e-01 0.001093557
## Infant.mortality..per.1000.births. 0.8297814155 1.430269e-02 0.003224335
## Dim.4 Dim.5
## Population 0.0134838598 0.4035054706
## Area..sq..mi.. 0.0002696156 0.4135758230
## Pop..Density..per.sq..mi.. 0.0284240028 0.0601858144
## Coastline..coast.area.ratio. 0.0437703401 0.1686928809
## Net.migration 0.2272510914 0.0056298646
## Infant.mortality..per.1000.births. 0.0389365681 0.0004878392
head(var$contrib)
## Dim.1 Dim.2 Dim.3
## Population 0.01059666 0.00303435 19.61980656
## Area..sq..mi.. 0.01442605 3.18404909 13.72580613
## Pop..Density..per.sq..mi.. 1.24148321 0.56942580 7.29848653
## Coastline..coast.area.ratio. 0.56862344 3.33603989 13.53228397
## Net.migration 0.60756725 10.07619617 0.05992595
## Infant.mortality..per.1000.births. 15.95795620 0.59039352 0.17669072
## Dim.4 Dim.5
## Population 0.90815098 31.52629570
## Area..sq..mi.. 0.01815887 32.31310265
## Pop..Density..per.sq..mi.. 1.91438403 4.70237933
## Coastline..coast.area.ratio. 2.94797467 13.18014757
## Net.migration 15.30558044 0.43986709
## Infant.mortality..per.1000.births. 2.62241546 0.03811537
fviz_pca_var(res.pca, col.var = "black",labelsize = 2.5, repel = TRUE)
library("corrplot")
corrplot(var$cos2, is.corr=FALSE)
fviz_cos2(res.pca, choice = "var", axes = 1:2)
fviz_pca_var(res.pca, col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
fviz_pca_var(res.pca, alpha.var = "cos2")
library("corrplot")
corrplot(var$contrib, is.corr=FALSE)
- Ahora, mediante la función fviz_contrib visualizaremos las contribuciones de los datos.
fviz_contrib(res.pca, choice = "var", axes = 1, top = 17)
fviz_contrib(res.pca, choice = "var", axes = 2, top = 17)
fviz_contrib(res.pca, choice = "var", axes = 1:2, top = 17)
fviz_pca_var(res.pca, col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")
)
fviz_pca_var(res.pca, alpha.var = "contrib")
set.seed(123)
my.cont.var <- rnorm(17)
fviz_pca_var(res.pca, col.var = my.cont.var,
gradient.cols = c("blue", "yellow", "red"),
legend.title = "Cont.Var")
set.seed(123)
res.km <- kmeans(var$coord, centers = 3, nstart = 25)
grp <- as.factor(res.km$cluster)
fviz_pca_var(res.pca, col.var = grp,
palette = c("#0073C2FF", "#EFC000FF", "#868686FF"),
legend.title = "Cluster")
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.1
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.3 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.0 v forcats 0.5.1
## v purrr 0.3.4
## Warning: package 'dplyr' was built under R version 4.1.1
## Warning: package 'forcats' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(cluster)
library(factoextra)
-Las filas son observaciones (individuos) y las columnas son variables. -Cualquier valor que falte en los datos debe eliminarse o estimarse. -Los datos deben estar estandarizados (es decir, escalados) para que las variables sean comparables.
Cargamos la base de datos y omitimos los valores nulos
df <- USArrests
df <- na.omit(df)
Empezamos estandarizando los datos con la función %%scale%%
df <- scale(df)
head(df)
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207
k2 <- kmeans(df, centers = 2, nstart = 25)
str(k2)
## List of 9
## $ cluster : Named int [1:50] 2 2 2 1 2 2 1 1 2 2 ...
## ..- attr(*, "names")= chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ centers : num [1:2, 1:4] -0.67 1.005 -0.676 1.014 -0.132 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "1" "2"
## .. ..$ : chr [1:4] "Murder" "Assault" "UrbanPop" "Rape"
## $ totss : num 196
## $ withinss : num [1:2] 56.1 46.7
## $ tot.withinss: num 103
## $ betweenss : num 93.1
## $ size : int [1:2] 30 20
## $ iter : int 1
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
fviz_cluster(k2, data = df)
df %>%
as_tibble() %>%
mutate(cluster = k2$cluster,
state = row.names(USArrests)) %>%
ggplot(aes(UrbanPop, Murder, color = factor(cluster), label = state)) +
geom_text()
k3 <- kmeans(df, centers = 3, nstart = 25)
k4 <- kmeans(df, centers = 4, nstart = 25)
k5 <- kmeans(df, centers = 5, nstart = 25)
# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point", data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point", data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point", data = df) + ggtitle("k = 5")
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p1, p2, p3, p4, nrow = 2)
Aunque esta evaluación visual dice donde ocurren las diluciones verdaderas (o no, como en los grupos 2 y 4 para k=5), no nos dice cuál es el número optimo de grupos.
Determinación de clústeres óptimos
Existen 3 métodos más populares para determinar los clústeres óptimos, que incluyen:
set.seed(123)
wss <- function(k) {
kmeans(df, k, nstart = 10 )$tot.withinss
}
k.values <- 1:15
wss_values <- map_dbl(k.values, wss)
plot(k.values, wss_values,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
set.seed(123)
fviz_nbclust(df, kmeans, method = "wss")
El enfoque principal es medir la calidad de un agrupamiento. Determina que tan bien se encuentra cada objeto dentro de su grupo, por lo que un ancho de silueta medio alto indica una buena agrupación. El número óptimo de clústers k es el que maximiza la silueta promedio en un rango de valores posibles para k.
Se calcula a continuación:
avg_sil <- function(k) {
km.res <- kmeans(df, centers = k, nstart = 25)
ss <- silhouette(km.res$cluster, dist(df))
mean(ss[, 3])
}
k.values <- 2:15
avg_sil_values <- map_dbl(k.values, avg_sil)
plot(k.values, avg_sil_values,
type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Average Silhouettes")
fviz_nbclust(df, kmeans, method = "silhouette")
El enfoque se puede aplicar a cualquier método de agrupación.Compara la variación intragrupo total para diferentes valores de k con sus valores esperados bajo una distribución de referencia nula de los datos. El conjunto de datos de referencia se genera utilizando simulaciones de Monte Carlo del proceso de muestreo.
El algoritmo implica:
set.seed(123)
gap_stat <- clusGap(df, FUN = kmeans, nstart = 25,
K.max = 10, B = 50)
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = df, FUNcluster = kmeans, K.max = 10, B = 50, nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstmax'): 4
## logW E.logW gap SE.sim
## [1,] 3.458369 3.640154 0.1817845 0.04422857
## [2,] 3.135112 3.372283 0.2371717 0.03559601
## [3,] 2.977727 3.233771 0.2560446 0.03749193
## [4,] 2.826221 3.119172 0.2929511 0.04067348
## [5,] 2.738868 3.019965 0.2810969 0.04185469
## [6,] 2.666967 2.930002 0.2630347 0.04105040
## [7,] 2.609895 2.852152 0.2422572 0.04184725
## [8,] 2.539156 2.778562 0.2394054 0.04292750
## [9,] 2.468162 2.711752 0.2435901 0.04344197
## [10,] 2.407265 2.647595 0.2403307 0.04548446
fviz_gap_stat(gap_stat)
set.seed(123)
final <- kmeans(df, 4, nstart = 25)
print(final)
## K-means clustering with 4 clusters of sizes 8, 13, 16, 13
##
## Cluster means:
## Murder Assault UrbanPop Rape
## 1 1.4118898 0.8743346 -0.8145211 0.01927104
## 2 -0.9615407 -1.1066010 -0.9301069 -0.96676331
## 3 -0.4894375 -0.3826001 0.5758298 -0.26165379
## 4 0.6950701 1.0394414 0.7226370 1.27693964
##
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 1 4 4 1 4
## Colorado Connecticut Delaware Florida Georgia
## 4 3 3 4 1
## Hawaii Idaho Illinois Indiana Iowa
## 3 2 4 3 2
## Kansas Kentucky Louisiana Maine Maryland
## 3 2 1 2 4
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 4 2 1 4
## Montana Nebraska Nevada New Hampshire New Jersey
## 2 2 4 2 3
## New Mexico New York North Carolina North Dakota Ohio
## 4 4 1 2 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 1
## South Dakota Tennessee Texas Utah Vermont
## 2 1 4 3 2
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 2 2 3
##
## Within cluster sum of squares by cluster:
## [1] 8.316061 11.952463 16.212213 19.922437
## (between_SS / total_SS = 71.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(final, data = df)
USArrests %>%
mutate(Cluster = final$cluster) %>%
group_by(Cluster) %>%
summarise_all("mean")
## # A tibble: 4 x 5
## Cluster Murder Assault UrbanPop Rape
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 13.9 244. 53.8 21.4
## 2 2 3.6 78.5 52.1 12.2
## 3 3 5.66 139. 73.9 18.8
## 4 4 10.8 257. 76 33.2
library(dplyr)
library(tidyverse)
library(cluster)
library(factoextra)
df2 <- read.csv("C:/Users/Usuario/Desktop/Analisis Estadístico/mall_customers.csv")
dff <- df2
df2 <- na.omit(df2)
df2 <- select (df2, CustomerID, Age, Annual.Income..k.., Spending.Score..1.100.)
rownames(df2) <- df2$CustomerID
df2 <- scale(df2)
head(df2)
## CustomerID Age Annual.Income..k.. Spending.Score..1.100.
## 1 -1.719098 -1.4210029 -1.734646 -0.4337131
## 2 -1.701821 -1.2778288 -1.734646 1.1927111
## 3 -1.684543 -1.3494159 -1.696572 -1.7116178
## 4 -1.667266 -1.1346547 -1.696572 1.0378135
## 5 -1.649989 -0.5619583 -1.658498 -0.3949887
## 6 -1.632711 -1.2062418 -1.658498 0.9990891
k2 <- kmeans(df2, centers = 2, nstart = 25)
str(k2)
## List of 9
## $ cluster : Named int [1:200] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "names")= chr [1:200] "1" "2" "3" "4" ...
## $ centers : num [1:2, 1:4] -0.809 0.894 0.241 -0.267 -0.753 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "1" "2"
## .. ..$ : chr [1:4] "CustomerID" "Age" "Annual.Income..k.." "Spending.Score..1.100."
## $ totss : num 796
## $ withinss : num [1:2] 271 242
## $ tot.withinss: num 513
## $ betweenss : num 283
## $ size : int [1:2] 105 95
## $ iter : int 1
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
fviz_cluster(k2, data = df2)
df2 %>%
as_tibble() %>%
mutate(cluster = k2$cluster,
state = row.names(df2)) %>%
ggplot(aes(Annual.Income..k.., Spending.Score..1.100., color = factor(cluster), label = state)) +
geom_text()
k3 <- kmeans(df2, centers = 3, nstart = 25)
k4 <- kmeans(df2, centers = 4, nstart = 25)
k5 <- kmeans(df2, centers = 5, nstart = 25)
# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded
## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded
p2 <- fviz_cluster(k3, geom = "point", data = df) + ggtitle("k = 3")
## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded
## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded
p3 <- fviz_cluster(k4, geom = "point", data = df) + ggtitle("k = 4")
## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded
## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded
p4 <- fviz_cluster(k5, geom = "point", data = df) + ggtitle("k = 5")
## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded
## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded
library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)
Determinación de clústeres óptimos
Existen 3 métodos más populares para determinar los clústeres óptimos, que incluyen:
set.seed(123)
wss <- function(k) {
kmeans(df2, k, nstart = 10 )$tot.withinss
}
k.values <- 1:15
wss_values <- map_dbl(k.values, wss)
plot(k.values, wss_values,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
set.seed(123)
fviz_nbclust(df2, kmeans, method = "wss")
El enfoque principal es medir la calidad de un agrupamiento. Determina que tan bien se encuentra cada objeto dentro de su grupo, por lo que un ancho de silueta medio alto indica una buena agrupación. El número óptimo de clústers k es el que maximiza la silueta promedio en un rango de valores posibles para k.
Se calcula a continuación:
avg_sil <- function(k) {
km.res <- kmeans(df2, centers = k, nstart = 25)
ss <- silhouette(km.res$cluster, dist(df2))
mean(ss[, 3])
}
k.values <- 2:15
avg_sil_values <- map_dbl(k.values, avg_sil)
plot(k.values, avg_sil_values,
type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Average Silhouettes")
fviz_nbclust(df2, kmeans, method = "silhouette")
El algoritmo implica: 1. Agrupar los datos observados, cariando el número de grupos y calcular el correspondiente Wk. 2. Generar B conjuntos de datos de referencia y agrupar cada uno de ellos con un número variable de grupos. Calcular las estadísticas de brecha estimadas. 3.Dejar w, calcular la desviación estándar y definir Sk. 4. Elija el número de clústers como el más pequeño k
Para cacular el método de la estadística de la brecha, se puede usar la función clusGap
set.seed(123)
gap_stat <- clusGap(df2, FUN = kmeans, nstart = 25,
K.max = 10, B = 200)
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = df2, FUNcluster = kmeans, K.max = 10, B = 200, nstart = 25)
## B=200 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstmax'): 7
## logW E.logW gap SE.sim
## [1,] 4.863253 5.081448 0.2181943 0.01914861
## [2,] 4.633029 4.862444 0.2294147 0.01972575
## [3,] 4.459949 4.732268 0.2723187 0.01936747
## [4,] 4.270541 4.627206 0.3566657 0.01985567
## [5,] 4.160244 4.543353 0.3831089 0.01878113
## [6,] 4.030821 4.470868 0.4400465 0.01770540
## [7,] 3.961284 4.408471 0.4471869 0.01674066
## [8,] 3.913092 4.353396 0.4403043 0.01702480
## [9,] 3.851528 4.304869 0.4533410 0.01761517
## [10,] 3.805422 4.261498 0.4560758 0.01667538
fviz_gap_stat(gap_stat)
set.seed(123)
final <- kmeans(df2, 6, nstart = 25)
print(final)
## K-means clustering with 6 clusters of sizes 37, 44, 24, 35, 39, 21
##
## Cluster means:
## CustomerID Age Annual.Income..k.. Spending.Score..1.100.
## 1 -0.1589985 -0.8560457 -0.1303979 -0.05379550
## 2 -0.3192386 1.2521226 -0.2610138 -0.03150757
## 3 -1.3382762 -0.9735839 -1.3221791 1.03458649
## 4 1.1045175 0.2030004 1.0534500 -1.27458548
## 5 1.0625582 -0.4408110 0.9891010 1.23640011
## 6 -1.3357051 0.4777583 -1.3049552 -1.19344867
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 3 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 2 3 6 3 6 3 2 1 1 1 2 1 1 2 2 2 2 2 1 2
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 2 1 2 2 2 1 2 2 1 1 2 2 2 2 2 1 2 1 1 2
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 2 1 2 2 1 2 2 1 1 2 2 1 2 1 1 1 2 1 2 1
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 1 2 2 1 2 1 2 2 2 2 2 1 1 1 1 1 2 2 2 2
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 1 1 1 5 1 5 4 5 4 5 4 5 1 5 4 5 4 5 4 5
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 4 5 1 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5
##
## Within cluster sum of squares by cluster:
## [1] 24.51556 28.98960 13.07633 43.83787 28.26118 21.49026
## (between_SS / total_SS = 79.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(final, data = df2)
dff %>%
mutate(Cluster = final$cluster) %>%
group_by(Cluster) %>%
summarise_all("mean")
## Warning in mean.default(Gender): argument is not numeric or logical: returning
## NA
## Warning in mean.default(Gender): argument is not numeric or logical: returning
## NA
## Warning in mean.default(Gender): argument is not numeric or logical: returning
## NA
## Warning in mean.default(Gender): argument is not numeric or logical: returning
## NA
## Warning in mean.default(Gender): argument is not numeric or logical: returning
## NA
## Warning in mean.default(Gender): argument is not numeric or logical: returning
## NA
## # A tibble: 6 x 6
## Cluster CustomerID Gender Age Annual.Income..k.. Spending.Score..1.100.
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 91.3 NA 26.9 57.1 48.8
## 2 2 82.0 NA 56.3 53.7 49.4
## 3 3 23.0 NA 25.2 25.8 76.9
## 4 4 164. NA 41.7 88.2 17.3
## 5 5 162 NA 32.7 86.5 82.1
## 6 6 23.2 NA 45.5 26.3 19.4