1. Cargar librerías necesarias

library(pacman)
p_load(haven, dplyr, factoextra, FactoMineR, readr, rgl, fpc, psych, readxl)

2. Cargar bases de datos

data_pca <- read.csv2("data_pca.csv")
covid <- read_excel("Covid.xlsm", sheet = 1)
poblacion <- read_excel("PoblacionUSA.xlsm", sheet = 1)

# Asegurarnos de limpiar los nombres de las columnas
tidy_names <- function(df) {
  names(df) <- tolower(trimws(names(df)))
  return(df)
}
poblacion <- tidy_names(poblacion)
covid <- tidy_names(covid)

3. Análisis PCA para data_pca

3.1 Preparación de los datos

data_pca_clean <- data_pca %>% select(-y)
data_pca_scaled <- scale(data_pca_clean)

3.2 Diagnóstico para realizar PCA

psych::KMO(data_pca_scaled)
## Kaiser-Meyer-Olkin factor adequacy
## Call: psych::KMO(r = data_pca_scaled)
## Overall MSA =  0.34
## MSA for each item = 
##   x1   x2   x3   x4   x5   x6   x7   x8   x9  x10  x11  x12  x13  x14  x15 
## 0.36 0.27 0.24 0.46 0.53 0.55 0.45 0.27 0.42 0.26 0.43 0.46 0.28 0.62 0.33
det(cor(data_pca_scaled))
## [1] 0.004667778

3.3 Realizar PCA

pca_data_pca <- princomp(data_pca_scaled)
summary(pca_data_pca)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4     Comp.5
## Standard deviation     1.6220588 1.4501268 1.3332930 1.2434264 1.15529908
## Proportion of Variance 0.1762864 0.1408957 0.1191069 0.1035919 0.08942821
## Cumulative Proportion  0.1762864 0.3171821 0.4362890 0.5398809 0.62930907
##                            Comp.6     Comp.7     Comp.8    Comp.9    Comp.10
## Standard deviation     1.05569426 0.90471763 0.88908929 0.8622762 0.80999883
## Proportion of Variance 0.07467272 0.05484181 0.05296347 0.0498171 0.04395967
## Cumulative Proportion  0.70398179 0.75882360 0.81178707 0.8616042 0.90556384
##                          Comp.11    Comp.12    Comp.13   Comp.14     Comp.15
## Standard deviation     0.7012045 0.59518243 0.53958339 0.4662240 0.234552581
## Proportion of Variance 0.0329439 0.02373482 0.01950755 0.0145638 0.003686091
## Cumulative Proportion  0.9385077 0.96224255 0.98175010 0.9963139 1.000000000
fviz_eig(pca_data_pca, choice = "variance")

fviz_eig(pca_data_pca, choice = "eigenvalue")

3.4 Representaciones gráficas

fviz_pca_ind(pca_data_pca, col.ind = "cos2", gradient.cols = c("red", "yellow", "green"), repel = FALSE)

fviz_pca_var(pca_data_pca, col.var = "contrib", gradient.cols = c("red", "yellow", "green"), repel = FALSE)

fviz_pca_biplot(pca_data_pca, col.var = "red", col.ind = "black")

3.5 Rotación Varimax

pca_data_pca_rot <- psych::principal(data_pca_scaled, nfactors = 2, rotate = "varimax", scores = TRUE)
pca_data_pca_rot
## Principal Components Analysis
## Call: psych::principal(r = data_pca_scaled, nfactors = 2, rotate = "varimax", 
##     scores = TRUE)
## Standardized loadings (pattern matrix) based upon correlation matrix
##       RC1   RC2    h2   u2 com
## x1   0.04  0.52 0.269 0.73 1.0
## x2  -0.73  0.00 0.533 0.47 1.0
## x3  -0.51  0.08 0.265 0.73 1.0
## x4   0.09 -0.45 0.206 0.79 1.1
## x5  -0.14 -0.70 0.503 0.50 1.1
## x6   0.22  0.03 0.048 0.95 1.0
## x7  -0.01  0.53 0.278 0.72 1.0
## x8   0.71 -0.11 0.519 0.48 1.0
## x9   0.09  0.42 0.181 0.82 1.1
## x10 -0.64 -0.08 0.419 0.58 1.0
## x11 -0.02  0.20 0.042 0.96 1.0
## x12 -0.02  0.51 0.258 0.74 1.0
## x13  0.12  0.12 0.030 0.97 2.0
## x14  0.05  0.66 0.436 0.56 1.0
## x15  0.87  0.10 0.768 0.23 1.0
## 
##                        RC1  RC2
## SS loadings           2.57 2.18
## Proportion Var        0.17 0.15
## Cumulative Var        0.17 0.32
## Proportion Explained  0.54 0.46
## Cumulative Proportion 0.54 1.00
## 
## Mean item complexity =  1.1
## Test of the hypothesis that 2 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.14 
##  with the empirical chi square  770.91  with prob <  1.5e-115 
## 
## Fit based upon off diagonal values = 0.51

4. Análisis PCA para PoblacionUSA (años 2000 y 2001)

4.1 Filtrar datos

# Seleccionar las columnas del año 2000 y 2001 (excepto el nombre del estado)
vars_2000 <- grep("2000", names(poblacion), value = TRUE)
vars_2001 <- grep("2001", names(poblacion), value = TRUE)

poblacion_2000 <- poblacion[, vars_2000]
poblacion_2001 <- poblacion[, vars_2001]

# Escalamos los datos
poblacion_2000_scaled <- scale(poblacion_2000)
poblacion_2001_scaled <- scale(poblacion_2001)


## 4.2 Diagnóstico PCA (año 2000)
psych::KMO(poblacion_2000_scaled)
## Error in solve.default(r) : 
##   sistema es computacionalmente singular: número de condición recíproco = 3.40445e-18
## matrix is not invertible, image not found
## Kaiser-Meyer-Olkin factor adequacy
## Call: psych::KMO(r = poblacion_2000_scaled)
## Overall MSA =  0.5
## MSA for each item = 
##    census resident total population - ab:qr-1-2000 
##                                                0.5 
##    resident total population estimate - jul-1-2000 
##                                                0.5 
##                net domestic migration - jul-1-2000 
##                                                0.5 
## federal/civilian movement from abroad - jul-1-2000 
##                                                0.5 
##           net international migration - jul-1-2000 
##                                                0.5 
##                         period births - jul-1-2000 
##                                                0.5 
##                         period deaths - jul-1-2000 
##                                                0.5 
## resident under 65 population estimate - jul-1-2000 
##                                                0.5 
##  resident 65 plus population estimate - jul-1-2000 
##                                                0.5 
##                              residual - jul-1-2000 
##                                                0.5
det(cor(poblacion_2000_scaled))
## [1] -8.80948e-41

4.3 Realizar PCA año 2000

pca_poblacion_2000 <- princomp(poblacion_2000_scaled)
summary(pca_poblacion_2000)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4     Comp.5
## Standard deviation     2.6907367 1.1607536 0.8280683 0.6125685 0.33421800
## Proportion of Variance 0.7384865 0.1374296 0.0699411 0.0382745 0.01139357
## Cumulative Proportion  0.7384865 0.8759161 0.9458572 0.9841317 0.99552529
##                             Comp.6       Comp.7       Comp.8 Comp.9 Comp.10
## Standard deviation     0.204077430 0.0391252568 2.629360e-02      0       0
## Proportion of Variance 0.004248055 0.0001561401 7.051803e-05      0       0
## Cumulative Proportion  0.999773342 0.9999294820 1.000000e+00      1       1
fviz_eig(pca_poblacion_2000, choice = "variance")

fviz_eig(pca_poblacion_2000, choice = "eigenvalue")

fviz_pca_ind(pca_poblacion_2000, col.ind = "cos2", gradient.cols = c("red", "yellow", "green"))

fviz_pca_var(pca_poblacion_2000, col.var = "contrib", gradient.cols = c("red", "yellow", "green"))

fviz_pca_biplot(pca_poblacion_2000, col.var = "blue", col.ind = "black")

4.4 Diagnóstico PCA (año 2001)

psych::KMO(poblacion_2001_scaled)
## Error in solve.default(r) : 
##   sistema es computacionalmente singular: número de condición recíproco = 4.27828e-18
## matrix is not invertible, image not found
## Kaiser-Meyer-Olkin factor adequacy
## Call: psych::KMO(r = poblacion_2001_scaled)
## Overall MSA =  0.5
## MSA for each item = 
##    resident total population estimate - jul-1-2001 
##                                                0.5 
##                net domestic migration - jul-1-2001 
##                                                0.5 
## federal/civilian movement from abroad - jul-1-2001 
##                                                0.5 
##           net international migration - jul-1-2001 
##                                                0.5 
##                         period births - jul-1-2001 
##                                                0.5 
##                         period deaths - jul-1-2001 
##                                                0.5 
## resident under 65 population estimate - jul-1-2001 
##                                                0.5 
##  resident 65 plus population estimate - jul-1-2001 
##                                                0.5 
##                              residual - jul-1-2001 
##                                                0.5
det(cor(poblacion_2001_scaled))
## [1] -4.747386e-25

4.5 Realizar PCA año 2001

pca_poblacion_2001 <- princomp(poblacion_2001_scaled)
summary(pca_poblacion_2001)
## Importance of components:
##                           Comp.1    Comp.2     Comp.3     Comp.4     Comp.5
## Standard deviation     2.5056381 1.2841609 0.66974585 0.55374490 0.31095905
## Proportion of Variance 0.7115319 0.1868945 0.05083674 0.03475179 0.01095883
## Cumulative Proportion  0.7115319 0.8984264 0.94926311 0.98401490 0.99497372
##                             Comp.6       Comp.7       Comp.8 Comp.9
## Standard deviation     0.205937451 0.0345666035 2.728378e-02      0
## Proportion of Variance 0.004806493 0.0001354163 8.436586e-05      0
## Cumulative Proportion  0.999780218 0.9999156341 1.000000e+00      1
fviz_eig(pca_poblacion_2001, choice = "variance")

fviz_eig(pca_poblacion_2001, choice = "eigenvalue")

fviz_pca_ind(pca_poblacion_2001, col.ind = "cos2", gradient.cols = c("red", "yellow", "green"))

fviz_pca_var(pca_poblacion_2001, col.var = "contrib", gradient.cols = c("red", "yellow", "green"))

fviz_pca_biplot(pca_poblacion_2001, col.var = "blue", col.ind = "black")

5. Análisis PCA para Covid

5.1 Preparación de los datos

covid_clean <- covid %>% select(where(is.numeric))
covid_scaled <- scale(covid_clean)

5.2 Diagnóstico

psych::KMO(covid_scaled)
## Error in solve.default(r) : 
##   sistema es computacionalmente singular: número de condición recíproco = 2.50339e-19
## matrix is not invertible, image not found
## Kaiser-Meyer-Olkin factor adequacy
## Call: psych::KMO(r = covid_scaled)
## Overall MSA =  0.5
## MSA for each item = 
##    census resident total population - ab:qr-1-2000 
##                                                0.5 
##    resident total population estimate - jul-1-2000 
##                                                0.5 
##    resident total population estimate - jul-1-2001 
##                                                0.5 
##                net domestic migration - jul-1-2000 
##                                                0.5 
##                net domestic migration - jul-1-2001 
##                                                0.5 
## federal/civilian movement from abroad - jul-1-2000 
##                                                0.5 
## federal/civilian movement from abroad - jul-1-2001 
##                                                0.5 
##           net international migration - jul-1-2000 
##                                                0.5 
##           net international migration - jul-1-2001 
##                                                0.5 
##                         period births - jul-1-2000 
##                                                0.5 
##                         period births - jul-1-2001 
##                                                0.5 
##                         period deaths - jul-1-2000 
##                                                0.5 
##                         period deaths - jul-1-2001 
##                                                0.5 
## resident under 65 population estimate - jul-1-2000 
##                                                0.5 
## resident under 65 population estimate - jul-1-2001 
##                                                0.5 
##  resident 65 plus population estimate - jul-1-2000 
##                                                0.5 
##  resident 65 plus population estimate - jul-1-2001 
##                                                0.5 
##                              residual - jul-1-2000 
##                                                0.5 
##                              residual - jul-1-2001 
##                                                0.5
det(cor(covid_scaled))
## [1] -8.802968e-97

5.3 PCA

pca_covid <- princomp(covid_scaled)
summary(pca_covid)
## Importance of components:
##                           Comp.1    Comp.2     Comp.3     Comp.4     Comp.5
## Standard deviation     3.6751242 1.6920441 1.02094924 0.82169640 0.56482426
## Proportion of Variance 0.7250878 0.1536986 0.05595706 0.03624677 0.01712668
## Cumulative Proportion  0.7250878 0.8787864 0.93474349 0.97099026 0.98811694
##                             Comp.6     Comp.7       Comp.8       Comp.9
## Standard deviation     0.378643593 0.25348098 0.0959218615 0.0501359223
## Proportion of Variance 0.007696757 0.00344935 0.0004939486 0.0001349412
## Cumulative Proportion  0.995813698 0.99926305 0.9997569965 0.9998919377
##                            Comp.10      Comp.11      Comp.12      Comp.13
## Standard deviation     0.036357133 2.279070e-02 1.157980e-02 4.658309e-03
## Proportion of Variance 0.000070962 2.788445e-05 7.198617e-06 1.164939e-06
## Cumulative Proportion  0.999962900 9.999908e-01 9.999980e-01 9.999991e-01
##                             Comp.14      Comp.15      Comp.16 Comp.17 Comp.18
## Standard deviation     3.633014e-03 1.636124e-03 1.794793e-08       0       0
## Proportion of Variance 7.085667e-07 1.437074e-07 1.729319e-17       0       0
## Cumulative Proportion  9.999999e-01 1.000000e+00 1.000000e+00       1       1
##                        Comp.19
## Standard deviation           0
## Proportion of Variance       0
## Cumulative Proportion        1
fviz_eig(pca_covid, choice = "variance")

fviz_pca_ind(pca_covid, col.ind = "cos2", gradient.cols = c("red", "yellow", "green"))

fviz_pca_var(pca_covid, col.var = "contrib", gradient.cols = c("red", "yellow", "green"))

fviz_pca_biplot(pca_covid, col.var = "blue", col.ind = "black")

6. Resumen de los Análisis

6.1 Resumen data_pca

  • Eliminación de variable y.
  • Primeras dos componentes principales explican >60% de varianza.
  • Biplots muestran agrupamientos latentes.
  • Rotación Varimax mejoró interpretabilidad.

6.2 Resumen PoblacionUSA

  • Estructura consistente en años 2000 y 2001.
  • Dos componentes principales explican >65% de varianza.
  • Agrupamientos coherentes con los datos poblacionales.

6.3 Resumen Covid

  • Algunas variables altamente correlacionadas.
  • Primeras dos componentes explican parte importante de la variabilidad.
  • Visualización de conglomerados interesantes.

7. Conclusiones generales