EDA Proyecto final

Librerias:

library(pacman)

Warning: package 'pacman' was built under R version 4.5.2

p_load(umap, cluster, factoextra, tidyverse, skimr, naniar, 
       tictoc, DataExplorer, ggplot2, plotly, psych, NbClust)

Import

library(readr)
data_origin <- read_csv("data/data.csv")

Rows: 165 Columns: 15
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (1): Country
dbl (14): Property Rights, Government Integrity, Judicial Effectiveness, Tax...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

EDA

Estructura de datos

# resumen general 
skim(data_origin)

Data summary
Name	data_origin
Number of rows	165
Number of columns	15
_______________________
Column type frequency:
character	1
numeric	14
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
Country	0	1	4	28	0	165	0

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Property Rights	0	1.00	52.23	26.60	0.7	30.80	47.40	74.10	100.0	▂▆▇▃▆
Government Integrity	0	1.00	42.92	22.69	3.3	25.10	39.30	57.20	98.0	▃▇▃▂▂
Judicial Effectiveness	0	1.00	46.50	28.07	2.7	26.50	43.10	68.10	99.3	▆▇▇▃▅
Tax Burden	6	0.96	77.65	12.99	0.0	71.50	78.30	86.45	99.9	▁▁▁▇▇
Government Spending	6	0.96	67.50	23.63	0.0	55.60	73.80	85.85	98.1	▁▂▃▆▇
Fiscal Health	6	0.96	63.13	30.30	0.0	44.60	70.80	89.55	100.0	▃▂▃▅▇
Business Freedom	6	0.96	63.39	17.80	5.0	50.45	67.10	77.90	93.0	▁▃▅▇▇
Labor Freedom	6	0.96	56.43	10.23	5.0	52.30	57.30	62.10	81.8	▁▁▂▇▂
Monetary Freedom	6	0.96	67.18	15.03	0.0	67.15	70.90	74.60	88.7	▁▁▁▇▇
Trade Freedom	5	0.97	70.71	11.90	0.0	64.85	72.20	79.60	95.0	▁▁▂▇▆
Investment Freedom	6	0.96	54.18	20.58	0.0	45.00	60.00	70.00	95.0	▁▂▆▇▂
Financial Freedom	6	0.96	48.77	18.55	0.0	40.00	50.00	60.00	80.0	▁▃▃▇▃
Free and fair elections	0	1.00	5.45	4.00	0.0	0.50	6.58	9.58	10.0	▆▁▂▂▇
Civil liberties	0	1.00	5.35	2.86	0.0	2.94	5.29	7.94	10.0	▅▇▇▆▇

Missings

# visualización de patrón de NA
gg_miss_var(data_origin)

Visualización

Distribuciones

# Histogramas
data_origin %>%
  pivot_longer(-Country, names_to = "Variable", values_to = "Valor") %>%
  ggplot(aes(x = Valor)) +
  geom_histogram(fill = "#69b3a2", bins = 30) +
  facet_wrap(~Variable, scales = "free") +
  theme_minimal()

Warning: Removed 53 rows containing non-finite outside the scale range
(`stat_bin()`).

Boxplots

data_origin %>%
  select(-Country) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "Valor") %>%
  ggplot(aes(x = Variable, y = Valor)) +
  geom_boxplot(fill = "orange") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Distribución de todas las variables", x = NULL, y = NULL)

Warning: Removed 53 rows containing non-finite outside the scale range
(`stat_boxplot()`).

# 1. Transformar a formato largo
top10_long <- data_origin %>%
  pivot_longer(-Country, names_to = "Variable", values_to = "Valor") %>%
  group_by(Variable) %>%
  slice_max(order_by = Valor, n = 10) %>%
  ungroup()

# 2. Visualizar todos los Top 10 con facet_wrap
ggplot(top10_long, aes(x = reorder(Country, Valor), y = Valor)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  facet_wrap(~ Variable, scales = "free_y") +
  labs(title = "Top 10 países por cada variable", x = NULL, y = NULL) +
  theme_minimal()

Preprocesamiento

Imputación

# Tratamiento del 3.6% de nans

data_subset <- data_origin %>%
  drop_na()

Selección

data_subset <- as.data.frame(data_subset)
rownames(data_subset) <- data_subset$Country

data_subset <- subset(data_subset, select = -Country)

Escalamiento

# centrar por mediana, escalar por IQR
data_scaled <- scale(
  data_subset, 
  center = apply(data_subset, 2, median),  # centrado por mediana
  scale = apply(data_subset, 2, IQR)       # escala por IQR
)

Correlación

# Matriz de correlación
cor_data <- data_scaled
cor_matrix <- cor(cor_data)

# Convertir a tidy para ggplot2
cor_long <- cor_matrix %>%
  as.data.frame() %>%
  rownames_to_column("Var1") %>%
  pivot_longer(-Var1, names_to = "Var2", values_to = "Correlation")

# Gráfico tipo heatmap
ggplot(cor_long, aes(Var1, Var2, fill = Correlation)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "red", high = "blue", mid = "white", midpoint = 0) +
  theme_minimal() +
  coord_fixed() +
  labs(title = "Matriz de correlaciones") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Análisis PCA

Tests

psych::KMO(data_subset)

Kaiser-Meyer-Olkin factor adequacy
Call: psych::KMO(r = data_subset)
Overall MSA =  0.9
MSA for each item = 
        Property Rights    Government Integrity  Judicial Effectiveness 
                   0.93                    0.93                    0.91 
             Tax Burden     Government Spending           Fiscal Health 
                   0.63                    0.86                    0.68 
       Business Freedom           Labor Freedom        Monetary Freedom 
                   0.95                    0.95                    0.91 
          Trade Freedom      Investment Freedom       Financial Freedom 
                   0.93                    0.88                    0.91 
Free and fair elections         Civil liberties 
                   0.89                    0.88

# Interpretación: el conjunto de variables tiene correlaciones suficientes para reducción de dimensión mayor a 0.6

# Prueba de esfericidad de Bartlett

psych::cortest.bartlett(cor(data_subset), n = nrow(data_subset))

$chisq
[1] 2237.514

$p.value
[1] 0

$df
[1] 91

# Rechazo H0, la matriz de correlación no es una identidad, y por tanto es válida para PCA.

# PCA con prcomp sobre datos robustamente escalados
pc <- prcomp(data_scaled, center = FALSE, scale. = FALSE)

Varianza

summary(pc)

Importance of components:
                          PC1    PC2    PC3     PC4     PC5     PC6     PC7
Standard deviation     2.5314 1.5382 0.9451 0.77191 0.64829 0.62707 0.49760
Proportion of Variance 0.5381 0.1987 0.0750 0.05003 0.03529 0.03302 0.02079
Cumulative Proportion  0.5381 0.7367 0.8117 0.86177 0.89705 0.93007 0.95086
                           PC8     PC9    PC10   PC11    PC12    PC13   PC14
Standard deviation     0.42610 0.39613 0.30846 0.2846 0.17791 0.14908 0.1293
Proportion of Variance 0.01524 0.01318 0.00799 0.0068 0.00266 0.00187 0.0014
Cumulative Proportion  0.96611 0.97928 0.98727 0.9941 0.99673 0.99860 1.0000

# Si quieres usar fviz_eig
 fviz_eig(pc)

Warning in geom_bar(stat = "identity", fill = barfill, color = barcolor, :
Ignoring empty aesthetic: `width`.

 # PC1 = 53.8%
 # PC2 = 19.9%

Pesos

# Pesos de las variables originales en los componentes
loadings <- pc$rotation

# Loadings de PC1 ordenadas
sort(pc$rotation[,1], decreasing = TRUE)

    Government Spending              Tax Burden           Fiscal Health 
             0.06406693             -0.01308185             -0.06563946 
Free and fair elections         Civil liberties  Judicial Effectiveness 
            -0.11063381             -0.13251918             -0.16394167 
        Property Rights    Government Integrity        Business Freedom 
            -0.16918054             -0.18596478             -0.20528153 
          Trade Freedom      Investment Freedom       Financial Freedom 
            -0.23601626             -0.25563386             -0.28303691 
          Labor Freedom        Monetary Freedom 
            -0.30196330             -0.73283571

# Loadings de PC2 ordenadas
sort(pc$rotation[,2], decreasing = TRUE)

       Monetary Freedom     Government Spending              Tax Burden 
             0.56632403              0.34839387              0.31262578 
          Fiscal Health           Trade Freedom           Labor Freedom 
             0.07216788             -0.12162914             -0.13663665 
     Investment Freedom Free and fair elections        Business Freedom 
            -0.14183553             -0.14232899             -0.17119720 
        Civil liberties       Financial Freedom         Property Rights 
            -0.21471369             -0.23188001             -0.25553830 
 Judicial Effectiveness    Government Integrity 
            -0.28320213             -0.32624466

Visualización

library(factoextra)
fviz_pca_var(pc, col.var = "contrib", gradient.cols = c("blue", "orange", "red"))

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
ℹ The deprecated feature was likely used in the ggpubr package.
  Please report the issue at <https://github.com/kassambara/ggpubr/issues>.

Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
ℹ The deprecated feature was likely used in the factoextra package.
  Please report the issue at <https://github.com/kassambara/factoextra/issues>.

# PC1: 
# loadings negativos -> Monetary Freedom, Labor Freedom, Government Integrity, Judicial Effectiveness, Property Rights

# PC2 -> 
# positivico -> Monetary Freedom, Government Spending,  Tax Burden ...
# negativo -> Labor Freedom ,  Investment Freedom, ...

# Proyección de los países en las 2 primeras dimensiones
fviz_pca_ind(pc,
             geom.ind = "point",
             col.ind = "cos2",
             gradient.cols = c("blue", "orange", "red"),
             repel = TRUE,
             label = "none")

Caracterización

pca_df <- as.data.frame(pc$x[, 1:2])  # PC1 y PC2
rownames(pca_df) <- rownames(data_subset)

library(BBmisc)

Warning: package 'BBmisc' was built under R version 4.5.2


Adjuntando el paquete: 'BBmisc'

The following objects are masked from 'package:dplyr':

    coalesce, collapse, symdiff

The following object is masked from 'package:base':

    isFALSE

# Invertimos PC1 multiplicando por -1
pc1_invertido <- -pc$x[,1]

# Normalizamos en escala 0–100
indice_libertad <- normalize(pc1_invertido, 
                             method = "range", 
                             range = c(0, 100))

# incorporacion del índice y países
indice_df <- data.frame(
  Indice_Libertad = indice_libertad
)

# Unir con tus datos originales (sin country)
data_con_indice <- cbind(indice_df, data_subset)
head(data_con_indice)

          Indice_Libertad Property Rights Government Integrity
Albania          83.68368            58.3                 39.6
Algeria          68.34656            27.6                 29.5
Angola           66.95929            39.9                 28.2
Argentina        48.68559            34.7                 39.7
Armenia          81.68663            49.9                 50.4
Australia        94.94585            90.4                 86.2
          Judicial Effectiveness Tax Burden Government Spending Fiscal Health
Albania                     59.2       88.8                71.9          76.4
Algeria                     29.8       80.3                64.4          58.1
Angola                      25.7       86.9                89.5          89.1
Argentina                   56.5       70.8                57.1          67.9
Armenia                     31.3       88.1                77.6          81.3
Australia                   95.3       62.1                54.8          79.5
          Business Freedom Labor Freedom Monetary Freedom Trade Freedom
Albania               74.2          51.3             75.8          83.4
Algeria               59.1          52.4             70.9          57.4
Angola                45.2          50.8             64.2          70.4
Argentina             55.3          55.2             23.3          64.4
Armenia               73.0          59.2             72.2          72.0
Australia             92.5          65.3             75.0          90.0
          Investment Freedom Financial Freedom Free and fair elections
Albania                   60                60                    7.00
Algeria                   20                20                    3.08
Angola                    30                40                    4.50
Argentina                 65                60                    9.17
Armenia                   70                60                    7.92
Australia                 80                80                   10.00
          Civil liberties
Albania              7.06
Algeria              3.82
Angola               2.35
Argentina            8.53
Armenia              5.29
Australia            9.71

Clustering

data_scaled: variables originales escaladas
pca_df: data con los componentes PC1 y PC2

Elección de K

# data_scaled
set.seed(20202209)
res.nbclust <- NbClust(data_scaled, distance = "euclidean",
                       min.nc = 2, max.nc = 10, 
                       method = "kmeans", index = "all")

*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot.

*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 2 proposed 2 as the best number of clusters 
* 12 proposed 3 as the best number of clusters 
* 4 proposed 4 as the best number of clusters 
* 2 proposed 5 as the best number of clusters 
* 1 proposed 9 as the best number of clusters 
* 2 proposed 10 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  3 
 
 
*******************************************************************

# pca_df
set.seed(20202209)
res.nbclust <- NbClust(pca_df, distance = "euclidean",
                       min.nc = 2, max.nc = 10, 
                       method = "kmeans", index = "all")

*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot.

*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 4 proposed 2 as the best number of clusters 
* 11 proposed 3 as the best number of clusters 
* 6 proposed 5 as the best number of clusters 
* 1 proposed 6 as the best number of clusters 
* 1 proposed 10 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  3 
 
 
*******************************************************************

K means

k_optimo <- 3

# data 1
set.seed(20202209)
tic()
km <- kmeans(data_scaled, 
             centers = k_optimo,
             iter.max = 100,
             nstart = 25,
             algorithm = "Lloyd")
toc()

0.02 sec elapsed

# data 2
set.seed(20202209)
tic()
km <- kmeans(pca_df, 
             centers = k_optimo,
             iter.max = 100,
             nstart = 25,
             algorithm = "Lloyd")
toc()

0 sec elapsed