library(pacman)Warning: package 'pacman' was built under R version 4.5.2
p_load(umap, cluster, factoextra, tidyverse, skimr, naniar,
tictoc, DataExplorer, ggplot2, plotly, psych, NbClust)Librerias:
library(pacman)Warning: package 'pacman' was built under R version 4.5.2
p_load(umap, cluster, factoextra, tidyverse, skimr, naniar,
tictoc, DataExplorer, ggplot2, plotly, psych, NbClust)library(readr)
data_origin <- read_csv("data/data.csv")Rows: 165 Columns: 15
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Country
dbl (14): Property Rights, Government Integrity, Judicial Effectiveness, Tax...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# resumen general
skim(data_origin)| Name | data_origin |
| Number of rows | 165 |
| Number of columns | 15 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 14 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Country | 0 | 1 | 4 | 28 | 0 | 165 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Property Rights | 0 | 1.00 | 52.23 | 26.60 | 0.7 | 30.80 | 47.40 | 74.10 | 100.0 | ▂▆▇▃▆ |
| Government Integrity | 0 | 1.00 | 42.92 | 22.69 | 3.3 | 25.10 | 39.30 | 57.20 | 98.0 | ▃▇▃▂▂ |
| Judicial Effectiveness | 0 | 1.00 | 46.50 | 28.07 | 2.7 | 26.50 | 43.10 | 68.10 | 99.3 | ▆▇▇▃▅ |
| Tax Burden | 6 | 0.96 | 77.65 | 12.99 | 0.0 | 71.50 | 78.30 | 86.45 | 99.9 | ▁▁▁▇▇ |
| Government Spending | 6 | 0.96 | 67.50 | 23.63 | 0.0 | 55.60 | 73.80 | 85.85 | 98.1 | ▁▂▃▆▇ |
| Fiscal Health | 6 | 0.96 | 63.13 | 30.30 | 0.0 | 44.60 | 70.80 | 89.55 | 100.0 | ▃▂▃▅▇ |
| Business Freedom | 6 | 0.96 | 63.39 | 17.80 | 5.0 | 50.45 | 67.10 | 77.90 | 93.0 | ▁▃▅▇▇ |
| Labor Freedom | 6 | 0.96 | 56.43 | 10.23 | 5.0 | 52.30 | 57.30 | 62.10 | 81.8 | ▁▁▂▇▂ |
| Monetary Freedom | 6 | 0.96 | 67.18 | 15.03 | 0.0 | 67.15 | 70.90 | 74.60 | 88.7 | ▁▁▁▇▇ |
| Trade Freedom | 5 | 0.97 | 70.71 | 11.90 | 0.0 | 64.85 | 72.20 | 79.60 | 95.0 | ▁▁▂▇▆ |
| Investment Freedom | 6 | 0.96 | 54.18 | 20.58 | 0.0 | 45.00 | 60.00 | 70.00 | 95.0 | ▁▂▆▇▂ |
| Financial Freedom | 6 | 0.96 | 48.77 | 18.55 | 0.0 | 40.00 | 50.00 | 60.00 | 80.0 | ▁▃▃▇▃ |
| Free and fair elections | 0 | 1.00 | 5.45 | 4.00 | 0.0 | 0.50 | 6.58 | 9.58 | 10.0 | ▆▁▂▂▇ |
| Civil liberties | 0 | 1.00 | 5.35 | 2.86 | 0.0 | 2.94 | 5.29 | 7.94 | 10.0 | ▅▇▇▆▇ |
# visualización de patrón de NA
gg_miss_var(data_origin)# Histogramas
data_origin %>%
pivot_longer(-Country, names_to = "Variable", values_to = "Valor") %>%
ggplot(aes(x = Valor)) +
geom_histogram(fill = "#69b3a2", bins = 30) +
facet_wrap(~Variable, scales = "free") +
theme_minimal()Warning: Removed 53 rows containing non-finite outside the scale range
(`stat_bin()`).
data_origin %>%
select(-Country) %>%
pivot_longer(everything(), names_to = "Variable", values_to = "Valor") %>%
ggplot(aes(x = Variable, y = Valor)) +
geom_boxplot(fill = "orange") +
coord_flip() +
theme_minimal() +
labs(title = "Distribución de todas las variables", x = NULL, y = NULL)Warning: Removed 53 rows containing non-finite outside the scale range
(`stat_boxplot()`).
# 1. Transformar a formato largo
top10_long <- data_origin %>%
pivot_longer(-Country, names_to = "Variable", values_to = "Valor") %>%
group_by(Variable) %>%
slice_max(order_by = Valor, n = 10) %>%
ungroup()
# 2. Visualizar todos los Top 10 con facet_wrap
ggplot(top10_long, aes(x = reorder(Country, Valor), y = Valor)) +
geom_col(fill = "steelblue") +
coord_flip() +
facet_wrap(~ Variable, scales = "free_y") +
labs(title = "Top 10 países por cada variable", x = NULL, y = NULL) +
theme_minimal()# Tratamiento del 3.6% de nans
data_subset <- data_origin %>%
drop_na()data_subset <- as.data.frame(data_subset)
rownames(data_subset) <- data_subset$Countrydata_subset <- subset(data_subset, select = -Country)# centrar por mediana, escalar por IQR
data_scaled <- scale(
data_subset,
center = apply(data_subset, 2, median), # centrado por mediana
scale = apply(data_subset, 2, IQR) # escala por IQR
)# Matriz de correlación
cor_data <- data_scaled
cor_matrix <- cor(cor_data)
# Convertir a tidy para ggplot2
cor_long <- cor_matrix %>%
as.data.frame() %>%
rownames_to_column("Var1") %>%
pivot_longer(-Var1, names_to = "Var2", values_to = "Correlation")
# Gráfico tipo heatmap
ggplot(cor_long, aes(Var1, Var2, fill = Correlation)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "red", high = "blue", mid = "white", midpoint = 0) +
theme_minimal() +
coord_fixed() +
labs(title = "Matriz de correlaciones") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))psych::KMO(data_subset)Kaiser-Meyer-Olkin factor adequacy
Call: psych::KMO(r = data_subset)
Overall MSA = 0.9
MSA for each item =
Property Rights Government Integrity Judicial Effectiveness
0.93 0.93 0.91
Tax Burden Government Spending Fiscal Health
0.63 0.86 0.68
Business Freedom Labor Freedom Monetary Freedom
0.95 0.95 0.91
Trade Freedom Investment Freedom Financial Freedom
0.93 0.88 0.91
Free and fair elections Civil liberties
0.89 0.88
# Interpretación: el conjunto de variables tiene correlaciones suficientes para reducción de dimensión mayor a 0.6# Prueba de esfericidad de Bartlett
psych::cortest.bartlett(cor(data_subset), n = nrow(data_subset))$chisq
[1] 2237.514
$p.value
[1] 0
$df
[1] 91
# Rechazo H0, la matriz de correlación no es una identidad, y por tanto es válida para PCA.# PCA con prcomp sobre datos robustamente escalados
pc <- prcomp(data_scaled, center = FALSE, scale. = FALSE) summary(pc) Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7
Standard deviation 2.5314 1.5382 0.9451 0.77191 0.64829 0.62707 0.49760
Proportion of Variance 0.5381 0.1987 0.0750 0.05003 0.03529 0.03302 0.02079
Cumulative Proportion 0.5381 0.7367 0.8117 0.86177 0.89705 0.93007 0.95086
PC8 PC9 PC10 PC11 PC12 PC13 PC14
Standard deviation 0.42610 0.39613 0.30846 0.2846 0.17791 0.14908 0.1293
Proportion of Variance 0.01524 0.01318 0.00799 0.0068 0.00266 0.00187 0.0014
Cumulative Proportion 0.96611 0.97928 0.98727 0.9941 0.99673 0.99860 1.0000
# Si quieres usar fviz_eig
fviz_eig(pc)Warning in geom_bar(stat = "identity", fill = barfill, color = barcolor, :
Ignoring empty aesthetic: `width`.
# PC1 = 53.8%
# PC2 = 19.9%# Pesos de las variables originales en los componentes
loadings <- pc$rotation
# Loadings de PC1 ordenadas
sort(pc$rotation[,1], decreasing = TRUE) Government Spending Tax Burden Fiscal Health
0.06406693 -0.01308185 -0.06563946
Free and fair elections Civil liberties Judicial Effectiveness
-0.11063381 -0.13251918 -0.16394167
Property Rights Government Integrity Business Freedom
-0.16918054 -0.18596478 -0.20528153
Trade Freedom Investment Freedom Financial Freedom
-0.23601626 -0.25563386 -0.28303691
Labor Freedom Monetary Freedom
-0.30196330 -0.73283571
# Loadings de PC2 ordenadas
sort(pc$rotation[,2], decreasing = TRUE) Monetary Freedom Government Spending Tax Burden
0.56632403 0.34839387 0.31262578
Fiscal Health Trade Freedom Labor Freedom
0.07216788 -0.12162914 -0.13663665
Investment Freedom Free and fair elections Business Freedom
-0.14183553 -0.14232899 -0.17119720
Civil liberties Financial Freedom Property Rights
-0.21471369 -0.23188001 -0.25553830
Judicial Effectiveness Government Integrity
-0.28320213 -0.32624466
library(factoextra)
fviz_pca_var(pc, col.var = "contrib", gradient.cols = c("blue", "orange", "red"))Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
ℹ The deprecated feature was likely used in the ggpubr package.
Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
ℹ The deprecated feature was likely used in the factoextra package.
Please report the issue at <https://github.com/kassambara/factoextra/issues>.
# PC1:
# loadings negativos -> Monetary Freedom, Labor Freedom, Government Integrity, Judicial Effectiveness, Property Rights
# PC2 ->
# positivico -> Monetary Freedom, Government Spending, Tax Burden ...
# negativo -> Labor Freedom , Investment Freedom, ...# Proyección de los países en las 2 primeras dimensiones
fviz_pca_ind(pc,
geom.ind = "point",
col.ind = "cos2",
gradient.cols = c("blue", "orange", "red"),
repel = TRUE,
label = "none")pca_df <- as.data.frame(pc$x[, 1:2]) # PC1 y PC2
rownames(pca_df) <- rownames(data_subset)library(BBmisc)Warning: package 'BBmisc' was built under R version 4.5.2
Adjuntando el paquete: 'BBmisc'
The following objects are masked from 'package:dplyr':
coalesce, collapse, symdiff
The following object is masked from 'package:base':
isFALSE
# Invertimos PC1 multiplicando por -1
pc1_invertido <- -pc$x[,1]
# Normalizamos en escala 0–100
indice_libertad <- normalize(pc1_invertido,
method = "range",
range = c(0, 100))# incorporacion del índice y países
indice_df <- data.frame(
Indice_Libertad = indice_libertad
)
# Unir con tus datos originales (sin country)
data_con_indice <- cbind(indice_df, data_subset)
head(data_con_indice) Indice_Libertad Property Rights Government Integrity
Albania 83.68368 58.3 39.6
Algeria 68.34656 27.6 29.5
Angola 66.95929 39.9 28.2
Argentina 48.68559 34.7 39.7
Armenia 81.68663 49.9 50.4
Australia 94.94585 90.4 86.2
Judicial Effectiveness Tax Burden Government Spending Fiscal Health
Albania 59.2 88.8 71.9 76.4
Algeria 29.8 80.3 64.4 58.1
Angola 25.7 86.9 89.5 89.1
Argentina 56.5 70.8 57.1 67.9
Armenia 31.3 88.1 77.6 81.3
Australia 95.3 62.1 54.8 79.5
Business Freedom Labor Freedom Monetary Freedom Trade Freedom
Albania 74.2 51.3 75.8 83.4
Algeria 59.1 52.4 70.9 57.4
Angola 45.2 50.8 64.2 70.4
Argentina 55.3 55.2 23.3 64.4
Armenia 73.0 59.2 72.2 72.0
Australia 92.5 65.3 75.0 90.0
Investment Freedom Financial Freedom Free and fair elections
Albania 60 60 7.00
Algeria 20 20 3.08
Angola 30 40 4.50
Argentina 65 60 9.17
Armenia 70 60 7.92
Australia 80 80 10.00
Civil liberties
Albania 7.06
Algeria 3.82
Angola 2.35
Argentina 8.53
Armenia 5.29
Australia 9.71
data_scaled: variables originales escaladas
pca_df: data con los componentes PC1 y PC2
# data_scaled
set.seed(20202209)
res.nbclust <- NbClust(data_scaled, distance = "euclidean",
min.nc = 2, max.nc = 10,
method = "kmeans", index = "all")*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 2 proposed 2 as the best number of clusters
* 12 proposed 3 as the best number of clusters
* 4 proposed 4 as the best number of clusters
* 2 proposed 5 as the best number of clusters
* 1 proposed 9 as the best number of clusters
* 2 proposed 10 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 3
*******************************************************************
# pca_df
set.seed(20202209)
res.nbclust <- NbClust(pca_df, distance = "euclidean",
min.nc = 2, max.nc = 10,
method = "kmeans", index = "all")*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 4 proposed 2 as the best number of clusters
* 11 proposed 3 as the best number of clusters
* 6 proposed 5 as the best number of clusters
* 1 proposed 6 as the best number of clusters
* 1 proposed 10 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 3
*******************************************************************
k_optimo <- 3# data 1
set.seed(20202209)
tic()
km <- kmeans(data_scaled,
centers = k_optimo,
iter.max = 100,
nstart = 25,
algorithm = "Lloyd")
toc()0.02 sec elapsed
# data 2
set.seed(20202209)
tic()
km <- kmeans(pca_df,
centers = k_optimo,
iter.max = 100,
nstart = 25,
algorithm = "Lloyd")
toc()0 sec elapsed