Primero de todo cargamos las librerias que necesitaremos
library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.4.3
library(rnaturalearthdata)
## Warning: package 'rnaturalearthdata' was built under R version 4.4.3
##
## Adjuntando el paquete: 'rnaturalearthdata'
## The following object is masked from 'package:rnaturalearth':
##
## countries110
library(dplyr)
library(cluster)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.4.3
library(gridExtra)
##
## Adjuntando el paquete: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(tidyr)
# Cargar datos de notas medias por país (previamente procesados)
load("estudiantesPISAfiltrados.R")
NotasPlausibles = datosPisaEstudiantes_filtrado %>%
select(matches("^PV|^CNT$|^CNTSCHID$"))
NotasMedias = NotasPlausibles %>% transmute(CNT = NotasPlausibles$CNT,
CNTSCHID = NotasPlausibles$CNTSCHID,
MathMean = rowMeans(NotasPlausibles[,3:12]),
LectureMean = rowMeans(NotasPlausibles[,13:22]),
ScienceMean = rowMeans(NotasPlausibles[,23:32]))
# 1. Calcular medias por país con manejo de NA/NaN
NotasMediasPais <- NotasMedias %>%
group_by(CNT) %>%
summarise(
MathMean_Country = mean(MathMean, na.rm = TRUE),
LectureMean_Country = mean(LectureMean, na.rm = TRUE),
ScienceMean_Country = mean(ScienceMean, na.rm = TRUE)
) %>%
# Reemplazar NaN por NA (cuando todos los valores son NA)
mutate(across(where(is.numeric), ~ifelse(is.nan(.), NA, .))) %>%
# Eliminar filas con cualquier NA
filter(complete.cases(.)) %>%
# Convertir a data.frame (kmeans funciona mejor con data.frames que con tibbles)
as.data.frame()
# Verificar que no hay NA/NaN/Inf
summary(NotasMediasPais)
## CNT MathMean_Country LectureMean_Country ScienceMean_Country
## Length:76 Min. :343.5 Min. :337.7 Min. :356.2
## Class :character 1st Qu.:396.2 1st Qu.:406.0 1st Qu.:411.8
## Mode :character Median :449.1 Median :448.5 Median :461.2
## Mean :445.1 Mean :442.6 Mean :454.4
## 3rd Qu.:483.6 3rd Qu.:484.0 3rd Qu.:497.6
## Max. :574.6 Max. :543.2 Max. :561.5
sum(is.na(NotasMediasPais))
## [1] 0
sum(is.nan(as.matrix(NotasMediasPais[,-1])))
## [1] 0
sum(is.infinite(as.matrix(NotasMediasPais[,-1])))
## [1] 0
Ahora hacemos como antes limpiamos y aagrupamos lo datos por países. Como este código es muy rápido de ejecutar no hemos guardado los resultados.
# 2. Estandarizar los datos (recomendado para k-means)
datos_estandarizados <- scale(NotasMediasPais[,-1])
# 3. Clustering con k-means (5 clusters)
set.seed(100)
clust_kmeans <- kmeans(datos_estandarizados, centers = 5, nstart = 20)
# 4. Asignar clusters a los países
NotasMediasPais$Cluster <- clust_kmeans$cluster
# 5. Visualización
world <- ne_countries(scale = "medium", returnclass = "sf")
world$iso_a3[world$admin == "France"] = "FRA"
world$iso_a3[world$admin == "Norway"] = "NOR"
world$iso_a3[world$admin == "Kosovo"] = "KSV"
world$iso_a3[world$admin == "Kazakhstan"] = "QAZ"
world$iso_a3[world$admin == "Taiwan"] = "TAP"
world$iso_a3[world$admin == "Qatar"] ="QUR"
world$iso_a3[world$admin == "China"] = "QCI"
world_clustered <- world %>%
left_join(NotasMediasPais, by = c("iso_a3" = "CNT"))
ggplot(world_clustered) +
geom_sf(aes(fill = as.factor(Cluster)), color = "white", size = 0.1) +
scale_fill_brewer(palette = "Set2", na.value = "grey90") +
labs(title = "Clústeres de países basados en notas PISA 2022",
subtitle = "5 clusters calculados con k-means",
fill = "Cluster") +
theme_minimal() +
theme(legend.position = "bottom")
Imprimimos los códigos de los países que pertenecen a cada clúster.
split(NotasMediasPais$CNT, NotasMediasPais$Cluster)
## $`1`
## [1] "AUS" "AUT" "BEL" "CAN" "CHE" "CZE" "DEU" "DNK" "ESP" "FIN" "FRA" "GBR"
## [13] "HRV" "HUN" "IRL" "ISR" "ITA" "LTU" "LVA" "NLD" "NOR" "NZL" "POL" "PRT"
## [25] "SVN" "SWE" "USA"
##
## $`2`
## [1] "ARE" "BRN" "CHL" "GRC" "ISL" "KAZ" "MLT" "QUR" "ROU" "SRB" "SVK" "TUR"
## [13] "URY"
##
## $`3`
## [1] "EST" "HKG" "JPN" "KOR" "MAC" "SGP" "TAP"
##
## $`4`
## [1] "ARG" "BGR" "BRA" "COL" "CRI" "GEO" "JAM" "MDA" "MEX" "MNE" "MNG" "MYS"
## [13] "PER" "QAT" "QAZ" "SAU" "THA"
##
## $`5`
## [1] "ALB" "DOM" "IDN" "JOR" "KSV" "MAR" "MKD" "PAN" "PHL" "PSE" "SLV" "UZB"
Hacemos el grafico de barras donde se diferencien las medias de cada cluster en cada rama.
library(dplyr)
library(ggplot2)
library(FactoMineR)
library(factoextra)
# 1. Preparación de datos con clusters
pisa_cluster_data <- NotasMediasPais %>%
mutate(Cluster = as.factor(clust_kmeans$cluster))
# 2. Calcular medias y desviaciones estándar por cluster (versión sencilla)
medias_cluster_pisa <- pisa_cluster_data %>%
group_by(Cluster) %>%
summarise(
Math_mean = mean(MathMean_Country, na.rm = TRUE),
Math_sd = sd(MathMean_Country, na.rm = TRUE),
Lecture_mean = mean(LectureMean_Country, na.rm = TRUE),
Lecture_sd = sd(LectureMean_Country, na.rm = TRUE),
Science_mean = mean(ScienceMean_Country, na.rm = TRUE),
Science_sd = sd(ScienceMean_Country, na.rm = TRUE)
)
# 3. Preparar datos para visualización
medias_plot_pisa <- medias_cluster_pisa %>%
pivot_longer(
cols = -Cluster,
names_to = c("Variable", "stat"),
names_sep = "_",
values_to = "value"
) %>%
pivot_wider(
names_from = "stat",
values_from = "value"
) %>%
mutate(Variable = case_when(
Variable == "Math" ~ "Matemáticas",
Variable == "Lecture" ~ "Lectura",
Variable == "Science" ~ "Ciencias",
TRUE ~ Variable
))
# 4. Visualización con gráfico de barras
ggplot(medias_plot_pisa, aes(x = Cluster, y = mean, fill = Variable)) +
geom_bar(stat = "identity", position = position_dodge(0.9), width = 0.8) +
geom_errorbar(
aes(ymin = mean - sd, ymax = mean + sd),
width = 0.25,
position = position_dodge(0.9),
color = "black"
) +
scale_fill_brewer(
palette = "Set1",
labels = c("Matemáticas", "Lectura", "Ciencias")
) +
labs(
title = "Puntuaciones medias PISA por cluster 2022",
subtitle = "Barras muestran medias con desviación estándar",
x = "Cluster",
y = "Puntuación media",
fill = "Área"
) +
theme_minimal() +
theme(
legend.position = "bottom",
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5)
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.1))) # Ajustar eje Y
Cargamos y hacemos un merge con los datos del banco mundial.
load("bancomundial.R")
# Combinar datos de PISA con Banco Mundial
datos_combi <- merge(
NotasMediasPais,
datos_filtrado,
by.x = "CNT",
by.y = "Country.Code",
all.x = TRUE
)
Pasamos todas las columnas a numéricas
# 1. Convertir columnas relevantes a numéricas
bm_pisa_numeric <- datos_combi %>%
mutate(across(-CNT, ~ {
# Primero convertir a carácter por si hay factores
x_char <- as.character(.x)
# Luego a numérico, convirtiendo cadenas vacías y "NA" a NA reales
as.numeric(ifelse(x_char %in% c("", "NA"), NA, x_char))
}))
# 2. Seleccionar solo columnas con suficientes datos
bm_pisa_clean <- bm_pisa_numeric %>%
# Eliminar columnas con más del 50% de NAs
select(where(~ mean(is.na(.x)) <= 0.5)) %>%
# Eliminar filas con cualquier NA restante
filter(complete.cases(.))
# 3. Verificar estructura
str(bm_pisa_clean)
## 'data.frame': 41 obs. of 42 variables:
## $ CNT : chr "ARE" "AUS" "BEL" "BRN" ...
## $ MathMean_Country : num 438 489 498 441 490 ...
## $ LectureMean_Country : num 426 501 487 428 498 ...
## $ ScienceMean_Country : num 440 510 499 445 507 ...
## $ Cluster : num 2 1 1 2 1 1 2 4 1 1 ...
## $ Compulsory education, duration (years) : num 12 11 12 9 10 11 12 12 10 13 ...
## $ Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative) : num 51.1 36.8 39.4 17.6 32.4 ...
## $ Educational attainment, at least completed lower secondary, population 25+, total (%) (cumulative) : num 85.5 95 89.6 87.5 96.1 ...
## $ Educational attainment, at least completed post-secondary, population 25+, total (%) (cumulative) : num 56.4 53.9 41.5 37.4 67.3 ...
## $ Educational attainment, at least completed primary, population 25+ years, total (%) (cumulative) : num 93.2 99.5 96.6 90.7 96.1 ...
## $ Educational attainment, at least completed upper secondary, population 25+, total (%) (cumulative) : num 74.4 81.7 75.1 68.8 89.2 ...
## $ Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative) : num 11.48 9.89 17.28 4.34 10.94 ...
## $ Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative) : num 0.83 1.85 0.925 0.629 0 ...
## $ GNI per capita, Atlas method (current US$) : num 46070 60710 53680 30970 53300 ...
## $ GNI per capita, PPP (constant 2021 international $) : num 68868 59028 60988 74309 57734 ...
## $ Labor force with advanced education (% of total working-age population with advanced education) : num 77.9 78.8 76 80.9 73.4 ...
## $ Labor force with basic education (% of total working-age population with basic education) : num 85.9 41.9 27.7 46.3 42.8 ...
## $ Labor force with intermediate education (% of total working-age population with intermediate education): num 70.7 72.8 57.4 69 61.1 ...
## $ Unemployment with advanced education (% of total labor force with advanced education) : num 4.12 2.28 3.02 6.63 4.12 ...
## $ Unemployment with basic education (% of total labor force with basic education) : num 0.421 7.71 13.461 5.095 11.304 ...
## $ Unemployment with intermediate education (% of total labor force with intermediate education) : num 1.02 3.98 5.71 4.76 6.78 ...
## $ Individuals using the Internet (% of population) : num 100 94.9 94 99 94 ...
## $ Fixed broadband subscriptions : num 3770000 9580000 5070000 90200 16400000 ...
## $ Mobile cellular subscriptions : num 20000000 28700000 11800000 528723 35400000 ...
## $ Secure Internet servers : num 16388 1255844 360344 9606 1699682 ...
## $ GDP growth (annual %) : num 7.51 4.24 4.23 -1.63 3.82 ...
## $ GDP per capita (constant 2015 US$) : num 42688 61010 44584 28549 45227 ...
## $ Access to electricity (% of population) : num 100 100 100 100 100 100 100 100 100 100 ...
## $ Access to clean fuels and technologies for cooking (% of population) : num 100 100 100 100 100 100 100 93.6 100 100 ...
## $ Population ages 0-14, total : num 1659913 4743867 1930701 96456 6054208 ...
## $ Population ages 15-64, total : num 8249860 16828565 7440659 330673 25509499 ...
## $ Population ages 0-14 (% of total population) : num 16.5 18.2 16.5 21.2 15.5 ...
## $ Population ages 15-64 (% of total population) : num 81.9 64.7 63.7 72.6 65.5 ...
## $ Population ages 65 and above (% of total population) : num 1.64 17.08 19.77 6.2 18.94 ...
## $ Population ages 65 and above, total : num 165203 4441967 2308850 28241 7375348 ...
## $ Population, total : num 10074977 26014399 11680210 455370 38939056 ...
## $ Preprimary education, duration (years) : num 4 5 6 3 6 2 6 6 3 6 ...
## $ Proportion of seats held by women in national parliaments (%) : num 50 38.41 42.67 9.09 30.47 ...
## $ Employment in industry (% of total employment) (modeled ILO estimate) : num 29.3 18.6 19.1 21.5 19.4 ...
## $ Employment in agriculture (% of total employment) (modeled ILO estimate) : num 1.36 2.177 0.908 1.517 1.307 ...
## $ Employment in services (% of total employment) (modeled ILO estimate) : num 69.3 79.2 80 77 79.3 ...
## $ Birth rate, crude (per 1,000 people) : num 9.89 11.6 9.8 13.49 9 ...
# 4. Estandarizar los datos para el clustering
bm_pisa_scaled <- bm_pisa_clean %>%
select(-CNT) %>% # Excluir la columna de país
scale() %>%
as.data.frame()
# 5. Realizar el clustering k-means (k=5)
set.seed(100)
clust_bm <- kmeans(bm_pisa_scaled, centers = 5, nstart = 20)
# 6. Preparar datos para visualización
resultados_cluster <- bm_pisa_clean %>%
select(CNT) %>%
mutate(Cluster = clust_bm$cluster)
# 7. Visualización del mapa
library(rnaturalearth)
library(ggplot2)
world <- ne_countries(scale = "medium", returnclass = "sf")
world$iso_a3[world$admin == "France"] = "FRA"
world$iso_a3[world$admin == "Norway"] = "NOR"
world$iso_a3[world$admin == "Kosovo"] = "KSV"
world$iso_a3[world$admin == "Kazakhstan"] = "QAZ"
world$iso_a3[world$admin == "Taiwan"] = "TAP"
world$iso_a3[world$admin == "Qatar"]="QUR"
world$iso_a3[world$admin == "China"] = "QCI"
world_clustered <- world %>%
left_join(resultados_cluster, by = c("iso_a3" = "CNT"))
ggplot(world_clustered) +
geom_sf(aes(fill = as.factor(Cluster)), color = "white", size = 0.1) +
scale_fill_brewer(palette = "Set2", na.value = "grey90") +
labs(title = "Clústeres basados en PISA y Banco Mundial",
subtitle = "5 clusters (k-means con datos estandarizados) 2022",
fill = "Cluster") +
theme_minimal()
library(dplyr)
library(grid)
resultados_cluster %>%
group_by(Cluster) %>%
summarise(Paises = paste(sort(CNT), collapse = ", ")) %>%
print(n = Inf)
## # A tibble: 5 × 2
## Cluster Paises
## <int> <chr>
## 1 1 IDN, PHL
## 2 2 BRN, CHL, COL, DOM, MDA, MKD, MNG, MYS, PAN
## 3 3 CZE, EST, GRC, HRV, HUN, LTU, LVA, POL, ROU, SRB, SVK, SVN
## 4 4 ARE, AUS, BEL, CHE, DNK, FIN, IRL, ISR, MLT, NLD, NOR, SWE
## 5 5 CAN, DEU, ESP, FRA, ITA, KOR
Este gráfico muestra las medias y desviaciones estándar de variables clave (como GDP, educación o rendimiento PISA) agrupadas por cluster. Cada barra representa el promedio de una variable en un cluster específico, mientras que las líneas de error indican la variabilidad. Permite comparar el desempeño de los clusters en distintos indicadores socioeconómicos y educativos.
library(dplyr)
library(ggplot2)
library(tidyr)
library(purrr)
# 1. Limpieza y preparación de datos
bm_pisa_clean <- datos_combi %>%
mutate(across(-CNT, ~ {
x_char <- as.character(.x)
as.numeric(ifelse(x_char %in% c("", "NA"), NA, x_char))
})) %>%
# Eliminar columnas con >50% NAs
select(where(~ mean(is.na(.x)) <= 0.5)) %>%
# Eliminar filas con NAs (alternativa: tidyr::fill() o imputación)
filter(complete.cases(.))
# 2. Estandarización y clustering
set.seed(100)
bm_pisa_scaled <- bm_pisa_clean %>%
select(-CNT) %>%
scale() %>%
as.data.frame()
clust_bm <- kmeans(bm_pisa_scaled, centers = 5, nstart = 20)
# 3. Asignar clusters a los datos originales
bm_cluster_data <- bm_pisa_clean %>%
mutate(Cluster = as.factor(clust_bm$cluster))
# 4. Análisis de variables importantes (ajusta los patrones según tus datos)
vars_importantes <- names(bm_cluster_data) %>%
grep("GDP|Internet|education|employment|GNI|PISA|math|science|reading",
., value = TRUE, ignore.case = TRUE)
# 5. Estadísticas por cluster
medias_cluster <- bm_cluster_data %>%
select(Cluster, all_of(vars_importantes)) %>%
group_by(Cluster) %>%
summarise(across(
everything(),
list(mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE))
))
# 6. Preparar para visualización
medias_plot <- medias_cluster %>%
pivot_longer(
cols = -Cluster,
names_to = c("Variable", ".value"),
names_sep = "_"
)
## Warning: Expected 2 pieces. Additional pieces discarded in 4 rows [1, 2, 3, 4].
# 7. Visualización de indicadores
ggplot(medias_plot, aes(x = Cluster, y = mean, fill = Variable)) +
geom_col(position = position_dodge(0.8), width = 0.7) +
geom_errorbar(
aes(ymin = mean - sd, ymax = mean + sd),
width = 0.2,
position = position_dodge(0.8),
color = "black"
) +
labs(title = "Indicadores por Cluster 2022") +
theme_minimal()
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_col()`).
Mostramos cuales han sido las variables más discriminantes
library(dplyr)
# Asegúrate de tener esto: variables estandarizadas + clúster
bm_pisa_scaled$Cluster <- as.factor(clust_bm$cluster)
# Calcular medias por clúster
media_por_cluster <- bm_pisa_scaled %>%
group_by(Cluster) %>%
summarise(across(everything(), mean))
# Quitar la columna "Cluster" y calcular SD por variable (entre clústeres)
sd_entre_clusters <- media_por_cluster %>%
select(-Cluster) %>%
summarise(across(everything(), sd)) %>%
pivot_longer(everything(), names_to = "Variable", values_to = "Desviacion")
# Top 10 variables más discriminantes
top_10_vars <- sd_entre_clusters %>%
arrange(desc(Desviacion)) %>%
slice_head(n = 10)
print(top_10_vars)
## # A tibble: 10 × 2
## Variable Desviacion
## <chr> <dbl>
## 1 Population ages 0-14, total 1.83
## 2 Mobile cellular subscriptions 1.72
## 3 Population, total 1.68
## 4 Population ages 15-64, total 1.68
## 5 Educational attainment, at least completed upper secondary, popul… 1.26
## 6 LectureMean_Country 1.23
## 7 ScienceMean_Country 1.22
## 8 Educational attainment, at least completed lower secondary, popul… 1.21
## 9 MathMean_Country 1.21
## 10 Birth rate, crude (per 1,000 people) 1.20
# Extraer los nombres de las variables top
vars_top10 <- top_10_vars$Variable
# Medias por clúster solo para esas variables
medias_top10 <- media_por_cluster %>%
select(Cluster, all_of(vars_top10))
print(medias_top10)
## # A tibble: 5 × 11
## Cluster Population ages 0-14, tot…¹ Mobile cellular subs…² `Population, total`
## <fct> <dbl> <dbl> <dbl>
## 1 1 3.92 3.64 3.55
## 2 2 -0.183 -0.233 -0.260
## 3 3 -0.331 -0.374 -0.369
## 4 4 -0.304 -0.367 -0.358
## 5 5 0.238 0.617 0.658
## # ℹ abbreviated names: ¹`Population ages 0-14, total`,
## # ²`Mobile cellular subscriptions`
## # ℹ 7 more variables: `Population ages 15-64, total` <dbl>,
## # `Educational attainment, at least completed upper secondary, population 25+, total (%) (cumulative)` <dbl>,
## # LectureMean_Country <dbl>, ScienceMean_Country <dbl>,
## # `Educational attainment, at least completed lower secondary, population 25+, total (%) (cumulative)` <dbl>,
## # MathMean_Country <dbl>, `Birth rate, crude (per 1,000 people)` <dbl>
Variables mas discriminanates.
library(pheatmap)
## Warning: package 'pheatmap' was built under R version 4.4.3
# Convertir columna Cluster en nombres de fila
matriz_heatmap <- medias_top10
rownames(matriz_heatmap) <- paste("Cluster", matriz_heatmap$Cluster)
## Warning: Setting row names on a tibble is deprecated.
matriz_heatmap$Cluster <- NULL
matriz_heatmap <- as.matrix(matriz_heatmap)
# Graficar heatmap
pheatmap(matriz_heatmap,
cluster_rows = FALSE,
cluster_cols = TRUE,
main = "Top 10 variables más discriminantes entre clústeres")