Primero de todo cargamos las librerias que necesitaremos

library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.4.3
library(rnaturalearthdata)
## Warning: package 'rnaturalearthdata' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'rnaturalearthdata'
## The following object is masked from 'package:rnaturalearth':
## 
##     countries110
library(dplyr)
library(cluster)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.4.3
library(gridExtra)
## 
## Adjuntando el paquete: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(tidyr)



# Cargar datos de notas medias por país (previamente procesados)
load("estudiantesPISAfiltrados.R")
NotasPlausibles = datosPisaEstudiantes_filtrado %>%
  select(matches("^PV|^CNT$|^CNTSCHID$"))
NotasMedias = NotasPlausibles %>% transmute(CNT = NotasPlausibles$CNT,
                                        CNTSCHID = NotasPlausibles$CNTSCHID,
                                        MathMean = rowMeans(NotasPlausibles[,3:12]),
                                        LectureMean = rowMeans(NotasPlausibles[,13:22]),
                                        ScienceMean = rowMeans(NotasPlausibles[,23:32]))

# 1. Calcular medias por país con manejo de NA/NaN
NotasMediasPais <- NotasMedias %>% 
  group_by(CNT) %>% 
  summarise(
    MathMean_Country = mean(MathMean, na.rm = TRUE),
    LectureMean_Country = mean(LectureMean, na.rm = TRUE),
    ScienceMean_Country = mean(ScienceMean, na.rm = TRUE)
  ) %>%
  # Reemplazar NaN por NA (cuando todos los valores son NA)
  mutate(across(where(is.numeric), ~ifelse(is.nan(.), NA, .))) %>%
  # Eliminar filas con cualquier NA
  filter(complete.cases(.)) %>%
  # Convertir a data.frame (kmeans funciona mejor con data.frames que con tibbles)
  as.data.frame()

# Verificar que no hay NA/NaN/Inf
summary(NotasMediasPais)
##      CNT            MathMean_Country LectureMean_Country ScienceMean_Country
##  Length:76          Min.   :343.5    Min.   :337.7       Min.   :356.2      
##  Class :character   1st Qu.:396.2    1st Qu.:406.0       1st Qu.:411.8      
##  Mode  :character   Median :449.1    Median :448.5       Median :461.2      
##                     Mean   :445.1    Mean   :442.6       Mean   :454.4      
##                     3rd Qu.:483.6    3rd Qu.:484.0       3rd Qu.:497.6      
##                     Max.   :574.6    Max.   :543.2       Max.   :561.5
sum(is.na(NotasMediasPais))
## [1] 0
sum(is.nan(as.matrix(NotasMediasPais[,-1])))
## [1] 0
sum(is.infinite(as.matrix(NotasMediasPais[,-1])))
## [1] 0

Ahora hacemos como antes limpiamos y aagrupamos lo datos por países. Como este código es muy rápido de ejecutar no hemos guardado los resultados.

# 2. Estandarizar los datos (recomendado para k-means)
datos_estandarizados <- scale(NotasMediasPais[,-1])

# 3. Clustering con k-means (5 clusters)
set.seed(100)
clust_kmeans <- kmeans(datos_estandarizados, centers = 5, nstart = 20)

# 4. Asignar clusters a los países
NotasMediasPais$Cluster <- clust_kmeans$cluster

# 5. Visualización
world <- ne_countries(scale = "medium", returnclass = "sf")

world$iso_a3[world$admin == "France"] = "FRA"
world$iso_a3[world$admin == "Norway"] = "NOR"
world$iso_a3[world$admin == "Kosovo"] = "KSV"
world$iso_a3[world$admin == "Kazakhstan"] = "QAZ"
world$iso_a3[world$admin == "Taiwan"] = "TAP"
world$iso_a3[world$admin == "Qatar"] ="QUR"
world$iso_a3[world$admin == "China"] = "QCI"
world_clustered <- world %>%
  left_join(NotasMediasPais, by = c("iso_a3" = "CNT"))

ggplot(world_clustered) +
  geom_sf(aes(fill = as.factor(Cluster)), color = "white", size = 0.1) +
  scale_fill_brewer(palette = "Set2", na.value = "grey90") +
  labs(title = "Clústeres de países basados en notas PISA 2022",
       subtitle = "5 clusters calculados con k-means",
       fill = "Cluster") +
  theme_minimal() +
  theme(legend.position = "bottom")

Imprimimos los códigos de los países que pertenecen a cada clúster.

split(NotasMediasPais$CNT, NotasMediasPais$Cluster)
## $`1`
##  [1] "AUS" "AUT" "BEL" "CAN" "CHE" "CZE" "DEU" "DNK" "ESP" "FIN" "FRA" "GBR"
## [13] "HRV" "HUN" "IRL" "ISR" "ITA" "LTU" "LVA" "NLD" "NOR" "NZL" "POL" "PRT"
## [25] "SVN" "SWE" "USA"
## 
## $`2`
##  [1] "ARE" "BRN" "CHL" "GRC" "ISL" "KAZ" "MLT" "QUR" "ROU" "SRB" "SVK" "TUR"
## [13] "URY"
## 
## $`3`
## [1] "EST" "HKG" "JPN" "KOR" "MAC" "SGP" "TAP"
## 
## $`4`
##  [1] "ARG" "BGR" "BRA" "COL" "CRI" "GEO" "JAM" "MDA" "MEX" "MNE" "MNG" "MYS"
## [13] "PER" "QAT" "QAZ" "SAU" "THA"
## 
## $`5`
##  [1] "ALB" "DOM" "IDN" "JOR" "KSV" "MAR" "MKD" "PAN" "PHL" "PSE" "SLV" "UZB"

Hacemos el grafico de barras donde se diferencien las medias de cada cluster en cada rama.

library(dplyr)
library(ggplot2)
library(FactoMineR)
library(factoextra)

# 1. Preparación de datos con clusters
pisa_cluster_data <- NotasMediasPais %>%
  mutate(Cluster = as.factor(clust_kmeans$cluster))

# 2. Calcular medias y desviaciones estándar por cluster (versión sencilla)
medias_cluster_pisa <- pisa_cluster_data %>%
  group_by(Cluster) %>%
  summarise(
    Math_mean = mean(MathMean_Country, na.rm = TRUE),
    Math_sd = sd(MathMean_Country, na.rm = TRUE),
    Lecture_mean = mean(LectureMean_Country, na.rm = TRUE),
    Lecture_sd = sd(LectureMean_Country, na.rm = TRUE),
    Science_mean = mean(ScienceMean_Country, na.rm = TRUE),
    Science_sd = sd(ScienceMean_Country, na.rm = TRUE)
  )

# 3. Preparar datos para visualización
medias_plot_pisa <- medias_cluster_pisa %>%
  pivot_longer(
    cols = -Cluster,
    names_to = c("Variable", "stat"),
    names_sep = "_",
    values_to = "value"
  ) %>%
  pivot_wider(
    names_from = "stat",
    values_from = "value"
  ) %>%
  mutate(Variable = case_when(
    Variable == "Math" ~ "Matemáticas",
    Variable == "Lecture" ~ "Lectura",
    Variable == "Science" ~ "Ciencias",
    TRUE ~ Variable
  ))

# 4. Visualización con gráfico de barras
ggplot(medias_plot_pisa, aes(x = Cluster, y = mean, fill = Variable)) +
  geom_bar(stat = "identity", position = position_dodge(0.9), width = 0.8) +
  geom_errorbar(
    aes(ymin = mean - sd, ymax = mean + sd),
    width = 0.25,
    position = position_dodge(0.9),
    color = "black"
  ) +
  scale_fill_brewer(
    palette = "Set1",
    labels = c("Matemáticas", "Lectura", "Ciencias")
  ) +
  labs(
    title = "Puntuaciones medias PISA por cluster 2022",
    subtitle = "Barras muestran medias con desviación estándar",
    x = "Cluster",
    y = "Puntuación media",
    fill = "Área"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5)
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1)))  # Ajustar eje Y

Cargamos y hacemos un merge con los datos del banco mundial.

load("bancomundial.R")


# Combinar datos de PISA con Banco Mundial
datos_combi <- merge(
  NotasMediasPais, 
  datos_filtrado, 
  by.x = "CNT", 
  by.y = "Country.Code",
  all.x = TRUE
)

Pasamos todas las columnas a numéricas

# 1. Convertir columnas relevantes a numéricas
bm_pisa_numeric <- datos_combi %>%
  mutate(across(-CNT, ~ {
    # Primero convertir a carácter por si hay factores
    x_char <- as.character(.x)
    # Luego a numérico, convirtiendo cadenas vacías y "NA" a NA reales
    as.numeric(ifelse(x_char %in% c("", "NA"), NA, x_char))
  }))

# 2. Seleccionar solo columnas con suficientes datos
bm_pisa_clean <- bm_pisa_numeric %>%
  # Eliminar columnas con más del 50% de NAs
  select(where(~ mean(is.na(.x)) <= 0.5)) %>%
  # Eliminar filas con cualquier NA restante
  filter(complete.cases(.))

# 3. Verificar estructura
str(bm_pisa_clean)
## 'data.frame':    41 obs. of  42 variables:
##  $ CNT                                                                                                    : chr  "ARE" "AUS" "BEL" "BRN" ...
##  $ MathMean_Country                                                                                       : num  438 489 498 441 490 ...
##  $ LectureMean_Country                                                                                    : num  426 501 487 428 498 ...
##  $ ScienceMean_Country                                                                                    : num  440 510 499 445 507 ...
##  $ Cluster                                                                                                : num  2 1 1 2 1 1 2 4 1 1 ...
##  $ Compulsory education, duration (years)                                                                 : num  12 11 12 9 10 11 12 12 10 13 ...
##  $ Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative)      : num  51.1 36.8 39.4 17.6 32.4 ...
##  $ Educational attainment, at least completed lower secondary, population 25+, total (%) (cumulative)     : num  85.5 95 89.6 87.5 96.1 ...
##  $ Educational attainment, at least completed post-secondary, population 25+, total (%) (cumulative)      : num  56.4 53.9 41.5 37.4 67.3 ...
##  $ Educational attainment, at least completed primary, population 25+ years, total (%) (cumulative)       : num  93.2 99.5 96.6 90.7 96.1 ...
##  $ Educational attainment, at least completed upper secondary, population 25+, total (%) (cumulative)     : num  74.4 81.7 75.1 68.8 89.2 ...
##  $ Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)        : num  11.48 9.89 17.28 4.34 10.94 ...
##  $ Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)                 : num  0.83 1.85 0.925 0.629 0 ...
##  $ GNI per capita, Atlas method (current US$)                                                             : num  46070 60710 53680 30970 53300 ...
##  $ GNI per capita, PPP (constant 2021 international $)                                                    : num  68868 59028 60988 74309 57734 ...
##  $ Labor force with advanced education (% of total working-age population with advanced education)        : num  77.9 78.8 76 80.9 73.4 ...
##  $ Labor force with basic education (% of total working-age population with basic education)              : num  85.9 41.9 27.7 46.3 42.8 ...
##  $ Labor force with intermediate education (% of total working-age population with intermediate education): num  70.7 72.8 57.4 69 61.1 ...
##  $ Unemployment with advanced education (% of total labor force with advanced education)                  : num  4.12 2.28 3.02 6.63 4.12 ...
##  $ Unemployment with basic education (% of total labor force with basic education)                        : num  0.421 7.71 13.461 5.095 11.304 ...
##  $ Unemployment with intermediate education (% of total labor force with intermediate education)          : num  1.02 3.98 5.71 4.76 6.78 ...
##  $ Individuals using the Internet (% of population)                                                       : num  100 94.9 94 99 94 ...
##  $ Fixed broadband subscriptions                                                                          : num  3770000 9580000 5070000 90200 16400000 ...
##  $ Mobile cellular subscriptions                                                                          : num  20000000 28700000 11800000 528723 35400000 ...
##  $ Secure Internet servers                                                                                : num  16388 1255844 360344 9606 1699682 ...
##  $ GDP growth (annual %)                                                                                  : num  7.51 4.24 4.23 -1.63 3.82 ...
##  $ GDP per capita (constant 2015 US$)                                                                     : num  42688 61010 44584 28549 45227 ...
##  $ Access to electricity (% of population)                                                                : num  100 100 100 100 100 100 100 100 100 100 ...
##  $ Access to clean fuels and technologies for cooking (% of population)                                   : num  100 100 100 100 100 100 100 93.6 100 100 ...
##  $ Population ages 0-14, total                                                                            : num  1659913 4743867 1930701 96456 6054208 ...
##  $ Population ages 15-64, total                                                                           : num  8249860 16828565 7440659 330673 25509499 ...
##  $ Population ages 0-14 (% of total population)                                                           : num  16.5 18.2 16.5 21.2 15.5 ...
##  $ Population ages 15-64 (% of total population)                                                          : num  81.9 64.7 63.7 72.6 65.5 ...
##  $ Population ages 65 and above (% of total population)                                                   : num  1.64 17.08 19.77 6.2 18.94 ...
##  $ Population ages 65 and above, total                                                                    : num  165203 4441967 2308850 28241 7375348 ...
##  $ Population, total                                                                                      : num  10074977 26014399 11680210 455370 38939056 ...
##  $ Preprimary education, duration (years)                                                                 : num  4 5 6 3 6 2 6 6 3 6 ...
##  $ Proportion of seats held by women in national parliaments (%)                                          : num  50 38.41 42.67 9.09 30.47 ...
##  $ Employment in industry (% of total employment) (modeled ILO estimate)                                  : num  29.3 18.6 19.1 21.5 19.4 ...
##  $ Employment in agriculture (% of total employment) (modeled ILO estimate)                               : num  1.36 2.177 0.908 1.517 1.307 ...
##  $ Employment in services (% of total employment) (modeled ILO estimate)                                  : num  69.3 79.2 80 77 79.3 ...
##  $ Birth rate, crude (per 1,000 people)                                                                   : num  9.89 11.6 9.8 13.49 9 ...
# 4. Estandarizar los datos para el clustering
bm_pisa_scaled <- bm_pisa_clean %>%
  select(-CNT) %>%  # Excluir la columna de país
  scale() %>%
  as.data.frame()

# 5. Realizar el clustering k-means (k=5)
set.seed(100)
clust_bm <- kmeans(bm_pisa_scaled, centers = 5, nstart = 20)

# 6. Preparar datos para visualización
resultados_cluster <- bm_pisa_clean %>%
  select(CNT) %>%
  mutate(Cluster = clust_bm$cluster)

# 7. Visualización del mapa
library(rnaturalearth)
library(ggplot2)

world <- ne_countries(scale = "medium", returnclass = "sf")

world$iso_a3[world$admin == "France"] = "FRA"
world$iso_a3[world$admin == "Norway"] = "NOR"
world$iso_a3[world$admin == "Kosovo"] = "KSV"
world$iso_a3[world$admin == "Kazakhstan"] = "QAZ"
world$iso_a3[world$admin == "Taiwan"] = "TAP"
world$iso_a3[world$admin == "Qatar"]="QUR"
world$iso_a3[world$admin == "China"] = "QCI"

world_clustered <- world %>%
  left_join(resultados_cluster, by = c("iso_a3" = "CNT"))

ggplot(world_clustered) +
  geom_sf(aes(fill = as.factor(Cluster)), color = "white", size = 0.1) +
  scale_fill_brewer(palette = "Set2", na.value = "grey90") +
  labs(title = "Clústeres basados en PISA y Banco Mundial",
       subtitle = "5 clusters (k-means con datos estandarizados) 2022",
       fill = "Cluster") +
  theme_minimal()

library(dplyr)
library(grid)

resultados_cluster %>%
  group_by(Cluster) %>%
  summarise(Paises = paste(sort(CNT), collapse = ", ")) %>%
  print(n = Inf)
## # A tibble: 5 × 2
##   Cluster Paises                                                    
##     <int> <chr>                                                     
## 1       1 IDN, PHL                                                  
## 2       2 BRN, CHL, COL, DOM, MDA, MKD, MNG, MYS, PAN               
## 3       3 CZE, EST, GRC, HRV, HUN, LTU, LVA, POL, ROU, SRB, SVK, SVN
## 4       4 ARE, AUS, BEL, CHE, DNK, FIN, IRL, ISR, MLT, NLD, NOR, SWE
## 5       5 CAN, DEU, ESP, FRA, ITA, KOR

Este gráfico muestra las medias y desviaciones estándar de variables clave (como GDP, educación o rendimiento PISA) agrupadas por cluster. Cada barra representa el promedio de una variable en un cluster específico, mientras que las líneas de error indican la variabilidad. Permite comparar el desempeño de los clusters en distintos indicadores socioeconómicos y educativos.

library(dplyr)
library(ggplot2)
library(tidyr)
library(purrr)

# 1. Limpieza y preparación de datos
bm_pisa_clean <- datos_combi %>%
  mutate(across(-CNT, ~ {
    x_char <- as.character(.x)
    as.numeric(ifelse(x_char %in% c("", "NA"), NA, x_char))
  })) %>%
  # Eliminar columnas con >50% NAs
  select(where(~ mean(is.na(.x)) <= 0.5)) %>%
  # Eliminar filas con NAs (alternativa: tidyr::fill() o imputación)
  filter(complete.cases(.))

# 2. Estandarización y clustering
set.seed(100)
bm_pisa_scaled <- bm_pisa_clean %>%
  select(-CNT) %>%
  scale() %>%
  as.data.frame()

clust_bm <- kmeans(bm_pisa_scaled, centers = 5, nstart = 20)

# 3. Asignar clusters a los datos originales
bm_cluster_data <- bm_pisa_clean %>%
  mutate(Cluster = as.factor(clust_bm$cluster))

# 4. Análisis de variables importantes (ajusta los patrones según tus datos)
vars_importantes <- names(bm_cluster_data) %>%
  grep("GDP|Internet|education|employment|GNI|PISA|math|science|reading", 
       ., value = TRUE, ignore.case = TRUE)

# 5. Estadísticas por cluster
medias_cluster <- bm_cluster_data %>%
  select(Cluster, all_of(vars_importantes)) %>%
  group_by(Cluster) %>%
  summarise(across(
    everything(),
    list(mean = ~mean(., na.rm = TRUE),
         sd = ~sd(., na.rm = TRUE))
  ))

# 6. Preparar para visualización
medias_plot <- medias_cluster %>%
  pivot_longer(
    cols = -Cluster,
    names_to = c("Variable", ".value"),
    names_sep = "_"
  )
## Warning: Expected 2 pieces. Additional pieces discarded in 4 rows [1, 2, 3, 4].
# 7. Visualización de indicadores
ggplot(medias_plot, aes(x = Cluster, y = mean, fill = Variable)) +
  geom_col(position = position_dodge(0.8), width = 0.7) +
  geom_errorbar(
    aes(ymin = mean - sd, ymax = mean + sd),
    width = 0.2,
    position = position_dodge(0.8),
    color = "black"
  ) +
  labs(title = "Indicadores por Cluster 2022") +
  theme_minimal() 
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_col()`).

Mostramos cuales han sido las variables más discriminantes

library(dplyr)

# Asegúrate de tener esto: variables estandarizadas + clúster
bm_pisa_scaled$Cluster <- as.factor(clust_bm$cluster)

# Calcular medias por clúster
media_por_cluster <- bm_pisa_scaled %>%
  group_by(Cluster) %>%
  summarise(across(everything(), mean))

# Quitar la columna "Cluster" y calcular SD por variable (entre clústeres)
sd_entre_clusters <- media_por_cluster %>%
  select(-Cluster) %>%
  summarise(across(everything(), sd)) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "Desviacion")

# Top 10 variables más discriminantes
top_10_vars <- sd_entre_clusters %>%
  arrange(desc(Desviacion)) %>%
  slice_head(n = 10)

print(top_10_vars)
## # A tibble: 10 × 2
##    Variable                                                           Desviacion
##    <chr>                                                                   <dbl>
##  1 Population ages 0-14, total                                              1.83
##  2 Mobile cellular subscriptions                                            1.72
##  3 Population, total                                                        1.68
##  4 Population ages 15-64, total                                             1.68
##  5 Educational attainment, at least completed upper secondary, popul…       1.26
##  6 LectureMean_Country                                                      1.23
##  7 ScienceMean_Country                                                      1.22
##  8 Educational attainment, at least completed lower secondary, popul…       1.21
##  9 MathMean_Country                                                         1.21
## 10 Birth rate, crude (per 1,000 people)                                     1.20
# Extraer los nombres de las variables top
vars_top10 <- top_10_vars$Variable

# Medias por clúster solo para esas variables
medias_top10 <- media_por_cluster %>%
  select(Cluster, all_of(vars_top10))

print(medias_top10)
## # A tibble: 5 × 11
##   Cluster Population ages 0-14, tot…¹ Mobile cellular subs…² `Population, total`
##   <fct>                         <dbl>                  <dbl>               <dbl>
## 1 1                             3.92                   3.64                3.55 
## 2 2                            -0.183                 -0.233              -0.260
## 3 3                            -0.331                 -0.374              -0.369
## 4 4                            -0.304                 -0.367              -0.358
## 5 5                             0.238                  0.617               0.658
## # ℹ abbreviated names: ¹​`Population ages 0-14, total`,
## #   ²​`Mobile cellular subscriptions`
## # ℹ 7 more variables: `Population ages 15-64, total` <dbl>,
## #   `Educational attainment, at least completed upper secondary, population 25+, total (%) (cumulative)` <dbl>,
## #   LectureMean_Country <dbl>, ScienceMean_Country <dbl>,
## #   `Educational attainment, at least completed lower secondary, population 25+, total (%) (cumulative)` <dbl>,
## #   MathMean_Country <dbl>, `Birth rate, crude (per 1,000 people)` <dbl>

Variables mas discriminanates.

library(pheatmap)
## Warning: package 'pheatmap' was built under R version 4.4.3
# Convertir columna Cluster en nombres de fila
matriz_heatmap <- medias_top10
rownames(matriz_heatmap) <- paste("Cluster", matriz_heatmap$Cluster)
## Warning: Setting row names on a tibble is deprecated.
matriz_heatmap$Cluster <- NULL
matriz_heatmap <- as.matrix(matriz_heatmap)

# Graficar heatmap
pheatmap(matriz_heatmap,
         cluster_rows = FALSE,
         cluster_cols = TRUE,
         main = "Top 10 variables más discriminantes entre clústeres")