Cluster 2018

# Cargar librerías necesarias
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.3

library(rnaturalearth)

## Warning: package 'rnaturalearth' was built under R version 4.4.3

library(rnaturalearthdata)

## Warning: package 'rnaturalearthdata' was built under R version 4.4.3

## 
## Adjuntando el paquete: 'rnaturalearthdata'

## The following object is masked from 'package:rnaturalearth':
## 
##     countries110

library(dplyr)

## 
## Adjuntando el paquete: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(cluster)
library(factoextra)

## Warning: package 'factoextra' was built under R version 4.4.3

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(FactoMineR)

## Warning: package 'FactoMineR' was built under R version 4.4.3

library(gridExtra)

## 
## Adjuntando el paquete: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(tidyr)


# Cargar datos de notas medias por país (previamente procesados)
load("estudiantesPISAfiltrados2018.R")
NotasPlausibles2018 = datosPisaEstudiantes_filtrado2018 %>%
  select(matches("^PV|^CNT$|^CNTSCHID$"))
NotasMedias = NotasPlausibles2018 %>% transmute(CNT = NotasPlausibles2018$CNT,
                                        CNTSCHID = NotasPlausibles2018$CNTSCHID,
                                        MathMean = rowMeans(NotasPlausibles2018[,3:12]),
                                        LectureMean = rowMeans(NotasPlausibles2018[,13:22]),
                                        ScienceMean = rowMeans(NotasPlausibles2018[,23:32]))

# 1. Calcular medias por país con manejo de NA/NaN
NotasMediasPais2018 <- NotasMedias %>% 
  group_by(CNT) %>% 
  summarise(
    MathMean_Country = mean(MathMean, na.rm = TRUE),
    LectureMean_Country = mean(LectureMean, na.rm = TRUE),
    ScienceMean_Country = mean(ScienceMean, na.rm = TRUE)
  ) %>%
  # Reemplazar NaN por NA (cuando todos los valores son NA)
  mutate(across(where(is.numeric), ~ifelse(is.nan(.), NA, .))) %>%
  # Eliminar filas con cualquier NA
  filter(complete.cases(.)) %>%
  # Convertir a data.frame (kmeans funciona mejor con data.frames que con tibbles)
  as.data.frame()

# Verificar que no hay NA/NaN/Inf
summary(NotasMediasPais2018)

##      CNT            MathMean_Country LectureMean_Country ScienceMean_Country
##  Length:77          Min.   :341.3    Min.   :339.9       Min.   :352.4      
##  Class :character   1st Qu.:428.6    1st Qu.:426.6       1st Qu.:429.0      
##  Mode  :character   Median :479.0    Median :474.4       Median :475.2      
##                     Mean   :466.7    Mean   :462.6       Mean   :465.7      
##                     3rd Qu.:502.7    3rd Qu.:503.2       3rd Qu.:499.7      
##                     Max.   :593.5    Max.   :560.9       Max.   :594.0

sum(is.na(NotasMediasPais2018))

## [1] 0

sum(is.nan(as.matrix(NotasMediasPais2018[,-1])))

## [1] 0

sum(is.infinite(as.matrix(NotasMediasPais2018[,-1])))

## [1] 0

load("estudiantesPISAfiltrados.R")
NotasPlausibles = datosPisaEstudiantes_filtrado %>%
  select(matches("^PV|^CNT$|^CNTSCHID$"))
NotasMedias = NotasPlausibles %>% transmute(CNT = NotasPlausibles$CNT,
                                        CNTSCHID = NotasPlausibles$CNTSCHID,
                                        MathMean = rowMeans(NotasPlausibles[,3:12]),
                                        LectureMean = rowMeans(NotasPlausibles[,13:22]),
                                        ScienceMean = rowMeans(NotasPlausibles[,23:32]))

# 1. Calcular medias por país con manejo de NA/NaN
NotasMediasPais <- NotasMedias %>%
  group_by(CNT) %>%
  summarise(
    MathMean_Country = mean(MathMean, na.rm = TRUE),
    LectureMean_Country = mean(LectureMean, na.rm = TRUE),
    ScienceMean_Country = mean(ScienceMean, na.rm = TRUE)
  )

NotasMediasPais2018$CNT <- recode(NotasMediasPais2018$CNT,
  "KSV" = "XKX",   # Kosovo
  "QAZ" = "KAZ",   # Asumiendo que QAZ es Kazajistán
  "QCI" = "CHN",   # Si representa China
  "QMR" = "MAR",   # Marruecos
  "QRT" = "TUR",   # Turquía
  "TAP" = "TWN" )    # Taiwán

paises2018 <- unique(datosPisaEstudiantes_filtrado2018$CNT)

paisesOtra <- unique(datosPisaEstudiantes_filtrado$CNT)

paisesSoloEn2018 <- setdiff(paises2018, paisesOtra)
paisesSoloEnOtra <- setdiff(paisesOtra, paises2018)
paisesComunes <- intersect(paises2018, paisesOtra)

cat("✅ Países que están SOLO en PISA 2018:\n")

## ✅ Países que están SOLO en PISA 2018:

print(paisesSoloEn2018)

## [1] "BIH" "BLR" "LUX" "RUS" "VNM" "UKR" "QCI" "QMR" "QRT"

cat("\n✅ Países que están SOLO en la otra base:\n")

## 
## ✅ Países que están SOLO en la otra base:

print(paisesSoloEnOtra)

## [1] "SLV" "PSE" "JAM" "MNG" "QUR" "MKD" "UZB"

cat("\n✅ Países comunes en ambas bases:\n")

## 
## ✅ Países comunes en ambas bases:

print(paisesComunes)

##  [1] "ALB" "QAZ" "ARG" "AUS" "AUT" "BEL" "BRA" "BRN" "BGR" "CAN" "CHL" "TAP"
## [13] "COL" "CRI" "HRV" "CZE" "DNK" "DOM" "EST" "FIN" "FRA" "GEO" "DEU" "GRC"
## [25] "HKG" "HUN" "ISL" "IDN" "IRL" "ISR" "ITA" "KSV" "JPN" "KAZ" "JOR" "KOR"
## [37] "LVA" "LTU" "MAC" "MYS" "MLT" "MEX" "MDA" "MNE" "MAR" "NLD" "NZL" "NOR"
## [49] "PAN" "PER" "PHL" "POL" "PRT" "QAT" "ROU" "SAU" "SRB" "SGP" "SVK" "SVN"
## [61] "ESP" "SWE" "CHE" "THA" "ARE" "TUR" "GBR" "USA" "URY"

Se decide estudiar cuáles son los países que difieren en ambas bases de datos y encontramos que hay bastantes distintos. Trás la optención de estos resultados se buscó información de porque esos países que aparecen en 2018 dejan de aparecer.

Rusia (RUS) y Bielorrusia (BLR): Probable exclusión o retiro por razones geopolíticas tras la invasión rusa a Ucrania en 2022. Muchos organismos internacionales han restringido la participación de Rusia en actividades multilaterales.

Ucrania (UKR): Por la guerra activa, la capacidad de implementar pruebas educativas a nivel nacional se ha visto comprometida.

Vietnam (VNM): Ha tenido intermitencias en su participación. Posiblemente por decisiones administrativas internas o reformas educativas en curso.

Bosnia y Herzegovina (BIH): Ha mostrado participación irregular debido a limitaciones institucionales y presupuestarias.

Luxemburgo (LUX): Aunque miembro activo en ciclos anteriores, puede haber decidido no participar por razones metodológicas o cambios internos en sus prioridades educativas.

También se buscó información para averiguar por que en 2022 aparecen países que no aparecian antes.

El Salvador (SLV), Jamaica (JAM), Mongolia (MNG), Uzbekistán (UZB), Macedonia del Norte (MKD): Nuevos participantes. Muchos países en vías de desarrollo están sumándose gradualmente a PISA conforme mejoran sus capacidades estadísticas y educativas.

Palestina (PSE): Su presencia en bases internacionales varía según reconocimiento y capacidad logística para aplicar la prueba en contextos de conflicto.

Se continua el análisis realizando un análisis clustering de los resultados de 2018 para poder compararlos con el análisis clustering realizado en el objetivo anterior (de los resultados de 2022)

Para comenzar lo primero que se debe decidir es el número óptimo de clusters

p1 = fviz_nbclust(x = NotasMediasPais2018[,-1], FUNcluster = hcut, method = "silhouette", hc_method = "ward.D2", k.max = 10, verbose = FALSE, hc_metric = "euclidean") + labs(title = "Num. optimo clusters")

p2 = fviz_nbclust(x = NotasMediasPais2018[,-1], FUNcluster = hcut, method = "wss", 
                  hc_method = "ward.D2", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
grid.arrange(p1, p2, nrow = 1)

# 2. Estandarizar los datos (recomendado para k-means)
datos_estandarizados <- scale(NotasMediasPais2018[,-1])

# 3. Clustering con k-means (5 clusters)
set.seed(100)
clust_kmeans <- kmeans(datos_estandarizados, centers = 5, nstart = 20)

# 4. Asignar clusters a los países
NotasMediasPais2018$Cluster <- clust_kmeans$cluster

# 5. Visualización
world <- ne_countries(scale = "medium", returnclass = "sf")
world$iso_a3[world$name == "France"] <- "FRA"
world$iso_a3[world$name == "Norway"] <- "NOR"

world_clustered <- world %>%
  left_join(NotasMediasPais2018, by = c("iso_a3" = "CNT"))



ggplot(world_clustered) +
  geom_sf(aes(fill = as.factor(Cluster)), color = "white", size = 0.1) +
  scale_fill_brewer(palette = "Set2", na.value = "grey90") +
  labs(title = "Clústeres de países basados en notas PISA 2018",
       subtitle = "5 clusters calculados con k-means",
       fill = "Cluster") +
  theme_minimal() +
  theme(legend.position = "bottom")

# Listar países por clúster
split(NotasMediasPais2018$CNT, NotasMediasPais2018$Cluster)

## $`1`
##  [1] "BLR" "CHL" "DNK" "ESP" "FRA" "GRC" "HRV" "HUN" "ISL" "ISR" "ITA" "LTU"
## [13] "LUX" "LVA" "MLT" "PRT" "MAR" "TUR" "RUS" "SVK" "SVN" "TUR" "UKR" "USA"
## 
## $`2`
##  [1] "AUS" "AUT" "BEL" "CAN" "CHE" "CZE" "DEU" "EST" "FIN" "GBR" "IRL" "JPN"
## [13] "KOR" "NLD" "NOR" "NZL" "POL" "SWE" "TWN"
## 
## $`3`
##  [1] "ALB" "ARE" "ARG" "BGR" "BIH" "BRA" "BRN" "COL" "CRI" "IDN" "JOR" "KAZ"
## [13] "MDA" "MEX" "MNE" "MYS" "PER" "QAT" "KAZ" "ROU" "SRB" "THA" "URY"
## 
## $`4`
## [1] "DOM" "GEO" "XKX" "MAR" "PAN" "PHL" "SAU"
## 
## $`5`
## [1] "HKG" "MAC" "CHN" "SGP"

library(dplyr)
library(ggplot2)
library(FactoMineR)
library(factoextra)

# 1. Preparación de datos con clusters
pisa_cluster_data <- NotasMediasPais2018 %>%
  mutate(Cluster = as.factor(clust_kmeans$cluster))

# 2. Calcular medias y desviaciones estándar por cluster (versión sencilla)
medias_cluster_pisa <- pisa_cluster_data %>%
  group_by(Cluster) %>%
  summarise(
    Math_mean = mean(MathMean_Country, na.rm = TRUE),
    Math_sd = sd(MathMean_Country, na.rm = TRUE),
    Lecture_mean = mean(LectureMean_Country, na.rm = TRUE),
    Lecture_sd = sd(LectureMean_Country, na.rm = TRUE),
    Science_mean = mean(ScienceMean_Country, na.rm = TRUE),
    Science_sd = sd(ScienceMean_Country, na.rm = TRUE)
  )

# 3. Preparar datos para visualización
medias_plot_pisa <- medias_cluster_pisa %>%
  pivot_longer(
    cols = -Cluster,
    names_to = c("Variable", "stat"),
    names_sep = "_",
    values_to = "value"
  ) %>%
  pivot_wider(
    names_from = "stat",
    values_from = "value"
  ) %>%
  mutate(Variable = case_when(
    Variable == "Math" ~ "Matemáticas",
    Variable == "Lecture" ~ "Lectura",
    Variable == "Science" ~ "Ciencias",
    TRUE ~ Variable
  ))

# 4. Visualización con gráfico de barras
ggplot(medias_plot_pisa, aes(x = Cluster, y = mean, fill = Variable)) +
  geom_bar(stat = "identity", position = position_dodge(0.9), width = 0.8) +
  geom_errorbar(
    aes(ymin = mean - sd, ymax = mean + sd),
    width = 0.25,
    position = position_dodge(0.9),
    color = "black"
  ) +
  scale_fill_brewer(
    palette = "Set1",
    labels = c("Matemáticas", "Lectura", "Ciencias")
  ) +
  labs(
    title = "Puntuaciones medias PISA por cluster 2018",
    subtitle = "Barras muestran medias con desviación estándar",
    x = "Cluster",
    y = "Puntuación media",
    fill = "Área"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5)
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1)))  # Ajustar eje Y

Ahora se hace el cluster añadiendo las variables que se encuentran en la base de datos del banco mundial

bancomundial <- read.csv("datos_filtrados_finales2018.csv")


# Combinar datos de PISA con Banco Mundial
datos_combi <- merge(
  NotasMediasPais2018, 
  bancomundial, 
  by.x = "CNT", 
  by.y = "Country.Code",
  all.x = TRUE
)

library(dplyr)
library(purrr)

# 1. Convertir columnas relevantes a numéricas
bm_pisa_numeric <- datos_combi %>%
  mutate(across(-CNT, ~ {
    # Primero convertir a carácter por si hay factores
    x_char <- as.character(.x)
    # Luego a numérico, convirtiendo cadenas vacías y "NA" a NA reales
    as.numeric(ifelse(x_char %in% c("", "NA"), NA, x_char))
  }))

# 2. Seleccionar solo columnas con suficientes datos
bm_pisa_clean <- bm_pisa_numeric %>%
  # Eliminar columnas con más del 50% de NAs
  select(where(~ mean(is.na(.x)) <= 0.5)) %>%
  # Eliminar filas con cualquier NA restante
  filter(complete.cases(.))

# 3. Verificar estructura
str(bm_pisa_clean)

## 'data.frame':    40 obs. of  42 variables:
##  $ CNT                                                                                                    : chr  "ARE" "AUS" "BEL" "BIH" ...
##  $ MathMean_Country                                                                                       : num  432 493 519 411 432 ...
##  $ LectureMean_Country                                                                                    : num  428 507 504 408 410 ...
##  $ ScienceMean_Country                                                                                    : num  431 506 510 402 433 ...
##  $ Cluster                                                                                                : num  3 2 2 3 3 2 2 1 3 2 ...
##  $ Compulsory.education..duration..years.                                                                 : num  12 11 12 9 9 10 11 12 12 10 ...
##  $ Educational.attainment..at.least.Bachelor.s.or.equivalent..population.25...total......cumulative.      : num  51.1 36.8 39.4 12.3 17.6 ...
##  $ Educational.attainment..at.least.completed.lower.secondary..population.25...total......cumulative.     : num  85.5 95 89.6 91 87.5 ...
##  $ Educational.attainment..at.least.completed.post.secondary..population.25...total......cumulative.      : num  56.4 53.9 41.5 15.1 37.4 ...
##  $ Educational.attainment..at.least.completed.primary..population.25..years..total......cumulative.       : num  93.2 99.5 96.6 95.4 90.7 ...
##  $ Educational.attainment..at.least.completed.upper.secondary..population.25...total......cumulative.     : num  74.4 81.7 75.1 72.3 68.8 ...
##  $ Educational.attainment..at.least.Master.s.or.equivalent..population.25...total......cumulative.        : num  11.48 9.89 17.28 1.96 4.34 ...
##  $ Educational.attainment..Doctoral.or.equivalent..population.25...total......cumulative.                 : num  0.83 1.85 0.925 0.21 0.629 ...
##  $ GNI.per.capita..Atlas.method..current.US..                                                             : num  46070 60710 53680 7710 30970 ...
##  $ GNI.per.capita..PPP..constant.2021.international...                                                    : num  68868 59028 60988 19034 74309 ...
##  $ Labor.force.with.advanced.education....of.total.working.age.population.with.advanced.education.        : num  77.9 78.8 76 80.3 80.9 ...
##  $ Labor.force.with.basic.education....of.total.working.age.population.with.basic.education.              : num  85.9 41.9 27.7 24.7 46.3 ...
##  $ Labor.force.with.intermediate.education....of.total.working.age.population.with.intermediate.education.: num  70.7 72.8 57.4 60.7 69 ...
##  $ Unemployment.with.advanced.education....of.total.labor.force.with.advanced.education.                  : num  4.12 2.28 3.02 10.49 6.63 ...
##  $ Unemployment.with.basic.education....of.total.labor.force.with.basic.education.                        : num  0.421 7.71 13.461 16.087 5.095 ...
##  $ Unemployment.with.intermediate.education....of.total.labor.force.with.intermediate.education.          : num  1.02 3.98 5.71 13.06 4.76 ...
##  $ Individuals.using.the.Internet....of.population.                                                       : num  100 94.9 94 78.8 99 ...
##  $ Fixed.broadband.subscriptions                                                                          : num  3770000 9580000 5070000 876000 90200 ...
##  $ Mobile.cellular.subscriptions                                                                          : num  20000000 28700000 11800000 3810000 528723 ...
##  $ Secure.Internet.servers                                                                                : num  16388 1255844 360344 15308 9606 ...
##  $ GDP.growth..annual...                                                                                  : num  7.51 4.24 4.23 4.23 -1.63 ...
##  $ GDP.per.capita..constant.2015.US..                                                                     : num  42688 61010 44584 6327 28549 ...
##  $ Access.to.electricity....of.population.                                                                : num  100 100 100 100 100 100 100 100 100 100 ...
##  $ Access.to.clean.fuels.and.technologies.for.cooking....of.population.                                   : num  100 100 100 41.1 100 100 100 100 93.6 100 ...
##  $ Population.ages.0.14..total                                                                            : num  1659913 4743867 1930701 428690 96456 ...
##  $ Population.ages.15.64..total                                                                           : num  8249860 16828565 7440659 2099794 330673 ...
##  $ Population.ages.0.14....of.total.population.                                                           : num  16.5 18.2 16.5 13.4 21.2 ...
##  $ Population.ages.15.64....of.total.population.                                                          : num  81.9 64.7 63.7 65.5 72.6 ...
##  $ Population.ages.65.and.above....of.total.population.                                                   : num  1.64 17.08 19.77 21.1 6.2 ...
##  $ Population.ages.65.and.above..total                                                                    : num  165203 4441967 2308850 676318 28241 ...
##  $ Population..total                                                                                      : num  10074977 26014399 11680210 3204802 455370 ...
##  $ Preprimary.education..duration..years.                                                                 : num  4 5 6 6 3 6 2 6 6 3 ...
##  $ Proportion.of.seats.held.by.women.in.national.parliaments....                                          : num  50 38.41 42.67 16.67 9.09 ...
##  $ Employment.in.industry....of.total.employment...modeled.ILO.estimate.                                  : num  29.3 18.6 19.1 31 21.5 ...
##  $ Employment.in.agriculture....of.total.employment...modeled.ILO.estimate.                               : num  1.36 2.177 0.908 16.873 1.517 ...
##  $ Employment.in.services....of.total.employment...modeled.ILO.estimate.                                  : num  69.3 79.2 80 52.2 77 ...
##  $ Birth.rate..crude..per.1.000.people.                                                                   : num  9.89 11.6 9.8 8.28 13.49 ...

# 4. Estandarizar los datos para el clustering
bm_pisa_scaled <- bm_pisa_clean %>%
  select(-CNT) %>%  # Excluir la columna de país
  scale() %>%
  as.data.frame()

# 5. Realizar el clustering k-means (k=5)
set.seed(100)
clust_bm <- kmeans(bm_pisa_scaled, centers = 5, nstart = 20)

# 6. Preparar datos para visualización
resultados_cluster <- bm_pisa_clean %>%
  select(CNT) %>%
  mutate(Cluster = clust_bm$cluster)

# 7. Visualización del mapa
library(rnaturalearth)
library(ggplot2)

world <- ne_countries(scale = "medium", returnclass = "sf")

world$iso_a3[world$name == "France"] <- "FRA"
world$iso_a3[world$name == "Norway"] <- "NOR"

world_clustered <- world %>%
  left_join(resultados_cluster, by = c("iso_a3" = "CNT"))

ggplot(world_clustered) +
  geom_sf(aes(fill = as.factor(Cluster)), color = "white", size = 0.1) +
  scale_fill_brewer(palette = "Set2", na.value = "grey90") +
  labs(title = "Clústeres basados en PISA y Banco Mundial 2018",
       subtitle = "5 clusters (k-means con datos estandarizados)",
       fill = "Cluster") +
  theme_minimal()

library(dplyr)

resultados_cluster %>%
  group_by(Cluster) %>%
  summarise(Paises = paste(sort(CNT), collapse = ", ")) %>%
  print(n = Inf)

## # A tibble: 5 × 2
##   Cluster Paises                                                    
##     <int> <chr>                                                     
## 1       1 BIH, BRN, CHL, COL, DOM, MDA, MYS, PAN, SRB               
## 2       2 CZE, EST, GRC, HRV, HUN, LTU, LVA, POL, ROU, SVK, SVN     
## 3       3 CAN, DEU, ESP, FRA, ITA, KOR                              
## 4       4 ARE, AUS, BEL, CHE, DNK, FIN, IRL, ISR, MLT, NLD, NOR, SWE
## 5       5 IDN, PHL

library(dplyr)
library(ggplot2)
library(tidyr)
library(purrr)

# 1. Limpieza y preparación de datos
bm_pisa_clean <- datos_combi %>%
  mutate(across(-CNT, ~ {
    x_char <- as.character(.x)
    as.numeric(ifelse(x_char %in% c("", "NA"), NA, x_char))
  })) %>%
  # Eliminar columnas con >50% NAs
  select(where(~ mean(is.na(.x)) <= 0.5)) %>%
  # Eliminar filas con NAs (alternativa: tidyr::fill() o imputación)
  filter(complete.cases(.))

# 2. Estandarización y clustering
set.seed(100)
bm_pisa_scaled <- bm_pisa_clean %>%
  select(-CNT) %>%
  scale() %>%
  as.data.frame()

clust_bm <- kmeans(bm_pisa_scaled, centers = 5, nstart = 20)

# 3. Asignar clusters a los datos originales
bm_cluster_data <- bm_pisa_clean %>%
  mutate(Cluster = as.factor(clust_bm$cluster))

# 4. Análisis de variables importantes (ajusta los patrones según tus datos)
vars_importantes <- names(bm_cluster_data) %>%
  grep("GDP|Internet|education|employment|GNI|PISA|math|science|reading", 
       ., value = TRUE, ignore.case = TRUE)

# 5. Estadísticas por cluster
medias_cluster <- bm_cluster_data %>%
  select(Cluster, all_of(vars_importantes)) %>%
  group_by(Cluster) %>%
  summarise(across(
    everything(),
    list(mean = ~mean(., na.rm = TRUE),
         sd = ~sd(., na.rm = TRUE))
  ))

# 6. Preparar para visualización
medias_plot <- medias_cluster %>%
  pivot_longer(
    cols = -Cluster,
    names_to = c("Variable", ".value"),
    names_sep = "_"
  )

## Warning: Expected 2 pieces. Additional pieces discarded in 4 rows [1, 2, 3, 4].

# 7. Visualización de indicadores
ggplot(medias_plot, aes(x = Cluster, y = mean, fill = Variable)) +
  geom_col(position = position_dodge(0.8), width = 0.7) +
  geom_errorbar(
    aes(ymin = mean - sd, ymax = mean + sd),
    width = 0.2,
    position = position_dodge(0.8),
    color = "black"
  ) +
  labs(title = "Indicadores por Cluster 2018") +
  theme_minimal()

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_col()`).

library(ggplot2)
library(dplyr)
library(tidyr)

library(dplyr)

# Asegúrate de tener esto: variables estandarizadas + clúster
bm_pisa_scaled$Cluster <- as.factor(clust_bm$cluster)

# Calcular medias por clúster
media_por_cluster <- bm_pisa_scaled %>%
  group_by(Cluster) %>%
  summarise(across(everything(), mean))

# Quitar la columna "Cluster" y calcular SD por variable (entre clústeres)
sd_entre_clusters <- media_por_cluster %>%
  select(-Cluster) %>%
  summarise(across(everything(), sd)) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "Desviacion")

# Top 10 variables más discriminantes
top_10_vars <- sd_entre_clusters %>%
  arrange(desc(Desviacion)) %>%
  slice_head(n = 10)

print(top_10_vars)

## # A tibble: 10 × 2
##    Variable                                                           Desviacion
##    <chr>                                                                   <dbl>
##  1 Population.ages.0.14..total                                              1.81
##  2 Mobile.cellular.subscriptions                                            1.70
##  3 Population..total                                                        1.66
##  4 Population.ages.15.64..total                                             1.66
##  5 Educational.attainment..at.least.completed.upper.secondary..popul…       1.26
##  6 Educational.attainment..at.least.completed.lower.secondary..popul…       1.26
##  7 LectureMean_Country                                                      1.25
##  8 Birth.rate..crude..per.1.000.people.                                     1.24
##  9 Population.ages.0.14....of.total.population.                             1.21
## 10 Population.ages.65.and.above..total                                      1.16

# Extraer los nombres de las variables top
vars_top10 <- top_10_vars$Variable

# Medias por clúster solo para esas variables
medias_top10 <- media_por_cluster %>%
  select(Cluster, all_of(vars_top10))
print(medias_top10)

## # A tibble: 5 × 11
##   Cluster Population.ages.0.14..total Mobile.cellular.subscr…¹ Population..total
##   <fct>                         <dbl>                    <dbl>             <dbl>
## 1 1                            -0.190                   -0.234            -0.260
## 2 2                            -0.333                   -0.379            -0.373
## 3 3                             0.226                    0.599             0.639
## 4 4                            -0.310                   -0.376            -0.367
## 5 5                             3.87                     3.59              3.51 
## # ℹ abbreviated name: ¹Mobile.cellular.subscriptions
## # ℹ 7 more variables: Population.ages.15.64..total <dbl>,
## #   Educational.attainment..at.least.completed.upper.secondary..population.25...total......cumulative. <dbl>,
## #   Educational.attainment..at.least.completed.lower.secondary..population.25...total......cumulative. <dbl>,
## #   LectureMean_Country <dbl>, Birth.rate..crude..per.1.000.people. <dbl>,
## #   Population.ages.0.14....of.total.population. <dbl>,
## #   Population.ages.65.and.above..total <dbl>

library(pheatmap)

## Warning: package 'pheatmap' was built under R version 4.4.3

print(medias_top10)

## # A tibble: 5 × 11
##   Cluster Population.ages.0.14..total Mobile.cellular.subscr…¹ Population..total
##   <fct>                         <dbl>                    <dbl>             <dbl>
## 1 1                            -0.190                   -0.234            -0.260
## 2 2                            -0.333                   -0.379            -0.373
## 3 3                             0.226                    0.599             0.639
## 4 4                            -0.310                   -0.376            -0.367
## 5 5                             3.87                     3.59              3.51 
## # ℹ abbreviated name: ¹Mobile.cellular.subscriptions
## # ℹ 7 more variables: Population.ages.15.64..total <dbl>,
## #   Educational.attainment..at.least.completed.upper.secondary..population.25...total......cumulative. <dbl>,
## #   Educational.attainment..at.least.completed.lower.secondary..population.25...total......cumulative. <dbl>,
## #   LectureMean_Country <dbl>, Birth.rate..crude..per.1.000.people. <dbl>,
## #   Population.ages.0.14....of.total.population. <dbl>,
## #   Population.ages.65.and.above..total <dbl>

# Convertir columna Cluster en nombres de fila
matriz_heatmap <- medias_top10
rownames(matriz_heatmap) <- paste("Cluster", matriz_heatmap$Cluster)

## Warning: Setting row names on a tibble is deprecated.

matriz_heatmap$Cluster <- NULL
matriz_heatmap <- as.matrix(matriz_heatmap)

# Graficar heatmap
pheatmap(matriz_heatmap,
         cluster_rows = FALSE,
         cluster_cols = TRUE,
         main = "Top 10 variables más discriminantes entre clústeres")

Cluster 2018

emma

2025-05-20