Librerías

library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(haven)
library(ggplot2)
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.4.3
library(rnaturalearthdata)
## Warning: package 'rnaturalearthdata' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'rnaturalearthdata'
## The following object is masked from 'package:rnaturalearth':
## 
##     countries110
library(rnaturalearthhires)
library(cluster)
library(FactoMineR)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(NbClust)
library(clValid)
## Warning: package 'clValid' was built under R version 4.4.3
library(grid)
library(gridExtra)
## 
## Adjuntando el paquete: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ggsci)
library(missMDA)
## Warning: package 'missMDA' was built under R version 4.4.3
library(readxl)
library(mice)
## 
## Adjuntando el paquete: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind

Limpieza de datos

#EJECUTAR ESTE CÓDIGO UNA SOLA VEZ
datosPisaEstudiantes = read_sas("C:/Users/David/UPV/PROYII/proyecto/datosPisaEstudiantes.SAS7BDAT", NULL)

numNA = apply(datosPisaEstudiantes, 1, function(x) sum(is.na(x)))

percNA = round(100*apply(datosPisaEstudiantes, 1, function(x) mean(is.na(x))), 2)
tablaNA2 = data.frame(numNA, percNA)
summary(tablaNA2$numNA)

barplot(table(tablaNA2$percNA), xlab = "% Valores faltantes", ylab = "Número de casos",
        main = "Valores faltantes por estudiante")

datosPisaEstudiantes_filtrado <- datosPisaEstudiantes %>%
  select(which(colMeans(is.na(datosPisaEstudiantes)) <= 0.2))

#Filtrado de estudiantes puede que sea innecesario
datosPisaEstudiantes_filtrado = datosPisaEstudiantes_filtrado %>% filter(rowMeans(is.na(.)) <= 0.2)

save(datosPisaEstudiantes_filtrado, file="estudiantesPISAfiltrados.R")

Creación de notas medias por estudiante y por país

load("estudiantesPISAfiltrados.R")
NotasPlausibles = datosPisaEstudiantes_filtrado %>%
  select(matches("^PV|^CNT$|^CNTSCHID$"))
NotasMedias = NotasPlausibles %>% transmute(CNT = NotasPlausibles$CNT,
                                        CNTSCHID = NotasPlausibles$CNTSCHID,
                                        MathMean = rowMeans(NotasPlausibles[,3:12]),
                                        LectureMean = rowMeans(NotasPlausibles[,13:22]),
                                        ScienceMean = rowMeans(NotasPlausibles[,23:32]))

NotasMediasPais = NotasMedias %>% group_by(CNT) %>% summarise(
  MathMean_Country = mean(MathMean, na.rm = TRUE),
  MathSD = sd(MathMean, na.rm = TRUE),
  LectureMean_Country = mean(LectureMean, na.rm = TRUE),
  LectureSD = sd(LectureMean, na.rm = TRUE),
  ScienceMean_Country = mean(ScienceMean, na.rm = TRUE),
  ScienceSD = sd(ScienceMean, na.rm = TRUE))

#He quitado todos los View y puesto todos los textos para que se vean bien aunque hayan los gráficos de clústeres óptimos y de Silhouette. Falta cambiar nombres de algunos gráficos y poner alguna explicación más

Boxplot medias estudiantes

par(mar = c(5, 7, 4, 2))
boxplot(NotasMedias$MathMean, NotasMedias$LectureMean, NotasMedias$ScienceMean, 
        names = c("Matemáticas", "Lectura", "Ciencias"), 
        main = "Valores atípicos medias estudiantes", 
        col = c("red", "lightblue", "green"),
        horizontal = TRUE, cex.axis = 0.8)

summary(NotasMedias)
##      CNT               CNTSCHID           MathMean      LectureMean    
##  Length:562708      Min.   :  800001   Min.   :150.2   Min.   : 37.96  
##  Class :character   1st Qu.:21400219   1st Qu.:370.8   1st Qu.:365.57  
##  Mode  :character   Median :40000043   Median :438.2   Median :441.23  
##                     Mean   :43679973   Mean   :446.4   Mean   :443.54  
##                     3rd Qu.:68800107   3rd Qu.:514.6   3rd Qu.:519.47  
##                     Max.   :90100182   Max.   :843.3   Max.   :890.17  
##   ScienceMean    
##  Min.   : 51.34  
##  1st Qu.:379.33  
##  Median :449.85  
##  Mean   :455.78  
##  3rd Qu.:527.40  
##  Max.   :852.45

Boxplot medias países

par(mar = c(5, 7, 4, 2))
boxplot(
        NotasMediasPais$MathMean_Country, 
        NotasMediasPais$LectureMean_Country, 
        NotasMediasPais$ScienceMean_Country, 
        names = c("Matemáticas", "Lectura", "Ciencias"), 
        main = "Valores atípicos medias países", 
        col = c("red", "lightblue", "green"),
        horizontal = TRUE, cex.axis = 0.8)

summary(NotasMediasPais)
##      CNT            MathMean_Country     MathSD       LectureMean_Country
##  Length:76          Min.   :343.5    Min.   : 48.07   Min.   :337.7      
##  Class :character   1st Qu.:396.2    1st Qu.: 74.31   1st Qu.:406.0      
##  Mode  :character   Median :449.1    Median : 82.78   Median :448.5      
##                     Mean   :445.1    Mean   : 80.63   Mean   :442.6      
##                     3rd Qu.:483.6    3rd Qu.: 90.03   3rd Qu.:484.0      
##                     Max.   :574.6    Max.   :108.34   Max.   :543.2      
##    LectureSD      ScienceMean_Country   ScienceSD     
##  Min.   : 59.36   Min.   :356.2       Min.   : 55.49  
##  1st Qu.: 81.24   1st Qu.:411.8       1st Qu.: 76.93  
##  Median : 86.80   Median :461.2       Median : 86.68  
##  Mean   : 87.58   Mean   :454.4       Mean   : 84.59  
##  3rd Qu.: 96.22   3rd Qu.:497.6       3rd Qu.: 93.52  
##  Max.   :114.10   Max.   :561.5       Max.   :104.17

Identificar códigos que no hacen bien el join

world = ne_countries(scale = "medium", returnclass = "sf")
world <- left_join(world, NotasMediasPais, by = c("iso_a3" = "CNT"))
iso_validos = world$iso_a3[!is.na(world$MathMean_Country)]
iso_novalidos = setdiff(NotasMediasPais$CNT, iso_validos)
print(iso_novalidos)
## [1] "FRA" "KSV" "NOR" "QAZ" "QUR" "TAP"
#Poner este código cada vez que se cree world
world$iso_a3[world$admin == "France"] = "FRA"
world$iso_a3[world$admin == "Norway"] = "NOR"
world$iso_a3[world$admin == "Kosovo"] = "KSV"
world$iso_a3[world$admin == "Taiwan"] = "TAP"
#QAZ es Baku (Azerbaiyan) y QUR es algunas regiones de Ucrania

Mapa medias por país

world = ne_countries(scale = "medium", returnclass = "sf")

world$iso_a3[world$admin == "France"] = "FRA"
world$iso_a3[world$admin == "Norway"] = "NOR"
world$iso_a3[world$admin == "Kosovo"] = "KSV"
world$iso_a3[world$admin == "Taiwan"] = "TAP"

world <- left_join(world, NotasMediasPais, by = c("iso_a3" = "CNT"))

ggplot(data = world) +
  geom_sf(aes(fill = MathMean_Country), color = "white") +
  scale_fill_viridis_c(option = "magma", na.value = "gray90") +  
  labs(title = "Puntaje Medio en Matemáticas por País",
       fill = "Puntaje") +
  theme_minimal()

ggplot(data = world) +
  geom_sf(aes(fill = LectureMean_Country), color = "white") +
  scale_fill_viridis_c(option = "magma", na.value = "gray90") +  
  labs(title = "Puntaje Medio en Lectura por País",
       fill = "Puntaje") +
  theme_minimal()

ggplot(data = world) +
  geom_sf(aes(fill = ScienceMean_Country), color = "white") +
  scale_fill_viridis_c(option = "magma", na.value = "gray90") +  
  labs(title = "Puntaje Medio en Ciencia por País",
       fill = "Puntaje") +
  theme_minimal()

Heatmap Medias y Desviaciones típicas

midist = get_dist(NotasMediasPais[2:ncol(NotasMediasPais)], stand = FALSE, method = "euclidean")
fviz_dist(midist, show_labels = TRUE, lab_size = 0.3,
          gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

Estadístico Hopkins

#nrow(NotasMediasPais)
set.seed(100)
myN = c(20, 35, 50, 65)  # m
myhopkins = NULL
myseed = sample(1:1000, 10)
for (i in myN) {
  for (j in myseed) {
    tmp = get_clust_tendency(data = NotasMediasPais[,2:ncol(NotasMediasPais)], n = i, graph = FALSE, seed = j)
    myhopkins = c(myhopkins, tmp$hopkins_stat)
  }
}
summary(myhopkins)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.7923  0.8240  0.8315  0.8330  0.8429  0.8730
#Hopkins cercano a 1 indica datos agrupados

Prueba de PCA

#Escalar por bloques?
res.pca = PCA(NotasMediasPais, 
              graph = FALSE, ncp = 2, quali.sup = 1)
fviz_pca_ind(res.pca, axes = c(1, 2), geom = c("point", "text"), 
             habillage = "CNT", repel = TRUE, labelsize = 2)

fviz_pca_var(res.pca, axes = c(1,2), repel = TRUE, col.var = "contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))

Clustering por países

Visualización número de clústeres óptimos (Silhouette y SC intraclúster)

Ward

p1 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = hcut, method = "silhouette", 
                  hc_method = "ward.D2", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
p2 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = hcut, method = "wss", 
                  hc_method = "ward.D2", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
grid.arrange(p1, p2, nrow = 1)

Método Jerárquico de la media

p1 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = hcut, method = "silhouette", 
                  hc_method = "average", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
p2 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = hcut, method = "wss", 
                  hc_method = "average", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
grid.arrange(p1, p2, nrow = 1)

#En los métodos jerárquicos se puede meter la variable CNT y no da error, pero mejor quitarla para que nos diga warnings (en los métodos de partición no se puede incluir directamente)

En los jerárquicos cogemos 5 clústeres, ya que los más óptimos son 2, 4 o 5 clústeres, pero 2 es demasiado poco y 5 es un poco mejor que 4 en ambos criterios.

K-Medias

p1 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = kmeans, method = "silhouette", 
             k.max = 10, verbose = FALSE) +
  labs(title = "K-means")
p2 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = kmeans, method = "wss", 
             k.max = 10, verbose = FALSE) +
  labs(title = "K-means")
grid.arrange(p1, p2, nrow = 1)

K-Medoides

p1 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = pam, method = "silhouette", 
             k.max = 10, verbose = FALSE) +
  labs(title = "Numero optimo de clusters")
p2 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = pam, method = "wss", 
             k.max = 10, verbose = FALSE) +
  labs(title = "Numero optimo de clusters")
grid.arrange(p1, p2, nrow = 1)

Excluyendo la posibilidad de hacerlo con dos clústeres, con el método de k-medias nos salen 4 o 6 como la cantidad idonea de clústeres. Con el método de k-medoides nos sale otra vez 5 clústeres.

Creación de clusterings

clust1 <- hclust(midist, method="ward.D2")
grupos1 <- cutree(clust1, k=5)
table(grupos1)
## grupos1
##  1  2  3  4  5 
## 18 14 30  8  6
gruposA = cbind(grupos1, NotasMediasPais[,"CNT"])

clust2 <- hclust(midist, method="average")
grupos2 <- cutree(clust2, k=5)
table(grupos2)
## grupos2
##  1  2  3  4  5 
## 18 14 28  8  8
gruposB = cbind(grupos2, NotasMediasPais[,"CNT"])

set.seed(100)
clust3 <- kmeans(NotasMediasPais[,-1], centers = 4, nstart = 20)
table(clust3$cluster)
## 
##  1  2  3  4 
## 15 23 31  7
gruposC = cbind(clust3$cluster, NotasMediasPais[,"CNT"])
colnames(gruposC)[1] = "grupos3"

clust4 <- pam(NotasMediasPais, k = 5)
table(clust4$clustering)
## 
##  1  2  3  4  5 
## 13 24 17 15  7
gruposD = cbind(clust4$cluster, NotasMediasPais[,"CNT"])
colnames(gruposD)[1] = "grupos4"

grupos1 == grupos2
##  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
## [25]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [73]  TRUE  TRUE  TRUE  TRUE

Son prácticamente iguales los clusterings en los dos métodos jerárquicos, solo cambia en que incluye irlanda y estonia en el grupo de singapur.

Visualización de clústeres

world = ne_countries(scale = "large", returnclass = "sf")

world$iso_a3[world$admin == "France"] = "FRA"
world$iso_a3[world$admin == "Norway"] = "NOR"
world$iso_a3[world$admin == "Kosovo"] = "KSV"
world$iso_a3[world$admin == "Taiwan"] = "TAP"

world_clustered <- world %>%
  left_join(gruposA, by = c("iso_a3" = "CNT"))
world_clustered$grupos1 <- as.factor(world_clustered$grupos1)

ggplot(world_clustered) +
  geom_sf(aes(fill = grupos1), color = NA, size = 0.1) +
  scale_fill_brewer(palette = "Set3", na.value = "grey90") +
  theme_minimal() +
  labs(title = "Mapa de Clústeres de Países WARD k=5", fill = "Cluster")

world_clustered <- world %>%
  left_join(gruposB, by = c("iso_a3" = "CNT"))
world_clustered$grupos2 <- as.factor(world_clustered$grupos2)

ggplot(world_clustered) +
  geom_sf(aes(fill = grupos2), color = NA, size = 0.1) +
  scale_fill_brewer(palette = "Set3", na.value = "grey90") +
  theme_minimal() +
  labs(title = "Mapa de Clústeres de Países MEDIAS k=5", fill = "Cluster")

world_clustered <- world %>%
  left_join(gruposC, by = c("iso_a3" = "CNT"))
world_clustered$grupos3 <- as.factor(world_clustered$grupos3)

ggplot(world_clustered) +
  geom_sf(aes(fill = grupos3), color = NA, size = 0.1) +
  scale_fill_brewer(palette = "Set3", na.value = "grey90") +
  theme_minimal() +
  labs(title = "Mapa de Clústeres de Países KMEDIAS k=4", fill = "Cluster")

world_clustered <- world %>%
  left_join(gruposD, by = c("iso_a3" = "CNT"))
world_clustered$grupos4 <- as.factor(world_clustered$grupos4)

ggplot(world_clustered) +
  geom_sf(aes(fill = grupos4), color = NA, size = 0.1) +
  scale_fill_brewer(palette = "Set3", na.value = "grey90") +
  theme_minimal() +
  labs(title = "Mapa de Clústeres de Países KMEDOIDES k=5", fill = "Cluster")

Visualización del mejor clustering

colores = pal_npg("nrc")(6)
colores2 = pal_npg("nrc")(7)
par(mfrow = c(1,2))
plot(silhouette(grupos1, midist), col=colores, border=NA, main = "WARD")
plot(silhouette(grupos2, midist), col=colores, border=NA, main = "MEDIAS")

plot(silhouette(clust3$cluster, midist), col=colores, border=NA, main = "K-MEDIAS")
plot(silhouette(clust4$clustering, midist), col=colores2, border=NA, main = "K-MEDOIDES")

Según el coeficiente de Silhouette el mejor método es el de k-medias, así que lo escogemos. Además no tiene ningún individuo con un coeficiente de Silhouette negativo.

Validación del clustering

NotasNumericas = NotasMediasPais[,-1]
NotasNumericas = as.data.frame(NotasNumericas)

metodos = c("hierarchical","kmeans","pam")
validacion = suppressMessages(clValid(NotasNumericas, nClust = 4:6, metric = "euclidean", clMethods = metodos, validation = c("internal", "stability"),
method = "ward"))
## Warning in clValid(NotasNumericas, nClust = 4:6, metric = "euclidean",
## clMethods = metodos, : rownames for data not specified, using 1:nrow(data)
summary(validacion)
## 
## Clustering Methods:
##  hierarchical kmeans pam 
## 
## Cluster sizes:
##  4 5 6 
## 
## Validation Measures:
##                                  4       5       6
##                                                   
## hierarchical APN            0.2186  0.1967  0.1723
##              AD            49.3383 40.1190 35.3354
##              ADM           24.3513 15.5680 11.9886
##              FOM           14.2901 12.3023 10.9805
##              Connectivity  14.8532 19.9964 27.0885
##              Dunn           0.2218  0.2663  0.1502
##              Silhouette     0.4536  0.4568  0.3689
## kmeans       APN            0.0669  0.1233  0.0779
##              AD            41.2157 36.9759 31.8536
##              ADM            6.2354  9.1124  5.1827
##              FOM           13.8448 12.1483 10.9892
##              Connectivity  14.6960 20.6234 24.0770
##              Dunn           0.2335  0.2667  0.1871
##              Silhouette     0.4936  0.4656  0.3924
## pam          APN            0.1100  0.0395  0.1354
##              AD            45.1380 34.6045 33.3412
##              ADM           12.1841  2.8777  7.8905
##              FOM           14.3613 11.4065 11.0022
##              Connectivity  21.2111 22.8972 27.9671
##              Dunn           0.0503  0.2012  0.2667
##              Silhouette     0.4432  0.4618  0.4418
## 
## Optimal Scores:
## 
##              Score   Method       Clusters
## APN           0.0395 pam          5       
## AD           31.8536 kmeans       6       
## ADM           2.8777 pam          5       
## FOM          10.9805 hierarchical 6       
## Connectivity 14.6960 kmeans       4       
## Dunn          0.2667 kmeans       5       
## Silhouette    0.4936 kmeans       4

Considerando todo lo anterior junto con estos parámetros creemos que el mejor método sigue siendo k-means con 4 clústeres. Dando como parámetros más importantes el coeficiente de Silhouette, el índice de Dunn y la conectividad.

Visualización del Clúster final

world_clustered <- world %>%
  left_join(gruposC, by = c("iso_a3" = "CNT"))
world_clustered$grupos3 <- as.factor(world_clustered$grupos3)

ggplot(world_clustered) +
  geom_sf(aes(fill = grupos3), color = NA, size = 0.1) +
  scale_fill_brewer(palette = "Set2", na.value = "grey90") +
  theme_minimal() +
  labs(title = "Mapa de Clústeres de Países KMEDIAS k=4", fill = "Cluster")

PaisesCluster = merge(NotasMediasPais, gruposC, by = "CNT")

MediasCluster = PaisesCluster %>%
  group_by(grupos3) %>%
  summarise(
    nota_media_Mates = mean(MathMean_Country),
    nota_media_Lectura = mean(LectureMean_Country),
    nota_media_Ciencias = mean(ScienceMean_Country)
    )
MediasCluster$nota_media_General = rowMeans(MediasCluster[,2:ncol(MediasCluster)])
MediasCluster
## # A tibble: 4 × 5
##   grupos3 nota_media_Mates nota_media_Lectura nota_media_Ciencias
##     <int>            <dbl>              <dbl>               <dbl>
## 1       1             371.               362.                377.
## 2       2             417.               419.                428.
## 3       3             480.               483.                492.
## 4       4             541.               516.                538.
## # ℹ 1 more variable: nota_media_General <dbl>
MediasCluster2 <- MediasCluster %>%
  pivot_longer(
    cols = starts_with("nota"),
    names_to = "area",
    values_to = "nota"
  )

MediasCluster2$area <- gsub("nota_media_", "", MediasCluster2$area)

ggplot(MediasCluster2, aes(x = factor(grupos3), y = nota, fill = area)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Nota media por área y clúster",
    x = "Clúster",
    y = "Nota media",
    fill = "Área"
  ) +
  theme_minimal()

PaisesCluster %>%
  group_by(grupos3) %>%
  summarise(paises = paste(CNT, collapse = ", "))
## # A tibble: 4 × 2
##   grupos3 paises                                                                
##     <int> <chr>                                                                 
## 1       1 ALB, DOM, GEO, IDN, JOR, KSV, MAR, MKD, PAN, PHL, PSE, QAZ, SAU, SLV,…
## 2       2 ARE, ARG, BGR, BRA, BRN, COL, CRI, GRC, ISL, JAM, KAZ, MDA, MEX, MNE,…
## 3       3 AUS, AUT, BEL, CAN, CHE, CHL, CZE, DEU, DNK, ESP, FIN, FRA, GBR, HRV,…
## 4       4 EST, HKG, JPN, KOR, MAC, SGP, TAP

Las notas medias de los clústeres van de menos a más (el clúster 1 es el que menor nota tiene y el 4 es el que mayor nota tiene). Vemos que esta subida de notas ocurre en todas las áreas.

Clúster 1 = {Marruecos, Arabia Saudí, Uzbekistán, Indonesia, Filipinas, Georgia…}. Países subdesarrollados que no destacan por su educación.

Clúster 2 = {Latinoamérica, Balcanes, Kazajistán, Mongolia, Tailandia…}. Países subdesarrollados o en vías de desarrollo, tienen notas un poco mejor que el grupo anterior.

Clúster 3 = {Europa, Canadá, EEUU, Australia, Nueva Zelanda…}. Países desarrollados con buena educación.

Clúster 4 = {Estonia, Corea del Sur, Japón, Singapur, Macao, Hong Kong…}. Países desarrollados que suelen ser pequeños y destacan por su avanzado programa educativo.

Clustering España

Creación del df de medias por CCAA

ESP = datosPisaEstudiantes_filtrado[datosPisaEstudiantes_filtrado$CNT=="ESP",]
PlausiblesESP = ESP %>%
  select(matches("^PV|^CNTSCHID$|^REGION$"))
MediasESP = PlausiblesESP %>% transmute(CNTSCHID = PlausiblesESP$CNTSCHID,
                                        REGION = PlausiblesESP$REGION,
                                        MathMean = rowMeans(PlausiblesESP[,3:12]),
                                        LectureMean = rowMeans(PlausiblesESP[,13:22]),
                                        ScienceMean = rowMeans(PlausiblesESP[,23:32]))
MediasREG = MediasESP %>% group_by(REGION) %>% summarise(
                                                    MathMeanReg = mean(MathMean),
                                                    MathSDReg = sd(MathMean),
                                                    LectureMeanReg = mean(LectureMean),
                                                    LectureSDReg = sd(LectureMean),
                                                    ScienceMeanReg = mean(ScienceMean),
                                                    ScienceSDReg = sd(ScienceMean)
                                                    )
regions = ne_states(country = "Spain", returnclass = "sf")

#código del .SAS
cod_reg <- tibble(
  REGION = c(72401, 72402, 72403, 72404, 72405, 72406, 72407, 72408, 
             72409, 72410, 72411, 72412, 72413, 72414, 72415, 72416, 
             72417, 72418, 72419),
  REGION_NAME = c("Andalucía",
                  "Aragón",
                  "Asturias",
                  "Islas Baleares",
                  "Canary Is.",
                  "Cantabria",
                  "Castilla y León",
                  "Castilla-La Mancha",
                  "Cataluña",
                  "Extremadura",
                  "Galicia",
                  "La Rioja",
                  "Madrid",
                  "Murcia",
                  "Foral de Navarra",
                  "País Vasco",
                  "Valenciana",
                  "Ceuta",
                  "Melilla")
)

MediasREG = MediasREG %>% left_join(cod_reg, by = "REGION")
MediasREG = MediasREG %>% select(REGION_NAME, matches("^Math|^Lecture|^Science"))

Boxplot medias estudiantes de España

par(mar = c(5, 7, 4, 2))
boxplot(MediasESP$MathMean, MediasESP$LectureMean, MediasESP$ScienceMean, 
        names = c("Matemáticas", "Lectura", "Ciencias"), 
        main = "Valores atípicos medias estudiantes España", 
        col = c("red", "lightblue", "green"),
        horizontal = TRUE, cex.axis = 0.8)

summary(MediasESP)
##     CNTSCHID            REGION         MathMean      LectureMean   
##  Min.   :72400001   Min.   :72401   Min.   :201.9   Min.   :121.8  
##  1st Qu.:72400252   1st Qu.:72405   1st Qu.:428.4   1st Qu.:425.0  
##  Median :72400497   Median :72410   Median :485.8   Median :487.5  
##  Mean   :72400495   Mean   :72410   Mean   :483.3   Mean   :482.8  
##  3rd Qu.:72400741   3rd Qu.:72414   3rd Qu.:540.7   3rd Qu.:544.2  
##  Max.   :72400987   Max.   :72419   Max.   :773.5   Max.   :759.6  
##   ScienceMean   
##  Min.   :172.4  
##  1st Qu.:435.1  
##  Median :495.1  
##  Mean   :492.3  
##  3rd Qu.:551.7  
##  Max.   :767.6

Boxplot medias comunidades autónomas

par(mar = c(5, 7, 4, 2))
boxplot(
        MediasREG$MathMeanReg, 
        MediasREG$LectureMeanReg, 
        MediasREG$ScienceMeanReg, 
        names = c("Matemáticas", "Lectura", "Ciencias"), 
        main = "Valores atípicos medias Com. Autónomas", 
        col = c("red", "lightblue", "green"),
        horizontal = TRUE, cex.axis = 0.8)

summary(MediasREG)
##  REGION_NAME         MathMeanReg      MathSDReg     LectureMeanReg 
##  Length:19          Min.   :413.9   Min.   :75.25   Min.   :423.7  
##  Class :character   1st Qu.:469.7   1st Qu.:77.02   1st Qu.:472.1  
##  Mode  :character   Median :478.6   Median :78.42   Median :477.8  
##                     Mean   :476.8   Mean   :78.55   Mean   :477.8  
##                     3rd Qu.:496.6   3rd Qu.:79.64   3rd Qu.:491.5  
##                     Max.   :505.0   Max.   :82.86   Max.   :504.1  
##   LectureSDReg   ScienceMeanReg   ScienceSDReg  
##  Min.   :80.44   Min.   :427.5   Min.   :77.93  
##  1st Qu.:83.06   1st Qu.:479.6   1st Qu.:80.12  
##  Median :84.38   Median :488.4   Median :82.06  
##  Mean   :84.91   Mean   :486.8   Mean   :81.74  
##  3rd Qu.:85.96   3rd Qu.:505.2   3rd Qu.:83.46  
##  Max.   :92.72   Max.   :511.8   Max.   :86.38

HeatMap España

midist2 = get_dist(MediasREG[2:ncol(MediasREG)], stand = FALSE, method = "euclidean")
fviz_dist(midist2, show_labels = TRUE, lab_size = 0.3,
          gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

Estadístico Hopkins España

set.seed(100)
myN = c(5, 8, 10, 15)  # cambiando tamaños de muestra
myhopkins = NULL
myseed = sample(1:1000, 10)
for (i in myN) {
  for (j in myseed) {
    tmp = get_clust_tendency(data = MediasREG[,2:ncol(MediasREG)], n = i, graph = FALSE, seed = j)
    myhopkins = c(myhopkins, tmp$hopkins_stat)
  }
}
summary(myhopkins)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.6624  0.7737  0.7947  0.7893  0.8136  0.8383

Mapa medias por Comunidad Autónoma

mapSPAIN = regions %>% left_join(MediasREG, by = c("region" = "REGION_NAME"))

ggplot(mapSPAIN) +
  geom_sf(aes(fill = MathMeanReg), color = NA) +
  scale_fill_viridis_c(option = "C") +
  theme_minimal() +
  labs(title = "Media en Matemáticas por Comunidad Autónoma", fill = "Media Matemáticas")

ggplot(mapSPAIN) +
  geom_sf(aes(fill = LectureMeanReg), color = NA) +
  scale_fill_viridis_c(option = "C") +
  theme_minimal() +
  labs(title = "Media en Lectura por Comunidad Autónoma", fill = "Media Lectura")

ggplot(mapSPAIN) +
  geom_sf(aes(fill = ScienceMeanReg), color = NA) +
  scale_fill_viridis_c(option = "C") +
  theme_minimal() +
  labs(title = "Media en Ciencias por Comunidad Autónoma", fill = "Media Ciencias")

Visualización número de clústeres óptimos (Silhouette y SC intraclúster)

Ward

p1 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = hcut, method = "silhouette", 
                  hc_method = "ward.D2", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
p2 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = hcut, method = "wss", 
                  hc_method = "ward.D2", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
grid.arrange(p1, p2, nrow = 1)

Método jerárquico de la media

p1 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = hcut, method = "silhouette", 
                  hc_method = "average", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
p2 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = hcut, method = "wss", 
                  hc_method = "average", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
grid.arrange(p1, p2, nrow = 1)

K-Means

p1 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = kmeans, method = "silhouette", 
             k.max = 10, verbose = FALSE) +
  labs(title = "K-means")
p2 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = kmeans, method = "wss", 
             k.max = 10, verbose = FALSE) +
  labs(title = "K-means")
grid.arrange(p1, p2, nrow = 1)

K-Medoides

p1 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = pam, method = "silhouette", 
             k.max = 10, verbose = FALSE) +
  labs(title = "Numero optimo de clusters")
p2 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = pam, method = "wss", 
             k.max = 10, verbose = FALSE) +
  labs(title = "Numero optimo de clusters")
grid.arrange(p1, p2, nrow = 1)

En todos los métodos parece que el número óptimo de clústeres es 3 basándonos en el criterio del codo en la SC intraclúster y en el criterio de maximizar el coeficiente de Silhouette.

Creación de clusterings

Eclust1 <- hclust(midist2, method="ward.D2")
Egrupos1 <- cutree(Eclust1, k=3)
table(Egrupos1)
## Egrupos1
##  1  2  3 
## 10  7  2
ESPgruposA = cbind(Egrupos1, MediasREG[,"REGION_NAME"])

Eclust2 <- hclust(midist2, method="average")
Egrupos2 <- cutree(Eclust2, k=3)
table(Egrupos2)
## Egrupos2
##  1  2  3 
## 10  7  2
ESPgruposB = cbind(Egrupos2, MediasREG[,"REGION_NAME"])

set.seed(100)
Eclust3 <- kmeans(MediasREG[,-1], centers = 3, nstart = 20)
table(Eclust3$cluster)
## 
## 1 2 3 
## 2 8 9
ESPgruposC = cbind(Eclust3$cluster, MediasREG[,"REGION_NAME"])
colnames(ESPgruposC)[1] = "ESPgrupos3"

Eclust4 <- pam(MediasREG, k = 3)
table(Eclust4$clustering)
## 
##  1  2  3 
## 10  7  2
ESPgruposD = cbind(Eclust4$cluster, MediasREG[,"REGION_NAME"])
colnames(ESPgruposD)[1] = "ESPgrupos4"

Visualización del mejor clustering

par(mfrow = c(1,2))
plot(silhouette(Egrupos1, midist2), col=colores, border=NA, main = "WARD")
plot(silhouette(Egrupos2, midist2), col=colores, border=NA, main = "MEDIAS")

plot(silhouette(Eclust3$cluster, midist2), col=colores, border=NA, main = "K-MEDIAS")
plot(silhouette(Eclust4$clustering, midist2), col=colores2, border=NA, main = "K-MEDOIDES")

Vemos que el mejor parece el método k-medias con k = 3 ya que no hay ningún elemento mal colocado y el coeficiente de Silhouette.

Validación del clustering

numESP = MediasREG[,-1]
numESP = as.data.frame(numESP)

validacion = suppressMessages(clValid(numESP, nClust = 3:5, metric = "euclidean", clMethods = metodos, validation = c("internal", "stability"),
method = "ward"))
## Warning in clValid(numESP, nClust = 3:5, metric = "euclidean", clMethods =
## metodos, : rownames for data not specified, using 1:nrow(data)
summary(validacion)
## 
## Clustering Methods:
##  hierarchical kmeans pam 
## 
## Cluster sizes:
##  3 4 5 
## 
## Validation Measures:
##                                  3       4       5
##                                                   
## hierarchical APN            0.0368  0.0959  0.1067
##              AD            15.1087 12.4892 10.7653
##              ADM            1.7603  2.9143  3.1940
##              FOM            5.2969  4.9283  4.9603
##              Connectivity  10.7206 19.1706 24.5230
##              Dunn           0.2775  0.4244  0.4244
##              Silhouette     0.5864  0.4660  0.4007
## kmeans       APN            0.0154  0.1076  0.1132
##              AD            14.2907 12.6164 10.5438
##              ADM            0.7204  3.2313  3.1357
##              FOM            5.0181  5.0983  4.9537
##              Connectivity  10.1579 19.1706 24.3552
##              Dunn           0.3815  0.4244  0.4483
##              Silhouette     0.6046  0.4660  0.4151
## pam          APN            0.0154  0.1094  0.0921
##              AD            14.2907 12.7162 10.2569
##              ADM            0.7204  3.4544  2.4605
##              FOM            5.0181  4.9580  4.4473
##              Connectivity  10.1579 19.1706 24.3552
##              Dunn           0.3815  0.4244  0.4483
##              Silhouette     0.6046  0.4660  0.4151
## 
## Optimal Scores:
## 
##              Score   Method Clusters
## APN           0.0154 kmeans 3       
## AD           10.2569 pam    5       
## ADM           0.7204 kmeans 3       
## FOM           4.4473 pam    5       
## Connectivity 10.1579 kmeans 3       
## Dunn          0.4483 kmeans 5       
## Silhouette    0.6046 kmeans 3

Siguiendo estos criterios parece que el clustering mejor sigue siendo el de k-medias con k = 3, aunque son bastante parecidos al haber solo 15 comunidades autónomas y 2 ciudades autónomas.

Visualización del Clúster final

#Intercambiamos los clústers 2 y 3 para que sea similar al de países y ordenados por nota. La columna de clúster ahora es ESPgrupos3_mod
ESPgruposC$ESPgrupos3_mod <- ESPgruposC$ESPgrupos3
ESPgruposC$ESPgrupos3_mod[ESPgruposC$ESPgrupos3 == 2] <- 99
ESPgruposC$ESPgrupos3_mod[ESPgruposC$ESPgrupos3 == 3] <- 2
ESPgruposC$ESPgrupos3_mod[ESPgruposC$ESPgrupos3_mod == 99] <- 3

mapSPAIN <- regions %>%
  left_join(ESPgruposC, by = c("region" = "REGION_NAME"))
mapSPAIN$ESPgrupos3_mod <- as.factor(mapSPAIN$ESPgrupos3_mod)

ggplot(mapSPAIN) +
  geom_sf(aes(fill = ESPgrupos3_mod), color = NA, size = 0.1) +
  scale_fill_brewer(palette = "Set2", na.value = "grey90") +
  theme_minimal() +
  labs(title = "Mapa de Clústeres de CCAA españolas KMEDIAS k=3", fill = "Cluster")

#Con k = 3 todos los clústers son prácticamente iguales

CCAACluster = merge(MediasREG, ESPgruposC, by = "REGION_NAME")

MediasClusterRegiones = CCAACluster %>%
  group_by(ESPgrupos3_mod) %>%
  summarise(
    nota_media_Mates = mean(MathMeanReg),
    nota_media_Lectura = mean(LectureMeanReg),
    nota_media_Ciencias = mean(ScienceMeanReg)
    )
MediasClusterRegiones$nota_media_General = rowMeans(MediasClusterRegiones[,2:ncol(MediasClusterRegiones)])
MediasClusterRegiones
## # A tibble: 3 × 5
##   ESPgrupos3_mod nota_media_Mates nota_media_Lectura nota_media_Ciencias
##            <dbl>            <dbl>              <dbl>               <dbl>
## 1              1             417.               424.                428.
## 2              2             472.               474.                483.
## 3              3             498.               495.                506.
## # ℹ 1 more variable: nota_media_General <dbl>
MediasClusterRegiones2 <- MediasClusterRegiones %>%
  pivot_longer(
    cols = starts_with("nota"),
    names_to = "area",
    values_to = "nota"
  )

MediasClusterRegiones2$area <- gsub("nota_media_", "", MediasClusterRegiones2$area)

ggplot(MediasClusterRegiones2, aes(x = factor(ESPgrupos3_mod), y = nota, fill = area)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Nota media por área y clúster",
    x = "Clúster",
    y = "Nota media",
    fill = "Área"
  ) +
  theme_minimal()

CCAACluster %>%
  group_by(ESPgrupos3_mod) %>%
  summarise(regiones = paste(REGION_NAME, collapse = ", "))
## # A tibble: 3 × 2
##   ESPgrupos3_mod regiones                                                       
##            <dbl> <chr>                                                          
## 1              1 Ceuta, Melilla                                                 
## 2              2 Andalucía, Canary Is., Castilla-La Mancha, Cataluña, Extremadu…
## 3              3 Aragón, Asturias, Cantabria, Castilla y León, Foral de Navarra…

Los clústeres ordenados de menor nota (en todas las áreas) a mayor son: Clúster 1 < Clúster 3 < Clúster 2.

Clúster 1 = {Ceuta, Melilla}. Ciudades autónomas situadas en África que están menos conectadas con el resto de España y suelen tener menos recursos.

Clúster 2 = {Galicia, Asturias, Cantabria, Castilla y León, Navarra, Aragón, La Rioja, Madrid}. Comunidades autónomas del norte de España, muchas suelen destacar en índices socieconómicos y tener desarrollo tecnológico.

Clúster 3 = {Islas Canarias, Andalucía, Comunidad Valenciana, Extremadura, Castilla-La Mancha, Murcia, Cataluña, País Vasco}. La mayoría de estas Comunidades autónomas suelen estar en el sur de España y estar un poco peor que las del clúster 2 en cuanto a índices socioeconómicos y tecnología.

Clustering Banco Mundial + NotasMediasPaís

escuelas <- read_sas("escuelas.SAS7BDAT", NULL)

bancomundial =load('bancomundial.R')
bancomundial <- datos_filtrado

bm_media_res<- merge(
  NotasMediasPais, 
  bancomundial, 
  by.x = "CNT",  # Columna de países en NotasMediasPais
  by.y = "Country.Code",  # Columna de países en bancomundial (ajusta según tu CSV)
  all.x = TRUE  # Conservar todos los países de PISA aunque falten datos del BM
)

write.csv(bm_media_res, file = "bm_media_res.csv", row.names = FALSE)

datos_numericos <- read_excel("datos_bm_res.xlsx")

datos = subset(datos_numericos,select = -c(CNT))
summary(datos)
##  MathMean_Country LectureMean_Country ScienceMean_Country
##  Min.   :343.5    Min.   :337.7       Min.   :356.2      
##  1st Qu.:396.2    1st Qu.:406.0       1st Qu.:411.8      
##  Median :449.1    Median :448.5       Median :461.2      
##  Mean   :445.1    Mean   :442.6       Mean   :454.4      
##  3rd Qu.:483.6    3rd Qu.:484.0       3rd Qu.:497.6      
##  Max.   :574.6    Max.   :543.2       Max.   :561.5      
##                                                          
##  Compulsory education, duration (years)
##  Min.   : 6.00                         
##  1st Qu.:10.00                         
##  Median :11.00                         
##  Mean   :10.93                         
##  3rd Qu.:12.00                         
##  Max.   :15.00                         
##  NA's   :18                            
##  Current education expenditure, total (% of total expenditure in public institutions)
##  Min.   :77.62                                                                       
##  1st Qu.:84.73                                                                       
##  Median :91.75                                                                       
##  Mean   :90.27                                                                       
##  3rd Qu.:96.85                                                                       
##  Max.   :99.34                                                                       
##  NA's   :69                                                                          
##  Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative)
##  Min.   : 9.481                                                                                   
##  1st Qu.:19.822                                                                                   
##  Median :26.128                                                                                   
##  Mean   :26.834                                                                                   
##  3rd Qu.:33.694                                                                                   
##  Max.   :51.070                                                                                   
##  NA's   :23                                                                                       
##  Educational attainment, at least completed lower secondary, population 25+, total (%) (cumulative)
##  Min.   :48.79                                                                                     
##  1st Qu.:75.92                                                                                     
##  Median :89.87                                                                                     
##  Mean   :84.01                                                                                     
##  3rd Qu.:96.46                                                                                     
##  Max.   :99.72                                                                                     
##  NA's   :18                                                                                        
##  Educational attainment, at least completed post-secondary, population 25+, total (%) (cumulative)
##  Min.   :12.11                                                                                    
##  1st Qu.:24.71                                                                                    
##  Median :34.66                                                                                    
##  Mean   :35.45                                                                                    
##  3rd Qu.:43.50                                                                                    
##  Max.   :67.27                                                                                    
##  NA's   :23                                                                                       
##  Educational attainment, at least completed primary, population 25+ years, total (%) (cumulative)
##  Min.   : 63.81                                                                                  
##  1st Qu.: 91.23                                                                                  
##  Median : 97.56                                                                                  
##  Mean   : 93.95                                                                                  
##  3rd Qu.: 99.21                                                                                  
##  Max.   :100.00                                                                                  
##  NA's   :18                                                                                      
##  Educational attainment, at least completed upper secondary, population 25+, total (%) (cumulative)
##  Min.   :34.46                                                                                     
##  1st Qu.:55.53                                                                                     
##  Median :74.85                                                                                     
##  Mean   :69.24                                                                                     
##  3rd Qu.:81.59                                                                                     
##  Max.   :91.99                                                                                     
##  NA's   :18                                                                                        
##  Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)
##  Min.   : 0.7075                                                                                
##  1st Qu.: 3.9465                                                                                
##  Median :12.5948                                                                                
##  Mean   :11.0888                                                                                
##  3rd Qu.:15.8233                                                                                
##  Max.   :22.8423                                                                                
##  NA's   :25                                                                                     
##  Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)
##  Min.   :0.0000                                                                        
##  1st Qu.:0.2900                                                                        
##  Median :0.7576                                                                        
##  Mean   :0.9213                                                                        
##  3rd Qu.:1.3326                                                                        
##  Max.   :3.1390                                                                        
##  NA's   :20                                                                            
##  Government expenditure on education, total (% of GDP)
##  Min.   :0.8639                                       
##  1st Qu.:3.3393                                       
##  Median :4.1601                                       
##  Mean   :4.1761                                       
##  3rd Qu.:4.9824                                       
##  Max.   :7.1397                                       
##  NA's   :48                                           
##  Government expenditure on education, total (% of government expenditure)
##  Min.   : 7.45                                                           
##  1st Qu.:10.12                                                           
##  Median :13.93                                                           
##  Mean   :13.36                                                           
##  3rd Qu.:15.73                                                           
##  Max.   :22.36                                                           
##  NA's   :51                                                              
##  GNI per capita, Atlas method (current US$)
##  Min.   : 4010                             
##  1st Qu.:10942                             
##  Median :24895                             
##  Mean   :32341                             
##  3rd Qu.:53585                             
##  Max.   :95480                             
##  NA's   :18                                
##  GNI per capita, PPP (constant 2021 international $)
##  Min.   : 10003                                     
##  1st Qu.: 25668                                     
##  Median : 41695                                     
##  Mean   : 43316                                     
##  3rd Qu.: 58408                                     
##  Max.   :110892                                     
##  NA's   :25                                         
##  Labor force with advanced education (% of total working-age population with advanced education)
##  Min.   :71.14                                                                                  
##  1st Qu.:76.17                                                                                  
##  Median :78.58                                                                                  
##  Mean   :78.71                                                                                  
##  3rd Qu.:81.11                                                                                  
##  Max.   :90.08                                                                                  
##  NA's   :18                                                                                     
##  Labor force with basic education (% of total working-age population with basic education)
##  Min.   :17.17                                                                            
##  1st Qu.:30.58                                                                            
##  Median :45.48                                                                            
##  Mean   :44.98                                                                            
##  3rd Qu.:56.72                                                                            
##  Max.   :85.92                                                                            
##  NA's   :18                                                                               
##  Labor force with intermediate education (% of total working-age population with intermediate education)
##  Min.   :48.39                                                                                          
##  1st Qu.:61.32                                                                                          
##  Median :65.27                                                                                          
##  Mean   :65.33                                                                                          
##  3rd Qu.:69.00                                                                                          
##  Max.   :77.81                                                                                          
##  NA's   :18                                                                                             
##  Unemployment with advanced education (% of total labor force with advanced education)
##  Min.   : 0.794                                                                       
##  1st Qu.: 2.365                                                                       
##  Median : 3.533                                                                       
##  Mean   : 4.202                                                                       
##  3rd Qu.: 5.187                                                                       
##  Max.   :11.397                                                                       
##  NA's   :18                                                                           
##  Unemployment with basic education (% of total labor force with basic education)
##  Min.   : 0.421                                                                 
##  1st Qu.: 4.255                                                                 
##  Median : 7.901                                                                 
##  Mean   : 8.972                                                                 
##  3rd Qu.:11.917                                                                 
##  Max.   :40.621                                                                 
##  NA's   :18                                                                     
##  Unemployment with intermediate education (% of total labor force with intermediate education)
##  Min.   : 0.885                                                                               
##  1st Qu.: 3.977                                                                               
##  Median : 5.194                                                                               
##  Mean   : 6.154                                                                               
##  3rd Qu.: 7.560                                                                               
##  Max.   :16.066                                                                               
##  NA's   :18                                                                                   
##  Individuals using the Internet (% of population) Fixed broadband subscriptions
##  Min.   : 62.89                                   Min.   :    90200            
##  1st Qu.: 83.63                                   1st Qu.:  1087500            
##  Median : 89.11                                   Median :  3350000            
##  Mean   : 87.89                                   Mean   :  8885245            
##  3rd Qu.: 94.78                                   3rd Qu.:  9062500            
##  Max.   :100.00                                   Max.   :128000000            
##  NA's   :18                                       NA's   :18                   
##  Mobile cellular subscriptions Secure Internet servers GDP growth (annual %)
##  Min.   :   456864             Min.   :     518        Min.   :-5.015       
##  1st Qu.:  5760000             1st Qu.:   21833        1st Qu.: 2.581       
##  Median : 11450000             Median :  213266        Median : 4.191       
##  Mean   : 41068747             Mean   : 1739713        Mean   : 4.133       
##  3rd Qu.: 48150000             3rd Qu.:  766554        3rd Qu.: 5.300       
##  Max.   :373000000             Max.   :60194281        Max.   :10.770       
##  NA's   :18                    NA's   :18              NA's   :18           
##  GDP per capita (constant 2015 US$) Access to electricity (% of population)
##  Min.   : 3578                      Min.   : 94.80                         
##  1st Qu.:10302                      1st Qu.:100.00                         
##  Median :20611                      Median :100.00                         
##  Mean   :28843                      Mean   : 99.72                         
##  3rd Qu.:44483                      3rd Qu.:100.00                         
##  Max.   :99677                      Max.   :100.00                         
##  NA's   :18                         NA's   :18                             
##  Access to clean fuels and technologies for cooking (% of population)
##  Min.   : 54.20                                                      
##  1st Qu.: 95.10                                                      
##  Median :100.00                                                      
##  Mean   : 95.39                                                      
##  3rd Qu.:100.00                                                      
##  Max.   :100.00                                                      
##  NA's   :19                                                          
##  Population ages 0-14, total Population ages 15-64, total
##  Min.   :   70206            Min.   :   254073           
##  1st Qu.:  851109            1st Qu.:  3008344           
##  Median : 1527079            Median :  6386626           
##  Mean   : 6514125            Mean   : 21791933           
##  3rd Qu.: 6383518            3rd Qu.: 24254126           
##  Max.   :70471815            Max.   :217288142           
##  NA's   :18                  NA's   :18                  
##  Population ages 0-14 (% of total population)
##  Min.   :11.41                               
##  1st Qu.:15.08                               
##  Median :16.66                               
##  Mean   :18.21                               
##  3rd Qu.:20.07                               
##  Max.   :32.88                               
##  NA's   :18                                  
##  Population ages 15-64 (% of total population)
##  Min.   :59.89                                
##  1st Qu.:63.64                                
##  Median :65.45                                
##  Mean   :66.12                                
##  3rd Qu.:66.99                                
##  Max.   :81.88                                
##  NA's   :18                                   
##  Population ages 65 and above (% of total population)
##  Min.   : 1.64                                       
##  1st Qu.:10.11                                       
##  Median :17.53                                       
##  Mean   :15.67                                       
##  3rd Qu.:20.39                                       
##  Max.   :23.93                                       
##  NA's   :18                                          
##  Population ages 65 and above, total Population, total  
##  Min.   :   28241                    Min.   :   382003  
##  1st Qu.:  537674                    1st Qu.:  4571021  
##  Median : 1579097                    Median :  9859677  
##  Mean   : 4523469                    Mean   : 32829526  
##  3rd Qu.: 4605347                    3rd Qu.: 36290185  
##  Max.   :56388258                    Max.   :333271411  
##  NA's   :18                          NA's   :18         
##  Preprimary education, duration (years)
##  Min.   :1.000                         
##  1st Qu.:4.000                         
##  Median :6.000                         
##  Mean   :5.125                         
##  3rd Qu.:6.000                         
##  Max.   :7.000                         
##  NA's   :20                            
##  Proportion of seats held by women in national parliaments (%)
##  Min.   : 9.091                                               
##  1st Qu.:23.542                                               
##  Median :29.563                                               
##  Mean   :31.409                                               
##  3rd Qu.:40.446                                               
##  Max.   :50.000                                               
##  NA's   :18                                                   
##  Employment in industry (% of total employment) (modeled ILO estimate)
##  Min.   :12.68                                                        
##  1st Qu.:18.48                                                        
##  Median :21.20                                                        
##  Mean   :22.56                                                        
##  3rd Qu.:26.87                                                        
##  Max.   :36.89                                                        
##  NA's   :18                                                           
##  Employment in agriculture (% of total employment) (modeled ILO estimate)
##  Min.   : 0.09095                                                        
##  1st Qu.: 2.22190                                                        
##  Median : 4.31686                                                        
##  Mean   : 9.09093                                                        
##  3rd Qu.:13.21798                                                        
##  Max.   :55.38076                                                        
##  NA's   :18                                                              
##  Employment in services (% of total employment) (modeled ILO estimate)
##  Min.   :31.94                                                        
##  1st Qu.:61.77                                                        
##  Median :69.46                                                        
##  Mean   :68.35                                                        
##  3rd Qu.:77.82                                                        
##  Max.   :85.34                                                        
##  NA's   :18                                                           
##  Birth rate, crude (per 1,000 people)
##  Min.   : 4.900                      
##  1st Qu.: 8.800                      
##  Median : 9.894                      
##  Mean   :11.135                      
##  3rd Qu.:12.050                      
##  Max.   :21.618                      
##  NA's   :18
datos_imputados <- datos_numericos
datos_imputados[, -1] <- lapply(datos_imputados[, -1], function(x) {
  ifelse(is.na(x), mean(x, na.rm=TRUE),x)
})

NotasNumericas = datos_imputados[,-1]
NotasNumericas = as.data.frame(NotasNumericas)

metodos = c("hierarchical","kmeans","pam")
validacion = suppressMessages(clValid(NotasNumericas, nClust = 4:7, metric = "euclidean", clMethods = metodos, validation = c("internal", "stability"),
method = "ward"))
## Warning in clValid(NotasNumericas, nClust = 4:7, metric = "euclidean",
## clMethods = metodos, : rownames for data not specified, using 1:nrow(data)
summary(validacion)
## 
## Clustering Methods:
##  hierarchical kmeans pam 
## 
## Cluster sizes:
##  4 5 6 7 
## 
## Validation Measures:
##                                       4            5            6            7
##                                                                               
## hierarchical APN           1.100000e-03 1.100000e-03 1.220000e-02 9.500000e-03
##              AD            1.638221e+07 1.319447e+07 1.129872e+07 9.414332e+06
##              ADM           9.343188e+04 7.277224e+04 3.196984e+05 3.375925e+05
##              FOM           1.277323e+06 9.959701e+05 8.960475e+05 7.526625e+05
##              Connectivity  9.025000e+00 1.334130e+01 1.534130e+01 1.802140e+01
##              Dunn          9.990000e-02 9.990000e-02 2.042000e-01 2.970000e-02
##              Silhouette    6.995000e-01 7.235000e-01 7.099000e-01 5.196000e-01
## kmeans       APN           3.700000e-03 1.700000e-03 1.330000e-02 7.500000e-03
##              AD            1.598113e+07 1.289033e+07 1.101110e+07 8.921211e+06
##              ADM           3.411711e+05 1.158990e+05 3.957884e+05 2.816039e+05
##              FOM           1.258489e+06 1.023782e+06 9.283123e+05 7.991300e+05
##              Connectivity  1.386390e+01 1.798020e+01 1.998020e+01 1.895120e+01
##              Dunn          2.740000e-02 2.740000e-02 7.380000e-02 6.920000e-02
##              Silhouette    7.221000e-01 7.425000e-01 7.289000e-01 6.168000e-01
## pam          APN           6.000000e-04 1.500000e-03 1.100000e-03 1.600000e-03
##              AD            1.584959e+07 1.306305e+07 1.089023e+07 8.818984e+06
##              ADM           5.055155e+04 4.432437e+05 8.815363e+04 9.662234e+04
##              FOM           1.252413e+06 1.174879e+06 6.881431e+05 6.427945e+05
##              Connectivity  1.386390e+01 1.798020e+01 1.998020e+01 2.103290e+01
##              Dunn          2.740000e-02 2.740000e-02 7.380000e-02 4.570000e-02
##              Silhouette    7.221000e-01 7.425000e-01 7.289000e-01 6.132000e-01
## 
## Optimal Scores:
## 
##              Score        Method       Clusters
## APN                0.0006 pam          4       
## AD           8818984.2818 pam          7       
## ADM            50551.5506 pam          4       
## FOM           642794.4851 pam          7       
## Connectivity       9.0250 hierarchical 4       
## Dunn               0.2042 hierarchical 6       
## Silhouette         0.7425 kmeans       5

Ward

p1 = fviz_nbclust(x = NotasNumericas, FUNcluster = hcut, method = "silhouette", 
                  hc_method = "ward.D2", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
p2 = fviz_nbclust(x = NotasNumericas, FUNcluster = hcut, method = "wss", 
                  hc_method = "ward.D2", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
grid.arrange(p1, p2, nrow = 1)

Método jerárquico de la media

p1 = fviz_nbclust(x = NotasNumericas, FUNcluster = hcut, method = "silhouette", 
                  hc_method = "average", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
p2 = fviz_nbclust(x = NotasNumericas, FUNcluster = hcut, method = "wss", 
                  hc_method = "average", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
grid.arrange(p1, p2, nrow = 1)

K-Means

p1 = fviz_nbclust(x = NotasNumericas, FUNcluster = kmeans, method = "silhouette", 
             k.max = 10, verbose = FALSE) +
  labs(title = "K-means")
p2 = fviz_nbclust(x = NotasNumericas, FUNcluster = kmeans, method = "wss", 
             k.max = 10, verbose = FALSE) +
  labs(title = "K-means")
grid.arrange(p1, p2, nrow = 1)

K-Medoides

p1 = fviz_nbclust(x = NotasNumericas, FUNcluster = pam, method = "silhouette", 
             k.max = 10, verbose = FALSE) +
  labs(title = "Numero optimo de clusters")
p2 = fviz_nbclust(x = NotasNumericas, FUNcluster = pam, method = "wss", 
             k.max = 10, verbose = FALSE) +
  labs(title = "Numero optimo de clusters")
grid.arrange(p1, p2, nrow = 1)

5,7,5,4-6

midist3 = get_dist(NotasNumericas, stand = FALSE, method = "euclidean")

Aclust1 <- hclust(midist3, method="ward.D2")
Agrupos1 <- cutree(Aclust1, k=5)
table(Agrupos1)
## Agrupos1
##  1  2  3  4  5 
## 40 23  9  2  2
AgruposA = cbind(Agrupos1, datos_imputados[,"CNT"])

Aclust2 <- hclust(midist3, method="average")
Agrupos2 <- cutree(Aclust2, k=7)
table(Agrupos2)
## Agrupos2
##  1  2  3  4  5  6  7 
## 36 27  6  3  1  2  1
AgruposB = cbind(Agrupos2, datos_imputados[,"CNT"])

set.seed(100)
Aclust3 <- kmeans(NotasNumericas, centers = 5, nstart = 20)
table(Aclust3$cluster)
## 
##  1  2  3  4  5 
## 26 38  2  2  8
AgruposC = cbind(Aclust3$cluster, datos_imputados[,"CNT"])
colnames(AgruposC)[1] = "Agrupos3"

Aclust4 <- pam(NotasNumericas, k = 5)
table(Aclust4$clustering)
## 
##  1  2  3  4  5 
## 38 26  8  2  2
AgruposD = cbind(Aclust4$cluster, datos_imputados[,"CNT"])
colnames(AgruposD)[1] = "Agrupos4"

Visualización del mejor clustering

par(mfrow = c(1,2))
plot(silhouette(Agrupos1, midist3), col=colores, border=NA, main = "WARD")
plot(silhouette(Agrupos2, midist3), col=colores, border=NA, main = "MEDIAS")

plot(silhouette(Aclust3$cluster, midist3), col=colores, border=NA, main = "K-MEDIAS")
plot(silhouette(Aclust4$clustering, midist3), col=colores2, border=NA, main = "K-MEDOIDES")

clustAUX = Aclust3
gruposAUX = AgruposC

world = ne_countries(scale = "medium", returnclass = "sf")

world$iso_a3[world$admin == "France"] = "FRA"
world$iso_a3[world$admin == "Norway"] = "NOR"
world$iso_a3[world$admin == "Kosovo"] = "KSV"
world$iso_a3[world$admin == "Taiwan"] = "TAP"

world_clustered <- world %>%
  left_join(AgruposC, by = c("iso_a3" = "CNT"))
world_clustered$gruposAUX <- as.factor(world_clustered$Agrupos3)

ggplot(world_clustered) +
  geom_sf(aes(fill = gruposAUX), color = NA, size = 0.1) +
  scale_fill_brewer(palette = "Set2", na.value = "grey90") +
  theme_minimal() +
  labs(title = "Mapa de Clústeres de Países KMEDIAS k=5", fill = "Cluster")

PaisesCluster = merge(datos_imputados, gruposAUX, by = "CNT")

MediasCluster = PaisesCluster %>%
  group_by(Agrupos3) %>%
  summarise(
    nota_media_Mates = mean(MathMean_Country),
    nota_media_Lectura = mean(LectureMean_Country),
    nota_media_Ciencias = mean(ScienceMean_Country)
    )
MediasCluster$nota_media_General = rowMeans(MediasCluster[,2:ncol(MediasCluster)])
MediasCluster
## # A tibble: 5 × 5
##   Agrupos3 nota_media_Mates nota_media_Lectura nota_media_Ciencias
##      <int>            <dbl>              <dbl>               <dbl>
## 1        1             437.               429.                444.
## 2        2             452.               450.                461.
## 3        3             375.               381.                384.
## 4        4             423.               441.                450.
## 5        5             463.               466.                478.
## # ℹ 1 more variable: nota_media_General <dbl>
MediasCluster2 <- MediasCluster %>%
  pivot_longer(
    cols = starts_with("nota"),
    names_to = "area",
    values_to = "nota"
  )

MediasCluster2$area <- gsub("nota_media_", "", MediasCluster2$area)

ggplot(MediasCluster2, aes(x = factor(Agrupos3), y = nota, fill = area)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Nota media por área y clúster",
    x = "Clúster",
    y = "Nota media",
    fill = "Área"
  ) +
  theme_minimal()

# Combinar datos de PISA con Banco Mundial
datos_combi <- merge(
  NotasMediasPais, 
  bancomundial, 
  by.x = "CNT", 
  by.y = "Country.Code",
  all.x=TRUE
)

# 1. Limpieza y preparación de datos
bm_pisa_clean <- datos_combi %>%
  mutate(across(-CNT, ~ {
    x_char <- as.character(.x)
    as.numeric(ifelse(x_char %in% c("", "NA"), NA, x_char))
  })) %>%
  # Eliminar columnas con >50% NAs
  select(where(~ mean(is.na(.x)) <= 0.5)) %>%
  # Eliminar filas con NAs (alternativa: tidyr::fill() o imputación)
  filter(complete.cases(.))

# 2. Estandarización y clustering
set.seed(100)
bm_pisa_scaled <- bm_pisa_clean %>%
  select(-CNT) %>%
  scale() %>%
  as.data.frame()

clust_bm <- kmeans(bm_pisa_scaled, centers = 5, nstart = 20)

# 3. Asignar clusters a los datos originales
bm_cluster_data <- bm_pisa_clean %>%
  mutate(Cluster = as.factor(clust_bm$cluster))

# 4. Análisis de variables importantes (ajusta los patrones según tus datos)
vars_importantes <- names(bm_cluster_data) %>%
  grep("GDP|Internet|education|employment|GNI|PISA|math|science|reading", 
       ., value = TRUE, ignore.case = TRUE)
# 5. Estadísticas por cluster
medias_cluster <- bm_cluster_data %>%
  select(Cluster, all_of(vars_importantes)) %>%
  group_by(Cluster) %>%
  summarise(across(
    everything(),
    list(mean = ~mean(., na.rm = TRUE),
         sd = ~sd(., na.rm = TRUE))
  ))

# 6. Preparar para visualización
medias_plot <- medias_cluster %>%
  pivot_longer(
    cols = -Cluster,
    names_to = c("Variable", ".value"),
    names_sep = "_"
  )
## Warning: Expected 2 pieces. Additional pieces discarded in 4 rows [1, 2, 5, 6].
# 7. Visualización de indicadores
ggplot(medias_plot, aes(x = Cluster, y = mean, fill = Variable)) +
  geom_col(position = position_dodge(0.8), width = 0.7) +
  geom_errorbar(
    aes(ymin = mean - sd, ymax = mean + sd),
    width = 0.2,
    position = position_dodge(0.8),
    color = "black"
  ) +
  labs(title = "Indicadores por Cluster") +
  theme_minimal()
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_col()`).

# 1. Primero verifiquemos los nombres reales de las columnas
names(bm_cluster_data)
##  [1] "CNT"                                                                                                    
##  [2] "MathMean_Country"                                                                                       
##  [3] "MathSD"                                                                                                 
##  [4] "LectureMean_Country"                                                                                    
##  [5] "LectureSD"                                                                                              
##  [6] "ScienceMean_Country"                                                                                    
##  [7] "ScienceSD"                                                                                              
##  [8] "Compulsory education, duration (years)"                                                                 
##  [9] "Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative)"      
## [10] "Educational attainment, at least completed lower secondary, population 25+, total (%) (cumulative)"     
## [11] "Educational attainment, at least completed post-secondary, population 25+, total (%) (cumulative)"      
## [12] "Educational attainment, at least completed primary, population 25+ years, total (%) (cumulative)"       
## [13] "Educational attainment, at least completed upper secondary, population 25+, total (%) (cumulative)"     
## [14] "Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)"        
## [15] "Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)"                 
## [16] "GNI per capita, Atlas method (current US$)"                                                             
## [17] "GNI per capita, PPP (constant 2021 international $)"                                                    
## [18] "Labor force with advanced education (% of total working-age population with advanced education)"        
## [19] "Labor force with basic education (% of total working-age population with basic education)"              
## [20] "Labor force with intermediate education (% of total working-age population with intermediate education)"
## [21] "Unemployment with advanced education (% of total labor force with advanced education)"                  
## [22] "Unemployment with basic education (% of total labor force with basic education)"                        
## [23] "Unemployment with intermediate education (% of total labor force with intermediate education)"          
## [24] "Individuals using the Internet (% of population)"                                                       
## [25] "Fixed broadband subscriptions"                                                                          
## [26] "Mobile cellular subscriptions"                                                                          
## [27] "Secure Internet servers"                                                                                
## [28] "GDP growth (annual %)"                                                                                  
## [29] "GDP per capita (constant 2015 US$)"                                                                     
## [30] "Access to electricity (% of population)"                                                                
## [31] "Access to clean fuels and technologies for cooking (% of population)"                                   
## [32] "Population ages 0-14, total"                                                                            
## [33] "Population ages 15-64, total"                                                                           
## [34] "Population ages 0-14 (% of total population)"                                                           
## [35] "Population ages 15-64 (% of total population)"                                                          
## [36] "Population ages 65 and above (% of total population)"                                                   
## [37] "Population ages 65 and above, total"                                                                    
## [38] "Population, total"                                                                                      
## [39] "Preprimary education, duration (years)"                                                                 
## [40] "Proportion of seats held by women in national parliaments (%)"                                          
## [41] "Employment in industry (% of total employment) (modeled ILO estimate)"                                  
## [42] "Employment in agriculture (% of total employment) (modeled ILO estimate)"                               
## [43] "Employment in services (% of total employment) (modeled ILO estimate)"                                  
## [44] "Birth rate, crude (per 1,000 people)"                                                                   
## [45] "Cluster"
# 2. Seleccionar variables basadas en los nombres reales
# (Estos son ejemplos - ajusta según los nombres reales en tus datos)
vars_importantes <- names(bm_cluster_data) %>%
  grep("GDP|Internet|education|employment|GNI", ., value = TRUE, ignore.case = TRUE)

# Mostrar las variables seleccionadas para confirmación
print(vars_importantes)
##  [1] "Compulsory education, duration (years)"                                                                 
##  [2] "Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative)"      
##  [3] "Educational attainment, at least completed lower secondary, population 25+, total (%) (cumulative)"     
##  [4] "Educational attainment, at least completed post-secondary, population 25+, total (%) (cumulative)"      
##  [5] "Educational attainment, at least completed primary, population 25+ years, total (%) (cumulative)"       
##  [6] "Educational attainment, at least completed upper secondary, population 25+, total (%) (cumulative)"     
##  [7] "Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)"        
##  [8] "Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)"                 
##  [9] "GNI per capita, Atlas method (current US$)"                                                             
## [10] "GNI per capita, PPP (constant 2021 international $)"                                                    
## [11] "Labor force with advanced education (% of total working-age population with advanced education)"        
## [12] "Labor force with basic education (% of total working-age population with basic education)"              
## [13] "Labor force with intermediate education (% of total working-age population with intermediate education)"
## [14] "Unemployment with advanced education (% of total labor force with advanced education)"                  
## [15] "Unemployment with basic education (% of total labor force with basic education)"                        
## [16] "Unemployment with intermediate education (% of total labor force with intermediate education)"          
## [17] "Individuals using the Internet (% of population)"                                                       
## [18] "Secure Internet servers"                                                                                
## [19] "GDP growth (annual %)"                                                                                  
## [20] "GDP per capita (constant 2015 US$)"                                                                     
## [21] "Preprimary education, duration (years)"                                                                 
## [22] "Employment in industry (% of total employment) (modeled ILO estimate)"                                  
## [23] "Employment in agriculture (% of total employment) (modeled ILO estimate)"                               
## [24] "Employment in services (% of total employment) (modeled ILO estimate)"
# 3. Calcular estadísticas por cluster (versión robusta)
medias_cluster_bm <- bm_cluster_data %>%
  # Seleccionar solo columnas numéricas además del Cluster
  select(Cluster, where(is.numeric)) %>%
  group_by(Cluster) %>%
  summarise(across(
    where(is.numeric),
    list(mean = ~mean(., na.rm = TRUE),
         sd = ~sd(., na.rm = TRUE))
  ))  # <- Paréntesis faltante aquí

# 4. Preparar datos para visualización (versión segura)
medias_plot_bm <- medias_cluster_bm %>%
  pivot_longer(
    cols = -Cluster,
    names_to = c("Variable", ".value"),
    names_sep = "_"
  )
## Warning: Expected 2 pieces. Additional pieces discarded in 6 rows [1, 2, 5, 6,
## 9, 10].
# 5. Visualización
ggplot(medias_plot_bm, aes(x = Cluster, y = mean, fill = Variable)) +
  geom_col(position = position_dodge(0.8), width = 0.7) +
  geom_errorbar(
    aes(ymin = mean - sd, ymax = mean + sd),
    width = 0.2,
    position = position_dodge(0.8),
    color = "black"
  ) +
  labs(
    title = "Indicadores del Banco Mundial por Cluster",
    x = "Cluster",
    y = "Valor promedio",
    fill = "Indicador"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom",
        axis.text.x = element_text(angle = 45,hjust=1))
## Warning: Removed 30 rows containing missing values or values outside the scale range
## (`geom_col()`).

PCA + Clustering

load('pca_data.RData')

gruposC = gruposC[-(which(gruposC$CNT %in% c("USA", "IDN", "PHL"))),]
pca_data$clusters = as.factor(gruposC$grupos3)


ggplot(pca_data, aes(x = Dim1, y = Dim2, label = label, color = clusters)) +
  geom_text(size = 3) +
  scale_color_manual(values = c("#228B22", "#FF8C00", "blue", "#D33682")) +
  labs(title = "PCA - Dimensiones 1 y 2", color = "Cluster") +
  theme_minimal()

ggplot(pca_data, aes(x = Dim3, y = Dim4, label = label, color = clusters)) +
  geom_text(size = 3) +
  scale_color_manual(values = c("#228B22", "#FF8C00", "blue", "#D33682")) +
  labs(title = "PCA - Dimensiones 3 y 4", color = "Cluster") +
  theme_minimal()