Librerías

library(dplyr)

## 
## Adjuntando el paquete: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(haven)
library(ggplot2)
library(rnaturalearth)

## Warning: package 'rnaturalearth' was built under R version 4.4.3

library(rnaturalearthdata)

## Warning: package 'rnaturalearthdata' was built under R version 4.4.3

## 
## Adjuntando el paquete: 'rnaturalearthdata'

## The following object is masked from 'package:rnaturalearth':
## 
##     countries110

library(rnaturalearthhires)
library(cluster)
library(FactoMineR)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(NbClust)
library(clValid)

## Warning: package 'clValid' was built under R version 4.4.3

library(grid)
library(gridExtra)

## 
## Adjuntando el paquete: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(ggsci)
library(missMDA)

## Warning: package 'missMDA' was built under R version 4.4.3

library(readxl)
library(mice)

## 
## Adjuntando el paquete: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

Limpieza de datos

#EJECUTAR ESTE CÓDIGO UNA SOLA VEZ
datosPisaEstudiantes = read_sas("C:/Users/David/UPV/PROYII/proyecto/datosPisaEstudiantes.SAS7BDAT", NULL)

numNA = apply(datosPisaEstudiantes, 1, function(x) sum(is.na(x)))

percNA = round(100*apply(datosPisaEstudiantes, 1, function(x) mean(is.na(x))), 2)
tablaNA2 = data.frame(numNA, percNA)
summary(tablaNA2$numNA)

barplot(table(tablaNA2$percNA), xlab = "% Valores faltantes", ylab = "Número de casos",
        main = "Valores faltantes por estudiante")

datosPisaEstudiantes_filtrado <- datosPisaEstudiantes %>%
  select(which(colMeans(is.na(datosPisaEstudiantes)) <= 0.2))

#Filtrado de estudiantes puede que sea innecesario
datosPisaEstudiantes_filtrado = datosPisaEstudiantes_filtrado %>% filter(rowMeans(is.na(.)) <= 0.2)

save(datosPisaEstudiantes_filtrado, file="estudiantesPISAfiltrados.R")

Creación de notas medias por estudiante y por país

load("estudiantesPISAfiltrados.R")
NotasPlausibles = datosPisaEstudiantes_filtrado %>%
  select(matches("^PV|^CNT$|^CNTSCHID$"))
NotasMedias = NotasPlausibles %>% transmute(CNT = NotasPlausibles$CNT,
                                        CNTSCHID = NotasPlausibles$CNTSCHID,
                                        MathMean = rowMeans(NotasPlausibles[,3:12]),
                                        LectureMean = rowMeans(NotasPlausibles[,13:22]),
                                        ScienceMean = rowMeans(NotasPlausibles[,23:32]))

NotasMediasPais = NotasMedias %>% group_by(CNT) %>% summarise(
  MathMean_Country = mean(MathMean, na.rm = TRUE),
  MathSD = sd(MathMean, na.rm = TRUE),
  LectureMean_Country = mean(LectureMean, na.rm = TRUE),
  LectureSD = sd(LectureMean, na.rm = TRUE),
  ScienceMean_Country = mean(ScienceMean, na.rm = TRUE),
  ScienceSD = sd(ScienceMean, na.rm = TRUE))

#He quitado todos los View y puesto todos los textos para que se vean bien aunque hayan los gráficos de clústeres óptimos y de Silhouette. Falta cambiar nombres de algunos gráficos y poner alguna explicación más

Boxplot medias estudiantes

par(mar = c(5, 7, 4, 2))
boxplot(NotasMedias$MathMean, NotasMedias$LectureMean, NotasMedias$ScienceMean, 
        names = c("Matemáticas", "Lectura", "Ciencias"), 
        main = "Valores atípicos medias estudiantes", 
        col = c("red", "lightblue", "green"),
        horizontal = TRUE, cex.axis = 0.8)

summary(NotasMedias)

##      CNT               CNTSCHID           MathMean      LectureMean    
##  Length:562708      Min.   :  800001   Min.   :150.2   Min.   : 37.96  
##  Class :character   1st Qu.:21400219   1st Qu.:370.8   1st Qu.:365.57  
##  Mode  :character   Median :40000043   Median :438.2   Median :441.23  
##                     Mean   :43679973   Mean   :446.4   Mean   :443.54  
##                     3rd Qu.:68800107   3rd Qu.:514.6   3rd Qu.:519.47  
##                     Max.   :90100182   Max.   :843.3   Max.   :890.17  
##   ScienceMean    
##  Min.   : 51.34  
##  1st Qu.:379.33  
##  Median :449.85  
##  Mean   :455.78  
##  3rd Qu.:527.40  
##  Max.   :852.45

Boxplot medias países

par(mar = c(5, 7, 4, 2))
boxplot(
        NotasMediasPais$MathMean_Country, 
        NotasMediasPais$LectureMean_Country, 
        NotasMediasPais$ScienceMean_Country, 
        names = c("Matemáticas", "Lectura", "Ciencias"), 
        main = "Valores atípicos medias países", 
        col = c("red", "lightblue", "green"),
        horizontal = TRUE, cex.axis = 0.8)

summary(NotasMediasPais)

##      CNT            MathMean_Country     MathSD       LectureMean_Country
##  Length:76          Min.   :343.5    Min.   : 48.07   Min.   :337.7      
##  Class :character   1st Qu.:396.2    1st Qu.: 74.31   1st Qu.:406.0      
##  Mode  :character   Median :449.1    Median : 82.78   Median :448.5      
##                     Mean   :445.1    Mean   : 80.63   Mean   :442.6      
##                     3rd Qu.:483.6    3rd Qu.: 90.03   3rd Qu.:484.0      
##                     Max.   :574.6    Max.   :108.34   Max.   :543.2      
##    LectureSD      ScienceMean_Country   ScienceSD     
##  Min.   : 59.36   Min.   :356.2       Min.   : 55.49  
##  1st Qu.: 81.24   1st Qu.:411.8       1st Qu.: 76.93  
##  Median : 86.80   Median :461.2       Median : 86.68  
##  Mean   : 87.58   Mean   :454.4       Mean   : 84.59  
##  3rd Qu.: 96.22   3rd Qu.:497.6       3rd Qu.: 93.52  
##  Max.   :114.10   Max.   :561.5       Max.   :104.17

Identificar códigos que no hacen bien el join

world = ne_countries(scale = "medium", returnclass = "sf")
world <- left_join(world, NotasMediasPais, by = c("iso_a3" = "CNT"))
iso_validos = world$iso_a3[!is.na(world$MathMean_Country)]
iso_novalidos = setdiff(NotasMediasPais$CNT, iso_validos)
print(iso_novalidos)

## [1] "FRA" "KSV" "NOR" "QAZ" "QUR" "TAP"

#Poner este código cada vez que se cree world
world$iso_a3[world$admin == "France"] = "FRA"
world$iso_a3[world$admin == "Norway"] = "NOR"
world$iso_a3[world$admin == "Kosovo"] = "KSV"
world$iso_a3[world$admin == "Taiwan"] = "TAP"
#QAZ es Baku (Azerbaiyan) y QUR es algunas regiones de Ucrania

Mapa medias por país

world = ne_countries(scale = "medium", returnclass = "sf")

world$iso_a3[world$admin == "France"] = "FRA"
world$iso_a3[world$admin == "Norway"] = "NOR"
world$iso_a3[world$admin == "Kosovo"] = "KSV"
world$iso_a3[world$admin == "Taiwan"] = "TAP"

world <- left_join(world, NotasMediasPais, by = c("iso_a3" = "CNT"))

ggplot(data = world) +
  geom_sf(aes(fill = MathMean_Country), color = "white") +
  scale_fill_viridis_c(option = "magma", na.value = "gray90") +  
  labs(title = "Puntaje Medio en Matemáticas por País",
       fill = "Puntaje") +
  theme_minimal()

ggplot(data = world) +
  geom_sf(aes(fill = LectureMean_Country), color = "white") +
  scale_fill_viridis_c(option = "magma", na.value = "gray90") +  
  labs(title = "Puntaje Medio en Lectura por País",
       fill = "Puntaje") +
  theme_minimal()

ggplot(data = world) +
  geom_sf(aes(fill = ScienceMean_Country), color = "white") +
  scale_fill_viridis_c(option = "magma", na.value = "gray90") +  
  labs(title = "Puntaje Medio en Ciencia por País",
       fill = "Puntaje") +
  theme_minimal()

Heatmap Medias y Desviaciones típicas

midist = get_dist(NotasMediasPais[2:ncol(NotasMediasPais)], stand = FALSE, method = "euclidean")
fviz_dist(midist, show_labels = TRUE, lab_size = 0.3,
          gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

Estadístico Hopkins

#nrow(NotasMediasPais)
set.seed(100)
myN = c(20, 35, 50, 65)  # m
myhopkins = NULL
myseed = sample(1:1000, 10)
for (i in myN) {
  for (j in myseed) {
    tmp = get_clust_tendency(data = NotasMediasPais[,2:ncol(NotasMediasPais)], n = i, graph = FALSE, seed = j)
    myhopkins = c(myhopkins, tmp$hopkins_stat)
  }
}
summary(myhopkins)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.7923  0.8240  0.8315  0.8330  0.8429  0.8730

#Hopkins cercano a 1 indica datos agrupados

Prueba de PCA

#Escalar por bloques?
res.pca = PCA(NotasMediasPais, 
              graph = FALSE, ncp = 2, quali.sup = 1)
fviz_pca_ind(res.pca, axes = c(1, 2), geom = c("point", "text"), 
             habillage = "CNT", repel = TRUE, labelsize = 2)

fviz_pca_var(res.pca, axes = c(1,2), repel = TRUE, col.var = "contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))

Clustering por países

Visualización número de clústeres óptimos (Silhouette y SC intraclúster)

Ward

p1 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = hcut, method = "silhouette", 
                  hc_method = "ward.D2", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
p2 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = hcut, method = "wss", 
                  hc_method = "ward.D2", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
grid.arrange(p1, p2, nrow = 1)

Método Jerárquico de la media

p1 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = hcut, method = "silhouette", 
                  hc_method = "average", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
p2 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = hcut, method = "wss", 
                  hc_method = "average", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
grid.arrange(p1, p2, nrow = 1)

#En los métodos jerárquicos se puede meter la variable CNT y no da error, pero mejor quitarla para que nos diga warnings (en los métodos de partición no se puede incluir directamente)

En los jerárquicos cogemos 5 clústeres, ya que los más óptimos son 2, 4 o 5 clústeres, pero 2 es demasiado poco y 5 es un poco mejor que 4 en ambos criterios.

K-Medias

p1 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = kmeans, method = "silhouette", 
             k.max = 10, verbose = FALSE) +
  labs(title = "K-means")
p2 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = kmeans, method = "wss", 
             k.max = 10, verbose = FALSE) +
  labs(title = "K-means")
grid.arrange(p1, p2, nrow = 1)

K-Medoides

p1 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = pam, method = "silhouette", 
             k.max = 10, verbose = FALSE) +
  labs(title = "Numero optimo de clusters")
p2 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = pam, method = "wss", 
             k.max = 10, verbose = FALSE) +
  labs(title = "Numero optimo de clusters")
grid.arrange(p1, p2, nrow = 1)

Excluyendo la posibilidad de hacerlo con dos clústeres, con el método de k-medias nos salen 4 o 6 como la cantidad idonea de clústeres. Con el método de k-medoides nos sale otra vez 5 clústeres.

Creación de clusterings

clust1 <- hclust(midist, method="ward.D2")
grupos1 <- cutree(clust1, k=5)
table(grupos1)

## grupos1
##  1  2  3  4  5 
## 18 14 30  8  6

gruposA = cbind(grupos1, NotasMediasPais[,"CNT"])

clust2 <- hclust(midist, method="average")
grupos2 <- cutree(clust2, k=5)
table(grupos2)

## grupos2
##  1  2  3  4  5 
## 18 14 28  8  8

gruposB = cbind(grupos2, NotasMediasPais[,"CNT"])

set.seed(100)
clust3 <- kmeans(NotasMediasPais[,-1], centers = 4, nstart = 20)
table(clust3$cluster)

## 
##  1  2  3  4 
## 15 23 31  7

gruposC = cbind(clust3$cluster, NotasMediasPais[,"CNT"])
colnames(gruposC)[1] = "grupos3"

clust4 <- pam(NotasMediasPais, k = 5)
table(clust4$clustering)

## 
##  1  2  3  4  5 
## 13 24 17 15  7

gruposD = cbind(clust4$cluster, NotasMediasPais[,"CNT"])
colnames(gruposD)[1] = "grupos4"

grupos1 == grupos2

##  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
## [25]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [73]  TRUE  TRUE  TRUE  TRUE

Son prácticamente iguales los clusterings en los dos métodos jerárquicos, solo cambia en que incluye irlanda y estonia en el grupo de singapur.

Visualización de clústeres

world = ne_countries(scale = "large", returnclass = "sf")

world$iso_a3[world$admin == "France"] = "FRA"
world$iso_a3[world$admin == "Norway"] = "NOR"
world$iso_a3[world$admin == "Kosovo"] = "KSV"
world$iso_a3[world$admin == "Taiwan"] = "TAP"

world_clustered <- world %>%
  left_join(gruposA, by = c("iso_a3" = "CNT"))
world_clustered$grupos1 <- as.factor(world_clustered$grupos1)

ggplot(world_clustered) +
  geom_sf(aes(fill = grupos1), color = NA, size = 0.1) +
  scale_fill_brewer(palette = "Set3", na.value = "grey90") +
  theme_minimal() +
  labs(title = "Mapa de Clústeres de Países WARD k=5", fill = "Cluster")

world_clustered <- world %>%
  left_join(gruposB, by = c("iso_a3" = "CNT"))
world_clustered$grupos2 <- as.factor(world_clustered$grupos2)

ggplot(world_clustered) +
  geom_sf(aes(fill = grupos2), color = NA, size = 0.1) +
  scale_fill_brewer(palette = "Set3", na.value = "grey90") +
  theme_minimal() +
  labs(title = "Mapa de Clústeres de Países MEDIAS k=5", fill = "Cluster")

world_clustered <- world %>%
  left_join(gruposC, by = c("iso_a3" = "CNT"))
world_clustered$grupos3 <- as.factor(world_clustered$grupos3)

ggplot(world_clustered) +
  geom_sf(aes(fill = grupos3), color = NA, size = 0.1) +
  scale_fill_brewer(palette = "Set3", na.value = "grey90") +
  theme_minimal() +
  labs(title = "Mapa de Clústeres de Países KMEDIAS k=4", fill = "Cluster")

world_clustered <- world %>%
  left_join(gruposD, by = c("iso_a3" = "CNT"))
world_clustered$grupos4 <- as.factor(world_clustered$grupos4)

ggplot(world_clustered) +
  geom_sf(aes(fill = grupos4), color = NA, size = 0.1) +
  scale_fill_brewer(palette = "Set3", na.value = "grey90") +
  theme_minimal() +
  labs(title = "Mapa de Clústeres de Países KMEDOIDES k=5", fill = "Cluster")

Visualización del mejor clustering

colores = pal_npg("nrc")(6)
colores2 = pal_npg("nrc")(7)
par(mfrow = c(1,2))
plot(silhouette(grupos1, midist), col=colores, border=NA, main = "WARD")
plot(silhouette(grupos2, midist), col=colores, border=NA, main = "MEDIAS")

plot(silhouette(clust3$cluster, midist), col=colores, border=NA, main = "K-MEDIAS")
plot(silhouette(clust4$clustering, midist), col=colores2, border=NA, main = "K-MEDOIDES")

Según el coeficiente de Silhouette el mejor método es el de k-medias, así que lo escogemos. Además no tiene ningún individuo con un coeficiente de Silhouette negativo.

Validación del clustering

NotasNumericas = NotasMediasPais[,-1]
NotasNumericas = as.data.frame(NotasNumericas)

metodos = c("hierarchical","kmeans","pam")
validacion = suppressMessages(clValid(NotasNumericas, nClust = 4:6, metric = "euclidean", clMethods = metodos, validation = c("internal", "stability"),
method = "ward"))

## Warning in clValid(NotasNumericas, nClust = 4:6, metric = "euclidean",
## clMethods = metodos, : rownames for data not specified, using 1:nrow(data)

summary(validacion)

## 
## Clustering Methods:
##  hierarchical kmeans pam 
## 
## Cluster sizes:
##  4 5 6 
## 
## Validation Measures:
##                                  4       5       6
##                                                   
## hierarchical APN            0.2186  0.1967  0.1723
##              AD            49.3383 40.1190 35.3354
##              ADM           24.3513 15.5680 11.9886
##              FOM           14.2901 12.3023 10.9805
##              Connectivity  14.8532 19.9964 27.0885
##              Dunn           0.2218  0.2663  0.1502
##              Silhouette     0.4536  0.4568  0.3689
## kmeans       APN            0.0669  0.1233  0.0779
##              AD            41.2157 36.9759 31.8536
##              ADM            6.2354  9.1124  5.1827
##              FOM           13.8448 12.1483 10.9892
##              Connectivity  14.6960 20.6234 24.0770
##              Dunn           0.2335  0.2667  0.1871
##              Silhouette     0.4936  0.4656  0.3924
## pam          APN            0.1100  0.0395  0.1354
##              AD            45.1380 34.6045 33.3412
##              ADM           12.1841  2.8777  7.8905
##              FOM           14.3613 11.4065 11.0022
##              Connectivity  21.2111 22.8972 27.9671
##              Dunn           0.0503  0.2012  0.2667
##              Silhouette     0.4432  0.4618  0.4418
## 
## Optimal Scores:
## 
##              Score   Method       Clusters
## APN           0.0395 pam          5       
## AD           31.8536 kmeans       6       
## ADM           2.8777 pam          5       
## FOM          10.9805 hierarchical 6       
## Connectivity 14.6960 kmeans       4       
## Dunn          0.2667 kmeans       5       
## Silhouette    0.4936 kmeans       4

Considerando todo lo anterior junto con estos parámetros creemos que el mejor método sigue siendo k-means con 4 clústeres. Dando como parámetros más importantes el coeficiente de Silhouette, el índice de Dunn y la conectividad.

Visualización del Clúster final

world_clustered <- world %>%
  left_join(gruposC, by = c("iso_a3" = "CNT"))
world_clustered$grupos3 <- as.factor(world_clustered$grupos3)

ggplot(world_clustered) +
  geom_sf(aes(fill = grupos3), color = NA, size = 0.1) +
  scale_fill_brewer(palette = "Set2", na.value = "grey90") +
  theme_minimal() +
  labs(title = "Mapa de Clústeres de Países KMEDIAS k=4", fill = "Cluster")

PaisesCluster = merge(NotasMediasPais, gruposC, by = "CNT")

MediasCluster = PaisesCluster %>%
  group_by(grupos3) %>%
  summarise(
    nota_media_Mates = mean(MathMean_Country),
    nota_media_Lectura = mean(LectureMean_Country),
    nota_media_Ciencias = mean(ScienceMean_Country)
    )
MediasCluster$nota_media_General = rowMeans(MediasCluster[,2:ncol(MediasCluster)])
MediasCluster

## # A tibble: 4 × 5
##   grupos3 nota_media_Mates nota_media_Lectura nota_media_Ciencias
##     <int>            <dbl>              <dbl>               <dbl>
## 1       1             371.               362.                377.
## 2       2             417.               419.                428.
## 3       3             480.               483.                492.
## 4       4             541.               516.                538.
## # ℹ 1 more variable: nota_media_General <dbl>

MediasCluster2 <- MediasCluster %>%
  pivot_longer(
    cols = starts_with("nota"),
    names_to = "area",
    values_to = "nota"
  )

MediasCluster2$area <- gsub("nota_media_", "", MediasCluster2$area)

ggplot(MediasCluster2, aes(x = factor(grupos3), y = nota, fill = area)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Nota media por área y clúster",
    x = "Clúster",
    y = "Nota media",
    fill = "Área"
  ) +
  theme_minimal()

PaisesCluster %>%
  group_by(grupos3) %>%
  summarise(paises = paste(CNT, collapse = ", "))

## # A tibble: 4 × 2
##   grupos3 paises                                                                
##     <int> <chr>                                                                 
## 1       1 ALB, DOM, GEO, IDN, JOR, KSV, MAR, MKD, PAN, PHL, PSE, QAZ, SAU, SLV,…
## 2       2 ARE, ARG, BGR, BRA, BRN, COL, CRI, GRC, ISL, JAM, KAZ, MDA, MEX, MNE,…
## 3       3 AUS, AUT, BEL, CAN, CHE, CHL, CZE, DEU, DNK, ESP, FIN, FRA, GBR, HRV,…
## 4       4 EST, HKG, JPN, KOR, MAC, SGP, TAP

Las notas medias de los clústeres van de menos a más (el clúster 1 es el que menor nota tiene y el 4 es el que mayor nota tiene). Vemos que esta subida de notas ocurre en todas las áreas.

Clúster 1 = {Marruecos, Arabia Saudí, Uzbekistán, Indonesia, Filipinas, Georgia…}. Países subdesarrollados que no destacan por su educación.

Clúster 2 = {Latinoamérica, Balcanes, Kazajistán, Mongolia, Tailandia…}. Países subdesarrollados o en vías de desarrollo, tienen notas un poco mejor que el grupo anterior.

Clúster 3 = {Europa, Canadá, EEUU, Australia, Nueva Zelanda…}. Países desarrollados con buena educación.

Clúster 4 = {Estonia, Corea del Sur, Japón, Singapur, Macao, Hong Kong…}. Países desarrollados que suelen ser pequeños y destacan por su avanzado programa educativo.

Clustering España

Creación del df de medias por CCAA

ESP = datosPisaEstudiantes_filtrado[datosPisaEstudiantes_filtrado$CNT=="ESP",]
PlausiblesESP = ESP %>%
  select(matches("^PV|^CNTSCHID$|^REGION$"))
MediasESP = PlausiblesESP %>% transmute(CNTSCHID = PlausiblesESP$CNTSCHID,
                                        REGION = PlausiblesESP$REGION,
                                        MathMean = rowMeans(PlausiblesESP[,3:12]),
                                        LectureMean = rowMeans(PlausiblesESP[,13:22]),
                                        ScienceMean = rowMeans(PlausiblesESP[,23:32]))
MediasREG = MediasESP %>% group_by(REGION) %>% summarise(
                                                    MathMeanReg = mean(MathMean),
                                                    MathSDReg = sd(MathMean),
                                                    LectureMeanReg = mean(LectureMean),
                                                    LectureSDReg = sd(LectureMean),
                                                    ScienceMeanReg = mean(ScienceMean),
                                                    ScienceSDReg = sd(ScienceMean)
                                                    )
regions = ne_states(country = "Spain", returnclass = "sf")

#código del .SAS
cod_reg <- tibble(
  REGION = c(72401, 72402, 72403, 72404, 72405, 72406, 72407, 72408, 
             72409, 72410, 72411, 72412, 72413, 72414, 72415, 72416, 
             72417, 72418, 72419),
  REGION_NAME = c("Andalucía",
                  "Aragón",
                  "Asturias",
                  "Islas Baleares",
                  "Canary Is.",
                  "Cantabria",
                  "Castilla y León",
                  "Castilla-La Mancha",
                  "Cataluña",
                  "Extremadura",
                  "Galicia",
                  "La Rioja",
                  "Madrid",
                  "Murcia",
                  "Foral de Navarra",
                  "País Vasco",
                  "Valenciana",
                  "Ceuta",
                  "Melilla")
)

MediasREG = MediasREG %>% left_join(cod_reg, by = "REGION")
MediasREG = MediasREG %>% select(REGION_NAME, matches("^Math|^Lecture|^Science"))

Boxplot medias estudiantes de España

par(mar = c(5, 7, 4, 2))
boxplot(MediasESP$MathMean, MediasESP$LectureMean, MediasESP$ScienceMean, 
        names = c("Matemáticas", "Lectura", "Ciencias"), 
        main = "Valores atípicos medias estudiantes España", 
        col = c("red", "lightblue", "green"),
        horizontal = TRUE, cex.axis = 0.8)

summary(MediasESP)

##     CNTSCHID            REGION         MathMean      LectureMean   
##  Min.   :72400001   Min.   :72401   Min.   :201.9   Min.   :121.8  
##  1st Qu.:72400252   1st Qu.:72405   1st Qu.:428.4   1st Qu.:425.0  
##  Median :72400497   Median :72410   Median :485.8   Median :487.5  
##  Mean   :72400495   Mean   :72410   Mean   :483.3   Mean   :482.8  
##  3rd Qu.:72400741   3rd Qu.:72414   3rd Qu.:540.7   3rd Qu.:544.2  
##  Max.   :72400987   Max.   :72419   Max.   :773.5   Max.   :759.6  
##   ScienceMean   
##  Min.   :172.4  
##  1st Qu.:435.1  
##  Median :495.1  
##  Mean   :492.3  
##  3rd Qu.:551.7  
##  Max.   :767.6

Boxplot medias comunidades autónomas

par(mar = c(5, 7, 4, 2))
boxplot(
        MediasREG$MathMeanReg, 
        MediasREG$LectureMeanReg, 
        MediasREG$ScienceMeanReg, 
        names = c("Matemáticas", "Lectura", "Ciencias"), 
        main = "Valores atípicos medias Com. Autónomas", 
        col = c("red", "lightblue", "green"),
        horizontal = TRUE, cex.axis = 0.8)

summary(MediasREG)

##  REGION_NAME         MathMeanReg      MathSDReg     LectureMeanReg 
##  Length:19          Min.   :413.9   Min.   :75.25   Min.   :423.7  
##  Class :character   1st Qu.:469.7   1st Qu.:77.02   1st Qu.:472.1  
##  Mode  :character   Median :478.6   Median :78.42   Median :477.8  
##                     Mean   :476.8   Mean   :78.55   Mean   :477.8  
##                     3rd Qu.:496.6   3rd Qu.:79.64   3rd Qu.:491.5  
##                     Max.   :505.0   Max.   :82.86   Max.   :504.1  
##   LectureSDReg   ScienceMeanReg   ScienceSDReg  
##  Min.   :80.44   Min.   :427.5   Min.   :77.93  
##  1st Qu.:83.06   1st Qu.:479.6   1st Qu.:80.12  
##  Median :84.38   Median :488.4   Median :82.06  
##  Mean   :84.91   Mean   :486.8   Mean   :81.74  
##  3rd Qu.:85.96   3rd Qu.:505.2   3rd Qu.:83.46  
##  Max.   :92.72   Max.   :511.8   Max.   :86.38

HeatMap España

midist2 = get_dist(MediasREG[2:ncol(MediasREG)], stand = FALSE, method = "euclidean")
fviz_dist(midist2, show_labels = TRUE, lab_size = 0.3,
          gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

Estadístico Hopkins España

set.seed(100)
myN = c(5, 8, 10, 15)  # cambiando tamaños de muestra
myhopkins = NULL
myseed = sample(1:1000, 10)
for (i in myN) {
  for (j in myseed) {
    tmp = get_clust_tendency(data = MediasREG[,2:ncol(MediasREG)], n = i, graph = FALSE, seed = j)
    myhopkins = c(myhopkins, tmp$hopkins_stat)
  }
}
summary(myhopkins)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.6624  0.7737  0.7947  0.7893  0.8136  0.8383

Mapa medias por Comunidad Autónoma

mapSPAIN = regions %>% left_join(MediasREG, by = c("region" = "REGION_NAME"))

ggplot(mapSPAIN) +
  geom_sf(aes(fill = MathMeanReg), color = NA) +
  scale_fill_viridis_c(option = "C") +
  theme_minimal() +
  labs(title = "Media en Matemáticas por Comunidad Autónoma", fill = "Media Matemáticas")

ggplot(mapSPAIN) +
  geom_sf(aes(fill = LectureMeanReg), color = NA) +
  scale_fill_viridis_c(option = "C") +
  theme_minimal() +
  labs(title = "Media en Lectura por Comunidad Autónoma", fill = "Media Lectura")

ggplot(mapSPAIN) +
  geom_sf(aes(fill = ScienceMeanReg), color = NA) +
  scale_fill_viridis_c(option = "C") +
  theme_minimal() +
  labs(title = "Media en Ciencias por Comunidad Autónoma", fill = "Media Ciencias")

Visualización número de clústeres óptimos (Silhouette y SC intraclúster)

Ward

p1 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = hcut, method = "silhouette", 
                  hc_method = "ward.D2", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
p2 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = hcut, method = "wss", 
                  hc_method = "ward.D2", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
grid.arrange(p1, p2, nrow = 1)

Método jerárquico de la media

p1 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = hcut, method = "silhouette", 
                  hc_method = "average", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
p2 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = hcut, method = "wss", 
                  hc_method = "average", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
grid.arrange(p1, p2, nrow = 1)

K-Means

p1 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = kmeans, method = "silhouette", 
             k.max = 10, verbose = FALSE) +
  labs(title = "K-means")
p2 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = kmeans, method = "wss", 
             k.max = 10, verbose = FALSE) +
  labs(title = "K-means")
grid.arrange(p1, p2, nrow = 1)

K-Medoides

p1 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = pam, method = "silhouette", 
             k.max = 10, verbose = FALSE) +
  labs(title = "Numero optimo de clusters")
p2 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = pam, method = "wss", 
             k.max = 10, verbose = FALSE) +
  labs(title = "Numero optimo de clusters")
grid.arrange(p1, p2, nrow = 1)

En todos los métodos parece que el número óptimo de clústeres es 3 basándonos en el criterio del codo en la SC intraclúster y en el criterio de maximizar el coeficiente de Silhouette.

Creación de clusterings

Eclust1 <- hclust(midist2, method="ward.D2")
Egrupos1 <- cutree(Eclust1, k=3)
table(Egrupos1)

## Egrupos1
##  1  2  3 
## 10  7  2

ESPgruposA = cbind(Egrupos1, MediasREG[,"REGION_NAME"])

Eclust2 <- hclust(midist2, method="average")
Egrupos2 <- cutree(Eclust2, k=3)
table(Egrupos2)

## Egrupos2
##  1  2  3 
## 10  7  2

ESPgruposB = cbind(Egrupos2, MediasREG[,"REGION_NAME"])

set.seed(100)
Eclust3 <- kmeans(MediasREG[,-1], centers = 3, nstart = 20)
table(Eclust3$cluster)

## 
## 1 2 3 
## 2 8 9

ESPgruposC = cbind(Eclust3$cluster, MediasREG[,"REGION_NAME"])
colnames(ESPgruposC)[1] = "ESPgrupos3"

Eclust4 <- pam(MediasREG, k = 3)
table(Eclust4$clustering)

## 
##  1  2  3 
## 10  7  2

ESPgruposD = cbind(Eclust4$cluster, MediasREG[,"REGION_NAME"])
colnames(ESPgruposD)[1] = "ESPgrupos4"

Visualización del mejor clustering

par(mfrow = c(1,2))
plot(silhouette(Egrupos1, midist2), col=colores, border=NA, main = "WARD")
plot(silhouette(Egrupos2, midist2), col=colores, border=NA, main = "MEDIAS")

plot(silhouette(Eclust3$cluster, midist2), col=colores, border=NA, main = "K-MEDIAS")
plot(silhouette(Eclust4$clustering, midist2), col=colores2, border=NA, main = "K-MEDOIDES")

Vemos que el mejor parece el método k-medias con k = 3 ya que no hay ningún elemento mal colocado y el coeficiente de Silhouette.

Validación del clustering

numESP = MediasREG[,-1]
numESP = as.data.frame(numESP)

validacion = suppressMessages(clValid(numESP, nClust = 3:5, metric = "euclidean", clMethods = metodos, validation = c("internal", "stability"),
method = "ward"))

## Warning in clValid(numESP, nClust = 3:5, metric = "euclidean", clMethods =
## metodos, : rownames for data not specified, using 1:nrow(data)

summary(validacion)

## 
## Clustering Methods:
##  hierarchical kmeans pam 
## 
## Cluster sizes:
##  3 4 5 
## 
## Validation Measures:
##                                  3       4       5
##                                                   
## hierarchical APN            0.0368  0.0959  0.1067
##              AD            15.1087 12.4892 10.7653
##              ADM            1.7603  2.9143  3.1940
##              FOM            5.2969  4.9283  4.9603
##              Connectivity  10.7206 19.1706 24.5230
##              Dunn           0.2775  0.4244  0.4244
##              Silhouette     0.5864  0.4660  0.4007
## kmeans       APN            0.0154  0.1076  0.1132
##              AD            14.2907 12.6164 10.5438
##              ADM            0.7204  3.2313  3.1357
##              FOM            5.0181  5.0983  4.9537
##              Connectivity  10.1579 19.1706 24.3552
##              Dunn           0.3815  0.4244  0.4483
##              Silhouette     0.6046  0.4660  0.4151
## pam          APN            0.0154  0.1094  0.0921
##              AD            14.2907 12.7162 10.2569
##              ADM            0.7204  3.4544  2.4605
##              FOM            5.0181  4.9580  4.4473
##              Connectivity  10.1579 19.1706 24.3552
##              Dunn           0.3815  0.4244  0.4483
##              Silhouette     0.6046  0.4660  0.4151
## 
## Optimal Scores:
## 
##              Score   Method Clusters
## APN           0.0154 kmeans 3       
## AD           10.2569 pam    5       
## ADM           0.7204 kmeans 3       
## FOM           4.4473 pam    5       
## Connectivity 10.1579 kmeans 3       
## Dunn          0.4483 kmeans 5       
## Silhouette    0.6046 kmeans 3

Siguiendo estos criterios parece que el clustering mejor sigue siendo el de k-medias con k = 3, aunque son bastante parecidos al haber solo 15 comunidades autónomas y 2 ciudades autónomas.

Visualización del Clúster final

#Intercambiamos los clústers 2 y 3 para que sea similar al de países y ordenados por nota. La columna de clúster ahora es ESPgrupos3_mod
ESPgruposC$ESPgrupos3_mod <- ESPgruposC$ESPgrupos3
ESPgruposC$ESPgrupos3_mod[ESPgruposC$ESPgrupos3 == 2] <- 99
ESPgruposC$ESPgrupos3_mod[ESPgruposC$ESPgrupos3 == 3] <- 2
ESPgruposC$ESPgrupos3_mod[ESPgruposC$ESPgrupos3_mod == 99] <- 3

mapSPAIN <- regions %>%
  left_join(ESPgruposC, by = c("region" = "REGION_NAME"))
mapSPAIN$ESPgrupos3_mod <- as.factor(mapSPAIN$ESPgrupos3_mod)

ggplot(mapSPAIN) +
  geom_sf(aes(fill = ESPgrupos3_mod), color = NA, size = 0.1) +
  scale_fill_brewer(palette = "Set2", na.value = "grey90") +
  theme_minimal() +
  labs(title = "Mapa de Clústeres de CCAA españolas KMEDIAS k=3", fill = "Cluster")

#Con k = 3 todos los clústers son prácticamente iguales

CCAACluster = merge(MediasREG, ESPgruposC, by = "REGION_NAME")

MediasClusterRegiones = CCAACluster %>%
  group_by(ESPgrupos3_mod) %>%
  summarise(
    nota_media_Mates = mean(MathMeanReg),
    nota_media_Lectura = mean(LectureMeanReg),
    nota_media_Ciencias = mean(ScienceMeanReg)
    )
MediasClusterRegiones$nota_media_General = rowMeans(MediasClusterRegiones[,2:ncol(MediasClusterRegiones)])
MediasClusterRegiones

## # A tibble: 3 × 5
##   ESPgrupos3_mod nota_media_Mates nota_media_Lectura nota_media_Ciencias
##            <dbl>            <dbl>              <dbl>               <dbl>
## 1              1             417.               424.                428.
## 2              2             472.               474.                483.
## 3              3             498.               495.                506.
## # ℹ 1 more variable: nota_media_General <dbl>

MediasClusterRegiones2 <- MediasClusterRegiones %>%
  pivot_longer(
    cols = starts_with("nota"),
    names_to = "area",
    values_to = "nota"
  )

MediasClusterRegiones2$area <- gsub("nota_media_", "", MediasClusterRegiones2$area)

ggplot(MediasClusterRegiones2, aes(x = factor(ESPgrupos3_mod), y = nota, fill = area)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Nota media por área y clúster",
    x = "Clúster",
    y = "Nota media",
    fill = "Área"
  ) +
  theme_minimal()

CCAACluster %>%
  group_by(ESPgrupos3_mod) %>%
  summarise(regiones = paste(REGION_NAME, collapse = ", "))

## # A tibble: 3 × 2
##   ESPgrupos3_mod regiones                                                       
##            <dbl> <chr>                                                          
## 1              1 Ceuta, Melilla                                                 
## 2              2 Andalucía, Canary Is., Castilla-La Mancha, Cataluña, Extremadu…
## 3              3 Aragón, Asturias, Cantabria, Castilla y León, Foral de Navarra…

Los clústeres ordenados de menor nota (en todas las áreas) a mayor son: Clúster 1 < Clúster 3 < Clúster 2.

Clúster 1 = {Ceuta, Melilla}. Ciudades autónomas situadas en África que están menos conectadas con el resto de España y suelen tener menos recursos.

Clúster 2 = {Galicia, Asturias, Cantabria, Castilla y León, Navarra, Aragón, La Rioja, Madrid}. Comunidades autónomas del norte de España, muchas suelen destacar en índices socieconómicos y tener desarrollo tecnológico.

Clúster 3 = {Islas Canarias, Andalucía, Comunidad Valenciana, Extremadura, Castilla-La Mancha, Murcia, Cataluña, País Vasco}. La mayoría de estas Comunidades autónomas suelen estar en el sur de España y estar un poco peor que las del clúster 2 en cuanto a índices socioeconómicos y tecnología.

Clustering Banco Mundial + NotasMediasPaís

escuelas <- read_sas("escuelas.SAS7BDAT", NULL)

bancomundial =load('bancomundial.R')
bancomundial <- datos_filtrado

bm_media_res<- merge(
  NotasMediasPais, 
  bancomundial, 
  by.x = "CNT",  # Columna de países en NotasMediasPais
  by.y = "Country.Code",  # Columna de países en bancomundial (ajusta según tu CSV)
  all.x = TRUE  # Conservar todos los países de PISA aunque falten datos del BM
)

write.csv(bm_media_res, file = "bm_media_res.csv", row.names = FALSE)

datos_numericos <- read_excel("datos_bm_res.xlsx")

datos = subset(datos_numericos,select = -c(CNT))
summary(datos)

##  MathMean_Country LectureMean_Country ScienceMean_Country
##  Min.   :343.5    Min.   :337.7       Min.   :356.2      
##  1st Qu.:396.2    1st Qu.:406.0       1st Qu.:411.8      
##  Median :449.1    Median :448.5       Median :461.2      
##  Mean   :445.1    Mean   :442.6       Mean   :454.4      
##  3rd Qu.:483.6    3rd Qu.:484.0       3rd Qu.:497.6      
##  Max.   :574.6    Max.   :543.2       Max.   :561.5      
##                                                          
##  Compulsory education, duration (years)
##  Min.   : 6.00                         
##  1st Qu.:10.00                         
##  Median :11.00                         
##  Mean   :10.93                         
##  3rd Qu.:12.00                         
##  Max.   :15.00                         
##  NA's   :18                            
##  Current education expenditure, total (% of total expenditure in public institutions)
##  Min.   :77.62                                                                       
##  1st Qu.:84.73                                                                       
##  Median :91.75                                                                       
##  Mean   :90.27                                                                       
##  3rd Qu.:96.85                                                                       
##  Max.   :99.34                                                                       
##  NA's   :69                                                                          
##  Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative)
##  Min.   : 9.481                                                                                   
##  1st Qu.:19.822                                                                                   
##  Median :26.128                                                                                   
##  Mean   :26.834                                                                                   
##  3rd Qu.:33.694                                                                                   
##  Max.   :51.070                                                                                   
##  NA's   :23                                                                                       
##  Educational attainment, at least completed lower secondary, population 25+, total (%) (cumulative)
##  Min.   :48.79                                                                                     
##  1st Qu.:75.92                                                                                     
##  Median :89.87                                                                                     
##  Mean   :84.01                                                                                     
##  3rd Qu.:96.46                                                                                     
##  Max.   :99.72                                                                                     
##  NA's   :18                                                                                        
##  Educational attainment, at least completed post-secondary, population 25+, total (%) (cumulative)
##  Min.   :12.11                                                                                    
##  1st Qu.:24.71                                                                                    
##  Median :34.66                                                                                    
##  Mean   :35.45                                                                                    
##  3rd Qu.:43.50                                                                                    
##  Max.   :67.27                                                                                    
##  NA's   :23                                                                                       
##  Educational attainment, at least completed primary, population 25+ years, total (%) (cumulative)
##  Min.   : 63.81                                                                                  
##  1st Qu.: 91.23                                                                                  
##  Median : 97.56                                                                                  
##  Mean   : 93.95                                                                                  
##  3rd Qu.: 99.21                                                                                  
##  Max.   :100.00                                                                                  
##  NA's   :18                                                                                      
##  Educational attainment, at least completed upper secondary, population 25+, total (%) (cumulative)
##  Min.   :34.46                                                                                     
##  1st Qu.:55.53                                                                                     
##  Median :74.85                                                                                     
##  Mean   :69.24                                                                                     
##  3rd Qu.:81.59                                                                                     
##  Max.   :91.99                                                                                     
##  NA's   :18                                                                                        
##  Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)
##  Min.   : 0.7075                                                                                
##  1st Qu.: 3.9465                                                                                
##  Median :12.5948                                                                                
##  Mean   :11.0888                                                                                
##  3rd Qu.:15.8233                                                                                
##  Max.   :22.8423                                                                                
##  NA's   :25                                                                                     
##  Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)
##  Min.   :0.0000                                                                        
##  1st Qu.:0.2900                                                                        
##  Median :0.7576                                                                        
##  Mean   :0.9213                                                                        
##  3rd Qu.:1.3326                                                                        
##  Max.   :3.1390                                                                        
##  NA's   :20                                                                            
##  Government expenditure on education, total (% of GDP)
##  Min.   :0.8639                                       
##  1st Qu.:3.3393                                       
##  Median :4.1601                                       
##  Mean   :4.1761                                       
##  3rd Qu.:4.9824                                       
##  Max.   :7.1397                                       
##  NA's   :48                                           
##  Government expenditure on education, total (% of government expenditure)
##  Min.   : 7.45                                                           
##  1st Qu.:10.12                                                           
##  Median :13.93                                                           
##  Mean   :13.36                                                           
##  3rd Qu.:15.73                                                           
##  Max.   :22.36                                                           
##  NA's   :51                                                              
##  GNI per capita, Atlas method (current US$)
##  Min.   : 4010                             
##  1st Qu.:10942                             
##  Median :24895                             
##  Mean   :32341                             
##  3rd Qu.:53585                             
##  Max.   :95480                             
##  NA's   :18                                
##  GNI per capita, PPP (constant 2021 international $)
##  Min.   : 10003                                     
##  1st Qu.: 25668                                     
##  Median : 41695                                     
##  Mean   : 43316                                     
##  3rd Qu.: 58408                                     
##  Max.   :110892                                     
##  NA's   :25                                         
##  Labor force with advanced education (% of total working-age population with advanced education)
##  Min.   :71.14                                                                                  
##  1st Qu.:76.17                                                                                  
##  Median :78.58                                                                                  
##  Mean   :78.71                                                                                  
##  3rd Qu.:81.11                                                                                  
##  Max.   :90.08                                                                                  
##  NA's   :18                                                                                     
##  Labor force with basic education (% of total working-age population with basic education)
##  Min.   :17.17                                                                            
##  1st Qu.:30.58                                                                            
##  Median :45.48                                                                            
##  Mean   :44.98                                                                            
##  3rd Qu.:56.72                                                                            
##  Max.   :85.92                                                                            
##  NA's   :18                                                                               
##  Labor force with intermediate education (% of total working-age population with intermediate education)
##  Min.   :48.39                                                                                          
##  1st Qu.:61.32                                                                                          
##  Median :65.27                                                                                          
##  Mean   :65.33                                                                                          
##  3rd Qu.:69.00                                                                                          
##  Max.   :77.81                                                                                          
##  NA's   :18                                                                                             
##  Unemployment with advanced education (% of total labor force with advanced education)
##  Min.   : 0.794                                                                       
##  1st Qu.: 2.365                                                                       
##  Median : 3.533                                                                       
##  Mean   : 4.202                                                                       
##  3rd Qu.: 5.187                                                                       
##  Max.   :11.397                                                                       
##  NA's   :18                                                                           
##  Unemployment with basic education (% of total labor force with basic education)
##  Min.   : 0.421                                                                 
##  1st Qu.: 4.255                                                                 
##  Median : 7.901                                                                 
##  Mean   : 8.972                                                                 
##  3rd Qu.:11.917                                                                 
##  Max.   :40.621                                                                 
##  NA's   :18                                                                     
##  Unemployment with intermediate education (% of total labor force with intermediate education)
##  Min.   : 0.885                                                                               
##  1st Qu.: 3.977                                                                               
##  Median : 5.194                                                                               
##  Mean   : 6.154                                                                               
##  3rd Qu.: 7.560                                                                               
##  Max.   :16.066                                                                               
##  NA's   :18                                                                                   
##  Individuals using the Internet (% of population) Fixed broadband subscriptions
##  Min.   : 62.89                                   Min.   :    90200            
##  1st Qu.: 83.63                                   1st Qu.:  1087500            
##  Median : 89.11                                   Median :  3350000            
##  Mean   : 87.89                                   Mean   :  8885245            
##  3rd Qu.: 94.78                                   3rd Qu.:  9062500            
##  Max.   :100.00                                   Max.   :128000000            
##  NA's   :18                                       NA's   :18                   
##  Mobile cellular subscriptions Secure Internet servers GDP growth (annual %)
##  Min.   :   456864             Min.   :     518        Min.   :-5.015       
##  1st Qu.:  5760000             1st Qu.:   21833        1st Qu.: 2.581       
##  Median : 11450000             Median :  213266        Median : 4.191       
##  Mean   : 41068747             Mean   : 1739713        Mean   : 4.133       
##  3rd Qu.: 48150000             3rd Qu.:  766554        3rd Qu.: 5.300       
##  Max.   :373000000             Max.   :60194281        Max.   :10.770       
##  NA's   :18                    NA's   :18              NA's   :18           
##  GDP per capita (constant 2015 US$) Access to electricity (% of population)
##  Min.   : 3578                      Min.   : 94.80                         
##  1st Qu.:10302                      1st Qu.:100.00                         
##  Median :20611                      Median :100.00                         
##  Mean   :28843                      Mean   : 99.72                         
##  3rd Qu.:44483                      3rd Qu.:100.00                         
##  Max.   :99677                      Max.   :100.00                         
##  NA's   :18                         NA's   :18                             
##  Access to clean fuels and technologies for cooking (% of population)
##  Min.   : 54.20                                                      
##  1st Qu.: 95.10                                                      
##  Median :100.00                                                      
##  Mean   : 95.39                                                      
##  3rd Qu.:100.00                                                      
##  Max.   :100.00                                                      
##  NA's   :19                                                          
##  Population ages 0-14, total Population ages 15-64, total
##  Min.   :   70206            Min.   :   254073           
##  1st Qu.:  851109            1st Qu.:  3008344           
##  Median : 1527079            Median :  6386626           
##  Mean   : 6514125            Mean   : 21791933           
##  3rd Qu.: 6383518            3rd Qu.: 24254126           
##  Max.   :70471815            Max.   :217288142           
##  NA's   :18                  NA's   :18                  
##  Population ages 0-14 (% of total population)
##  Min.   :11.41                               
##  1st Qu.:15.08                               
##  Median :16.66                               
##  Mean   :18.21                               
##  3rd Qu.:20.07                               
##  Max.   :32.88                               
##  NA's   :18                                  
##  Population ages 15-64 (% of total population)
##  Min.   :59.89                                
##  1st Qu.:63.64                                
##  Median :65.45                                
##  Mean   :66.12                                
##  3rd Qu.:66.99                                
##  Max.   :81.88                                
##  NA's   :18                                   
##  Population ages 65 and above (% of total population)
##  Min.   : 1.64                                       
##  1st Qu.:10.11                                       
##  Median :17.53                                       
##  Mean   :15.67                                       
##  3rd Qu.:20.39                                       
##  Max.   :23.93                                       
##  NA's   :18                                          
##  Population ages 65 and above, total Population, total  
##  Min.   :   28241                    Min.   :   382003  
##  1st Qu.:  537674                    1st Qu.:  4571021  
##  Median : 1579097                    Median :  9859677  
##  Mean   : 4523469                    Mean   : 32829526  
##  3rd Qu.: 4605347                    3rd Qu.: 36290185  
##  Max.   :56388258                    Max.   :333271411  
##  NA's   :18                          NA's   :18         
##  Preprimary education, duration (years)
##  Min.   :1.000                         
##  1st Qu.:4.000                         
##  Median :6.000                         
##  Mean   :5.125                         
##  3rd Qu.:6.000                         
##  Max.   :7.000                         
##  NA's   :20                            
##  Proportion of seats held by women in national parliaments (%)
##  Min.   : 9.091                                               
##  1st Qu.:23.542                                               
##  Median :29.563                                               
##  Mean   :31.409                                               
##  3rd Qu.:40.446                                               
##  Max.   :50.000                                               
##  NA's   :18                                                   
##  Employment in industry (% of total employment) (modeled ILO estimate)
##  Min.   :12.68                                                        
##  1st Qu.:18.48                                                        
##  Median :21.20                                                        
##  Mean   :22.56                                                        
##  3rd Qu.:26.87                                                        
##  Max.   :36.89                                                        
##  NA's   :18                                                           
##  Employment in agriculture (% of total employment) (modeled ILO estimate)
##  Min.   : 0.09095                                                        
##  1st Qu.: 2.22190                                                        
##  Median : 4.31686                                                        
##  Mean   : 9.09093                                                        
##  3rd Qu.:13.21798                                                        
##  Max.   :55.38076                                                        
##  NA's   :18                                                              
##  Employment in services (% of total employment) (modeled ILO estimate)
##  Min.   :31.94                                                        
##  1st Qu.:61.77                                                        
##  Median :69.46                                                        
##  Mean   :68.35                                                        
##  3rd Qu.:77.82                                                        
##  Max.   :85.34                                                        
##  NA's   :18                                                           
##  Birth rate, crude (per 1,000 people)
##  Min.   : 4.900                      
##  1st Qu.: 8.800                      
##  Median : 9.894                      
##  Mean   :11.135                      
##  3rd Qu.:12.050                      
##  Max.   :21.618                      
##  NA's   :18

datos_imputados <- datos_numericos
datos_imputados[, -1] <- lapply(datos_imputados[, -1], function(x) {
  ifelse(is.na(x), mean(x, na.rm=TRUE),x)
})

NotasNumericas = datos_imputados[,-1]
NotasNumericas = as.data.frame(NotasNumericas)

metodos = c("hierarchical","kmeans","pam")
validacion = suppressMessages(clValid(NotasNumericas, nClust = 4:7, metric = "euclidean", clMethods = metodos, validation = c("internal", "stability"),
method = "ward"))

## Warning in clValid(NotasNumericas, nClust = 4:7, metric = "euclidean",
## clMethods = metodos, : rownames for data not specified, using 1:nrow(data)

summary(validacion)

## 
## Clustering Methods:
##  hierarchical kmeans pam 
## 
## Cluster sizes:
##  4 5 6 7 
## 
## Validation Measures:
##                                       4            5            6            7
##                                                                               
## hierarchical APN           1.100000e-03 1.100000e-03 1.220000e-02 9.500000e-03
##              AD            1.638221e+07 1.319447e+07 1.129872e+07 9.414332e+06
##              ADM           9.343188e+04 7.277224e+04 3.196984e+05 3.375925e+05
##              FOM           1.277323e+06 9.959701e+05 8.960475e+05 7.526625e+05
##              Connectivity  9.025000e+00 1.334130e+01 1.534130e+01 1.802140e+01
##              Dunn          9.990000e-02 9.990000e-02 2.042000e-01 2.970000e-02
##              Silhouette    6.995000e-01 7.235000e-01 7.099000e-01 5.196000e-01
## kmeans       APN           3.700000e-03 1.700000e-03 1.330000e-02 7.500000e-03
##              AD            1.598113e+07 1.289033e+07 1.101110e+07 8.921211e+06
##              ADM           3.411711e+05 1.158990e+05 3.957884e+05 2.816039e+05
##              FOM           1.258489e+06 1.023782e+06 9.283123e+05 7.991300e+05
##              Connectivity  1.386390e+01 1.798020e+01 1.998020e+01 1.895120e+01
##              Dunn          2.740000e-02 2.740000e-02 7.380000e-02 6.920000e-02
##              Silhouette    7.221000e-01 7.425000e-01 7.289000e-01 6.168000e-01
## pam          APN           6.000000e-04 1.500000e-03 1.100000e-03 1.600000e-03
##              AD            1.584959e+07 1.306305e+07 1.089023e+07 8.818984e+06
##              ADM           5.055155e+04 4.432437e+05 8.815363e+04 9.662234e+04
##              FOM           1.252413e+06 1.174879e+06 6.881431e+05 6.427945e+05
##              Connectivity  1.386390e+01 1.798020e+01 1.998020e+01 2.103290e+01
##              Dunn          2.740000e-02 2.740000e-02 7.380000e-02 4.570000e-02
##              Silhouette    7.221000e-01 7.425000e-01 7.289000e-01 6.132000e-01
## 
## Optimal Scores:
## 
##              Score        Method       Clusters
## APN                0.0006 pam          4       
## AD           8818984.2818 pam          7       
## ADM            50551.5506 pam          4       
## FOM           642794.4851 pam          7       
## Connectivity       9.0250 hierarchical 4       
## Dunn               0.2042 hierarchical 6       
## Silhouette         0.7425 kmeans       5

Ward

p1 = fviz_nbclust(x = NotasNumericas, FUNcluster = hcut, method = "silhouette", 
                  hc_method = "ward.D2", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
p2 = fviz_nbclust(x = NotasNumericas, FUNcluster = hcut, method = "wss", 
                  hc_method = "ward.D2", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
grid.arrange(p1, p2, nrow = 1)

Método jerárquico de la media

p1 = fviz_nbclust(x = NotasNumericas, FUNcluster = hcut, method = "silhouette", 
                  hc_method = "average", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
p2 = fviz_nbclust(x = NotasNumericas, FUNcluster = hcut, method = "wss", 
                  hc_method = "average", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
grid.arrange(p1, p2, nrow = 1)

K-Means

p1 = fviz_nbclust(x = NotasNumericas, FUNcluster = kmeans, method = "silhouette", 
             k.max = 10, verbose = FALSE) +
  labs(title = "K-means")
p2 = fviz_nbclust(x = NotasNumericas, FUNcluster = kmeans, method = "wss", 
             k.max = 10, verbose = FALSE) +
  labs(title = "K-means")
grid.arrange(p1, p2, nrow = 1)

K-Medoides

p1 = fviz_nbclust(x = NotasNumericas, FUNcluster = pam, method = "silhouette", 
             k.max = 10, verbose = FALSE) +
  labs(title = "Numero optimo de clusters")
p2 = fviz_nbclust(x = NotasNumericas, FUNcluster = pam, method = "wss", 
             k.max = 10, verbose = FALSE) +
  labs(title = "Numero optimo de clusters")
grid.arrange(p1, p2, nrow = 1)

5,7,5,4-6

midist3 = get_dist(NotasNumericas, stand = FALSE, method = "euclidean")

Aclust1 <- hclust(midist3, method="ward.D2")
Agrupos1 <- cutree(Aclust1, k=5)
table(Agrupos1)

## Agrupos1
##  1  2  3  4  5 
## 40 23  9  2  2

AgruposA = cbind(Agrupos1, datos_imputados[,"CNT"])

Aclust2 <- hclust(midist3, method="average")
Agrupos2 <- cutree(Aclust2, k=7)
table(Agrupos2)

## Agrupos2
##  1  2  3  4  5  6  7 
## 36 27  6  3  1  2  1

AgruposB = cbind(Agrupos2, datos_imputados[,"CNT"])

set.seed(100)
Aclust3 <- kmeans(NotasNumericas, centers = 5, nstart = 20)
table(Aclust3$cluster)

## 
##  1  2  3  4  5 
## 26 38  2  2  8

AgruposC = cbind(Aclust3$cluster, datos_imputados[,"CNT"])
colnames(AgruposC)[1] = "Agrupos3"

Aclust4 <- pam(NotasNumericas, k = 5)
table(Aclust4$clustering)

## 
##  1  2  3  4  5 
## 38 26  8  2  2

AgruposD = cbind(Aclust4$cluster, datos_imputados[,"CNT"])
colnames(AgruposD)[1] = "Agrupos4"

Visualización del mejor clustering

par(mfrow = c(1,2))
plot(silhouette(Agrupos1, midist3), col=colores, border=NA, main = "WARD")
plot(silhouette(Agrupos2, midist3), col=colores, border=NA, main = "MEDIAS")

plot(silhouette(Aclust3$cluster, midist3), col=colores, border=NA, main = "K-MEDIAS")
plot(silhouette(Aclust4$clustering, midist3), col=colores2, border=NA, main = "K-MEDOIDES")

clustAUX = Aclust3
gruposAUX = AgruposC

world = ne_countries(scale = "medium", returnclass = "sf")

world$iso_a3[world$admin == "France"] = "FRA"
world$iso_a3[world$admin == "Norway"] = "NOR"
world$iso_a3[world$admin == "Kosovo"] = "KSV"
world$iso_a3[world$admin == "Taiwan"] = "TAP"

world_clustered <- world %>%
  left_join(AgruposC, by = c("iso_a3" = "CNT"))
world_clustered$gruposAUX <- as.factor(world_clustered$Agrupos3)

ggplot(world_clustered) +
  geom_sf(aes(fill = gruposAUX), color = NA, size = 0.1) +
  scale_fill_brewer(palette = "Set2", na.value = "grey90") +
  theme_minimal() +
  labs(title = "Mapa de Clústeres de Países KMEDIAS k=5", fill = "Cluster")

PaisesCluster = merge(datos_imputados, gruposAUX, by = "CNT")

MediasCluster = PaisesCluster %>%
  group_by(Agrupos3) %>%
  summarise(
    nota_media_Mates = mean(MathMean_Country),
    nota_media_Lectura = mean(LectureMean_Country),
    nota_media_Ciencias = mean(ScienceMean_Country)
    )
MediasCluster$nota_media_General = rowMeans(MediasCluster[,2:ncol(MediasCluster)])
MediasCluster

## # A tibble: 5 × 5
##   Agrupos3 nota_media_Mates nota_media_Lectura nota_media_Ciencias
##      <int>            <dbl>              <dbl>               <dbl>
## 1        1             437.               429.                444.
## 2        2             452.               450.                461.
## 3        3             375.               381.                384.
## 4        4             423.               441.                450.
## 5        5             463.               466.                478.
## # ℹ 1 more variable: nota_media_General <dbl>

MediasCluster2 <- MediasCluster %>%
  pivot_longer(
    cols = starts_with("nota"),
    names_to = "area",
    values_to = "nota"
  )

MediasCluster2$area <- gsub("nota_media_", "", MediasCluster2$area)

ggplot(MediasCluster2, aes(x = factor(Agrupos3), y = nota, fill = area)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Nota media por área y clúster",
    x = "Clúster",
    y = "Nota media",
    fill = "Área"
  ) +
  theme_minimal()

# Combinar datos de PISA con Banco Mundial
datos_combi <- merge(
  NotasMediasPais, 
  bancomundial, 
  by.x = "CNT", 
  by.y = "Country.Code",
  all.x=TRUE
)

# 1. Limpieza y preparación de datos
bm_pisa_clean <- datos_combi %>%
  mutate(across(-CNT, ~ {
    x_char <- as.character(.x)
    as.numeric(ifelse(x_char %in% c("", "NA"), NA, x_char))
  })) %>%
  # Eliminar columnas con >50% NAs
  select(where(~ mean(is.na(.x)) <= 0.5)) %>%
  # Eliminar filas con NAs (alternativa: tidyr::fill() o imputación)
  filter(complete.cases(.))

# 2. Estandarización y clustering
set.seed(100)
bm_pisa_scaled <- bm_pisa_clean %>%
  select(-CNT) %>%
  scale() %>%
  as.data.frame()

clust_bm <- kmeans(bm_pisa_scaled, centers = 5, nstart = 20)

# 3. Asignar clusters a los datos originales
bm_cluster_data <- bm_pisa_clean %>%
  mutate(Cluster = as.factor(clust_bm$cluster))

# 4. Análisis de variables importantes (ajusta los patrones según tus datos)
vars_importantes <- names(bm_cluster_data) %>%
  grep("GDP|Internet|education|employment|GNI|PISA|math|science|reading", 
       ., value = TRUE, ignore.case = TRUE)
# 5. Estadísticas por cluster
medias_cluster <- bm_cluster_data %>%
  select(Cluster, all_of(vars_importantes)) %>%
  group_by(Cluster) %>%
  summarise(across(
    everything(),
    list(mean = ~mean(., na.rm = TRUE),
         sd = ~sd(., na.rm = TRUE))
  ))

# 6. Preparar para visualización
medias_plot <- medias_cluster %>%
  pivot_longer(
    cols = -Cluster,
    names_to = c("Variable", ".value"),
    names_sep = "_"
  )

## Warning: Expected 2 pieces. Additional pieces discarded in 4 rows [1, 2, 5, 6].

# 7. Visualización de indicadores
ggplot(medias_plot, aes(x = Cluster, y = mean, fill = Variable)) +
  geom_col(position = position_dodge(0.8), width = 0.7) +
  geom_errorbar(
    aes(ymin = mean - sd, ymax = mean + sd),
    width = 0.2,
    position = position_dodge(0.8),
    color = "black"
  ) +
  labs(title = "Indicadores por Cluster") +
  theme_minimal()

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_col()`).

# 1. Primero verifiquemos los nombres reales de las columnas
names(bm_cluster_data)

##  [1] "CNT"                                                                                                    
##  [2] "MathMean_Country"                                                                                       
##  [3] "MathSD"                                                                                                 
##  [4] "LectureMean_Country"                                                                                    
##  [5] "LectureSD"                                                                                              
##  [6] "ScienceMean_Country"                                                                                    
##  [7] "ScienceSD"                                                                                              
##  [8] "Compulsory education, duration (years)"                                                                 
##  [9] "Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative)"      
## [10] "Educational attainment, at least completed lower secondary, population 25+, total (%) (cumulative)"     
## [11] "Educational attainment, at least completed post-secondary, population 25+, total (%) (cumulative)"      
## [12] "Educational attainment, at least completed primary, population 25+ years, total (%) (cumulative)"       
## [13] "Educational attainment, at least completed upper secondary, population 25+, total (%) (cumulative)"     
## [14] "Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)"        
## [15] "Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)"                 
## [16] "GNI per capita, Atlas method (current US$)"                                                             
## [17] "GNI per capita, PPP (constant 2021 international $)"                                                    
## [18] "Labor force with advanced education (% of total working-age population with advanced education)"        
## [19] "Labor force with basic education (% of total working-age population with basic education)"              
## [20] "Labor force with intermediate education (% of total working-age population with intermediate education)"
## [21] "Unemployment with advanced education (% of total labor force with advanced education)"                  
## [22] "Unemployment with basic education (% of total labor force with basic education)"                        
## [23] "Unemployment with intermediate education (% of total labor force with intermediate education)"          
## [24] "Individuals using the Internet (% of population)"                                                       
## [25] "Fixed broadband subscriptions"                                                                          
## [26] "Mobile cellular subscriptions"                                                                          
## [27] "Secure Internet servers"                                                                                
## [28] "GDP growth (annual %)"                                                                                  
## [29] "GDP per capita (constant 2015 US$)"                                                                     
## [30] "Access to electricity (% of population)"                                                                
## [31] "Access to clean fuels and technologies for cooking (% of population)"                                   
## [32] "Population ages 0-14, total"                                                                            
## [33] "Population ages 15-64, total"                                                                           
## [34] "Population ages 0-14 (% of total population)"                                                           
## [35] "Population ages 15-64 (% of total population)"                                                          
## [36] "Population ages 65 and above (% of total population)"                                                   
## [37] "Population ages 65 and above, total"                                                                    
## [38] "Population, total"                                                                                      
## [39] "Preprimary education, duration (years)"                                                                 
## [40] "Proportion of seats held by women in national parliaments (%)"                                          
## [41] "Employment in industry (% of total employment) (modeled ILO estimate)"                                  
## [42] "Employment in agriculture (% of total employment) (modeled ILO estimate)"                               
## [43] "Employment in services (% of total employment) (modeled ILO estimate)"                                  
## [44] "Birth rate, crude (per 1,000 people)"                                                                   
## [45] "Cluster"

# 2. Seleccionar variables basadas en los nombres reales
# (Estos son ejemplos - ajusta según los nombres reales en tus datos)
vars_importantes <- names(bm_cluster_data) %>%
  grep("GDP|Internet|education|employment|GNI", ., value = TRUE, ignore.case = TRUE)

# Mostrar las variables seleccionadas para confirmación
print(vars_importantes)

##  [1] "Compulsory education, duration (years)"                                                                 
##  [2] "Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative)"      
##  [3] "Educational attainment, at least completed lower secondary, population 25+, total (%) (cumulative)"     
##  [4] "Educational attainment, at least completed post-secondary, population 25+, total (%) (cumulative)"      
##  [5] "Educational attainment, at least completed primary, population 25+ years, total (%) (cumulative)"       
##  [6] "Educational attainment, at least completed upper secondary, population 25+, total (%) (cumulative)"     
##  [7] "Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)"        
##  [8] "Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)"                 
##  [9] "GNI per capita, Atlas method (current US$)"                                                             
## [10] "GNI per capita, PPP (constant 2021 international $)"                                                    
## [11] "Labor force with advanced education (% of total working-age population with advanced education)"        
## [12] "Labor force with basic education (% of total working-age population with basic education)"              
## [13] "Labor force with intermediate education (% of total working-age population with intermediate education)"
## [14] "Unemployment with advanced education (% of total labor force with advanced education)"                  
## [15] "Unemployment with basic education (% of total labor force with basic education)"                        
## [16] "Unemployment with intermediate education (% of total labor force with intermediate education)"          
## [17] "Individuals using the Internet (% of population)"                                                       
## [18] "Secure Internet servers"                                                                                
## [19] "GDP growth (annual %)"                                                                                  
## [20] "GDP per capita (constant 2015 US$)"                                                                     
## [21] "Preprimary education, duration (years)"                                                                 
## [22] "Employment in industry (% of total employment) (modeled ILO estimate)"                                  
## [23] "Employment in agriculture (% of total employment) (modeled ILO estimate)"                               
## [24] "Employment in services (% of total employment) (modeled ILO estimate)"

# 3. Calcular estadísticas por cluster (versión robusta)
medias_cluster_bm <- bm_cluster_data %>%
  # Seleccionar solo columnas numéricas además del Cluster
  select(Cluster, where(is.numeric)) %>%
  group_by(Cluster) %>%
  summarise(across(
    where(is.numeric),
    list(mean = ~mean(., na.rm = TRUE),
         sd = ~sd(., na.rm = TRUE))
  ))  # <- Paréntesis faltante aquí

# 4. Preparar datos para visualización (versión segura)
medias_plot_bm <- medias_cluster_bm %>%
  pivot_longer(
    cols = -Cluster,
    names_to = c("Variable", ".value"),
    names_sep = "_"
  )

## Warning: Expected 2 pieces. Additional pieces discarded in 6 rows [1, 2, 5, 6,
## 9, 10].

# 5. Visualización
ggplot(medias_plot_bm, aes(x = Cluster, y = mean, fill = Variable)) +
  geom_col(position = position_dodge(0.8), width = 0.7) +
  geom_errorbar(
    aes(ymin = mean - sd, ymax = mean + sd),
    width = 0.2,
    position = position_dodge(0.8),
    color = "black"
  ) +
  labs(
    title = "Indicadores del Banco Mundial por Cluster",
    x = "Cluster",
    y = "Valor promedio",
    fill = "Indicador"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom",
        axis.text.x = element_text(angle = 45,hjust=1))

## Warning: Removed 30 rows containing missing values or values outside the scale range
## (`geom_col()`).

PCA + Clustering

load('pca_data.RData')

gruposC = gruposC[-(which(gruposC$CNT %in% c("USA", "IDN", "PHL"))),]
pca_data$clusters = as.factor(gruposC$grupos3)


ggplot(pca_data, aes(x = Dim1, y = Dim2, label = label, color = clusters)) +
  geom_text(size = 3) +
  scale_color_manual(values = c("#228B22", "#FF8C00", "blue", "#D33682")) +
  labs(title = "PCA - Dimensiones 1 y 2", color = "Cluster") +
  theme_minimal()

ggplot(pca_data, aes(x = Dim3, y = Dim4, label = label, color = clusters)) +
  geom_text(size = 3) +
  scale_color_manual(values = c("#228B22", "#FF8C00", "blue", "#D33682")) +
  labs(title = "PCA - Dimensiones 3 y 4", color = "Cluster") +
  theme_minimal()

ProyectoClustering

David

2025-02-21

Librerías

Limpieza de datos

Creación de notas medias por estudiante y por país

Boxplot medias estudiantes

Boxplot medias países

Identificar códigos que no hacen bien el join

Mapa medias por país

Heatmap Medias y Desviaciones típicas

Estadístico Hopkins

Prueba de PCA

Clustering por países

Visualización número de clústeres óptimos (Silhouette y SC intraclúster)

Ward

Método Jerárquico de la media

K-Medias

K-Medoides

Creación de clusterings

Visualización de clústeres

Visualización del mejor clustering

Validación del clustering

Visualización del Clúster final

Clustering España

Creación del df de medias por CCAA

Boxplot medias estudiantes de España

Boxplot medias comunidades autónomas

HeatMap España

Estadístico Hopkins España

Mapa medias por Comunidad Autónoma

Visualización número de clústeres óptimos (Silhouette y SC intraclúster)

Ward

Método jerárquico de la media

K-Means

K-Medoides

Creación de clusterings

Visualización del mejor clustering

Validación del clustering

Visualización del Clúster final

Clustering Banco Mundial + NotasMediasPaís

Ward

Método jerárquico de la media

K-Means

K-Medoides

Visualización del mejor clustering

PCA + Clustering