library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(haven)
library(ggplot2)
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.4.3
library(rnaturalearthdata)
## Warning: package 'rnaturalearthdata' was built under R version 4.4.3
##
## Adjuntando el paquete: 'rnaturalearthdata'
## The following object is masked from 'package:rnaturalearth':
##
## countries110
library(rnaturalearthhires)
library(cluster)
library(FactoMineR)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(NbClust)
library(clValid)
## Warning: package 'clValid' was built under R version 4.4.3
library(grid)
library(gridExtra)
##
## Adjuntando el paquete: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(ggsci)
library(missMDA)
## Warning: package 'missMDA' was built under R version 4.4.3
library(readxl)
library(mice)
##
## Adjuntando el paquete: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
#EJECUTAR ESTE CÓDIGO UNA SOLA VEZ
datosPisaEstudiantes = read_sas("C:/Users/David/UPV/PROYII/proyecto/datosPisaEstudiantes.SAS7BDAT", NULL)
numNA = apply(datosPisaEstudiantes, 1, function(x) sum(is.na(x)))
percNA = round(100*apply(datosPisaEstudiantes, 1, function(x) mean(is.na(x))), 2)
tablaNA2 = data.frame(numNA, percNA)
summary(tablaNA2$numNA)
barplot(table(tablaNA2$percNA), xlab = "% Valores faltantes", ylab = "Número de casos",
main = "Valores faltantes por estudiante")
datosPisaEstudiantes_filtrado <- datosPisaEstudiantes %>%
select(which(colMeans(is.na(datosPisaEstudiantes)) <= 0.2))
#Filtrado de estudiantes puede que sea innecesario
datosPisaEstudiantes_filtrado = datosPisaEstudiantes_filtrado %>% filter(rowMeans(is.na(.)) <= 0.2)
save(datosPisaEstudiantes_filtrado, file="estudiantesPISAfiltrados.R")
load("estudiantesPISAfiltrados.R")
NotasPlausibles = datosPisaEstudiantes_filtrado %>%
select(matches("^PV|^CNT$|^CNTSCHID$"))
NotasMedias = NotasPlausibles %>% transmute(CNT = NotasPlausibles$CNT,
CNTSCHID = NotasPlausibles$CNTSCHID,
MathMean = rowMeans(NotasPlausibles[,3:12]),
LectureMean = rowMeans(NotasPlausibles[,13:22]),
ScienceMean = rowMeans(NotasPlausibles[,23:32]))
NotasMediasPais = NotasMedias %>% group_by(CNT) %>% summarise(
MathMean_Country = mean(MathMean, na.rm = TRUE),
MathSD = sd(MathMean, na.rm = TRUE),
LectureMean_Country = mean(LectureMean, na.rm = TRUE),
LectureSD = sd(LectureMean, na.rm = TRUE),
ScienceMean_Country = mean(ScienceMean, na.rm = TRUE),
ScienceSD = sd(ScienceMean, na.rm = TRUE))
#He quitado todos los View y puesto todos los textos para que se vean bien aunque hayan los gráficos de clústeres óptimos y de Silhouette. Falta cambiar nombres de algunos gráficos y poner alguna explicación más
par(mar = c(5, 7, 4, 2))
boxplot(NotasMedias$MathMean, NotasMedias$LectureMean, NotasMedias$ScienceMean,
names = c("Matemáticas", "Lectura", "Ciencias"),
main = "Valores atípicos medias estudiantes",
col = c("red", "lightblue", "green"),
horizontal = TRUE, cex.axis = 0.8)
summary(NotasMedias)
## CNT CNTSCHID MathMean LectureMean
## Length:562708 Min. : 800001 Min. :150.2 Min. : 37.96
## Class :character 1st Qu.:21400219 1st Qu.:370.8 1st Qu.:365.57
## Mode :character Median :40000043 Median :438.2 Median :441.23
## Mean :43679973 Mean :446.4 Mean :443.54
## 3rd Qu.:68800107 3rd Qu.:514.6 3rd Qu.:519.47
## Max. :90100182 Max. :843.3 Max. :890.17
## ScienceMean
## Min. : 51.34
## 1st Qu.:379.33
## Median :449.85
## Mean :455.78
## 3rd Qu.:527.40
## Max. :852.45
par(mar = c(5, 7, 4, 2))
boxplot(
NotasMediasPais$MathMean_Country,
NotasMediasPais$LectureMean_Country,
NotasMediasPais$ScienceMean_Country,
names = c("Matemáticas", "Lectura", "Ciencias"),
main = "Valores atípicos medias países",
col = c("red", "lightblue", "green"),
horizontal = TRUE, cex.axis = 0.8)
summary(NotasMediasPais)
## CNT MathMean_Country MathSD LectureMean_Country
## Length:76 Min. :343.5 Min. : 48.07 Min. :337.7
## Class :character 1st Qu.:396.2 1st Qu.: 74.31 1st Qu.:406.0
## Mode :character Median :449.1 Median : 82.78 Median :448.5
## Mean :445.1 Mean : 80.63 Mean :442.6
## 3rd Qu.:483.6 3rd Qu.: 90.03 3rd Qu.:484.0
## Max. :574.6 Max. :108.34 Max. :543.2
## LectureSD ScienceMean_Country ScienceSD
## Min. : 59.36 Min. :356.2 Min. : 55.49
## 1st Qu.: 81.24 1st Qu.:411.8 1st Qu.: 76.93
## Median : 86.80 Median :461.2 Median : 86.68
## Mean : 87.58 Mean :454.4 Mean : 84.59
## 3rd Qu.: 96.22 3rd Qu.:497.6 3rd Qu.: 93.52
## Max. :114.10 Max. :561.5 Max. :104.17
world = ne_countries(scale = "medium", returnclass = "sf")
world <- left_join(world, NotasMediasPais, by = c("iso_a3" = "CNT"))
iso_validos = world$iso_a3[!is.na(world$MathMean_Country)]
iso_novalidos = setdiff(NotasMediasPais$CNT, iso_validos)
print(iso_novalidos)
## [1] "FRA" "KSV" "NOR" "QAZ" "QUR" "TAP"
#Poner este código cada vez que se cree world
world$iso_a3[world$admin == "France"] = "FRA"
world$iso_a3[world$admin == "Norway"] = "NOR"
world$iso_a3[world$admin == "Kosovo"] = "KSV"
world$iso_a3[world$admin == "Taiwan"] = "TAP"
#QAZ es Baku (Azerbaiyan) y QUR es algunas regiones de Ucrania
world = ne_countries(scale = "medium", returnclass = "sf")
world$iso_a3[world$admin == "France"] = "FRA"
world$iso_a3[world$admin == "Norway"] = "NOR"
world$iso_a3[world$admin == "Kosovo"] = "KSV"
world$iso_a3[world$admin == "Taiwan"] = "TAP"
world <- left_join(world, NotasMediasPais, by = c("iso_a3" = "CNT"))
ggplot(data = world) +
geom_sf(aes(fill = MathMean_Country), color = "white") +
scale_fill_viridis_c(option = "magma", na.value = "gray90") +
labs(title = "Puntaje Medio en Matemáticas por País",
fill = "Puntaje") +
theme_minimal()
ggplot(data = world) +
geom_sf(aes(fill = LectureMean_Country), color = "white") +
scale_fill_viridis_c(option = "magma", na.value = "gray90") +
labs(title = "Puntaje Medio en Lectura por País",
fill = "Puntaje") +
theme_minimal()
ggplot(data = world) +
geom_sf(aes(fill = ScienceMean_Country), color = "white") +
scale_fill_viridis_c(option = "magma", na.value = "gray90") +
labs(title = "Puntaje Medio en Ciencia por País",
fill = "Puntaje") +
theme_minimal()
midist = get_dist(NotasMediasPais[2:ncol(NotasMediasPais)], stand = FALSE, method = "euclidean")
fviz_dist(midist, show_labels = TRUE, lab_size = 0.3,
gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
#nrow(NotasMediasPais)
set.seed(100)
myN = c(20, 35, 50, 65) # m
myhopkins = NULL
myseed = sample(1:1000, 10)
for (i in myN) {
for (j in myseed) {
tmp = get_clust_tendency(data = NotasMediasPais[,2:ncol(NotasMediasPais)], n = i, graph = FALSE, seed = j)
myhopkins = c(myhopkins, tmp$hopkins_stat)
}
}
summary(myhopkins)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.7923 0.8240 0.8315 0.8330 0.8429 0.8730
#Hopkins cercano a 1 indica datos agrupados
#Escalar por bloques?
res.pca = PCA(NotasMediasPais,
graph = FALSE, ncp = 2, quali.sup = 1)
fviz_pca_ind(res.pca, axes = c(1, 2), geom = c("point", "text"),
habillage = "CNT", repel = TRUE, labelsize = 2)
fviz_pca_var(res.pca, axes = c(1,2), repel = TRUE, col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))
p1 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = hcut, method = "silhouette",
hc_method = "ward.D2", k.max = 10, verbose = FALSE,
hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
p2 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = hcut, method = "wss",
hc_method = "ward.D2", k.max = 10, verbose = FALSE,
hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
grid.arrange(p1, p2, nrow = 1)
p1 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = hcut, method = "silhouette",
hc_method = "average", k.max = 10, verbose = FALSE,
hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
p2 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = hcut, method = "wss",
hc_method = "average", k.max = 10, verbose = FALSE,
hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
grid.arrange(p1, p2, nrow = 1)
#En los métodos jerárquicos se puede meter la variable CNT y no da error, pero mejor quitarla para que nos diga warnings (en los métodos de partición no se puede incluir directamente)
En los jerárquicos cogemos 5 clústeres, ya que los más óptimos son 2, 4 o 5 clústeres, pero 2 es demasiado poco y 5 es un poco mejor que 4 en ambos criterios.
p1 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = kmeans, method = "silhouette",
k.max = 10, verbose = FALSE) +
labs(title = "K-means")
p2 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = kmeans, method = "wss",
k.max = 10, verbose = FALSE) +
labs(title = "K-means")
grid.arrange(p1, p2, nrow = 1)
p1 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = pam, method = "silhouette",
k.max = 10, verbose = FALSE) +
labs(title = "Numero optimo de clusters")
p2 = fviz_nbclust(x = NotasMediasPais[,-1], FUNcluster = pam, method = "wss",
k.max = 10, verbose = FALSE) +
labs(title = "Numero optimo de clusters")
grid.arrange(p1, p2, nrow = 1)
Excluyendo la posibilidad de hacerlo con dos clústeres, con el método de k-medias nos salen 4 o 6 como la cantidad idonea de clústeres. Con el método de k-medoides nos sale otra vez 5 clústeres.
clust1 <- hclust(midist, method="ward.D2")
grupos1 <- cutree(clust1, k=5)
table(grupos1)
## grupos1
## 1 2 3 4 5
## 18 14 30 8 6
gruposA = cbind(grupos1, NotasMediasPais[,"CNT"])
clust2 <- hclust(midist, method="average")
grupos2 <- cutree(clust2, k=5)
table(grupos2)
## grupos2
## 1 2 3 4 5
## 18 14 28 8 8
gruposB = cbind(grupos2, NotasMediasPais[,"CNT"])
set.seed(100)
clust3 <- kmeans(NotasMediasPais[,-1], centers = 4, nstart = 20)
table(clust3$cluster)
##
## 1 2 3 4
## 15 23 31 7
gruposC = cbind(clust3$cluster, NotasMediasPais[,"CNT"])
colnames(gruposC)[1] = "grupos3"
clust4 <- pam(NotasMediasPais, k = 5)
table(clust4$clustering)
##
## 1 2 3 4 5
## 13 24 17 15 7
gruposD = cbind(clust4$cluster, NotasMediasPais[,"CNT"])
colnames(gruposD)[1] = "grupos4"
grupos1 == grupos2
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [13] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## [25] TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [37] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [49] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [73] TRUE TRUE TRUE TRUE
Son prácticamente iguales los clusterings en los dos métodos jerárquicos, solo cambia en que incluye irlanda y estonia en el grupo de singapur.
world = ne_countries(scale = "large", returnclass = "sf")
world$iso_a3[world$admin == "France"] = "FRA"
world$iso_a3[world$admin == "Norway"] = "NOR"
world$iso_a3[world$admin == "Kosovo"] = "KSV"
world$iso_a3[world$admin == "Taiwan"] = "TAP"
world_clustered <- world %>%
left_join(gruposA, by = c("iso_a3" = "CNT"))
world_clustered$grupos1 <- as.factor(world_clustered$grupos1)
ggplot(world_clustered) +
geom_sf(aes(fill = grupos1), color = NA, size = 0.1) +
scale_fill_brewer(palette = "Set3", na.value = "grey90") +
theme_minimal() +
labs(title = "Mapa de Clústeres de Países WARD k=5", fill = "Cluster")
world_clustered <- world %>%
left_join(gruposB, by = c("iso_a3" = "CNT"))
world_clustered$grupos2 <- as.factor(world_clustered$grupos2)
ggplot(world_clustered) +
geom_sf(aes(fill = grupos2), color = NA, size = 0.1) +
scale_fill_brewer(palette = "Set3", na.value = "grey90") +
theme_minimal() +
labs(title = "Mapa de Clústeres de Países MEDIAS k=5", fill = "Cluster")
world_clustered <- world %>%
left_join(gruposC, by = c("iso_a3" = "CNT"))
world_clustered$grupos3 <- as.factor(world_clustered$grupos3)
ggplot(world_clustered) +
geom_sf(aes(fill = grupos3), color = NA, size = 0.1) +
scale_fill_brewer(palette = "Set3", na.value = "grey90") +
theme_minimal() +
labs(title = "Mapa de Clústeres de Países KMEDIAS k=4", fill = "Cluster")
world_clustered <- world %>%
left_join(gruposD, by = c("iso_a3" = "CNT"))
world_clustered$grupos4 <- as.factor(world_clustered$grupos4)
ggplot(world_clustered) +
geom_sf(aes(fill = grupos4), color = NA, size = 0.1) +
scale_fill_brewer(palette = "Set3", na.value = "grey90") +
theme_minimal() +
labs(title = "Mapa de Clústeres de Países KMEDOIDES k=5", fill = "Cluster")
colores = pal_npg("nrc")(6)
colores2 = pal_npg("nrc")(7)
par(mfrow = c(1,2))
plot(silhouette(grupos1, midist), col=colores, border=NA, main = "WARD")
plot(silhouette(grupos2, midist), col=colores, border=NA, main = "MEDIAS")
plot(silhouette(clust3$cluster, midist), col=colores, border=NA, main = "K-MEDIAS")
plot(silhouette(clust4$clustering, midist), col=colores2, border=NA, main = "K-MEDOIDES")
Según el coeficiente de Silhouette el mejor método es el de k-medias, así que lo escogemos. Además no tiene ningún individuo con un coeficiente de Silhouette negativo.
NotasNumericas = NotasMediasPais[,-1]
NotasNumericas = as.data.frame(NotasNumericas)
metodos = c("hierarchical","kmeans","pam")
validacion = suppressMessages(clValid(NotasNumericas, nClust = 4:6, metric = "euclidean", clMethods = metodos, validation = c("internal", "stability"),
method = "ward"))
## Warning in clValid(NotasNumericas, nClust = 4:6, metric = "euclidean",
## clMethods = metodos, : rownames for data not specified, using 1:nrow(data)
summary(validacion)
##
## Clustering Methods:
## hierarchical kmeans pam
##
## Cluster sizes:
## 4 5 6
##
## Validation Measures:
## 4 5 6
##
## hierarchical APN 0.2186 0.1967 0.1723
## AD 49.3383 40.1190 35.3354
## ADM 24.3513 15.5680 11.9886
## FOM 14.2901 12.3023 10.9805
## Connectivity 14.8532 19.9964 27.0885
## Dunn 0.2218 0.2663 0.1502
## Silhouette 0.4536 0.4568 0.3689
## kmeans APN 0.0669 0.1233 0.0779
## AD 41.2157 36.9759 31.8536
## ADM 6.2354 9.1124 5.1827
## FOM 13.8448 12.1483 10.9892
## Connectivity 14.6960 20.6234 24.0770
## Dunn 0.2335 0.2667 0.1871
## Silhouette 0.4936 0.4656 0.3924
## pam APN 0.1100 0.0395 0.1354
## AD 45.1380 34.6045 33.3412
## ADM 12.1841 2.8777 7.8905
## FOM 14.3613 11.4065 11.0022
## Connectivity 21.2111 22.8972 27.9671
## Dunn 0.0503 0.2012 0.2667
## Silhouette 0.4432 0.4618 0.4418
##
## Optimal Scores:
##
## Score Method Clusters
## APN 0.0395 pam 5
## AD 31.8536 kmeans 6
## ADM 2.8777 pam 5
## FOM 10.9805 hierarchical 6
## Connectivity 14.6960 kmeans 4
## Dunn 0.2667 kmeans 5
## Silhouette 0.4936 kmeans 4
Considerando todo lo anterior junto con estos parámetros creemos que el mejor método sigue siendo k-means con 4 clústeres. Dando como parámetros más importantes el coeficiente de Silhouette, el índice de Dunn y la conectividad.
world_clustered <- world %>%
left_join(gruposC, by = c("iso_a3" = "CNT"))
world_clustered$grupos3 <- as.factor(world_clustered$grupos3)
ggplot(world_clustered) +
geom_sf(aes(fill = grupos3), color = NA, size = 0.1) +
scale_fill_brewer(palette = "Set2", na.value = "grey90") +
theme_minimal() +
labs(title = "Mapa de Clústeres de Países KMEDIAS k=4", fill = "Cluster")
PaisesCluster = merge(NotasMediasPais, gruposC, by = "CNT")
MediasCluster = PaisesCluster %>%
group_by(grupos3) %>%
summarise(
nota_media_Mates = mean(MathMean_Country),
nota_media_Lectura = mean(LectureMean_Country),
nota_media_Ciencias = mean(ScienceMean_Country)
)
MediasCluster$nota_media_General = rowMeans(MediasCluster[,2:ncol(MediasCluster)])
MediasCluster
## # A tibble: 4 × 5
## grupos3 nota_media_Mates nota_media_Lectura nota_media_Ciencias
## <int> <dbl> <dbl> <dbl>
## 1 1 371. 362. 377.
## 2 2 417. 419. 428.
## 3 3 480. 483. 492.
## 4 4 541. 516. 538.
## # ℹ 1 more variable: nota_media_General <dbl>
MediasCluster2 <- MediasCluster %>%
pivot_longer(
cols = starts_with("nota"),
names_to = "area",
values_to = "nota"
)
MediasCluster2$area <- gsub("nota_media_", "", MediasCluster2$area)
ggplot(MediasCluster2, aes(x = factor(grupos3), y = nota, fill = area)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "Nota media por área y clúster",
x = "Clúster",
y = "Nota media",
fill = "Área"
) +
theme_minimal()
PaisesCluster %>%
group_by(grupos3) %>%
summarise(paises = paste(CNT, collapse = ", "))
## # A tibble: 4 × 2
## grupos3 paises
## <int> <chr>
## 1 1 ALB, DOM, GEO, IDN, JOR, KSV, MAR, MKD, PAN, PHL, PSE, QAZ, SAU, SLV,…
## 2 2 ARE, ARG, BGR, BRA, BRN, COL, CRI, GRC, ISL, JAM, KAZ, MDA, MEX, MNE,…
## 3 3 AUS, AUT, BEL, CAN, CHE, CHL, CZE, DEU, DNK, ESP, FIN, FRA, GBR, HRV,…
## 4 4 EST, HKG, JPN, KOR, MAC, SGP, TAP
Las notas medias de los clústeres van de menos a más (el clúster 1 es el que menor nota tiene y el 4 es el que mayor nota tiene). Vemos que esta subida de notas ocurre en todas las áreas.
Clúster 1 = {Marruecos, Arabia Saudí, Uzbekistán, Indonesia, Filipinas, Georgia…}. Países subdesarrollados que no destacan por su educación.
Clúster 2 = {Latinoamérica, Balcanes, Kazajistán, Mongolia, Tailandia…}. Países subdesarrollados o en vías de desarrollo, tienen notas un poco mejor que el grupo anterior.
Clúster 3 = {Europa, Canadá, EEUU, Australia, Nueva Zelanda…}. Países desarrollados con buena educación.
Clúster 4 = {Estonia, Corea del Sur, Japón, Singapur, Macao, Hong Kong…}. Países desarrollados que suelen ser pequeños y destacan por su avanzado programa educativo.
ESP = datosPisaEstudiantes_filtrado[datosPisaEstudiantes_filtrado$CNT=="ESP",]
PlausiblesESP = ESP %>%
select(matches("^PV|^CNTSCHID$|^REGION$"))
MediasESP = PlausiblesESP %>% transmute(CNTSCHID = PlausiblesESP$CNTSCHID,
REGION = PlausiblesESP$REGION,
MathMean = rowMeans(PlausiblesESP[,3:12]),
LectureMean = rowMeans(PlausiblesESP[,13:22]),
ScienceMean = rowMeans(PlausiblesESP[,23:32]))
MediasREG = MediasESP %>% group_by(REGION) %>% summarise(
MathMeanReg = mean(MathMean),
MathSDReg = sd(MathMean),
LectureMeanReg = mean(LectureMean),
LectureSDReg = sd(LectureMean),
ScienceMeanReg = mean(ScienceMean),
ScienceSDReg = sd(ScienceMean)
)
regions = ne_states(country = "Spain", returnclass = "sf")
#código del .SAS
cod_reg <- tibble(
REGION = c(72401, 72402, 72403, 72404, 72405, 72406, 72407, 72408,
72409, 72410, 72411, 72412, 72413, 72414, 72415, 72416,
72417, 72418, 72419),
REGION_NAME = c("Andalucía",
"Aragón",
"Asturias",
"Islas Baleares",
"Canary Is.",
"Cantabria",
"Castilla y León",
"Castilla-La Mancha",
"Cataluña",
"Extremadura",
"Galicia",
"La Rioja",
"Madrid",
"Murcia",
"Foral de Navarra",
"País Vasco",
"Valenciana",
"Ceuta",
"Melilla")
)
MediasREG = MediasREG %>% left_join(cod_reg, by = "REGION")
MediasREG = MediasREG %>% select(REGION_NAME, matches("^Math|^Lecture|^Science"))
par(mar = c(5, 7, 4, 2))
boxplot(MediasESP$MathMean, MediasESP$LectureMean, MediasESP$ScienceMean,
names = c("Matemáticas", "Lectura", "Ciencias"),
main = "Valores atípicos medias estudiantes España",
col = c("red", "lightblue", "green"),
horizontal = TRUE, cex.axis = 0.8)
summary(MediasESP)
## CNTSCHID REGION MathMean LectureMean
## Min. :72400001 Min. :72401 Min. :201.9 Min. :121.8
## 1st Qu.:72400252 1st Qu.:72405 1st Qu.:428.4 1st Qu.:425.0
## Median :72400497 Median :72410 Median :485.8 Median :487.5
## Mean :72400495 Mean :72410 Mean :483.3 Mean :482.8
## 3rd Qu.:72400741 3rd Qu.:72414 3rd Qu.:540.7 3rd Qu.:544.2
## Max. :72400987 Max. :72419 Max. :773.5 Max. :759.6
## ScienceMean
## Min. :172.4
## 1st Qu.:435.1
## Median :495.1
## Mean :492.3
## 3rd Qu.:551.7
## Max. :767.6
par(mar = c(5, 7, 4, 2))
boxplot(
MediasREG$MathMeanReg,
MediasREG$LectureMeanReg,
MediasREG$ScienceMeanReg,
names = c("Matemáticas", "Lectura", "Ciencias"),
main = "Valores atípicos medias Com. Autónomas",
col = c("red", "lightblue", "green"),
horizontal = TRUE, cex.axis = 0.8)
summary(MediasREG)
## REGION_NAME MathMeanReg MathSDReg LectureMeanReg
## Length:19 Min. :413.9 Min. :75.25 Min. :423.7
## Class :character 1st Qu.:469.7 1st Qu.:77.02 1st Qu.:472.1
## Mode :character Median :478.6 Median :78.42 Median :477.8
## Mean :476.8 Mean :78.55 Mean :477.8
## 3rd Qu.:496.6 3rd Qu.:79.64 3rd Qu.:491.5
## Max. :505.0 Max. :82.86 Max. :504.1
## LectureSDReg ScienceMeanReg ScienceSDReg
## Min. :80.44 Min. :427.5 Min. :77.93
## 1st Qu.:83.06 1st Qu.:479.6 1st Qu.:80.12
## Median :84.38 Median :488.4 Median :82.06
## Mean :84.91 Mean :486.8 Mean :81.74
## 3rd Qu.:85.96 3rd Qu.:505.2 3rd Qu.:83.46
## Max. :92.72 Max. :511.8 Max. :86.38
midist2 = get_dist(MediasREG[2:ncol(MediasREG)], stand = FALSE, method = "euclidean")
fviz_dist(midist2, show_labels = TRUE, lab_size = 0.3,
gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
set.seed(100)
myN = c(5, 8, 10, 15) # cambiando tamaños de muestra
myhopkins = NULL
myseed = sample(1:1000, 10)
for (i in myN) {
for (j in myseed) {
tmp = get_clust_tendency(data = MediasREG[,2:ncol(MediasREG)], n = i, graph = FALSE, seed = j)
myhopkins = c(myhopkins, tmp$hopkins_stat)
}
}
summary(myhopkins)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6624 0.7737 0.7947 0.7893 0.8136 0.8383
mapSPAIN = regions %>% left_join(MediasREG, by = c("region" = "REGION_NAME"))
ggplot(mapSPAIN) +
geom_sf(aes(fill = MathMeanReg), color = NA) +
scale_fill_viridis_c(option = "C") +
theme_minimal() +
labs(title = "Media en Matemáticas por Comunidad Autónoma", fill = "Media Matemáticas")
ggplot(mapSPAIN) +
geom_sf(aes(fill = LectureMeanReg), color = NA) +
scale_fill_viridis_c(option = "C") +
theme_minimal() +
labs(title = "Media en Lectura por Comunidad Autónoma", fill = "Media Lectura")
ggplot(mapSPAIN) +
geom_sf(aes(fill = ScienceMeanReg), color = NA) +
scale_fill_viridis_c(option = "C") +
theme_minimal() +
labs(title = "Media en Ciencias por Comunidad Autónoma", fill = "Media Ciencias")
p1 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = hcut, method = "silhouette",
hc_method = "ward.D2", k.max = 10, verbose = FALSE,
hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
p2 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = hcut, method = "wss",
hc_method = "ward.D2", k.max = 10, verbose = FALSE,
hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
grid.arrange(p1, p2, nrow = 1)
p1 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = hcut, method = "silhouette",
hc_method = "average", k.max = 10, verbose = FALSE,
hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
p2 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = hcut, method = "wss",
hc_method = "average", k.max = 10, verbose = FALSE,
hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
grid.arrange(p1, p2, nrow = 1)
p1 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = kmeans, method = "silhouette",
k.max = 10, verbose = FALSE) +
labs(title = "K-means")
p2 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = kmeans, method = "wss",
k.max = 10, verbose = FALSE) +
labs(title = "K-means")
grid.arrange(p1, p2, nrow = 1)
p1 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = pam, method = "silhouette",
k.max = 10, verbose = FALSE) +
labs(title = "Numero optimo de clusters")
p2 = fviz_nbclust(x = MediasREG[,-1], FUNcluster = pam, method = "wss",
k.max = 10, verbose = FALSE) +
labs(title = "Numero optimo de clusters")
grid.arrange(p1, p2, nrow = 1)
En todos los métodos parece que el número óptimo de clústeres es 3 basándonos en el criterio del codo en la SC intraclúster y en el criterio de maximizar el coeficiente de Silhouette.
Eclust1 <- hclust(midist2, method="ward.D2")
Egrupos1 <- cutree(Eclust1, k=3)
table(Egrupos1)
## Egrupos1
## 1 2 3
## 10 7 2
ESPgruposA = cbind(Egrupos1, MediasREG[,"REGION_NAME"])
Eclust2 <- hclust(midist2, method="average")
Egrupos2 <- cutree(Eclust2, k=3)
table(Egrupos2)
## Egrupos2
## 1 2 3
## 10 7 2
ESPgruposB = cbind(Egrupos2, MediasREG[,"REGION_NAME"])
set.seed(100)
Eclust3 <- kmeans(MediasREG[,-1], centers = 3, nstart = 20)
table(Eclust3$cluster)
##
## 1 2 3
## 2 8 9
ESPgruposC = cbind(Eclust3$cluster, MediasREG[,"REGION_NAME"])
colnames(ESPgruposC)[1] = "ESPgrupos3"
Eclust4 <- pam(MediasREG, k = 3)
table(Eclust4$clustering)
##
## 1 2 3
## 10 7 2
ESPgruposD = cbind(Eclust4$cluster, MediasREG[,"REGION_NAME"])
colnames(ESPgruposD)[1] = "ESPgrupos4"
par(mfrow = c(1,2))
plot(silhouette(Egrupos1, midist2), col=colores, border=NA, main = "WARD")
plot(silhouette(Egrupos2, midist2), col=colores, border=NA, main = "MEDIAS")
plot(silhouette(Eclust3$cluster, midist2), col=colores, border=NA, main = "K-MEDIAS")
plot(silhouette(Eclust4$clustering, midist2), col=colores2, border=NA, main = "K-MEDOIDES")
Vemos que el mejor parece el método k-medias con k = 3 ya que no hay ningún elemento mal colocado y el coeficiente de Silhouette.
numESP = MediasREG[,-1]
numESP = as.data.frame(numESP)
validacion = suppressMessages(clValid(numESP, nClust = 3:5, metric = "euclidean", clMethods = metodos, validation = c("internal", "stability"),
method = "ward"))
## Warning in clValid(numESP, nClust = 3:5, metric = "euclidean", clMethods =
## metodos, : rownames for data not specified, using 1:nrow(data)
summary(validacion)
##
## Clustering Methods:
## hierarchical kmeans pam
##
## Cluster sizes:
## 3 4 5
##
## Validation Measures:
## 3 4 5
##
## hierarchical APN 0.0368 0.0959 0.1067
## AD 15.1087 12.4892 10.7653
## ADM 1.7603 2.9143 3.1940
## FOM 5.2969 4.9283 4.9603
## Connectivity 10.7206 19.1706 24.5230
## Dunn 0.2775 0.4244 0.4244
## Silhouette 0.5864 0.4660 0.4007
## kmeans APN 0.0154 0.1076 0.1132
## AD 14.2907 12.6164 10.5438
## ADM 0.7204 3.2313 3.1357
## FOM 5.0181 5.0983 4.9537
## Connectivity 10.1579 19.1706 24.3552
## Dunn 0.3815 0.4244 0.4483
## Silhouette 0.6046 0.4660 0.4151
## pam APN 0.0154 0.1094 0.0921
## AD 14.2907 12.7162 10.2569
## ADM 0.7204 3.4544 2.4605
## FOM 5.0181 4.9580 4.4473
## Connectivity 10.1579 19.1706 24.3552
## Dunn 0.3815 0.4244 0.4483
## Silhouette 0.6046 0.4660 0.4151
##
## Optimal Scores:
##
## Score Method Clusters
## APN 0.0154 kmeans 3
## AD 10.2569 pam 5
## ADM 0.7204 kmeans 3
## FOM 4.4473 pam 5
## Connectivity 10.1579 kmeans 3
## Dunn 0.4483 kmeans 5
## Silhouette 0.6046 kmeans 3
Siguiendo estos criterios parece que el clustering mejor sigue siendo el de k-medias con k = 3, aunque son bastante parecidos al haber solo 15 comunidades autónomas y 2 ciudades autónomas.
#Intercambiamos los clústers 2 y 3 para que sea similar al de países y ordenados por nota. La columna de clúster ahora es ESPgrupos3_mod
ESPgruposC$ESPgrupos3_mod <- ESPgruposC$ESPgrupos3
ESPgruposC$ESPgrupos3_mod[ESPgruposC$ESPgrupos3 == 2] <- 99
ESPgruposC$ESPgrupos3_mod[ESPgruposC$ESPgrupos3 == 3] <- 2
ESPgruposC$ESPgrupos3_mod[ESPgruposC$ESPgrupos3_mod == 99] <- 3
mapSPAIN <- regions %>%
left_join(ESPgruposC, by = c("region" = "REGION_NAME"))
mapSPAIN$ESPgrupos3_mod <- as.factor(mapSPAIN$ESPgrupos3_mod)
ggplot(mapSPAIN) +
geom_sf(aes(fill = ESPgrupos3_mod), color = NA, size = 0.1) +
scale_fill_brewer(palette = "Set2", na.value = "grey90") +
theme_minimal() +
labs(title = "Mapa de Clústeres de CCAA españolas KMEDIAS k=3", fill = "Cluster")
#Con k = 3 todos los clústers son prácticamente iguales
CCAACluster = merge(MediasREG, ESPgruposC, by = "REGION_NAME")
MediasClusterRegiones = CCAACluster %>%
group_by(ESPgrupos3_mod) %>%
summarise(
nota_media_Mates = mean(MathMeanReg),
nota_media_Lectura = mean(LectureMeanReg),
nota_media_Ciencias = mean(ScienceMeanReg)
)
MediasClusterRegiones$nota_media_General = rowMeans(MediasClusterRegiones[,2:ncol(MediasClusterRegiones)])
MediasClusterRegiones
## # A tibble: 3 × 5
## ESPgrupos3_mod nota_media_Mates nota_media_Lectura nota_media_Ciencias
## <dbl> <dbl> <dbl> <dbl>
## 1 1 417. 424. 428.
## 2 2 472. 474. 483.
## 3 3 498. 495. 506.
## # ℹ 1 more variable: nota_media_General <dbl>
MediasClusterRegiones2 <- MediasClusterRegiones %>%
pivot_longer(
cols = starts_with("nota"),
names_to = "area",
values_to = "nota"
)
MediasClusterRegiones2$area <- gsub("nota_media_", "", MediasClusterRegiones2$area)
ggplot(MediasClusterRegiones2, aes(x = factor(ESPgrupos3_mod), y = nota, fill = area)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "Nota media por área y clúster",
x = "Clúster",
y = "Nota media",
fill = "Área"
) +
theme_minimal()
CCAACluster %>%
group_by(ESPgrupos3_mod) %>%
summarise(regiones = paste(REGION_NAME, collapse = ", "))
## # A tibble: 3 × 2
## ESPgrupos3_mod regiones
## <dbl> <chr>
## 1 1 Ceuta, Melilla
## 2 2 Andalucía, Canary Is., Castilla-La Mancha, Cataluña, Extremadu…
## 3 3 Aragón, Asturias, Cantabria, Castilla y León, Foral de Navarra…
Los clústeres ordenados de menor nota (en todas las áreas) a mayor son: Clúster 1 < Clúster 3 < Clúster 2.
Clúster 1 = {Ceuta, Melilla}. Ciudades autónomas situadas en África que están menos conectadas con el resto de España y suelen tener menos recursos.
Clúster 2 = {Galicia, Asturias, Cantabria, Castilla y León, Navarra, Aragón, La Rioja, Madrid}. Comunidades autónomas del norte de España, muchas suelen destacar en índices socieconómicos y tener desarrollo tecnológico.
Clúster 3 = {Islas Canarias, Andalucía, Comunidad Valenciana, Extremadura, Castilla-La Mancha, Murcia, Cataluña, País Vasco}. La mayoría de estas Comunidades autónomas suelen estar en el sur de España y estar un poco peor que las del clúster 2 en cuanto a índices socioeconómicos y tecnología.
escuelas <- read_sas("escuelas.SAS7BDAT", NULL)
bancomundial =load('bancomundial.R')
bancomundial <- datos_filtrado
bm_media_res<- merge(
NotasMediasPais,
bancomundial,
by.x = "CNT", # Columna de países en NotasMediasPais
by.y = "Country.Code", # Columna de países en bancomundial (ajusta según tu CSV)
all.x = TRUE # Conservar todos los países de PISA aunque falten datos del BM
)
write.csv(bm_media_res, file = "bm_media_res.csv", row.names = FALSE)
datos_numericos <- read_excel("datos_bm_res.xlsx")
datos = subset(datos_numericos,select = -c(CNT))
summary(datos)
## MathMean_Country LectureMean_Country ScienceMean_Country
## Min. :343.5 Min. :337.7 Min. :356.2
## 1st Qu.:396.2 1st Qu.:406.0 1st Qu.:411.8
## Median :449.1 Median :448.5 Median :461.2
## Mean :445.1 Mean :442.6 Mean :454.4
## 3rd Qu.:483.6 3rd Qu.:484.0 3rd Qu.:497.6
## Max. :574.6 Max. :543.2 Max. :561.5
##
## Compulsory education, duration (years)
## Min. : 6.00
## 1st Qu.:10.00
## Median :11.00
## Mean :10.93
## 3rd Qu.:12.00
## Max. :15.00
## NA's :18
## Current education expenditure, total (% of total expenditure in public institutions)
## Min. :77.62
## 1st Qu.:84.73
## Median :91.75
## Mean :90.27
## 3rd Qu.:96.85
## Max. :99.34
## NA's :69
## Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative)
## Min. : 9.481
## 1st Qu.:19.822
## Median :26.128
## Mean :26.834
## 3rd Qu.:33.694
## Max. :51.070
## NA's :23
## Educational attainment, at least completed lower secondary, population 25+, total (%) (cumulative)
## Min. :48.79
## 1st Qu.:75.92
## Median :89.87
## Mean :84.01
## 3rd Qu.:96.46
## Max. :99.72
## NA's :18
## Educational attainment, at least completed post-secondary, population 25+, total (%) (cumulative)
## Min. :12.11
## 1st Qu.:24.71
## Median :34.66
## Mean :35.45
## 3rd Qu.:43.50
## Max. :67.27
## NA's :23
## Educational attainment, at least completed primary, population 25+ years, total (%) (cumulative)
## Min. : 63.81
## 1st Qu.: 91.23
## Median : 97.56
## Mean : 93.95
## 3rd Qu.: 99.21
## Max. :100.00
## NA's :18
## Educational attainment, at least completed upper secondary, population 25+, total (%) (cumulative)
## Min. :34.46
## 1st Qu.:55.53
## Median :74.85
## Mean :69.24
## 3rd Qu.:81.59
## Max. :91.99
## NA's :18
## Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)
## Min. : 0.7075
## 1st Qu.: 3.9465
## Median :12.5948
## Mean :11.0888
## 3rd Qu.:15.8233
## Max. :22.8423
## NA's :25
## Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)
## Min. :0.0000
## 1st Qu.:0.2900
## Median :0.7576
## Mean :0.9213
## 3rd Qu.:1.3326
## Max. :3.1390
## NA's :20
## Government expenditure on education, total (% of GDP)
## Min. :0.8639
## 1st Qu.:3.3393
## Median :4.1601
## Mean :4.1761
## 3rd Qu.:4.9824
## Max. :7.1397
## NA's :48
## Government expenditure on education, total (% of government expenditure)
## Min. : 7.45
## 1st Qu.:10.12
## Median :13.93
## Mean :13.36
## 3rd Qu.:15.73
## Max. :22.36
## NA's :51
## GNI per capita, Atlas method (current US$)
## Min. : 4010
## 1st Qu.:10942
## Median :24895
## Mean :32341
## 3rd Qu.:53585
## Max. :95480
## NA's :18
## GNI per capita, PPP (constant 2021 international $)
## Min. : 10003
## 1st Qu.: 25668
## Median : 41695
## Mean : 43316
## 3rd Qu.: 58408
## Max. :110892
## NA's :25
## Labor force with advanced education (% of total working-age population with advanced education)
## Min. :71.14
## 1st Qu.:76.17
## Median :78.58
## Mean :78.71
## 3rd Qu.:81.11
## Max. :90.08
## NA's :18
## Labor force with basic education (% of total working-age population with basic education)
## Min. :17.17
## 1st Qu.:30.58
## Median :45.48
## Mean :44.98
## 3rd Qu.:56.72
## Max. :85.92
## NA's :18
## Labor force with intermediate education (% of total working-age population with intermediate education)
## Min. :48.39
## 1st Qu.:61.32
## Median :65.27
## Mean :65.33
## 3rd Qu.:69.00
## Max. :77.81
## NA's :18
## Unemployment with advanced education (% of total labor force with advanced education)
## Min. : 0.794
## 1st Qu.: 2.365
## Median : 3.533
## Mean : 4.202
## 3rd Qu.: 5.187
## Max. :11.397
## NA's :18
## Unemployment with basic education (% of total labor force with basic education)
## Min. : 0.421
## 1st Qu.: 4.255
## Median : 7.901
## Mean : 8.972
## 3rd Qu.:11.917
## Max. :40.621
## NA's :18
## Unemployment with intermediate education (% of total labor force with intermediate education)
## Min. : 0.885
## 1st Qu.: 3.977
## Median : 5.194
## Mean : 6.154
## 3rd Qu.: 7.560
## Max. :16.066
## NA's :18
## Individuals using the Internet (% of population) Fixed broadband subscriptions
## Min. : 62.89 Min. : 90200
## 1st Qu.: 83.63 1st Qu.: 1087500
## Median : 89.11 Median : 3350000
## Mean : 87.89 Mean : 8885245
## 3rd Qu.: 94.78 3rd Qu.: 9062500
## Max. :100.00 Max. :128000000
## NA's :18 NA's :18
## Mobile cellular subscriptions Secure Internet servers GDP growth (annual %)
## Min. : 456864 Min. : 518 Min. :-5.015
## 1st Qu.: 5760000 1st Qu.: 21833 1st Qu.: 2.581
## Median : 11450000 Median : 213266 Median : 4.191
## Mean : 41068747 Mean : 1739713 Mean : 4.133
## 3rd Qu.: 48150000 3rd Qu.: 766554 3rd Qu.: 5.300
## Max. :373000000 Max. :60194281 Max. :10.770
## NA's :18 NA's :18 NA's :18
## GDP per capita (constant 2015 US$) Access to electricity (% of population)
## Min. : 3578 Min. : 94.80
## 1st Qu.:10302 1st Qu.:100.00
## Median :20611 Median :100.00
## Mean :28843 Mean : 99.72
## 3rd Qu.:44483 3rd Qu.:100.00
## Max. :99677 Max. :100.00
## NA's :18 NA's :18
## Access to clean fuels and technologies for cooking (% of population)
## Min. : 54.20
## 1st Qu.: 95.10
## Median :100.00
## Mean : 95.39
## 3rd Qu.:100.00
## Max. :100.00
## NA's :19
## Population ages 0-14, total Population ages 15-64, total
## Min. : 70206 Min. : 254073
## 1st Qu.: 851109 1st Qu.: 3008344
## Median : 1527079 Median : 6386626
## Mean : 6514125 Mean : 21791933
## 3rd Qu.: 6383518 3rd Qu.: 24254126
## Max. :70471815 Max. :217288142
## NA's :18 NA's :18
## Population ages 0-14 (% of total population)
## Min. :11.41
## 1st Qu.:15.08
## Median :16.66
## Mean :18.21
## 3rd Qu.:20.07
## Max. :32.88
## NA's :18
## Population ages 15-64 (% of total population)
## Min. :59.89
## 1st Qu.:63.64
## Median :65.45
## Mean :66.12
## 3rd Qu.:66.99
## Max. :81.88
## NA's :18
## Population ages 65 and above (% of total population)
## Min. : 1.64
## 1st Qu.:10.11
## Median :17.53
## Mean :15.67
## 3rd Qu.:20.39
## Max. :23.93
## NA's :18
## Population ages 65 and above, total Population, total
## Min. : 28241 Min. : 382003
## 1st Qu.: 537674 1st Qu.: 4571021
## Median : 1579097 Median : 9859677
## Mean : 4523469 Mean : 32829526
## 3rd Qu.: 4605347 3rd Qu.: 36290185
## Max. :56388258 Max. :333271411
## NA's :18 NA's :18
## Preprimary education, duration (years)
## Min. :1.000
## 1st Qu.:4.000
## Median :6.000
## Mean :5.125
## 3rd Qu.:6.000
## Max. :7.000
## NA's :20
## Proportion of seats held by women in national parliaments (%)
## Min. : 9.091
## 1st Qu.:23.542
## Median :29.563
## Mean :31.409
## 3rd Qu.:40.446
## Max. :50.000
## NA's :18
## Employment in industry (% of total employment) (modeled ILO estimate)
## Min. :12.68
## 1st Qu.:18.48
## Median :21.20
## Mean :22.56
## 3rd Qu.:26.87
## Max. :36.89
## NA's :18
## Employment in agriculture (% of total employment) (modeled ILO estimate)
## Min. : 0.09095
## 1st Qu.: 2.22190
## Median : 4.31686
## Mean : 9.09093
## 3rd Qu.:13.21798
## Max. :55.38076
## NA's :18
## Employment in services (% of total employment) (modeled ILO estimate)
## Min. :31.94
## 1st Qu.:61.77
## Median :69.46
## Mean :68.35
## 3rd Qu.:77.82
## Max. :85.34
## NA's :18
## Birth rate, crude (per 1,000 people)
## Min. : 4.900
## 1st Qu.: 8.800
## Median : 9.894
## Mean :11.135
## 3rd Qu.:12.050
## Max. :21.618
## NA's :18
datos_imputados <- datos_numericos
datos_imputados[, -1] <- lapply(datos_imputados[, -1], function(x) {
ifelse(is.na(x), mean(x, na.rm=TRUE),x)
})
NotasNumericas = datos_imputados[,-1]
NotasNumericas = as.data.frame(NotasNumericas)
metodos = c("hierarchical","kmeans","pam")
validacion = suppressMessages(clValid(NotasNumericas, nClust = 4:7, metric = "euclidean", clMethods = metodos, validation = c("internal", "stability"),
method = "ward"))
## Warning in clValid(NotasNumericas, nClust = 4:7, metric = "euclidean",
## clMethods = metodos, : rownames for data not specified, using 1:nrow(data)
summary(validacion)
##
## Clustering Methods:
## hierarchical kmeans pam
##
## Cluster sizes:
## 4 5 6 7
##
## Validation Measures:
## 4 5 6 7
##
## hierarchical APN 1.100000e-03 1.100000e-03 1.220000e-02 9.500000e-03
## AD 1.638221e+07 1.319447e+07 1.129872e+07 9.414332e+06
## ADM 9.343188e+04 7.277224e+04 3.196984e+05 3.375925e+05
## FOM 1.277323e+06 9.959701e+05 8.960475e+05 7.526625e+05
## Connectivity 9.025000e+00 1.334130e+01 1.534130e+01 1.802140e+01
## Dunn 9.990000e-02 9.990000e-02 2.042000e-01 2.970000e-02
## Silhouette 6.995000e-01 7.235000e-01 7.099000e-01 5.196000e-01
## kmeans APN 3.700000e-03 1.700000e-03 1.330000e-02 7.500000e-03
## AD 1.598113e+07 1.289033e+07 1.101110e+07 8.921211e+06
## ADM 3.411711e+05 1.158990e+05 3.957884e+05 2.816039e+05
## FOM 1.258489e+06 1.023782e+06 9.283123e+05 7.991300e+05
## Connectivity 1.386390e+01 1.798020e+01 1.998020e+01 1.895120e+01
## Dunn 2.740000e-02 2.740000e-02 7.380000e-02 6.920000e-02
## Silhouette 7.221000e-01 7.425000e-01 7.289000e-01 6.168000e-01
## pam APN 6.000000e-04 1.500000e-03 1.100000e-03 1.600000e-03
## AD 1.584959e+07 1.306305e+07 1.089023e+07 8.818984e+06
## ADM 5.055155e+04 4.432437e+05 8.815363e+04 9.662234e+04
## FOM 1.252413e+06 1.174879e+06 6.881431e+05 6.427945e+05
## Connectivity 1.386390e+01 1.798020e+01 1.998020e+01 2.103290e+01
## Dunn 2.740000e-02 2.740000e-02 7.380000e-02 4.570000e-02
## Silhouette 7.221000e-01 7.425000e-01 7.289000e-01 6.132000e-01
##
## Optimal Scores:
##
## Score Method Clusters
## APN 0.0006 pam 4
## AD 8818984.2818 pam 7
## ADM 50551.5506 pam 4
## FOM 642794.4851 pam 7
## Connectivity 9.0250 hierarchical 4
## Dunn 0.2042 hierarchical 6
## Silhouette 0.7425 kmeans 5
p1 = fviz_nbclust(x = NotasNumericas, FUNcluster = hcut, method = "silhouette",
hc_method = "ward.D2", k.max = 10, verbose = FALSE,
hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
p2 = fviz_nbclust(x = NotasNumericas, FUNcluster = hcut, method = "wss",
hc_method = "ward.D2", k.max = 10, verbose = FALSE,
hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
grid.arrange(p1, p2, nrow = 1)
p1 = fviz_nbclust(x = NotasNumericas, FUNcluster = hcut, method = "silhouette",
hc_method = "average", k.max = 10, verbose = FALSE,
hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
p2 = fviz_nbclust(x = NotasNumericas, FUNcluster = hcut, method = "wss",
hc_method = "average", k.max = 10, verbose = FALSE,
hc_metric = "euclidean") + labs(title = "Num. optimo clusters")
grid.arrange(p1, p2, nrow = 1)
p1 = fviz_nbclust(x = NotasNumericas, FUNcluster = kmeans, method = "silhouette",
k.max = 10, verbose = FALSE) +
labs(title = "K-means")
p2 = fviz_nbclust(x = NotasNumericas, FUNcluster = kmeans, method = "wss",
k.max = 10, verbose = FALSE) +
labs(title = "K-means")
grid.arrange(p1, p2, nrow = 1)
p1 = fviz_nbclust(x = NotasNumericas, FUNcluster = pam, method = "silhouette",
k.max = 10, verbose = FALSE) +
labs(title = "Numero optimo de clusters")
p2 = fviz_nbclust(x = NotasNumericas, FUNcluster = pam, method = "wss",
k.max = 10, verbose = FALSE) +
labs(title = "Numero optimo de clusters")
grid.arrange(p1, p2, nrow = 1)
5,7,5,4-6
midist3 = get_dist(NotasNumericas, stand = FALSE, method = "euclidean")
Aclust1 <- hclust(midist3, method="ward.D2")
Agrupos1 <- cutree(Aclust1, k=5)
table(Agrupos1)
## Agrupos1
## 1 2 3 4 5
## 40 23 9 2 2
AgruposA = cbind(Agrupos1, datos_imputados[,"CNT"])
Aclust2 <- hclust(midist3, method="average")
Agrupos2 <- cutree(Aclust2, k=7)
table(Agrupos2)
## Agrupos2
## 1 2 3 4 5 6 7
## 36 27 6 3 1 2 1
AgruposB = cbind(Agrupos2, datos_imputados[,"CNT"])
set.seed(100)
Aclust3 <- kmeans(NotasNumericas, centers = 5, nstart = 20)
table(Aclust3$cluster)
##
## 1 2 3 4 5
## 26 38 2 2 8
AgruposC = cbind(Aclust3$cluster, datos_imputados[,"CNT"])
colnames(AgruposC)[1] = "Agrupos3"
Aclust4 <- pam(NotasNumericas, k = 5)
table(Aclust4$clustering)
##
## 1 2 3 4 5
## 38 26 8 2 2
AgruposD = cbind(Aclust4$cluster, datos_imputados[,"CNT"])
colnames(AgruposD)[1] = "Agrupos4"
par(mfrow = c(1,2))
plot(silhouette(Agrupos1, midist3), col=colores, border=NA, main = "WARD")
plot(silhouette(Agrupos2, midist3), col=colores, border=NA, main = "MEDIAS")
plot(silhouette(Aclust3$cluster, midist3), col=colores, border=NA, main = "K-MEDIAS")
plot(silhouette(Aclust4$clustering, midist3), col=colores2, border=NA, main = "K-MEDOIDES")
clustAUX = Aclust3
gruposAUX = AgruposC
world = ne_countries(scale = "medium", returnclass = "sf")
world$iso_a3[world$admin == "France"] = "FRA"
world$iso_a3[world$admin == "Norway"] = "NOR"
world$iso_a3[world$admin == "Kosovo"] = "KSV"
world$iso_a3[world$admin == "Taiwan"] = "TAP"
world_clustered <- world %>%
left_join(AgruposC, by = c("iso_a3" = "CNT"))
world_clustered$gruposAUX <- as.factor(world_clustered$Agrupos3)
ggplot(world_clustered) +
geom_sf(aes(fill = gruposAUX), color = NA, size = 0.1) +
scale_fill_brewer(palette = "Set2", na.value = "grey90") +
theme_minimal() +
labs(title = "Mapa de Clústeres de Países KMEDIAS k=5", fill = "Cluster")
PaisesCluster = merge(datos_imputados, gruposAUX, by = "CNT")
MediasCluster = PaisesCluster %>%
group_by(Agrupos3) %>%
summarise(
nota_media_Mates = mean(MathMean_Country),
nota_media_Lectura = mean(LectureMean_Country),
nota_media_Ciencias = mean(ScienceMean_Country)
)
MediasCluster$nota_media_General = rowMeans(MediasCluster[,2:ncol(MediasCluster)])
MediasCluster
## # A tibble: 5 × 5
## Agrupos3 nota_media_Mates nota_media_Lectura nota_media_Ciencias
## <int> <dbl> <dbl> <dbl>
## 1 1 437. 429. 444.
## 2 2 452. 450. 461.
## 3 3 375. 381. 384.
## 4 4 423. 441. 450.
## 5 5 463. 466. 478.
## # ℹ 1 more variable: nota_media_General <dbl>
MediasCluster2 <- MediasCluster %>%
pivot_longer(
cols = starts_with("nota"),
names_to = "area",
values_to = "nota"
)
MediasCluster2$area <- gsub("nota_media_", "", MediasCluster2$area)
ggplot(MediasCluster2, aes(x = factor(Agrupos3), y = nota, fill = area)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "Nota media por área y clúster",
x = "Clúster",
y = "Nota media",
fill = "Área"
) +
theme_minimal()
# Combinar datos de PISA con Banco Mundial
datos_combi <- merge(
NotasMediasPais,
bancomundial,
by.x = "CNT",
by.y = "Country.Code",
all.x=TRUE
)
# 1. Limpieza y preparación de datos
bm_pisa_clean <- datos_combi %>%
mutate(across(-CNT, ~ {
x_char <- as.character(.x)
as.numeric(ifelse(x_char %in% c("", "NA"), NA, x_char))
})) %>%
# Eliminar columnas con >50% NAs
select(where(~ mean(is.na(.x)) <= 0.5)) %>%
# Eliminar filas con NAs (alternativa: tidyr::fill() o imputación)
filter(complete.cases(.))
# 2. Estandarización y clustering
set.seed(100)
bm_pisa_scaled <- bm_pisa_clean %>%
select(-CNT) %>%
scale() %>%
as.data.frame()
clust_bm <- kmeans(bm_pisa_scaled, centers = 5, nstart = 20)
# 3. Asignar clusters a los datos originales
bm_cluster_data <- bm_pisa_clean %>%
mutate(Cluster = as.factor(clust_bm$cluster))
# 4. Análisis de variables importantes (ajusta los patrones según tus datos)
vars_importantes <- names(bm_cluster_data) %>%
grep("GDP|Internet|education|employment|GNI|PISA|math|science|reading",
., value = TRUE, ignore.case = TRUE)
# 5. Estadísticas por cluster
medias_cluster <- bm_cluster_data %>%
select(Cluster, all_of(vars_importantes)) %>%
group_by(Cluster) %>%
summarise(across(
everything(),
list(mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE))
))
# 6. Preparar para visualización
medias_plot <- medias_cluster %>%
pivot_longer(
cols = -Cluster,
names_to = c("Variable", ".value"),
names_sep = "_"
)
## Warning: Expected 2 pieces. Additional pieces discarded in 4 rows [1, 2, 5, 6].
# 7. Visualización de indicadores
ggplot(medias_plot, aes(x = Cluster, y = mean, fill = Variable)) +
geom_col(position = position_dodge(0.8), width = 0.7) +
geom_errorbar(
aes(ymin = mean - sd, ymax = mean + sd),
width = 0.2,
position = position_dodge(0.8),
color = "black"
) +
labs(title = "Indicadores por Cluster") +
theme_minimal()
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_col()`).
# 1. Primero verifiquemos los nombres reales de las columnas
names(bm_cluster_data)
## [1] "CNT"
## [2] "MathMean_Country"
## [3] "MathSD"
## [4] "LectureMean_Country"
## [5] "LectureSD"
## [6] "ScienceMean_Country"
## [7] "ScienceSD"
## [8] "Compulsory education, duration (years)"
## [9] "Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative)"
## [10] "Educational attainment, at least completed lower secondary, population 25+, total (%) (cumulative)"
## [11] "Educational attainment, at least completed post-secondary, population 25+, total (%) (cumulative)"
## [12] "Educational attainment, at least completed primary, population 25+ years, total (%) (cumulative)"
## [13] "Educational attainment, at least completed upper secondary, population 25+, total (%) (cumulative)"
## [14] "Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)"
## [15] "Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)"
## [16] "GNI per capita, Atlas method (current US$)"
## [17] "GNI per capita, PPP (constant 2021 international $)"
## [18] "Labor force with advanced education (% of total working-age population with advanced education)"
## [19] "Labor force with basic education (% of total working-age population with basic education)"
## [20] "Labor force with intermediate education (% of total working-age population with intermediate education)"
## [21] "Unemployment with advanced education (% of total labor force with advanced education)"
## [22] "Unemployment with basic education (% of total labor force with basic education)"
## [23] "Unemployment with intermediate education (% of total labor force with intermediate education)"
## [24] "Individuals using the Internet (% of population)"
## [25] "Fixed broadband subscriptions"
## [26] "Mobile cellular subscriptions"
## [27] "Secure Internet servers"
## [28] "GDP growth (annual %)"
## [29] "GDP per capita (constant 2015 US$)"
## [30] "Access to electricity (% of population)"
## [31] "Access to clean fuels and technologies for cooking (% of population)"
## [32] "Population ages 0-14, total"
## [33] "Population ages 15-64, total"
## [34] "Population ages 0-14 (% of total population)"
## [35] "Population ages 15-64 (% of total population)"
## [36] "Population ages 65 and above (% of total population)"
## [37] "Population ages 65 and above, total"
## [38] "Population, total"
## [39] "Preprimary education, duration (years)"
## [40] "Proportion of seats held by women in national parliaments (%)"
## [41] "Employment in industry (% of total employment) (modeled ILO estimate)"
## [42] "Employment in agriculture (% of total employment) (modeled ILO estimate)"
## [43] "Employment in services (% of total employment) (modeled ILO estimate)"
## [44] "Birth rate, crude (per 1,000 people)"
## [45] "Cluster"
# 2. Seleccionar variables basadas en los nombres reales
# (Estos son ejemplos - ajusta según los nombres reales en tus datos)
vars_importantes <- names(bm_cluster_data) %>%
grep("GDP|Internet|education|employment|GNI", ., value = TRUE, ignore.case = TRUE)
# Mostrar las variables seleccionadas para confirmación
print(vars_importantes)
## [1] "Compulsory education, duration (years)"
## [2] "Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative)"
## [3] "Educational attainment, at least completed lower secondary, population 25+, total (%) (cumulative)"
## [4] "Educational attainment, at least completed post-secondary, population 25+, total (%) (cumulative)"
## [5] "Educational attainment, at least completed primary, population 25+ years, total (%) (cumulative)"
## [6] "Educational attainment, at least completed upper secondary, population 25+, total (%) (cumulative)"
## [7] "Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)"
## [8] "Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)"
## [9] "GNI per capita, Atlas method (current US$)"
## [10] "GNI per capita, PPP (constant 2021 international $)"
## [11] "Labor force with advanced education (% of total working-age population with advanced education)"
## [12] "Labor force with basic education (% of total working-age population with basic education)"
## [13] "Labor force with intermediate education (% of total working-age population with intermediate education)"
## [14] "Unemployment with advanced education (% of total labor force with advanced education)"
## [15] "Unemployment with basic education (% of total labor force with basic education)"
## [16] "Unemployment with intermediate education (% of total labor force with intermediate education)"
## [17] "Individuals using the Internet (% of population)"
## [18] "Secure Internet servers"
## [19] "GDP growth (annual %)"
## [20] "GDP per capita (constant 2015 US$)"
## [21] "Preprimary education, duration (years)"
## [22] "Employment in industry (% of total employment) (modeled ILO estimate)"
## [23] "Employment in agriculture (% of total employment) (modeled ILO estimate)"
## [24] "Employment in services (% of total employment) (modeled ILO estimate)"
# 3. Calcular estadísticas por cluster (versión robusta)
medias_cluster_bm <- bm_cluster_data %>%
# Seleccionar solo columnas numéricas además del Cluster
select(Cluster, where(is.numeric)) %>%
group_by(Cluster) %>%
summarise(across(
where(is.numeric),
list(mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE))
)) # <- Paréntesis faltante aquí
# 4. Preparar datos para visualización (versión segura)
medias_plot_bm <- medias_cluster_bm %>%
pivot_longer(
cols = -Cluster,
names_to = c("Variable", ".value"),
names_sep = "_"
)
## Warning: Expected 2 pieces. Additional pieces discarded in 6 rows [1, 2, 5, 6,
## 9, 10].
# 5. Visualización
ggplot(medias_plot_bm, aes(x = Cluster, y = mean, fill = Variable)) +
geom_col(position = position_dodge(0.8), width = 0.7) +
geom_errorbar(
aes(ymin = mean - sd, ymax = mean + sd),
width = 0.2,
position = position_dodge(0.8),
color = "black"
) +
labs(
title = "Indicadores del Banco Mundial por Cluster",
x = "Cluster",
y = "Valor promedio",
fill = "Indicador"
) +
theme_minimal() +
theme(legend.position = "bottom",
axis.text.x = element_text(angle = 45,hjust=1))
## Warning: Removed 30 rows containing missing values or values outside the scale range
## (`geom_col()`).
load('pca_data.RData')
gruposC = gruposC[-(which(gruposC$CNT %in% c("USA", "IDN", "PHL"))),]
pca_data$clusters = as.factor(gruposC$grupos3)
ggplot(pca_data, aes(x = Dim1, y = Dim2, label = label, color = clusters)) +
geom_text(size = 3) +
scale_color_manual(values = c("#228B22", "#FF8C00", "blue", "#D33682")) +
labs(title = "PCA - Dimensiones 1 y 2", color = "Cluster") +
theme_minimal()
ggplot(pca_data, aes(x = Dim3, y = Dim4, label = label, color = clusters)) +
geom_text(size = 3) +
scale_color_manual(values = c("#228B22", "#FF8C00", "blue", "#D33682")) +
labs(title = "PCA - Dimensiones 3 y 4", color = "Cluster") +
theme_minimal()