library(haven)
library(gridExtra)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(tidyr)
library(sf)
## Warning: package 'sf' was built under R version 4.4.3
## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(countrycode)
## Warning: package 'countrycode' was built under R version 4.4.3
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.4.3
library(rnaturalearthdata)
## Warning: package 'rnaturalearthdata' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'rnaturalearthdata'
## The following object is masked from 'package:rnaturalearth':
## 
##     countries110
escuelas <- read_sas("CY08MSP_SCH_QQQ.SAS7BDAT", NULL)
load("estudiantesPISAfiltrados.R")
estudiantes <- datosPisaEstudiantes_filtrado
rm(datosPisaEstudiantes_filtrado)

CARGA DATOS

library(dplyr)
estudiantes_interes <- estudiantes %>%
  select(matches("^PV|^CNT$|^CNTSCHID$|^CNTSTUID$|^ST004D01T$"))

estudiantes_interes = estudiantes_interes %>% transmute(CNT = estudiantes_interes$CNT,
                                        CNTSCHID = estudiantes_interes$CNTSCHID,
                                        CNTSTUID = estudiantes_interes$CNTSTUID,
                                        Gender = estudiantes_interes$ST004D01T,
                                        MathMean = rowMeans(estudiantes_interes[,4:13]),
                                        LectureMean = rowMeans(estudiantes_interes[,14:23]),
                                        ScienceMean = rowMeans(estudiantes_interes[,24:33]))



escuelas_interes <- escuelas %>% select(c("CNTSCHID", "SC013Q01TA", "SC004Q02TA", "SC175Q01JA"))

donde_estudia <- left_join(escuelas_interes,estudiantes_interes, by = "CNTSCHID")
colnames(donde_estudia)[2] = "Tipo_escuela"

donde_estudia$TotalMean = rowMeans(donde_estudia %>% select(c("MathMean", "LectureMean", "ScienceMean")))

media_genero = donde_estudia %>%
  group_by(CNT, Gender) %>%
  summarise(
    MathMean = mean(MathMean, na.rm = TRUE),
    LectureMean = mean(LectureMean, na.rm = TRUE),
    ScienceMean = mean(ScienceMean, na.rm = TRUE),
    TotalMean = mean(TotalMean, na.rm = TRUE),
    .groups = "drop"
  )

media_genero = na.omit(media_genero)
numerical_cols <- c("MathMean", "LectureMean", "ScienceMean", "TotalMean")
media_genero_escalado <- media_genero
media_genero_escalado[numerical_cols] = scale(media_genero_escalado[numerical_cols], center = TRUE, scale = TRUE)

Relación e interacción entre Género y Tipo de escuela

tabla_contingencia <- table(donde_estudia$Gender, donde_estudia$Tipo_escuela)
print(tabla_contingencia)
##    
##          1      2
##   1 200637  49387
##   2 197290  49415
prueba_chi2 <- chisq.test(tabla_contingencia)
print(prueba_chi2)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tabla_contingencia
## X-squared = 5.9661, df = 1, p-value = 0.01458
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.4.3
CramerV(tabla_contingencia)
## [1] 0.003470713

Resultado: p-value = 0.01458. Interpretación:Esto significa que hay una relación estadísticamente significativa entre el género y el tipo de escuela, es decir que no son independientes.

Resultado: V de Cramer = 0.00347. Interpretación: La asociación es extremadamente débil, prácticamente insignificante en términos prácticos, a pesar de ser estadísticamente significativa debido al gran tamaño de la muestra.

manova_model <- manova(cbind(MathMean, LectureMean, ScienceMean) ~ Gender * Tipo_escuela, data = donde_estudia)
summary(manova_model)
##                         Df   Pillai approx F num Df den Df    Pr(>F)    
## Gender                   1 0.094919  17364.4      3 496723 < 2.2e-16 ***
## Tipo_escuela             1 0.021795   3689.0      3 496723 < 2.2e-16 ***
## Gender:Tipo_escuela      1 0.000061     10.1      3 496723 1.268e-06 ***
## Residuals           496725                                              
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print("----------------------------------------------------------------")
## [1] "----------------------------------------------------------------"
summary.aov(manova_model)
##  Response MathMean :
##                         Df     Sum Sq  Mean Sq   F value Pr(>F)    
## Gender                   1    5886381  5886381  791.0690 <2e-16 ***
## Tipo_escuela             1   72633040 72633040 9761.1332 <2e-16 ***
## Gender:Tipo_escuela      1       7184     7184    0.9655 0.3258    
## Residuals           496725 3696153484     7441                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response LectureMean :
##                         Df     Sum Sq   Mean Sq    F value Pr(>F)    
## Gender                   1   50530038  50530038  5091.2689 <2e-16 ***
## Tipo_escuela             1  108428808 108428808 10924.9910 <2e-16 ***
## Gender:Tipo_escuela      1      14907     14907     1.5019 0.2204    
## Residuals           496725 4929917065      9925                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response ScienceMean :
##                         Df     Sum Sq  Mean Sq    F value Pr(>F)    
## Gender                   1    1060804  1060804   110.9100 <2e-16 ***
## Tipo_escuela             1   97264869 97264869 10169.3097 <2e-16 ***
## Gender:Tipo_escuela      1       3649     3649     0.3815 0.5368    
## Residuals           496725 4750951022     9565                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 67119 observations deleted due to missingness

Se encontró un efecto significativo tanto para Gender como para Tipo_escuela, y lo más importante, una interacción significativa (p-value = 1.268e-06 ***) entre Gender y Tipo_escuela en el rendimiento general. Es decir, el efecto del género en las notas no es el mismo en los diferentes tipos de escuela, y viceversa. Dicho de otro modo, la diferencia en el rendimiento entre chicos y chicas depende de si están en una escuela pública o privada, y el efecto del tipo de escuela en el rendimiento es diferente para cada género.

summary.aov(manova_model) (ANOVA univariados): Los ANOVA univariados por asignatura no mostraron interacciones significativas, lo cual es una sutil pero importante diferencia.

Esto puede deberse a que el MANOVA busca diferencias en el “perfil” o “vector” de medias: No se enfoca en si hay una interacción para cada nota por separado, sino si hay un patrón de interacción que afecta al conjunto de notas. Es posible que el efecto combinado o una interacción muy sutil a través de las correlaciones entre MathMean, LectureMean y ScienceMean sea significativa cuando se analizan juntas.

library(ggplot2)
library(dplyr)

donde_estudia_largo <- donde_estudia %>%
  filter(
    !is.na(Gender),
    !is.na(Tipo_escuela),
    !is.na(MathMean),
    !is.na(LectureMean),
    !is.na(ScienceMean)
  ) %>%
  mutate(
    Gender = as.factor(Gender),
    Tipo_escuela = as.factor(Tipo_escuela)
  ) %>%
  pivot_longer(
    cols = c(MathMean, LectureMean, ScienceMean),
    names_to = "asignatura",
    values_to = "nota"
  )

# Calcular medias e IC por grupo (Gender x Tipo_escuela x asignatura)
summary_data <- donde_estudia_largo %>%
  group_by(Gender, Tipo_escuela, asignatura) %>%
  summarise(
    mean_nota = mean(nota, na.rm = TRUE),
    sd_nota = sd(nota, na.rm = TRUE),
    n = n(),
    se = sd_nota / sqrt(n),
    lower = mean_nota - qt(0.975, n-1) * se,
    upper = mean_nota + qt(0.975, n-1) * se
  ) %>%
  ungroup()
## `summarise()` has grouped output by 'Gender', 'Tipo_escuela'. You can override
## using the `.groups` argument.
# Gráfico de interacción
ggplot(summary_data, aes(x = Tipo_escuela, y = mean_nota, color = Gender, group = Gender)) +
  geom_point(size = 3) +
  geom_line(size = 1) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2) +
  facet_wrap(~ asignatura, scales = "free_y") +
  labs(
    title = "Gráfico de interacción: género y tipo de escuela sobre notas medias",
    x = "Tipo de escuela",
    y = "Nota media"
  ) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

library(tidyr)
library(dplyr)
library(ggplot2)

ggplot(donde_estudia_largo, aes(x = Tipo_escuela, y = nota, fill = Gender)) +
  geom_boxplot(aes(group = interaction(Tipo_escuela, Gender))) +
  facet_wrap(~ asignatura, scales = "free_y") +
  labs(
    title = "Distribución de notas por tipo de escuela y género",
    x = "Tipo de escuela", y = "Nota media"
  ) +
  theme_minimal()

Impacto que tiene el Género en los resultados de las notas

library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
datos_cor <- media_genero_escalado %>%  
  select(Gender, MathMean, LectureMean, ScienceMean, TotalMean)

matriz_cor <- cor(datos_cor, use = "complete.obs")

corrplot(matriz_cor, method = "circle", type = "upper", addCoef.col = "orange", 
         tl.col = "black", tl.cex = 0.9)

media_genero$Gender <- as.factor(media_genero$Gender)

# Gráfico con densidad y líneas de media
medias <- aggregate(LectureMean ~ Gender, data = media_genero, FUN = mean)
p1 <- ggplot(media_genero, aes(x = LectureMean, color = Gender)) +
  geom_density(size = 1.5) +
  geom_vline(data = medias, aes(xintercept = LectureMean, color = Gender),
             linetype = "dashed", size = 1) +
  labs(title = "Distribución de LectureMean por Género",
       x = "LectureMean",
       y = "Densidad") +
  theme_minimal()

medias <- aggregate(MathMean ~ Gender, data = media_genero, FUN = mean)
p2 <- ggplot(media_genero, aes(x = MathMean, color = Gender)) +
  geom_density(size = 1.5) +
  geom_vline(data = medias, aes(xintercept = MathMean, color = Gender),
             linetype = "dashed", size = 1) +
  labs(title = "Distribución de MathMean por Género",
       x = "MathMean",
       y = "Densidad") +
  theme_minimal()

medias <- aggregate(ScienceMean ~ Gender, data = media_genero, FUN = mean)
p3 <- ggplot(media_genero, aes(x = ScienceMean, color = Gender)) +
  geom_density(size = 1.5) +
  geom_vline(data = medias, aes(xintercept = ScienceMean, color = Gender),
             linetype = "dashed", size = 1) +
  labs(title = "Distribución de ScienceMean por Género",
       x = "ScienceMean",
       y = "Densidad") +
  theme_minimal()

medias <- aggregate(TotalMean ~ Gender, data = media_genero, FUN = mean)
p4 <- ggplot(media_genero, aes(x = TotalMean, color = Gender)) +
  geom_density(size = 1.5) +
  geom_vline(data = medias, aes(xintercept = TotalMean, color = Gender),
             linetype = "dashed", size = 1) +
  labs(title = "Distribución de TotalMean por Género",
       x = "TotalMean",
       y = "Densidad") +
  theme_minimal()

grid.arrange(p1, p2,p3,p4, nrow = 2)

media_genero$Gender <- as.numeric(media_genero$Gender)
media_genero$Gender <- media_genero$Gender - 1
modelo <- glm(Gender ~ LectureMean, data = media_genero,  family = binomial)
summary(modelo)
## 
## Call:
## glm(formula = Gender ~ LectureMean, family = binomial, data = media_genero)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  3.305673   1.379785   2.396   0.0166 *
## LectureMean -0.007462   0.003089  -2.416   0.0157 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 210.72  on 151  degrees of freedom
## Residual deviance: 204.63  on 150  degrees of freedom
## AIC: 208.63
## 
## Number of Fisher Scoring iterations: 4
media_genero$prob <- predict(modelo, type = "response")
library(ggplot2)

p1 <- ggplot(media_genero, aes(x = LectureMean, y = Gender)) +
  geom_jitter(height = 0.05, width = 0, alpha = 0.5) +  # puntos observados
  geom_line(aes(y = prob), color = "red", size = 1) +   # curva logĂ­stica ajustada
  labs(title = "Modelo Logístico del Género en la nota Lectura (0 = Femenino y 1 = Masculino)", x = "x", y = "Probabilidad de éxito") +
  theme_minimal()

modelo <- glm(Gender ~ MathMean, data = media_genero,  family = binomial)
summary(modelo)
## 
## Call:
## glm(formula = Gender ~ MathMean, family = binomial, data = media_genero)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.886520   1.323573  -0.670    0.503
## MathMean     0.002212   0.003278   0.675    0.500
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 210.72  on 151  degrees of freedom
## Residual deviance: 210.26  on 150  degrees of freedom
## AIC: 214.26
## 
## Number of Fisher Scoring iterations: 3
media_genero$prob <- predict(modelo, type = "response")

p2 <- ggplot(media_genero, aes(x = MathMean, y = Gender)) +
  geom_jitter(height = 0.05, width = 0, alpha = 0.5) +  # puntos observados
  geom_line(aes(y = prob), color = "red", size = 1) +   # curva logĂ­stica ajustada
  labs(title = "Modelo Logístico del Género en la nota Matemáticas", x = "x", y = "Probabilidad de éxito") +
  theme_minimal()

modelo <- glm(Gender ~ ScienceMean, data = media_genero,  family = binomial)
summary(modelo)
## 
## Call:
## glm(formula = Gender ~ ScienceMean, family = binomial, data = media_genero)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)  0.768900   1.389679   0.553    0.580
## ScienceMean -0.001697   0.003046  -0.557    0.577
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 210.72  on 151  degrees of freedom
## Residual deviance: 210.41  on 150  degrees of freedom
## AIC: 214.41
## 
## Number of Fisher Scoring iterations: 3
media_genero$prob <- predict(modelo, type = "response")

p3 <- ggplot(media_genero, aes(x = ScienceMean, y = Gender)) +
  geom_jitter(height = 0.05, width = 0, alpha = 0.5) +  # puntos observados
  geom_line(aes(y = prob), color = "red", size = 1) +   # curva logĂ­stica ajustada
  labs(title = "Modelo Logístico del Género en la nota Ciencia", x = "x", y = "Probabilidad de éxito") +
  theme_minimal()

modelo <- glm(Gender ~ TotalMean, data = media_genero,  family = binomial)
summary(modelo)
## 
## Call:
## glm(formula = Gender ~ TotalMean, family = binomial, data = media_genero)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)  1.142375   1.371318   0.833    0.405
## TotalMean   -0.002643   0.003150  -0.839    0.401
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 210.72  on 151  degrees of freedom
## Residual deviance: 210.01  on 150  degrees of freedom
## AIC: 214.01
## 
## Number of Fisher Scoring iterations: 3
media_genero$prob <- predict(modelo, type = "response")

p4 <- ggplot(media_genero, aes(x = TotalMean, y = Gender)) +
  geom_jitter(height = 0.05, width = 0, alpha = 0.5) +  # puntos observados
  geom_line(aes(y = prob), color = "red", size = 1) +   # curva logĂ­stica ajustada
  labs(title = "Modelo Logístico del Género en la nota Total", x = "x", y = "Probabilidad de éxito") +
  theme_minimal()

grid.arrange(p1, p2,p3,p4, nrow = 2,name = "HOLAA")

media_genero$Gender <- media_genero$Gender + 1
media_genero$Gender <- media_genero$Gender - 1
modelo <- glm(Gender ~ MathMean +LectureMean + ScienceMean, data = media_genero,  family = binomial)
summary(modelo)
## 
## Call:
## glm(formula = Gender ~ MathMean + LectureMean + ScienceMean, 
##     family = binomial, data = media_genero)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.123497   2.074025  -0.542    0.588    
## MathMean     0.001051   0.020331   0.052    0.959    
## LectureMean -0.212090   0.034413  -6.163 7.14e-10 ***
## ScienceMean  0.208687   0.044064   4.736 2.18e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 210.72  on 151  degrees of freedom
## Residual deviance: 112.75  on 148  degrees of freedom
## AIC: 120.75
## 
## Number of Fisher Scoring iterations: 5
media_genero$Gender <- media_genero$Gender + 1
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
## 
## Adjuntando el paquete: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
media_genero$Gender <- media_genero$Gender - 1
# Predicciones del modelo
modelo <- glm(Gender ~ MathMean + LectureMean + ScienceMean, data = media_genero,  family = binomial)
predicciones <- predict(modelo, type = "response")
roc_curve <- roc(media_genero$Gender, predicciones)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve, main = "Curva ROC", col = "blue", print.auc = TRUE)

roc_obj <- roc(media_genero$Gender, predicciones)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_obj, print.auc = TRUE, auc.polygon = TRUE, grid = TRUE)

print(roc_obj)
## 
## Call:
## roc.default(response = media_genero$Gender, predictor = predicciones)
## 
## Data: predicciones in 76 controls (media_genero$Gender 0) < 76 cases (media_genero$Gender 1).
## Area under the curve: 0.9108
media_genero$Gender <- media_genero$Gender + 1

Resultado: AUC = 0.9108. Interpretación: El modelo tiene un excelente poder discriminatorio para predecir el género de un estudiante basándose en sus notas.

Impacto que tiene el Tipo de escuela en los resultados de las notas

library(dplyr)
media_escuela = donde_estudia %>%
  group_by(CNT, Tipo_escuela) %>%
  summarise(
    MathMean = mean(MathMean, na.rm = TRUE),
    LectureMean = mean(LectureMean, na.rm = TRUE),
    ScienceMean = mean(ScienceMean, na.rm = TRUE),
    TotalMean = mean(TotalMean, na.rm = TRUE),
    .groups = "drop"
  )

colnames(media_escuela)[2] = "Tipo_escuela"

media_escuela = na.omit(media_escuela)

media_escuela_escalado <- media_escuela
media_escuela_escalado[numerical_cols] = scale(media_escuela_escalado[numerical_cols], center = TRUE, scale = TRUE)

datos_cor2 <- media_escuela_escalado %>%
  select(Tipo_escuela, MathMean, LectureMean, ScienceMean, TotalMean)

# Calculamos la matriz de correlaciĂłn
matriz_cor2 <- cor(datos_cor2, use = "complete.obs")

# Graficamos
corrplot(matriz_cor2, method = "circle", type = "upper",  addCoef.col = "orange", 
         tl.col = "black", tl.cex = 0.9)

media_escuela$Tipo_escuela <- as.factor(media_escuela$Tipo_escuela)

# Gráfico con densidad y líneas de media
medias <- aggregate(LectureMean ~ Tipo_escuela, data = media_escuela, FUN = mean)
p1 <- ggplot(media_escuela, aes(x = LectureMean, color = Tipo_escuela)) +
  geom_density(size = 1.5) +
  geom_vline(data = medias, aes(xintercept = LectureMean, color = Tipo_escuela),
             linetype = "dashed", size = 1) +
  labs(title = "DistribuciĂłn de LectureMean por Tipo",
       x = "LectureMean",
       y = "Densidad") +
  theme_minimal()

medias <- aggregate(MathMean ~ Tipo_escuela, data = media_escuela, FUN = mean)
p2 <- ggplot(media_escuela, aes(x = MathMean, color = Tipo_escuela)) +
  geom_density(size = 1.5) +
  geom_vline(data = medias, aes(xintercept = MathMean, color = Tipo_escuela),
             linetype = "dashed", size = 1) +
  labs(title = "DistribuciĂłn de MathMean por Tipo",
       x = "MathMean",
       y = "Densidad") +
  theme_minimal()

medias <- aggregate(ScienceMean ~ Tipo_escuela, data = media_escuela, FUN = mean)
p3 <- ggplot(media_escuela, aes(x = ScienceMean, color = Tipo_escuela)) +
  geom_density(size = 1.5) +
  geom_vline(data = medias, aes(xintercept = ScienceMean, color = Tipo_escuela),
             linetype = "dashed", size = 1) +
  labs(title = "DistribuciĂłn de ScienceMean por Tipo",
       x = "ScienceMean",
       y = "Densidad") +
  theme_minimal()

medias <- aggregate(TotalMean ~ Tipo_escuela, data = media_escuela, FUN = mean)
p4 <- ggplot(media_escuela, aes(x = TotalMean, color = Tipo_escuela)) +
  geom_density(size = 1.5) +
  geom_vline(data = medias, aes(xintercept = TotalMean, color = Tipo_escuela),
             linetype = "dashed", size = 1) +
  labs(title = "DistribuciĂłn de TotalMean por Tipo",
       x = "TotalMean",
       y = "Densidad") +
  theme_minimal()

grid.arrange(p1, p2,p3,p4, nrow = 2)

media_escuela$Tipo_escuela <- as.numeric(media_escuela$Tipo_escuela)
media_escuela$Tipo_escuela <- media_escuela$Tipo_escuela - 1
modelo <- glm(Tipo_escuela ~ LectureMean, data = media_escuela,  family = binomial)
summary(modelo)
## 
## Call:
## glm(formula = Tipo_escuela ~ LectureMean, family = binomial, 
##     data = media_escuela)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.238462   1.528115  -3.428 0.000608 ***
## LectureMean  0.011621   0.003361   3.458 0.000544 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 191.31  on 137  degrees of freedom
## Residual deviance: 178.09  on 136  degrees of freedom
## AIC: 182.09
## 
## Number of Fisher Scoring iterations: 4
media_escuela$prob <- predict(modelo, type = "response")

p1 <- ggplot(media_escuela, aes(x = LectureMean, y = Tipo_escuela)) +
  geom_jitter(height = 0.05, width = 0, alpha = 0.5) +  # puntos observados
  geom_line(aes(y = prob), color = "red", size = 1) +   # curva logĂ­stica ajustada
  labs(title = "Modelo Logístico del tipo de escuela en la nota Lectura", x = "x", y = "Probabilidad de éxito") +
  theme_minimal()

modelo <- glm(Tipo_escuela ~ MathMean, data = media_escuela,  family = binomial)
summary(modelo)
## 
## Call:
## glm(formula = Tipo_escuela ~ MathMean, family = binomial, data = media_escuela)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -4.932361   1.513037  -3.260  0.00111 **
## MathMean     0.012111   0.003688   3.284  0.00102 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 191.31  on 137  degrees of freedom
## Residual deviance: 179.40  on 136  degrees of freedom
## AIC: 183.4
## 
## Number of Fisher Scoring iterations: 4
media_escuela$prob <- predict(modelo, type = "response")

p2 <- ggplot(media_escuela, aes(x = MathMean, y = Tipo_escuela)) +
  geom_jitter(height = 0.05, width = 0, alpha = 0.5) +  # puntos observados
  geom_line(aes(y = prob), color = "red", size = 1) +   # curva logĂ­stica ajustada
  labs(title = "Modelo Logístico del tipo de escuela en la nota Matemáticas", x = "x", y = "Probabilidad de éxito") +
  theme_minimal()

modelo <- glm(Tipo_escuela ~ ScienceMean, data = media_escuela,  family = binomial)
summary(modelo)
## 
## Call:
## glm(formula = Tipo_escuela ~ ScienceMean, family = binomial, 
##     data = media_escuela)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.256354   1.573796  -3.340 0.000838 ***
## ScienceMean  0.011399   0.003388   3.365 0.000765 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 191.31  on 137  degrees of freedom
## Residual deviance: 178.82  on 136  degrees of freedom
## AIC: 182.82
## 
## Number of Fisher Scoring iterations: 4
media_escuela$prob <- predict(modelo, type = "response")

p3 <- ggplot(media_escuela, aes(x = ScienceMean, y = Tipo_escuela)) +
  geom_jitter(height = 0.05, width = 0, alpha = 0.5) +  # puntos observados
  geom_line(aes(y = prob), color = "red", size = 1) +   # curva logĂ­stica ajustada
  labs(title = "Modelo Logístico del tipo de escuela en la nota Ciencia", x = "x", y = "Probabilidad de éxito") +
  theme_minimal()

modelo <- glm(Tipo_escuela ~ TotalMean, data = media_escuela,  family = binomial)
summary(modelo)
## 
## Call:
## glm(formula = Tipo_escuela ~ TotalMean, family = binomial, data = media_escuela)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.281018   1.559852  -3.386 0.000710 ***
## TotalMean    0.012010   0.003521   3.411 0.000646 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 191.31  on 137  degrees of freedom
## Residual deviance: 178.44  on 136  degrees of freedom
## AIC: 182.44
## 
## Number of Fisher Scoring iterations: 4
media_escuela$prob <- predict(modelo, type = "response")

p4 <- ggplot(media_escuela, aes(x = TotalMean, y = Tipo_escuela)) +
  geom_jitter(height = 0.05, width = 0, alpha = 0.5) +  # puntos observados
  geom_line(aes(y = prob), color = "red", size = 1) +   # curva logĂ­stica ajustada
  labs(title = "Modelo Logístico del tipo de escuela en la nota Total", x = "x", y = "Probabilidad de éxito") +
  theme_minimal()

grid.arrange(p1, p2,p3,p4, nrow = 2,name = "HOLAA")

media_escuela$Tipo_escuela <- media_escuela$Tipo_escuela + 1
media_escuela$Tipo_escuela <- media_escuela$Tipo_escuela - 1
modelo <- glm(Tipo_escuela ~ MathMean + LectureMean + ScienceMean, data = media_escuela,  family = binomial)
summary(modelo)
## 
## Call:
## glm(formula = Tipo_escuela ~ MathMean + LectureMean + ScienceMean, 
##     family = binomial, data = media_escuela)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -5.064704   1.581397  -3.203  0.00136 **
## MathMean     0.007697   0.014414   0.534  0.59334   
## LectureMean  0.022091   0.021090   1.047  0.29488   
## ScienceMean -0.017411   0.028370  -0.614  0.53941   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 191.31  on 137  degrees of freedom
## Residual deviance: 177.70  on 134  degrees of freedom
## AIC: 185.7
## 
## Number of Fisher Scoring iterations: 4
media_escuela$Tipo_escuela <- media_escuela$Tipo_escuela + 1
media_escuela$Tipo_escuela <- media_escuela$Tipo_escuela - 1
# Predicciones del modelo
modelo <- glm(Tipo_escuela ~ MathMean + LectureMean + ScienceMean, data = media_escuela,  family = binomial)
predicciones <- predict(modelo, type = "response")
roc_curve <- roc(media_escuela$Tipo_escuela, predicciones)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve, main = "Curva ROC", col = "blue", print.auc = TRUE)

roc_obj <- roc(media_escuela$Tipo_escuela, predicciones)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_obj, print.auc = TRUE, auc.polygon = TRUE, grid = TRUE)

print(roc_obj)
## 
## Call:
## roc.default(response = media_escuela$Tipo_escuela, predictor = predicciones)
## 
## Data: predicciones in 69 controls (media_escuela$Tipo_escuela 0) < 69 cases (media_escuela$Tipo_escuela 1).
## Area under the curve: 0.6763
media_escuela$Tipo_escuela <- media_escuela$Tipo_escuela + 1
GanadorGenero <- media_genero %>%
  group_by(CNT) %>%
  slice_max(order_by = TotalMean, n = 1, with_ties = FALSE) %>%
  mutate(Gender = ifelse(Gender == 1, "Female", "Male")) %>%
  select(CNT, Gender)

library(countrycode)

GanadorGenero$country_name <- countrycode(GanadorGenero$CNT,
                                          origin = "iso3c", 
                                          destination = "country.name")
## Warning: Some values were not matched unambiguously: KSV, QAZ, QUR, TAP
codigos_extra <- c("KSV" = "Kosovo",
                   "QAZ" = "Kazakhstan",
                   "QUR" = "Qatar",
                   "TAP" = "Chinese Taipei")

GanadorGenero$country_name[is.na(GanadorGenero$country_name)] <-
  codigos_extra[GanadorGenero$CNT[is.na(GanadorGenero$country_name)]]

library(sf)

# Obtener el mapa mundial
world <- ne_countries(scale = "medium", returnclass = "sf")

# Unir con los datos de género ganador
map_data <- left_join(world, GanadorGenero, by = c("name" = "country_name"))

ggplot(data = map_data) +
  geom_sf(aes(fill = Gender)) +
  scale_fill_manual(values = c("Female" = "#FF69B4", "Male" = "#6495ED")) +
  labs(title = "Género con mayor rendimiento promedio por país",
       fill = "Género") +
  theme_minimal()

library(tidyr)
mean_total_gender_by_country <- donde_estudia %>%
  group_by(CNT, Gender) %>%
  summarise(
    MeanTotalMean = mean(TotalMean, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  group_by(CNT) %>%
  filter(n_distinct(Gender) == 2) %>%
  ungroup()

gender_means_wide <- mean_total_gender_by_country %>%
  pivot_wider(names_from = Gender, values_from = MeanTotalMean, names_prefix = "Gender_") %>%
  rename(MeanFemale = Gender_1, MeanMale = Gender_2) # Assuming 1 is Female and 2 is Male based on your earlier code

percentage_difference_by_country <- gender_means_wide %>%
  mutate(
    PercentageDiff_Male_vs_Female = ((MeanMale - MeanFemale) / MeanFemale) * 100
  )

print(percentage_difference_by_country)
## # A tibble: 72 Ă— 4
##    CNT   MeanFemale MeanMale PercentageDiff_Male_vs_Female
##    <chr>      <dbl>    <dbl>                         <dbl>
##  1 ALB         372.     348.                        -6.48 
##  2 ARE         428.     412.                        -3.85 
##  3 ARG         395.     398.                         0.899
##  4 AUT         474.     478.                         0.871
##  5 BEL         480.     477.                        -0.586
##  6 BGR         414.     399.                        -3.62 
##  7 BRA         390.     392.                         0.599
##  8 BRN         432.     414.                        -4.14 
##  9 CHE         483.     479.                        -0.867
## 10 CHL         437.     447.                         2.24 
## # ℹ 62 more rows
print(percentage_difference_by_country %>% arrange(desc(PercentageDiff_Male_vs_Female)))
## # A tibble: 72 Ă— 4
##    CNT   MeanFemale MeanMale PercentageDiff_Male_vs_Female
##    <chr>      <dbl>    <dbl>                         <dbl>
##  1 CRI         386.     395.                         2.40 
##  2 CHL         437.     447.                         2.24 
##  3 PER         390.     397.                         1.85 
##  4 MEX         392.     397.                         1.29 
##  5 ARG         395.     398.                         0.899
##  6 AUT         474.     478.                         0.871
##  7 URY         414.     417.                         0.837
##  8 COL         398.     401.                         0.833
##  9 BRA         390.     392.                         0.599
## 10 ISR         456.     457.                         0.353
## # ℹ 62 more rows
print(percentage_difference_by_country %>% arrange(PercentageDiff_Male_vs_Female))
## # A tibble: 72 Ă— 4
##    CNT   MeanFemale MeanMale PercentageDiff_Male_vs_Female
##    <chr>      <dbl>    <dbl>                         <dbl>
##  1 PSE         363.     333.                         -8.16
##  2 JOR         362.     333.                         -8.00
##  3 ALB         372.     348.                         -6.48
##  4 JAM         391.     368.                         -5.91
##  5 PHL         351.     330.                         -5.89
##  6 MYS         401.     383.                         -4.50
##  7 QAT         419.     401.                         -4.43
##  8 BRN         432.     414.                         -4.14
##  9 SVN         469.     450.                         -3.93
## 10 ARE         428.     412.                         -3.85
## # ℹ 62 more rows
summary(percentage_difference_by_country)
##      CNT              MeanFemale       MeanMale    
##  Length:72          Min.   :344.2   Min.   :330.2  
##  Class :character   1st Qu.:390.7   1st Qu.:388.1  
##  Mode  :character   Median :432.2   Median :425.7  
##                     Mean   :433.2   Mean   :425.8  
##                     3rd Qu.:470.5   3rd Qu.:469.0  
##                     Max.   :541.7   Max.   :540.5  
##  PercentageDiff_Male_vs_Female
##  Min.   :-8.1616              
##  1st Qu.:-3.2008              
##  Median :-1.6412              
##  Mean   :-1.7904              
##  3rd Qu.:-0.2339              
##  Max.   : 2.4043
GanadorGeneroMasculino <- media_genero %>%
  group_by(CNT) %>%
  slice_max(order_by = TotalMean, n = 1, with_ties = FALSE) %>%
  filter(Gender == 2) %>% 
  mutate(GeneroGanador = "Masculino") %>%
  select(CNT, GeneroGanador, TotalMean)

GanadorGeneroMasculino$country_name <- countrycode(GanadorGeneroMasculino$CNT,
                                                  origin = "iso3c",
                                                  destination = "country.name")

codigos_extra <- c("KSV" = "Kosovo",
                   "QAZ" = "Kazakhstan",
                   "QUR" = "Qatar",
                   "TAP" = "Taiwan")

GanadorGeneroMasculino$country_name[is.na(GanadorGeneroMasculino$country_name)] <-
  codigos_extra[GanadorGeneroMasculino$CNT[is.na(GanadorGeneroMasculino$country_name)]]

nrow(GanadorGeneroMasculino)
## [1] 15
GanadorGeneroMasculino$country_name
##  [1] "Argentina"       "Austria"         "Brazil"          "Chile"          
##  [5] "Colombia"        "Costa Rica"      "Germany"         "United Kingdom" 
##  [9] "Hungary"         "Israel"          "Italy"           "Macao SAR China"
## [13] "Mexico"          "Peru"            "Uruguay"

El género femenino es el más abundante en los países, en total hay 61 de 76 (80,26%)

GanadorEscuela <- media_escuela %>%
  group_by(CNT) %>%
  slice_max(order_by = TotalMean, n = 1, with_ties = FALSE) %>%
  mutate(Tipo_escuela = ifelse(Tipo_escuela == 1, "PĂşblica", "Privada")) %>%
  select(CNT, Tipo_escuela)

library(countrycode)

GanadorEscuela$country_name <- countrycode(GanadorEscuela$CNT,
                                          origin = "iso3c", 
                                          destination = "country.name")
## Warning: Some values were not matched unambiguously: KSV, QAZ, QUR, TAP
codigos_extra <- c("KSV" = "Kosovo",
                   "QAZ" = "Kazakhstan",
                   "QUR" = "Qatar",
                   "TAP" = "Chinese Taipei")

GanadorEscuela$country_name[is.na(GanadorEscuela$country_name)] <-
  codigos_extra[GanadorEscuela$CNT[is.na(GanadorEscuela$country_name)]]

library(sf)

world <- ne_countries(scale = "medium", returnclass = "sf")

map_data <- left_join(world, GanadorEscuela, by = c("name" = "country_name"))

ggplot(data = map_data) +
  geom_sf(aes(fill = Tipo_escuela)) +
  scale_fill_manual(values = c("PĂşblica" = "#44e36e", "Privada" = "#54b2e8")) +
  labs(title = "Tipo de escuela con mayor rendimiento promedio por paĂ­s",
       fill = "Tipo de escuela") +
  theme_minimal()

GanadorEscuelaPublica <- media_escuela %>%
  group_by(CNT) %>%
  slice_max(order_by = TotalMean, n = 1, with_ties = FALSE) %>%
  filter(Tipo_escuela == 1) %>%  # Solo las que ganan siendo pĂşblicas
  mutate(EscuelaGanadora = "PĂşblica") %>%
  select(CNT, EscuelaGanadora, TotalMean)

# Primero traducimos los paĂ­ses conocidos
GanadorEscuelaPublica$country_name <- countrycode(GanadorEscuelaPublica$CNT,
                                                  origin = "iso3c",
                                                  destination = "country.name")
## Warning: Some values were not matched unambiguously: TAP
# Después corregimos los que quedaron como NA manualmente
codigos_extra <- c("KSV" = "Kosovo",
                   "QAZ" = "Kazakhstan",
                   "QUR" = "Qatar",
                   "TAP" = "Taiwan")

GanadorEscuelaPublica$country_name[is.na(GanadorEscuelaPublica$country_name)] <-
  codigos_extra[GanadorEscuelaPublica$CNT[is.na(GanadorEscuelaPublica$country_name)]]

nrow(GanadorEscuelaPublica)
## [1] 11
GanadorEscuelaPublica$country_name
##  [1] "Hungary"    "Indonesia"  "Italy"      "Japan"      "Kazakhstan"
##  [6] "Poland"     "Romania"    "Serbia"     "Taiwan"     "Thailand"  
## [11] "Turkey"

La escuela privada es la predominante en los paĂ­ses, en total hay 58 de 69 (84,06%).

mean_total_school_type_by_country <- donde_estudia %>%
  group_by(CNT, Tipo_escuela) %>%
  summarise(
    MeanTotalMean = mean(TotalMean, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  group_by(CNT) %>%
  filter(n_distinct(Tipo_escuela) == 2) %>%
  ungroup()

school_type_means_wide <- mean_total_school_type_by_country %>%
  pivot_wider(names_from = Tipo_escuela, values_from = MeanTotalMean, names_prefix = "SchoolType_") %>%
  rename(MeanPublic = SchoolType_1, MeanPrivate = SchoolType_2) # Assuming 1 is Public and 2 is Private

percentage_difference_school_type_by_country_table <- school_type_means_wide %>%
  mutate(
    PercentageDiff_Private_vs_Public = ((MeanPrivate - MeanPublic) / MeanPublic) * 100
  )

percentage_difference_school_type_by_country_table$country_name <- countrycode(percentage_difference_school_type_by_country_table$CNT,
                                                                                 origin = "iso3c",
                                                                                 destination = "country.name")
## Warning: Some values were not matched unambiguously: TAP
codigos_extra <- c("KSV" = "Kosovo",
                   "QAZ" = "Kazakhstan",
                   "QUR" = "Qatar",
                   "TAP" = "Chinese Taipei")

percentage_difference_school_type_by_country_table$country_name[is.na(percentage_difference_school_type_by_country_table$country_name)] <-
  codigos_extra[percentage_difference_school_type_by_country_table$CNT[is.na(percentage_difference_school_type_by_country_table$country_name)]]

percentage_difference_school_type_by_country_table <- percentage_difference_school_type_by_country_table %>%
  select(CNT, country_name, MeanPublic, MeanPrivate, PercentageDiff_Private_vs_Public) %>%
  arrange(desc(PercentageDiff_Private_vs_Public)) # Arrange to see the largest differences first

print(percentage_difference_school_type_by_country_table)
## # A tibble: 23 Ă— 5
##    CNT   country_name            MeanPublic MeanPrivate PercentageDiff_Private…¹
##    <chr> <chr>                        <dbl>       <dbl>                    <dbl>
##  1 PHL   Philippines                   330.        394.                     19.5
##  2 QAT   Qatar                         374.        447.                     19.5
##  3 BRN   Brunei                        410.        488.                     19.0
##  4 URY   Uruguay                       405.        475.                     17.5
##  5 UZB   Uzbekistan                    341.        398.                     16.5
##  6 PSE   Palestinian Territories       347.        404.                     16.3
##  7 PER   Peru                          379.        439.                     15.8
##  8 MEX   Mexico                        388.        442.                     13.8
##  9 JOR   Jordan                        342.        384.                     12.3
## 10 NZL   New Zealand                   478.        535.                     12.1
## # ℹ 13 more rows
## # ℹ abbreviated name: ¹​PercentageDiff_Private_vs_Public
summary(percentage_difference_school_type_by_country_table)
##      CNT            country_name         MeanPublic     MeanPrivate   
##  Length:23          Length:23          Min.   :329.8   Min.   :364.5  
##  Class :character   Class :character   1st Qu.:376.3   1st Qu.:406.2  
##  Mode  :character   Mode  :character   Median :404.6   Median :441.8  
##                                        Mean   :422.7   Mean   :449.7  
##                                        3rd Qu.:480.9   3rd Qu.:483.5  
##                                        Max.   :539.9   Max.   :556.3  
##  PercentageDiff_Private_vs_Public
##  Min.   :-7.196                  
##  1st Qu.:-1.950                  
##  Median : 7.170                  
##  Mean   : 7.166                  
##  3rd Qu.:16.056                  
##  Max.   :19.506
media_genero$Gender <- ifelse(media_genero$Gender == 1, "Female", "Male")

diferencias <- media_genero %>%
  select(CNT, Gender, TotalMean) %>%
  pivot_wider(names_from = Gender, values_from = TotalMean) %>%
  mutate(Diff = Female - Male)

diferencias$country_name <- countrycode(diferencias$CNT,
                                        origin = "iso3c",
                                        destination = "country.name")
## Warning: Some values were not matched unambiguously: KSV, QAZ, QUR, TAP
codigos_extra <- c("KSV" = "Kosovo",
                   "QAZ" = "Kazakhstan",
                   "QUR" = "Qatar",
                   "TAP" = "Chinese Taipei")

diferencias$country_name[is.na(diferencias$country_name)] <-
  codigos_extra[diferencias$CNT[is.na(diferencias$country_name)]]

diferencias <- diferencias %>%
  mutate(Diff_adj = ifelse(abs(Diff) < 0.1, NA, Diff))

# Obtener mapa del mundo
world <- ne_countries(scale = "medium", returnclass = "sf")

# Unir mapa con los datos de diferencias
map_data <- left_join(world, diferencias, by = c("name" = "country_name"))

# Dibujar el mapa
ggplot(data = map_data) +
  geom_sf(aes(fill = scale(Diff_adj,center=TRUE,scale=TRUE)), color = "gray80") +
  scale_fill_gradient2(
    low = "#6495ED",      # Azul: chicos mejor
    mid = "white",        # Blanco: igualdad
    high = "#FF69B4",     # Rosado: chicas mejor
    midpoint = 0,
    na.value = "gray90",  # PaĂ­ses con diferencia menor a 0.1
    name = "Diferencia\n(Chicas - Chicos)"
  ) +
  labs(
    title = "Diferencia de rendimiento promedio por paĂ­s",
    subtitle = "Valores positivos: mejor desempeño de chicas\nGris: diferencias menores a ±0.1",
    caption = "Fuente: Datos de media_genero"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    legend.position = "right"
  )

mean_total_school_type_by_country <- donde_estudia %>%
  group_by(CNT, Tipo_escuela) %>%
  summarise(
    MeanTotalMean = mean(TotalMean, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  group_by(CNT) %>%
  ungroup()

school_type_means_wide <- mean_total_school_type_by_country %>%
  pivot_wider(names_from = Tipo_escuela,
              values_from = MeanTotalMean,
              names_prefix = "SchoolType_") %>%
  rename(MeanPublic = SchoolType_1, MeanPrivate = SchoolType_2)

percentage_difference <- school_type_means_wide %>%
  mutate(
    DiffPercent = ((MeanPrivate - MeanPublic) / MeanPublic) * 100
  )

percentage_difference$country_name <- countrycode(percentage_difference$CNT,
                                                  origin = "iso3c",
                                                  destination = "country.name")
## Warning: Some values were not matched unambiguously: KSV, QAZ, QUR, TAP
codigos_extra <- c("KSV" = "Kosovo",
                   "QAZ" = "Kazakhstan",
                   "QUR" = "Qatar",
                   "TAP" = "Chinese Taipei")

percentage_difference$country_name[is.na(percentage_difference$country_name)] <-
  codigos_extra[percentage_difference$CNT[is.na(percentage_difference$country_name)]]

percentage_difference <- percentage_difference %>%
  mutate(DiffAdj = ifelse(abs(DiffPercent) < 1, NA, DiffPercent))

world <- ne_countries(scale = "medium", returnclass = "sf")

map_data <- left_join(world, percentage_difference, by = c("name" = "country_name"))

ggplot(data = map_data) +
  geom_sf(aes(fill = DiffAdj), color = "gray80") +
  scale_fill_gradient2(
    low = "#44e36e",       # Verde: pĂşblica mejor
    mid = "white",         # Blanco: igualdad
    high = "#f75467",      # Rojo: privada mejor
    midpoint = 0,
    na.value = "gray90",   # Diferencias < 1% en gris
    name = "% diferencia\n(Privada - PĂşblica)"
  ) +
  labs(
    title = "Diferencia porcentual de rendimiento entre escuelas privadas y pĂşblicas",
    subtitle = "Valores positivos indican mejor rendimiento en escuelas privadas\nGris: diferencia menor al 1%",
    caption = "Fuente: Datos de donde_estudia"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 15),
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 9)
  )