library(haven)
library(gridExtra)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(tidyr)
library(sf)
## Warning: package 'sf' was built under R version 4.4.3
## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(countrycode)
## Warning: package 'countrycode' was built under R version 4.4.3
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.4.3
library(rnaturalearthdata)
## Warning: package 'rnaturalearthdata' was built under R version 4.4.3
##
## Adjuntando el paquete: 'rnaturalearthdata'
## The following object is masked from 'package:rnaturalearth':
##
## countries110
escuelas <- read_sas("CY08MSP_SCH_QQQ.SAS7BDAT", NULL)
load("estudiantesPISAfiltrados.R")
estudiantes <- datosPisaEstudiantes_filtrado
rm(datosPisaEstudiantes_filtrado)
library(dplyr)
estudiantes_interes <- estudiantes %>%
select(matches("^PV|^CNT$|^CNTSCHID$|^CNTSTUID$|^ST004D01T$"))
estudiantes_interes = estudiantes_interes %>% transmute(CNT = estudiantes_interes$CNT,
CNTSCHID = estudiantes_interes$CNTSCHID,
CNTSTUID = estudiantes_interes$CNTSTUID,
Gender = estudiantes_interes$ST004D01T,
MathMean = rowMeans(estudiantes_interes[,4:13]),
LectureMean = rowMeans(estudiantes_interes[,14:23]),
ScienceMean = rowMeans(estudiantes_interes[,24:33]))
escuelas_interes <- escuelas %>% select(c("CNTSCHID", "SC013Q01TA", "SC004Q02TA", "SC175Q01JA"))
donde_estudia <- left_join(escuelas_interes,estudiantes_interes, by = "CNTSCHID")
colnames(donde_estudia)[2] = "Tipo_escuela"
donde_estudia$TotalMean = rowMeans(donde_estudia %>% select(c("MathMean", "LectureMean", "ScienceMean")))
media_genero = donde_estudia %>%
group_by(CNT, Gender) %>%
summarise(
MathMean = mean(MathMean, na.rm = TRUE),
LectureMean = mean(LectureMean, na.rm = TRUE),
ScienceMean = mean(ScienceMean, na.rm = TRUE),
TotalMean = mean(TotalMean, na.rm = TRUE),
.groups = "drop"
)
media_genero = na.omit(media_genero)
numerical_cols <- c("MathMean", "LectureMean", "ScienceMean", "TotalMean")
media_genero_escalado <- media_genero
media_genero_escalado[numerical_cols] = scale(media_genero_escalado[numerical_cols], center = TRUE, scale = TRUE)
tabla_contingencia <- table(donde_estudia$Gender, donde_estudia$Tipo_escuela)
print(tabla_contingencia)
##
## 1 2
## 1 200637 49387
## 2 197290 49415
prueba_chi2 <- chisq.test(tabla_contingencia)
print(prueba_chi2)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: tabla_contingencia
## X-squared = 5.9661, df = 1, p-value = 0.01458
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.4.3
CramerV(tabla_contingencia)
## [1] 0.003470713
Resultado: p-value = 0.01458. InterpretaciĂłn:Esto significa que hay una relaciĂłn estadĂsticamente significativa entre el gĂ©nero y el tipo de escuela, es decir que no son independientes.
Resultado: V de Cramer = 0.00347. InterpretaciĂłn: La asociaciĂłn es extremadamente dĂ©bil, prácticamente insignificante en tĂ©rminos prácticos, a pesar de ser estadĂsticamente significativa debido al gran tamaño de la muestra.
manova_model <- manova(cbind(MathMean, LectureMean, ScienceMean) ~ Gender * Tipo_escuela, data = donde_estudia)
summary(manova_model)
## Df Pillai approx F num Df den Df Pr(>F)
## Gender 1 0.094919 17364.4 3 496723 < 2.2e-16 ***
## Tipo_escuela 1 0.021795 3689.0 3 496723 < 2.2e-16 ***
## Gender:Tipo_escuela 1 0.000061 10.1 3 496723 1.268e-06 ***
## Residuals 496725
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print("----------------------------------------------------------------")
## [1] "----------------------------------------------------------------"
summary.aov(manova_model)
## Response MathMean :
## Df Sum Sq Mean Sq F value Pr(>F)
## Gender 1 5886381 5886381 791.0690 <2e-16 ***
## Tipo_escuela 1 72633040 72633040 9761.1332 <2e-16 ***
## Gender:Tipo_escuela 1 7184 7184 0.9655 0.3258
## Residuals 496725 3696153484 7441
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response LectureMean :
## Df Sum Sq Mean Sq F value Pr(>F)
## Gender 1 50530038 50530038 5091.2689 <2e-16 ***
## Tipo_escuela 1 108428808 108428808 10924.9910 <2e-16 ***
## Gender:Tipo_escuela 1 14907 14907 1.5019 0.2204
## Residuals 496725 4929917065 9925
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response ScienceMean :
## Df Sum Sq Mean Sq F value Pr(>F)
## Gender 1 1060804 1060804 110.9100 <2e-16 ***
## Tipo_escuela 1 97264869 97264869 10169.3097 <2e-16 ***
## Gender:Tipo_escuela 1 3649 3649 0.3815 0.5368
## Residuals 496725 4750951022 9565
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## 67119 observations deleted due to missingness
Se encontró un efecto significativo tanto para Gender como para Tipo_escuela, y lo más importante, una interacción significativa (p-value = 1.268e-06 ***) entre Gender y Tipo_escuela en el rendimiento general. Es decir, el efecto del género en las notas no es el mismo en los diferentes tipos de escuela, y viceversa. Dicho de otro modo, la diferencia en el rendimiento entre chicos y chicas depende de si están en una escuela pública o privada, y el efecto del tipo de escuela en el rendimiento es diferente para cada género.
summary.aov(manova_model) (ANOVA univariados): Los ANOVA univariados por asignatura no mostraron interacciones significativas, lo cual es una sutil pero importante diferencia.
Esto puede deberse a que el MANOVA busca diferencias en el “perfil” o “vector” de medias: No se enfoca en si hay una interacción para cada nota por separado, sino si hay un patrón de interacción que afecta al conjunto de notas. Es posible que el efecto combinado o una interacción muy sutil a través de las correlaciones entre MathMean, LectureMean y ScienceMean sea significativa cuando se analizan juntas.
library(ggplot2)
library(dplyr)
donde_estudia_largo <- donde_estudia %>%
filter(
!is.na(Gender),
!is.na(Tipo_escuela),
!is.na(MathMean),
!is.na(LectureMean),
!is.na(ScienceMean)
) %>%
mutate(
Gender = as.factor(Gender),
Tipo_escuela = as.factor(Tipo_escuela)
) %>%
pivot_longer(
cols = c(MathMean, LectureMean, ScienceMean),
names_to = "asignatura",
values_to = "nota"
)
# Calcular medias e IC por grupo (Gender x Tipo_escuela x asignatura)
summary_data <- donde_estudia_largo %>%
group_by(Gender, Tipo_escuela, asignatura) %>%
summarise(
mean_nota = mean(nota, na.rm = TRUE),
sd_nota = sd(nota, na.rm = TRUE),
n = n(),
se = sd_nota / sqrt(n),
lower = mean_nota - qt(0.975, n-1) * se,
upper = mean_nota + qt(0.975, n-1) * se
) %>%
ungroup()
## `summarise()` has grouped output by 'Gender', 'Tipo_escuela'. You can override
## using the `.groups` argument.
# Gráfico de interacción
ggplot(summary_data, aes(x = Tipo_escuela, y = mean_nota, color = Gender, group = Gender)) +
geom_point(size = 3) +
geom_line(size = 1) +
geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2) +
facet_wrap(~ asignatura, scales = "free_y") +
labs(
title = "Gráfico de interacción: género y tipo de escuela sobre notas medias",
x = "Tipo de escuela",
y = "Nota media"
) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(tidyr)
library(dplyr)
library(ggplot2)
ggplot(donde_estudia_largo, aes(x = Tipo_escuela, y = nota, fill = Gender)) +
geom_boxplot(aes(group = interaction(Tipo_escuela, Gender))) +
facet_wrap(~ asignatura, scales = "free_y") +
labs(
title = "Distribución de notas por tipo de escuela y género",
x = "Tipo de escuela", y = "Nota media"
) +
theme_minimal()
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
datos_cor <- media_genero_escalado %>%
select(Gender, MathMean, LectureMean, ScienceMean, TotalMean)
matriz_cor <- cor(datos_cor, use = "complete.obs")
corrplot(matriz_cor, method = "circle", type = "upper", addCoef.col = "orange",
tl.col = "black", tl.cex = 0.9)
media_genero$Gender <- as.factor(media_genero$Gender)
# Gráfico con densidad y lĂneas de media
medias <- aggregate(LectureMean ~ Gender, data = media_genero, FUN = mean)
p1 <- ggplot(media_genero, aes(x = LectureMean, color = Gender)) +
geom_density(size = 1.5) +
geom_vline(data = medias, aes(xintercept = LectureMean, color = Gender),
linetype = "dashed", size = 1) +
labs(title = "Distribución de LectureMean por Género",
x = "LectureMean",
y = "Densidad") +
theme_minimal()
medias <- aggregate(MathMean ~ Gender, data = media_genero, FUN = mean)
p2 <- ggplot(media_genero, aes(x = MathMean, color = Gender)) +
geom_density(size = 1.5) +
geom_vline(data = medias, aes(xintercept = MathMean, color = Gender),
linetype = "dashed", size = 1) +
labs(title = "Distribución de MathMean por Género",
x = "MathMean",
y = "Densidad") +
theme_minimal()
medias <- aggregate(ScienceMean ~ Gender, data = media_genero, FUN = mean)
p3 <- ggplot(media_genero, aes(x = ScienceMean, color = Gender)) +
geom_density(size = 1.5) +
geom_vline(data = medias, aes(xintercept = ScienceMean, color = Gender),
linetype = "dashed", size = 1) +
labs(title = "Distribución de ScienceMean por Género",
x = "ScienceMean",
y = "Densidad") +
theme_minimal()
medias <- aggregate(TotalMean ~ Gender, data = media_genero, FUN = mean)
p4 <- ggplot(media_genero, aes(x = TotalMean, color = Gender)) +
geom_density(size = 1.5) +
geom_vline(data = medias, aes(xintercept = TotalMean, color = Gender),
linetype = "dashed", size = 1) +
labs(title = "Distribución de TotalMean por Género",
x = "TotalMean",
y = "Densidad") +
theme_minimal()
grid.arrange(p1, p2,p3,p4, nrow = 2)
media_genero$Gender <- as.numeric(media_genero$Gender)
media_genero$Gender <- media_genero$Gender - 1
modelo <- glm(Gender ~ LectureMean, data = media_genero, family = binomial)
summary(modelo)
##
## Call:
## glm(formula = Gender ~ LectureMean, family = binomial, data = media_genero)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.305673 1.379785 2.396 0.0166 *
## LectureMean -0.007462 0.003089 -2.416 0.0157 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 210.72 on 151 degrees of freedom
## Residual deviance: 204.63 on 150 degrees of freedom
## AIC: 208.63
##
## Number of Fisher Scoring iterations: 4
media_genero$prob <- predict(modelo, type = "response")
library(ggplot2)
p1 <- ggplot(media_genero, aes(x = LectureMean, y = Gender)) +
geom_jitter(height = 0.05, width = 0, alpha = 0.5) + # puntos observados
geom_line(aes(y = prob), color = "red", size = 1) + # curva logĂstica ajustada
labs(title = "Modelo LogĂstico del GĂ©nero en la nota Lectura (0 = Femenino y 1 = Masculino)", x = "x", y = "Probabilidad de Ă©xito") +
theme_minimal()
modelo <- glm(Gender ~ MathMean, data = media_genero, family = binomial)
summary(modelo)
##
## Call:
## glm(formula = Gender ~ MathMean, family = binomial, data = media_genero)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.886520 1.323573 -0.670 0.503
## MathMean 0.002212 0.003278 0.675 0.500
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 210.72 on 151 degrees of freedom
## Residual deviance: 210.26 on 150 degrees of freedom
## AIC: 214.26
##
## Number of Fisher Scoring iterations: 3
media_genero$prob <- predict(modelo, type = "response")
p2 <- ggplot(media_genero, aes(x = MathMean, y = Gender)) +
geom_jitter(height = 0.05, width = 0, alpha = 0.5) + # puntos observados
geom_line(aes(y = prob), color = "red", size = 1) + # curva logĂstica ajustada
labs(title = "Modelo LogĂstico del GĂ©nero en la nota Matemáticas", x = "x", y = "Probabilidad de Ă©xito") +
theme_minimal()
modelo <- glm(Gender ~ ScienceMean, data = media_genero, family = binomial)
summary(modelo)
##
## Call:
## glm(formula = Gender ~ ScienceMean, family = binomial, data = media_genero)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.768900 1.389679 0.553 0.580
## ScienceMean -0.001697 0.003046 -0.557 0.577
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 210.72 on 151 degrees of freedom
## Residual deviance: 210.41 on 150 degrees of freedom
## AIC: 214.41
##
## Number of Fisher Scoring iterations: 3
media_genero$prob <- predict(modelo, type = "response")
p3 <- ggplot(media_genero, aes(x = ScienceMean, y = Gender)) +
geom_jitter(height = 0.05, width = 0, alpha = 0.5) + # puntos observados
geom_line(aes(y = prob), color = "red", size = 1) + # curva logĂstica ajustada
labs(title = "Modelo LogĂstico del GĂ©nero en la nota Ciencia", x = "x", y = "Probabilidad de Ă©xito") +
theme_minimal()
modelo <- glm(Gender ~ TotalMean, data = media_genero, family = binomial)
summary(modelo)
##
## Call:
## glm(formula = Gender ~ TotalMean, family = binomial, data = media_genero)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.142375 1.371318 0.833 0.405
## TotalMean -0.002643 0.003150 -0.839 0.401
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 210.72 on 151 degrees of freedom
## Residual deviance: 210.01 on 150 degrees of freedom
## AIC: 214.01
##
## Number of Fisher Scoring iterations: 3
media_genero$prob <- predict(modelo, type = "response")
p4 <- ggplot(media_genero, aes(x = TotalMean, y = Gender)) +
geom_jitter(height = 0.05, width = 0, alpha = 0.5) + # puntos observados
geom_line(aes(y = prob), color = "red", size = 1) + # curva logĂstica ajustada
labs(title = "Modelo LogĂstico del GĂ©nero en la nota Total", x = "x", y = "Probabilidad de Ă©xito") +
theme_minimal()
grid.arrange(p1, p2,p3,p4, nrow = 2,name = "HOLAA")
media_genero$Gender <- media_genero$Gender + 1
media_genero$Gender <- media_genero$Gender - 1
modelo <- glm(Gender ~ MathMean +LectureMean + ScienceMean, data = media_genero, family = binomial)
summary(modelo)
##
## Call:
## glm(formula = Gender ~ MathMean + LectureMean + ScienceMean,
## family = binomial, data = media_genero)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.123497 2.074025 -0.542 0.588
## MathMean 0.001051 0.020331 0.052 0.959
## LectureMean -0.212090 0.034413 -6.163 7.14e-10 ***
## ScienceMean 0.208687 0.044064 4.736 2.18e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 210.72 on 151 degrees of freedom
## Residual deviance: 112.75 on 148 degrees of freedom
## AIC: 120.75
##
## Number of Fisher Scoring iterations: 5
media_genero$Gender <- media_genero$Gender + 1
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
##
## Adjuntando el paquete: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
media_genero$Gender <- media_genero$Gender - 1
# Predicciones del modelo
modelo <- glm(Gender ~ MathMean + LectureMean + ScienceMean, data = media_genero, family = binomial)
predicciones <- predict(modelo, type = "response")
roc_curve <- roc(media_genero$Gender, predicciones)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve, main = "Curva ROC", col = "blue", print.auc = TRUE)
roc_obj <- roc(media_genero$Gender, predicciones)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_obj, print.auc = TRUE, auc.polygon = TRUE, grid = TRUE)
print(roc_obj)
##
## Call:
## roc.default(response = media_genero$Gender, predictor = predicciones)
##
## Data: predicciones in 76 controls (media_genero$Gender 0) < 76 cases (media_genero$Gender 1).
## Area under the curve: 0.9108
media_genero$Gender <- media_genero$Gender + 1
Resultado: AUC = 0.9108. Interpretación: El modelo tiene un excelente poder discriminatorio para predecir el género de un estudiante basándose en sus notas.
library(dplyr)
media_escuela = donde_estudia %>%
group_by(CNT, Tipo_escuela) %>%
summarise(
MathMean = mean(MathMean, na.rm = TRUE),
LectureMean = mean(LectureMean, na.rm = TRUE),
ScienceMean = mean(ScienceMean, na.rm = TRUE),
TotalMean = mean(TotalMean, na.rm = TRUE),
.groups = "drop"
)
colnames(media_escuela)[2] = "Tipo_escuela"
media_escuela = na.omit(media_escuela)
media_escuela_escalado <- media_escuela
media_escuela_escalado[numerical_cols] = scale(media_escuela_escalado[numerical_cols], center = TRUE, scale = TRUE)
datos_cor2 <- media_escuela_escalado %>%
select(Tipo_escuela, MathMean, LectureMean, ScienceMean, TotalMean)
# Calculamos la matriz de correlaciĂłn
matriz_cor2 <- cor(datos_cor2, use = "complete.obs")
# Graficamos
corrplot(matriz_cor2, method = "circle", type = "upper", addCoef.col = "orange",
tl.col = "black", tl.cex = 0.9)
media_escuela$Tipo_escuela <- as.factor(media_escuela$Tipo_escuela)
# Gráfico con densidad y lĂneas de media
medias <- aggregate(LectureMean ~ Tipo_escuela, data = media_escuela, FUN = mean)
p1 <- ggplot(media_escuela, aes(x = LectureMean, color = Tipo_escuela)) +
geom_density(size = 1.5) +
geom_vline(data = medias, aes(xintercept = LectureMean, color = Tipo_escuela),
linetype = "dashed", size = 1) +
labs(title = "DistribuciĂłn de LectureMean por Tipo",
x = "LectureMean",
y = "Densidad") +
theme_minimal()
medias <- aggregate(MathMean ~ Tipo_escuela, data = media_escuela, FUN = mean)
p2 <- ggplot(media_escuela, aes(x = MathMean, color = Tipo_escuela)) +
geom_density(size = 1.5) +
geom_vline(data = medias, aes(xintercept = MathMean, color = Tipo_escuela),
linetype = "dashed", size = 1) +
labs(title = "DistribuciĂłn de MathMean por Tipo",
x = "MathMean",
y = "Densidad") +
theme_minimal()
medias <- aggregate(ScienceMean ~ Tipo_escuela, data = media_escuela, FUN = mean)
p3 <- ggplot(media_escuela, aes(x = ScienceMean, color = Tipo_escuela)) +
geom_density(size = 1.5) +
geom_vline(data = medias, aes(xintercept = ScienceMean, color = Tipo_escuela),
linetype = "dashed", size = 1) +
labs(title = "DistribuciĂłn de ScienceMean por Tipo",
x = "ScienceMean",
y = "Densidad") +
theme_minimal()
medias <- aggregate(TotalMean ~ Tipo_escuela, data = media_escuela, FUN = mean)
p4 <- ggplot(media_escuela, aes(x = TotalMean, color = Tipo_escuela)) +
geom_density(size = 1.5) +
geom_vline(data = medias, aes(xintercept = TotalMean, color = Tipo_escuela),
linetype = "dashed", size = 1) +
labs(title = "DistribuciĂłn de TotalMean por Tipo",
x = "TotalMean",
y = "Densidad") +
theme_minimal()
grid.arrange(p1, p2,p3,p4, nrow = 2)
media_escuela$Tipo_escuela <- as.numeric(media_escuela$Tipo_escuela)
media_escuela$Tipo_escuela <- media_escuela$Tipo_escuela - 1
modelo <- glm(Tipo_escuela ~ LectureMean, data = media_escuela, family = binomial)
summary(modelo)
##
## Call:
## glm(formula = Tipo_escuela ~ LectureMean, family = binomial,
## data = media_escuela)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.238462 1.528115 -3.428 0.000608 ***
## LectureMean 0.011621 0.003361 3.458 0.000544 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 191.31 on 137 degrees of freedom
## Residual deviance: 178.09 on 136 degrees of freedom
## AIC: 182.09
##
## Number of Fisher Scoring iterations: 4
media_escuela$prob <- predict(modelo, type = "response")
p1 <- ggplot(media_escuela, aes(x = LectureMean, y = Tipo_escuela)) +
geom_jitter(height = 0.05, width = 0, alpha = 0.5) + # puntos observados
geom_line(aes(y = prob), color = "red", size = 1) + # curva logĂstica ajustada
labs(title = "Modelo LogĂstico del tipo de escuela en la nota Lectura", x = "x", y = "Probabilidad de Ă©xito") +
theme_minimal()
modelo <- glm(Tipo_escuela ~ MathMean, data = media_escuela, family = binomial)
summary(modelo)
##
## Call:
## glm(formula = Tipo_escuela ~ MathMean, family = binomial, data = media_escuela)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.932361 1.513037 -3.260 0.00111 **
## MathMean 0.012111 0.003688 3.284 0.00102 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 191.31 on 137 degrees of freedom
## Residual deviance: 179.40 on 136 degrees of freedom
## AIC: 183.4
##
## Number of Fisher Scoring iterations: 4
media_escuela$prob <- predict(modelo, type = "response")
p2 <- ggplot(media_escuela, aes(x = MathMean, y = Tipo_escuela)) +
geom_jitter(height = 0.05, width = 0, alpha = 0.5) + # puntos observados
geom_line(aes(y = prob), color = "red", size = 1) + # curva logĂstica ajustada
labs(title = "Modelo LogĂstico del tipo de escuela en la nota Matemáticas", x = "x", y = "Probabilidad de Ă©xito") +
theme_minimal()
modelo <- glm(Tipo_escuela ~ ScienceMean, data = media_escuela, family = binomial)
summary(modelo)
##
## Call:
## glm(formula = Tipo_escuela ~ ScienceMean, family = binomial,
## data = media_escuela)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.256354 1.573796 -3.340 0.000838 ***
## ScienceMean 0.011399 0.003388 3.365 0.000765 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 191.31 on 137 degrees of freedom
## Residual deviance: 178.82 on 136 degrees of freedom
## AIC: 182.82
##
## Number of Fisher Scoring iterations: 4
media_escuela$prob <- predict(modelo, type = "response")
p3 <- ggplot(media_escuela, aes(x = ScienceMean, y = Tipo_escuela)) +
geom_jitter(height = 0.05, width = 0, alpha = 0.5) + # puntos observados
geom_line(aes(y = prob), color = "red", size = 1) + # curva logĂstica ajustada
labs(title = "Modelo LogĂstico del tipo de escuela en la nota Ciencia", x = "x", y = "Probabilidad de Ă©xito") +
theme_minimal()
modelo <- glm(Tipo_escuela ~ TotalMean, data = media_escuela, family = binomial)
summary(modelo)
##
## Call:
## glm(formula = Tipo_escuela ~ TotalMean, family = binomial, data = media_escuela)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.281018 1.559852 -3.386 0.000710 ***
## TotalMean 0.012010 0.003521 3.411 0.000646 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 191.31 on 137 degrees of freedom
## Residual deviance: 178.44 on 136 degrees of freedom
## AIC: 182.44
##
## Number of Fisher Scoring iterations: 4
media_escuela$prob <- predict(modelo, type = "response")
p4 <- ggplot(media_escuela, aes(x = TotalMean, y = Tipo_escuela)) +
geom_jitter(height = 0.05, width = 0, alpha = 0.5) + # puntos observados
geom_line(aes(y = prob), color = "red", size = 1) + # curva logĂstica ajustada
labs(title = "Modelo LogĂstico del tipo de escuela en la nota Total", x = "x", y = "Probabilidad de Ă©xito") +
theme_minimal()
grid.arrange(p1, p2,p3,p4, nrow = 2,name = "HOLAA")
media_escuela$Tipo_escuela <- media_escuela$Tipo_escuela + 1
media_escuela$Tipo_escuela <- media_escuela$Tipo_escuela - 1
modelo <- glm(Tipo_escuela ~ MathMean + LectureMean + ScienceMean, data = media_escuela, family = binomial)
summary(modelo)
##
## Call:
## glm(formula = Tipo_escuela ~ MathMean + LectureMean + ScienceMean,
## family = binomial, data = media_escuela)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.064704 1.581397 -3.203 0.00136 **
## MathMean 0.007697 0.014414 0.534 0.59334
## LectureMean 0.022091 0.021090 1.047 0.29488
## ScienceMean -0.017411 0.028370 -0.614 0.53941
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 191.31 on 137 degrees of freedom
## Residual deviance: 177.70 on 134 degrees of freedom
## AIC: 185.7
##
## Number of Fisher Scoring iterations: 4
media_escuela$Tipo_escuela <- media_escuela$Tipo_escuela + 1
media_escuela$Tipo_escuela <- media_escuela$Tipo_escuela - 1
# Predicciones del modelo
modelo <- glm(Tipo_escuela ~ MathMean + LectureMean + ScienceMean, data = media_escuela, family = binomial)
predicciones <- predict(modelo, type = "response")
roc_curve <- roc(media_escuela$Tipo_escuela, predicciones)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve, main = "Curva ROC", col = "blue", print.auc = TRUE)
roc_obj <- roc(media_escuela$Tipo_escuela, predicciones)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_obj, print.auc = TRUE, auc.polygon = TRUE, grid = TRUE)
print(roc_obj)
##
## Call:
## roc.default(response = media_escuela$Tipo_escuela, predictor = predicciones)
##
## Data: predicciones in 69 controls (media_escuela$Tipo_escuela 0) < 69 cases (media_escuela$Tipo_escuela 1).
## Area under the curve: 0.6763
media_escuela$Tipo_escuela <- media_escuela$Tipo_escuela + 1
GanadorGenero <- media_genero %>%
group_by(CNT) %>%
slice_max(order_by = TotalMean, n = 1, with_ties = FALSE) %>%
mutate(Gender = ifelse(Gender == 1, "Female", "Male")) %>%
select(CNT, Gender)
library(countrycode)
GanadorGenero$country_name <- countrycode(GanadorGenero$CNT,
origin = "iso3c",
destination = "country.name")
## Warning: Some values were not matched unambiguously: KSV, QAZ, QUR, TAP
codigos_extra <- c("KSV" = "Kosovo",
"QAZ" = "Kazakhstan",
"QUR" = "Qatar",
"TAP" = "Chinese Taipei")
GanadorGenero$country_name[is.na(GanadorGenero$country_name)] <-
codigos_extra[GanadorGenero$CNT[is.na(GanadorGenero$country_name)]]
library(sf)
# Obtener el mapa mundial
world <- ne_countries(scale = "medium", returnclass = "sf")
# Unir con los datos de género ganador
map_data <- left_join(world, GanadorGenero, by = c("name" = "country_name"))
ggplot(data = map_data) +
geom_sf(aes(fill = Gender)) +
scale_fill_manual(values = c("Female" = "#FF69B4", "Male" = "#6495ED")) +
labs(title = "GĂ©nero con mayor rendimiento promedio por paĂs",
fill = "Género") +
theme_minimal()
library(tidyr)
mean_total_gender_by_country <- donde_estudia %>%
group_by(CNT, Gender) %>%
summarise(
MeanTotalMean = mean(TotalMean, na.rm = TRUE),
.groups = "drop"
) %>%
group_by(CNT) %>%
filter(n_distinct(Gender) == 2) %>%
ungroup()
gender_means_wide <- mean_total_gender_by_country %>%
pivot_wider(names_from = Gender, values_from = MeanTotalMean, names_prefix = "Gender_") %>%
rename(MeanFemale = Gender_1, MeanMale = Gender_2) # Assuming 1 is Female and 2 is Male based on your earlier code
percentage_difference_by_country <- gender_means_wide %>%
mutate(
PercentageDiff_Male_vs_Female = ((MeanMale - MeanFemale) / MeanFemale) * 100
)
print(percentage_difference_by_country)
## # A tibble: 72 Ă— 4
## CNT MeanFemale MeanMale PercentageDiff_Male_vs_Female
## <chr> <dbl> <dbl> <dbl>
## 1 ALB 372. 348. -6.48
## 2 ARE 428. 412. -3.85
## 3 ARG 395. 398. 0.899
## 4 AUT 474. 478. 0.871
## 5 BEL 480. 477. -0.586
## 6 BGR 414. 399. -3.62
## 7 BRA 390. 392. 0.599
## 8 BRN 432. 414. -4.14
## 9 CHE 483. 479. -0.867
## 10 CHL 437. 447. 2.24
## # ℹ 62 more rows
print(percentage_difference_by_country %>% arrange(desc(PercentageDiff_Male_vs_Female)))
## # A tibble: 72 Ă— 4
## CNT MeanFemale MeanMale PercentageDiff_Male_vs_Female
## <chr> <dbl> <dbl> <dbl>
## 1 CRI 386. 395. 2.40
## 2 CHL 437. 447. 2.24
## 3 PER 390. 397. 1.85
## 4 MEX 392. 397. 1.29
## 5 ARG 395. 398. 0.899
## 6 AUT 474. 478. 0.871
## 7 URY 414. 417. 0.837
## 8 COL 398. 401. 0.833
## 9 BRA 390. 392. 0.599
## 10 ISR 456. 457. 0.353
## # ℹ 62 more rows
print(percentage_difference_by_country %>% arrange(PercentageDiff_Male_vs_Female))
## # A tibble: 72 Ă— 4
## CNT MeanFemale MeanMale PercentageDiff_Male_vs_Female
## <chr> <dbl> <dbl> <dbl>
## 1 PSE 363. 333. -8.16
## 2 JOR 362. 333. -8.00
## 3 ALB 372. 348. -6.48
## 4 JAM 391. 368. -5.91
## 5 PHL 351. 330. -5.89
## 6 MYS 401. 383. -4.50
## 7 QAT 419. 401. -4.43
## 8 BRN 432. 414. -4.14
## 9 SVN 469. 450. -3.93
## 10 ARE 428. 412. -3.85
## # ℹ 62 more rows
summary(percentage_difference_by_country)
## CNT MeanFemale MeanMale
## Length:72 Min. :344.2 Min. :330.2
## Class :character 1st Qu.:390.7 1st Qu.:388.1
## Mode :character Median :432.2 Median :425.7
## Mean :433.2 Mean :425.8
## 3rd Qu.:470.5 3rd Qu.:469.0
## Max. :541.7 Max. :540.5
## PercentageDiff_Male_vs_Female
## Min. :-8.1616
## 1st Qu.:-3.2008
## Median :-1.6412
## Mean :-1.7904
## 3rd Qu.:-0.2339
## Max. : 2.4043
GanadorGeneroMasculino <- media_genero %>%
group_by(CNT) %>%
slice_max(order_by = TotalMean, n = 1, with_ties = FALSE) %>%
filter(Gender == 2) %>%
mutate(GeneroGanador = "Masculino") %>%
select(CNT, GeneroGanador, TotalMean)
GanadorGeneroMasculino$country_name <- countrycode(GanadorGeneroMasculino$CNT,
origin = "iso3c",
destination = "country.name")
codigos_extra <- c("KSV" = "Kosovo",
"QAZ" = "Kazakhstan",
"QUR" = "Qatar",
"TAP" = "Taiwan")
GanadorGeneroMasculino$country_name[is.na(GanadorGeneroMasculino$country_name)] <-
codigos_extra[GanadorGeneroMasculino$CNT[is.na(GanadorGeneroMasculino$country_name)]]
nrow(GanadorGeneroMasculino)
## [1] 15
GanadorGeneroMasculino$country_name
## [1] "Argentina" "Austria" "Brazil" "Chile"
## [5] "Colombia" "Costa Rica" "Germany" "United Kingdom"
## [9] "Hungary" "Israel" "Italy" "Macao SAR China"
## [13] "Mexico" "Peru" "Uruguay"
El gĂ©nero femenino es el más abundante en los paĂses, en total hay 61 de 76 (80,26%)
GanadorEscuela <- media_escuela %>%
group_by(CNT) %>%
slice_max(order_by = TotalMean, n = 1, with_ties = FALSE) %>%
mutate(Tipo_escuela = ifelse(Tipo_escuela == 1, "PĂşblica", "Privada")) %>%
select(CNT, Tipo_escuela)
library(countrycode)
GanadorEscuela$country_name <- countrycode(GanadorEscuela$CNT,
origin = "iso3c",
destination = "country.name")
## Warning: Some values were not matched unambiguously: KSV, QAZ, QUR, TAP
codigos_extra <- c("KSV" = "Kosovo",
"QAZ" = "Kazakhstan",
"QUR" = "Qatar",
"TAP" = "Chinese Taipei")
GanadorEscuela$country_name[is.na(GanadorEscuela$country_name)] <-
codigos_extra[GanadorEscuela$CNT[is.na(GanadorEscuela$country_name)]]
library(sf)
world <- ne_countries(scale = "medium", returnclass = "sf")
map_data <- left_join(world, GanadorEscuela, by = c("name" = "country_name"))
ggplot(data = map_data) +
geom_sf(aes(fill = Tipo_escuela)) +
scale_fill_manual(values = c("PĂşblica" = "#44e36e", "Privada" = "#54b2e8")) +
labs(title = "Tipo de escuela con mayor rendimiento promedio por paĂs",
fill = "Tipo de escuela") +
theme_minimal()
GanadorEscuelaPublica <- media_escuela %>%
group_by(CNT) %>%
slice_max(order_by = TotalMean, n = 1, with_ties = FALSE) %>%
filter(Tipo_escuela == 1) %>% # Solo las que ganan siendo pĂşblicas
mutate(EscuelaGanadora = "PĂşblica") %>%
select(CNT, EscuelaGanadora, TotalMean)
# Primero traducimos los paĂses conocidos
GanadorEscuelaPublica$country_name <- countrycode(GanadorEscuelaPublica$CNT,
origin = "iso3c",
destination = "country.name")
## Warning: Some values were not matched unambiguously: TAP
# Después corregimos los que quedaron como NA manualmente
codigos_extra <- c("KSV" = "Kosovo",
"QAZ" = "Kazakhstan",
"QUR" = "Qatar",
"TAP" = "Taiwan")
GanadorEscuelaPublica$country_name[is.na(GanadorEscuelaPublica$country_name)] <-
codigos_extra[GanadorEscuelaPublica$CNT[is.na(GanadorEscuelaPublica$country_name)]]
nrow(GanadorEscuelaPublica)
## [1] 11
GanadorEscuelaPublica$country_name
## [1] "Hungary" "Indonesia" "Italy" "Japan" "Kazakhstan"
## [6] "Poland" "Romania" "Serbia" "Taiwan" "Thailand"
## [11] "Turkey"
La escuela privada es la predominante en los paĂses, en total hay 58 de 69 (84,06%).
mean_total_school_type_by_country <- donde_estudia %>%
group_by(CNT, Tipo_escuela) %>%
summarise(
MeanTotalMean = mean(TotalMean, na.rm = TRUE),
.groups = "drop"
) %>%
group_by(CNT) %>%
filter(n_distinct(Tipo_escuela) == 2) %>%
ungroup()
school_type_means_wide <- mean_total_school_type_by_country %>%
pivot_wider(names_from = Tipo_escuela, values_from = MeanTotalMean, names_prefix = "SchoolType_") %>%
rename(MeanPublic = SchoolType_1, MeanPrivate = SchoolType_2) # Assuming 1 is Public and 2 is Private
percentage_difference_school_type_by_country_table <- school_type_means_wide %>%
mutate(
PercentageDiff_Private_vs_Public = ((MeanPrivate - MeanPublic) / MeanPublic) * 100
)
percentage_difference_school_type_by_country_table$country_name <- countrycode(percentage_difference_school_type_by_country_table$CNT,
origin = "iso3c",
destination = "country.name")
## Warning: Some values were not matched unambiguously: TAP
codigos_extra <- c("KSV" = "Kosovo",
"QAZ" = "Kazakhstan",
"QUR" = "Qatar",
"TAP" = "Chinese Taipei")
percentage_difference_school_type_by_country_table$country_name[is.na(percentage_difference_school_type_by_country_table$country_name)] <-
codigos_extra[percentage_difference_school_type_by_country_table$CNT[is.na(percentage_difference_school_type_by_country_table$country_name)]]
percentage_difference_school_type_by_country_table <- percentage_difference_school_type_by_country_table %>%
select(CNT, country_name, MeanPublic, MeanPrivate, PercentageDiff_Private_vs_Public) %>%
arrange(desc(PercentageDiff_Private_vs_Public)) # Arrange to see the largest differences first
print(percentage_difference_school_type_by_country_table)
## # A tibble: 23 Ă— 5
## CNT country_name MeanPublic MeanPrivate PercentageDiff_Private…¹
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 PHL Philippines 330. 394. 19.5
## 2 QAT Qatar 374. 447. 19.5
## 3 BRN Brunei 410. 488. 19.0
## 4 URY Uruguay 405. 475. 17.5
## 5 UZB Uzbekistan 341. 398. 16.5
## 6 PSE Palestinian Territories 347. 404. 16.3
## 7 PER Peru 379. 439. 15.8
## 8 MEX Mexico 388. 442. 13.8
## 9 JOR Jordan 342. 384. 12.3
## 10 NZL New Zealand 478. 535. 12.1
## # ℹ 13 more rows
## # ℹ abbreviated name: ¹​PercentageDiff_Private_vs_Public
summary(percentage_difference_school_type_by_country_table)
## CNT country_name MeanPublic MeanPrivate
## Length:23 Length:23 Min. :329.8 Min. :364.5
## Class :character Class :character 1st Qu.:376.3 1st Qu.:406.2
## Mode :character Mode :character Median :404.6 Median :441.8
## Mean :422.7 Mean :449.7
## 3rd Qu.:480.9 3rd Qu.:483.5
## Max. :539.9 Max. :556.3
## PercentageDiff_Private_vs_Public
## Min. :-7.196
## 1st Qu.:-1.950
## Median : 7.170
## Mean : 7.166
## 3rd Qu.:16.056
## Max. :19.506
media_genero$Gender <- ifelse(media_genero$Gender == 1, "Female", "Male")
diferencias <- media_genero %>%
select(CNT, Gender, TotalMean) %>%
pivot_wider(names_from = Gender, values_from = TotalMean) %>%
mutate(Diff = Female - Male)
diferencias$country_name <- countrycode(diferencias$CNT,
origin = "iso3c",
destination = "country.name")
## Warning: Some values were not matched unambiguously: KSV, QAZ, QUR, TAP
codigos_extra <- c("KSV" = "Kosovo",
"QAZ" = "Kazakhstan",
"QUR" = "Qatar",
"TAP" = "Chinese Taipei")
diferencias$country_name[is.na(diferencias$country_name)] <-
codigos_extra[diferencias$CNT[is.na(diferencias$country_name)]]
diferencias <- diferencias %>%
mutate(Diff_adj = ifelse(abs(Diff) < 0.1, NA, Diff))
# Obtener mapa del mundo
world <- ne_countries(scale = "medium", returnclass = "sf")
# Unir mapa con los datos de diferencias
map_data <- left_join(world, diferencias, by = c("name" = "country_name"))
# Dibujar el mapa
ggplot(data = map_data) +
geom_sf(aes(fill = scale(Diff_adj,center=TRUE,scale=TRUE)), color = "gray80") +
scale_fill_gradient2(
low = "#6495ED", # Azul: chicos mejor
mid = "white", # Blanco: igualdad
high = "#FF69B4", # Rosado: chicas mejor
midpoint = 0,
na.value = "gray90", # PaĂses con diferencia menor a 0.1
name = "Diferencia\n(Chicas - Chicos)"
) +
labs(
title = "Diferencia de rendimiento promedio por paĂs",
subtitle = "Valores positivos: mejor desempeño de chicas\nGris: diferencias menores a ±0.1",
caption = "Fuente: Datos de media_genero"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16),
legend.position = "right"
)
mean_total_school_type_by_country <- donde_estudia %>%
group_by(CNT, Tipo_escuela) %>%
summarise(
MeanTotalMean = mean(TotalMean, na.rm = TRUE),
.groups = "drop"
) %>%
group_by(CNT) %>%
ungroup()
school_type_means_wide <- mean_total_school_type_by_country %>%
pivot_wider(names_from = Tipo_escuela,
values_from = MeanTotalMean,
names_prefix = "SchoolType_") %>%
rename(MeanPublic = SchoolType_1, MeanPrivate = SchoolType_2)
percentage_difference <- school_type_means_wide %>%
mutate(
DiffPercent = ((MeanPrivate - MeanPublic) / MeanPublic) * 100
)
percentage_difference$country_name <- countrycode(percentage_difference$CNT,
origin = "iso3c",
destination = "country.name")
## Warning: Some values were not matched unambiguously: KSV, QAZ, QUR, TAP
codigos_extra <- c("KSV" = "Kosovo",
"QAZ" = "Kazakhstan",
"QUR" = "Qatar",
"TAP" = "Chinese Taipei")
percentage_difference$country_name[is.na(percentage_difference$country_name)] <-
codigos_extra[percentage_difference$CNT[is.na(percentage_difference$country_name)]]
percentage_difference <- percentage_difference %>%
mutate(DiffAdj = ifelse(abs(DiffPercent) < 1, NA, DiffPercent))
world <- ne_countries(scale = "medium", returnclass = "sf")
map_data <- left_join(world, percentage_difference, by = c("name" = "country_name"))
ggplot(data = map_data) +
geom_sf(aes(fill = DiffAdj), color = "gray80") +
scale_fill_gradient2(
low = "#44e36e", # Verde: pĂşblica mejor
mid = "white", # Blanco: igualdad
high = "#f75467", # Rojo: privada mejor
midpoint = 0,
na.value = "gray90", # Diferencias < 1% en gris
name = "% diferencia\n(Privada - PĂşblica)"
) +
labs(
title = "Diferencia porcentual de rendimiento entre escuelas privadas y pĂşblicas",
subtitle = "Valores positivos indican mejor rendimiento en escuelas privadas\nGris: diferencia menor al 1%",
caption = "Fuente: Datos de donde_estudia"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 15),
legend.title = element_text(size = 10),
legend.text = element_text(size = 9)
)