library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(broom)
## Warning: package 'broom' was built under R version 4.5.2
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.5.2
library(readr)
## Warning: package 'readr' was built under R version 4.5.2
library(nortest)
## Warning: package 'nortest' was built under R version 4.5.2
Datos
regresion <- read_delim("regresion.csv", delim = ";", show_col_types = FALSE)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
summary(regresion)
## ESTU_GENERO PERIODO ESTU_TIENEETNIA ESTU_DEPTO_RESIDE
## Length:546212 Min. :20194 Length:546212 Length:546212
## Class :character 1st Qu.:20194 Class :character Class :character
## Mode :character Median :20194 Mode :character Mode :character
## Mean :20194
## 3rd Qu.:20194
## Max. :20194
##
## ESTU_COD_RESIDE_DEPTO FAMI_ESTRATOVIVIENDA COLE_NATURALEZA
## Length:546212 Length:546212 Length:546212
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## COLE_CALENDARIO PUNT_LECTURA_CRITICA PERCENTIL_LECTURA_CRITICA
## Length:546212 Min. : 0.00 Min. : 1.00
## Class :character 1st Qu.: 45.00 1st Qu.: 25.00
## Mode :character Median : 52.00 Median : 50.00
## Mean : 52.16 Mean : 50.11
## 3rd Qu.: 60.00 3rd Qu.: 75.00
## Max. :100.00 Max. :100.00
##
## DESEMP_LECTURA_CRITICA PUNT_MATEMATICAS PERCENTIL_MATEMATICAS
## Min. :1.000 Min. : 0.00 Min. : 1.00
## 1st Qu.:2.000 1st Qu.: 42.00 1st Qu.: 25.00
## Median :3.000 Median : 51.00 Median : 50.00
## Mean :2.613 Mean : 50.61 Mean : 50.11
## 3rd Qu.:3.000 3rd Qu.: 59.00 3rd Qu.: 75.00
## Max. :4.000 Max. :100.00 Max. :100.00
##
## DESEMP_MATEMATICAS PUNT_C_NATURALES PERCENTIL_C_NATURALES DESEMP_C_NATURALES
## Min. :1.000 Min. : 0.00 Min. : 1.00 Min. :1.000
## 1st Qu.:2.000 1st Qu.: 40.00 1st Qu.: 25.00 1st Qu.:1.000
## Median :3.000 Median : 48.00 Median : 50.00 Median :2.000
## Mean :2.441 Mean : 48.23 Mean : 50.16 Mean :2.021
## 3rd Qu.:3.000 3rd Qu.: 56.00 3rd Qu.: 75.00 3rd Qu.:3.000
## Max. :4.000 Max. :100.00 Max. :100.00 Max. :4.000
##
## PUNT_SOCIALES_CIUDADANAS PERCENTIL_SOCIALES_CIUDADANAS
## Min. : 0.00 Min. : 1.00
## 1st Qu.: 37.00 1st Qu.: 25.00
## Median : 45.00 Median : 50.00
## Mean : 46.22 Mean : 50.18
## 3rd Qu.: 55.00 3rd Qu.: 75.00
## Max. :100.00 Max. :100.00
##
## DESEMP_SOCIALES_CIUDADANAS PUNT_INGLES PERCENTIL_INGLES
## Min. :1.000 Min. : 0.00 Min. : 1.0
## 1st Qu.:1.000 1st Qu.: 39.00 1st Qu.: 25.0
## Median :2.000 Median : 48.00 Median : 50.0
## Mean :1.902 Mean : 48.42 Mean : 50.1
## 3rd Qu.:2.000 3rd Qu.: 56.00 3rd Qu.: 75.0
## Max. :4.000 Max. :100.00 Max. :100.0
## NA's :19
## DESEMP_INGLES PUNT_GLOBAL PERCENTIL_GLOBAL
## Length:546212 Min. : 0.0 Min. : 1.00
## Class :character 1st Qu.:207.0 1st Qu.: 25.00
## Mode :character Median :243.0 Median : 50.00
## Mean :246.2 Mean : 49.99
## 3rd Qu.:282.0 3rd Qu.: 75.00
## Max. :477.0 Max. :100.00
## NA's :19
Independencia regresion <- read_delim(“regresion.csv”, delim = “;”, show_col_types = FALSE)
cor(regresion$PUNT_GLOBAL, regresion$PUNT_MATEMATICAS)
## [1] 0.8997013
cor(regresion$PUNT_GLOBAL, regresion$PUNT_LECTURA_CRITICA)
## [1] 0.8984713
cor(regresion$PUNT_GLOBAL, regresion$PUNT_C_NATURALES)
## [1] 0.9137711
cor(regresion$PUNT_GLOBAL, regresion$PUNT_SOCIALES_CIUDADANAS)
## [1] 0.9103628
cor(regresion$PUNT_GLOBAL, regresion$PUNT_INGLES)
## [1] NA
regresion <- read_delim("regresion.csv", delim = ";", show_col_types = FALSE)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
Normalidad
hist(regresion$PUNT_GLOBAL,
main = "Histograma de PUNT_GLOBAL",
xlab = "Puntaje global",
col = "lightblue",
border = "white")
Linealidad
# PUNT_GLOBAL vs PUNT_MATEMATICAS
plot(regresion$PUNT_MATEMATICAS, regresion$PUNT_GLOBAL,
main = "PUNT_GLOBAL vs PUNT_MATEMATICAS",
xlab = "PUNT_MATEMATICAS",
ylab = "PUNT_GLOBAL",
pch = 19, col = "blue")
# PUNT_GLOBAL vs PUNT_LECTURA_CRITICA
plot(regresion$PUNT_LECTURA_CRITICA, regresion$PUNT_GLOBAL,
main = "PUNT_GLOBAL vs PUNT_LECTURA_CRITICA",
xlab = "PUNT_LECTURA_CRITICA",
ylab = "PUNT_GLOBAL",
pch = 19, col = "darkgreen")
# PUNT_GLOBAL vs PUNT_C_NATURALES
plot(regresion$PUNT_C_NATURALES, regresion$PUNT_GLOBAL,
main = "PUNT_GLOBAL vs PUNT_C_NATURALES",
xlab = "PUNT_C_NATURALES",
ylab = "PUNT_GLOBAL",
pch = 19, col = "purple")
# PUNT_GLOBAL vs PUNT_SOCIALES_CIUDADANAS
plot(regresion$PUNT_SOCIALES_CIUDADANAS, regresion$PUNT_GLOBAL,
main = "PUNT_GLOBAL vs PUNT_SOCIALES_CIUDADANAS",
xlab = "PUNT_SOCIALES_CIUDADANAS",
ylab = "PUNT_GLOBAL",
pch = 19, col = "brown")
# PUNT_GLOBAL vs PUNT_INGLES
plot(regresion$PUNT_INGLES, regresion$PUNT_GLOBAL,
main = "PUNT_GLOBAL vs PUNT_INGLES",
xlab = "PUNT_INGLES",
ylab = "PUNT_GLOBAL",
pch = 19, col = "red")
Modelo # Modelo de regresión múltiple
modelo <- lm(PUNT_GLOBAL ~
PUNT_MATEMATICAS +
PUNT_LECTURA_CRITICA +
PUNT_C_NATURALES +
PUNT_SOCIALES_CIUDADANAS +
PUNT_INGLES,
data = regresion)
summary(modelo)
##
## Call:
## lm(formula = PUNT_GLOBAL ~ PUNT_MATEMATICAS + PUNT_LECTURA_CRITICA +
## PUNT_C_NATURALES + PUNT_SOCIALES_CIUDADANAS + PUNT_INGLES,
## data = regresion)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4671 -0.2312 0.0003 0.2317 0.4680
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.213e-03 2.051e-03 0.591 0.554
## PUNT_MATEMATICAS 1.154e+00 5.883e-05 19612.848 <2e-16 ***
## PUNT_LECTURA_CRITICA 1.154e+00 6.825e-05 16907.547 <2e-16 ***
## PUNT_C_NATURALES 1.154e+00 7.135e-05 16169.935 <2e-16 ***
## PUNT_SOCIALES_CIUDADANAS 1.154e+00 6.073e-05 19000.196 <2e-16 ***
## PUNT_INGLES 3.846e-01 4.886e-05 7872.792 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2878 on 546187 degrees of freedom
## (19 observations deleted due to missingness)
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 3.483e+09 on 5 and 546187 DF, p-value: < 2.2e-16
Modelo \[ Y = \beta_0 + \beta_1 (\text{PUNT\_MATEMATICAS}) + \beta_2 (\text{PUNT\_LECTURA\_CRITICA}) + \beta_3 (\text{PUNT\_C\_NATURALES}) + \beta_4 (\text{PUNT\_SOCIALES\_CIUDADANAS}) + \beta_5 (\text{PUNT\_INGLES}) \]
\[ Y=14.984658-0.200133Biking+0.178334Smoking+e_i \] Homocedasticidad
par(mfrow=c(2,2))
plot(modelo)
par(mfrow=c(1,1))
grafico del modelo
library(ggplot2)
# Lista de variables independientes
variables <- c(
"PUNT_MATEMATICAS",
"PUNT_LECTURA_CRITICA",
"PUNT_C_NATURALES",
"PUNT_SOCIALES_CIUDADANAS",
"PUNT_INGLES"
)
# Generar un gráfico por cada variable
for (v in variables) {
print(
ggplot(regresion, aes_string(x = v, y = "PUNT_GLOBAL")) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = TRUE) +
labs(
title = paste("Relación entre PUNT_GLOBAL y", v),
x = v,
y = "Puntaje Global"
)
)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 19 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 19 rows containing missing values or values outside the scale range
## (`geom_point()`).
Linea de tendencia
library(ggplot2)
variables <- c(
"PUNT_MATEMATICAS",
"PUNT_LECTURA_CRITICA",
"PUNT_C_NATURALES",
"PUNT_SOCIALES_CIUDADANAS",
"PUNT_INGLES"
)
graficos <- list()
for (v in variables) {
p <- ggplot(regresion, aes_string(x = v, y = "PUNT_GLOBAL")) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = TRUE, size = 1.2) +
theme_bw() +
labs(
title = paste("PUNT_GLOBAL en función de", v),
x = v,
y = "PUNT_GLOBAL"
)
graficos[[v]] <- p
print(p)
}
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 19 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 19 rows containing missing values or values outside the scale range
## (`geom_point()`).
annotate("text",
x = 400,
y = 200,
label = "Y = β0 + β1*MATE + β2*LECT + β3*CNAT + β4*SOC + β5*ING",
size = 5)
## mapping: x = ~x, y = ~y
## geom_text: na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity
anova(modelo)
## Analysis of Variance Table
##
## Response: PUNT_GLOBAL
## Df Sum Sq Mean Sq F value Pr(>F)
## PUNT_MATEMATICAS 1 1167382207 1167382207 1.4096e+10 < 2.2e-16 ***
## PUNT_LECTURA_CRITICA 1 176005727 176005727 2.1252e+09 < 2.2e-16 ***
## PUNT_C_NATURALES 1 58831308 58831308 7.1038e+08 < 2.2e-16 ***
## PUNT_SOCIALES_CIUDADANAS 1 34795470 34795470 4.2015e+08 < 2.2e-16 ***
## PUNT_INGLES 1 5133068 5133068 6.1981e+07 < 2.2e-16 ***
## Residuals 546187 45234 0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Normalidad con lillie test
library(nortest)
residuos <- resid(modelo) # residuos del nuevo modelo
lillie.test(residuos)
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: residuos
## D = 0.093639, p-value < 2.2e-16
prueba de homecedasticidad
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.5.2
## Cargando paquete requerido: zoo
##
## Adjuntando el paquete: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(modelo)
##
## studentized Breusch-Pagan test
##
## data: modelo
## BP = 13.553, df = 5, p-value = 0.01872
# ANOVA del puntaje global según género del estudiante
anova_genero <- aov(PUNT_GLOBAL ~ ESTU_GENERO, data = regresion)
summary(anova_genero)
## Df Sum Sq Mean Sq F value Pr(>F)
## ESTU_GENERO 2 1.031e+07 5155982 1967 <2e-16 ***
## Residuals 546209 1.432e+09 2622
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(ggplot2)
ggplot(regresion, aes(x = ESTU_GENERO, y = PUNT_GLOBAL, fill = ESTU_GENERO)) +
geom_boxplot() +
theme_bw() +
labs(title = "Puntaje global por género",
x = "Género",
y = "Puntaje global")
library(nortest)
by(regresion, regresion$ESTU_GENERO,
function(x){ lillie.test(x$PUNT_GLOBAL) })
## regresion$ESTU_GENERO: -
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: x$PUNT_GLOBAL
## D = 0.060151, p-value = 0.3499
##
## ------------------------------------------------------------
## regresion$ESTU_GENERO: F
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: x$PUNT_GLOBAL
## D = 0.038571, p-value < 2.2e-16
##
## ------------------------------------------------------------
## regresion$ESTU_GENERO: M
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: x$PUNT_GLOBAL
## D = 0.031232, p-value < 2.2e-16
niveles <- unique(regresion$ESTU_GENERO)
par(mfrow = c(2, 3))
for(n in niveles){
qqnorm(regresion$PUNT_GLOBAL[regresion$ESTU_GENERO == n],
main = paste("QQ plot -", n))
qqline(regresion$PUNT_GLOBAL[regresion$ESTU_GENERO == n])
}
par(mfrow = c(1,1))
bartlett.test(PUNT_GLOBAL ~ ESTU_GENERO, data = regresion)
##
## Bartlett test of homogeneity of variances
##
## data: PUNT_GLOBAL by ESTU_GENERO
## Bartlett's K-squared = 547.12, df = 2, p-value < 2.2e-16
fligner.test(PUNT_GLOBAL ~ ESTU_GENERO, data = regresion)
##
## Fligner-Killeen test of homogeneity of variances
##
## data: PUNT_GLOBAL by ESTU_GENERO
## Fligner-Killeen:med chi-squared = 691.7, df = 2, p-value < 2.2e-16
#TukeyHSD(anova)
#plot(TukeyHSD(anova))