library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(broom)
## Warning: package 'broom' was built under R version 4.5.2
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.5.2
library(readr)
## Warning: package 'readr' was built under R version 4.5.2
library(nortest)
## Warning: package 'nortest' was built under R version 4.5.2

Datos

regresion <- read_delim("regresion.csv", delim = ";", show_col_types = FALSE)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
summary(regresion)
##  ESTU_GENERO           PERIODO      ESTU_TIENEETNIA    ESTU_DEPTO_RESIDE 
##  Length:546212      Min.   :20194   Length:546212      Length:546212     
##  Class :character   1st Qu.:20194   Class :character   Class :character  
##  Mode  :character   Median :20194   Mode  :character   Mode  :character  
##                     Mean   :20194                                        
##                     3rd Qu.:20194                                        
##                     Max.   :20194                                        
##                                                                          
##  ESTU_COD_RESIDE_DEPTO FAMI_ESTRATOVIVIENDA COLE_NATURALEZA   
##  Length:546212         Length:546212        Length:546212     
##  Class :character      Class :character     Class :character  
##  Mode  :character      Mode  :character     Mode  :character  
##                                                               
##                                                               
##                                                               
##                                                               
##  COLE_CALENDARIO    PUNT_LECTURA_CRITICA PERCENTIL_LECTURA_CRITICA
##  Length:546212      Min.   :  0.00       Min.   :  1.00           
##  Class :character   1st Qu.: 45.00       1st Qu.: 25.00           
##  Mode  :character   Median : 52.00       Median : 50.00           
##                     Mean   : 52.16       Mean   : 50.11           
##                     3rd Qu.: 60.00       3rd Qu.: 75.00           
##                     Max.   :100.00       Max.   :100.00           
##                                                                   
##  DESEMP_LECTURA_CRITICA PUNT_MATEMATICAS PERCENTIL_MATEMATICAS
##  Min.   :1.000          Min.   :  0.00   Min.   :  1.00       
##  1st Qu.:2.000          1st Qu.: 42.00   1st Qu.: 25.00       
##  Median :3.000          Median : 51.00   Median : 50.00       
##  Mean   :2.613          Mean   : 50.61   Mean   : 50.11       
##  3rd Qu.:3.000          3rd Qu.: 59.00   3rd Qu.: 75.00       
##  Max.   :4.000          Max.   :100.00   Max.   :100.00       
##                                                               
##  DESEMP_MATEMATICAS PUNT_C_NATURALES PERCENTIL_C_NATURALES DESEMP_C_NATURALES
##  Min.   :1.000      Min.   :  0.00   Min.   :  1.00        Min.   :1.000     
##  1st Qu.:2.000      1st Qu.: 40.00   1st Qu.: 25.00        1st Qu.:1.000     
##  Median :3.000      Median : 48.00   Median : 50.00        Median :2.000     
##  Mean   :2.441      Mean   : 48.23   Mean   : 50.16        Mean   :2.021     
##  3rd Qu.:3.000      3rd Qu.: 56.00   3rd Qu.: 75.00        3rd Qu.:3.000     
##  Max.   :4.000      Max.   :100.00   Max.   :100.00        Max.   :4.000     
##                                                                              
##  PUNT_SOCIALES_CIUDADANAS PERCENTIL_SOCIALES_CIUDADANAS
##  Min.   :  0.00           Min.   :  1.00               
##  1st Qu.: 37.00           1st Qu.: 25.00               
##  Median : 45.00           Median : 50.00               
##  Mean   : 46.22           Mean   : 50.18               
##  3rd Qu.: 55.00           3rd Qu.: 75.00               
##  Max.   :100.00           Max.   :100.00               
##                                                        
##  DESEMP_SOCIALES_CIUDADANAS  PUNT_INGLES     PERCENTIL_INGLES
##  Min.   :1.000              Min.   :  0.00   Min.   :  1.0   
##  1st Qu.:1.000              1st Qu.: 39.00   1st Qu.: 25.0   
##  Median :2.000              Median : 48.00   Median : 50.0   
##  Mean   :1.902              Mean   : 48.42   Mean   : 50.1   
##  3rd Qu.:2.000              3rd Qu.: 56.00   3rd Qu.: 75.0   
##  Max.   :4.000              Max.   :100.00   Max.   :100.0   
##                             NA's   :19                       
##  DESEMP_INGLES       PUNT_GLOBAL    PERCENTIL_GLOBAL
##  Length:546212      Min.   :  0.0   Min.   :  1.00  
##  Class :character   1st Qu.:207.0   1st Qu.: 25.00  
##  Mode  :character   Median :243.0   Median : 50.00  
##                     Mean   :246.2   Mean   : 49.99  
##                     3rd Qu.:282.0   3rd Qu.: 75.00  
##                     Max.   :477.0   Max.   :100.00  
##                                     NA's   :19

Independencia regresion <- read_delim(“regresion.csv”, delim = “;”, show_col_types = FALSE)

cor(regresion$PUNT_GLOBAL, regresion$PUNT_MATEMATICAS)
## [1] 0.8997013
cor(regresion$PUNT_GLOBAL, regresion$PUNT_LECTURA_CRITICA)
## [1] 0.8984713
cor(regresion$PUNT_GLOBAL, regresion$PUNT_C_NATURALES)
## [1] 0.9137711
cor(regresion$PUNT_GLOBAL, regresion$PUNT_SOCIALES_CIUDADANAS)
## [1] 0.9103628
cor(regresion$PUNT_GLOBAL, regresion$PUNT_INGLES)
## [1] NA
regresion <- read_delim("regresion.csv", delim = ";", show_col_types = FALSE)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

Normalidad

hist(regresion$PUNT_GLOBAL,
     main = "Histograma de PUNT_GLOBAL",
     xlab = "Puntaje global",
     col = "lightblue",
     border = "white")

Linealidad

# PUNT_GLOBAL vs PUNT_MATEMATICAS
plot(regresion$PUNT_MATEMATICAS, regresion$PUNT_GLOBAL,
     main = "PUNT_GLOBAL vs PUNT_MATEMATICAS",
     xlab = "PUNT_MATEMATICAS",
     ylab = "PUNT_GLOBAL",
     pch = 19, col = "blue")

# PUNT_GLOBAL vs PUNT_LECTURA_CRITICA
plot(regresion$PUNT_LECTURA_CRITICA, regresion$PUNT_GLOBAL,
     main = "PUNT_GLOBAL vs PUNT_LECTURA_CRITICA",
     xlab = "PUNT_LECTURA_CRITICA",
     ylab = "PUNT_GLOBAL",
     pch = 19, col = "darkgreen")

# PUNT_GLOBAL vs PUNT_C_NATURALES
plot(regresion$PUNT_C_NATURALES, regresion$PUNT_GLOBAL,
     main = "PUNT_GLOBAL vs PUNT_C_NATURALES",
     xlab = "PUNT_C_NATURALES",
     ylab = "PUNT_GLOBAL",
     pch = 19, col = "purple")

# PUNT_GLOBAL vs PUNT_SOCIALES_CIUDADANAS
plot(regresion$PUNT_SOCIALES_CIUDADANAS, regresion$PUNT_GLOBAL,
     main = "PUNT_GLOBAL vs PUNT_SOCIALES_CIUDADANAS",
     xlab = "PUNT_SOCIALES_CIUDADANAS",
     ylab = "PUNT_GLOBAL",
     pch = 19, col = "brown")

# PUNT_GLOBAL vs PUNT_INGLES
plot(regresion$PUNT_INGLES, regresion$PUNT_GLOBAL,
     main = "PUNT_GLOBAL vs PUNT_INGLES",
     xlab = "PUNT_INGLES",
     ylab = "PUNT_GLOBAL",
     pch = 19, col = "red")

Modelo # Modelo de regresión múltiple

modelo <- lm(PUNT_GLOBAL ~ 
               PUNT_MATEMATICAS +
               PUNT_LECTURA_CRITICA +
               PUNT_C_NATURALES +
               PUNT_SOCIALES_CIUDADANAS +
               PUNT_INGLES,
             data = regresion)

summary(modelo)
## 
## Call:
## lm(formula = PUNT_GLOBAL ~ PUNT_MATEMATICAS + PUNT_LECTURA_CRITICA + 
##     PUNT_C_NATURALES + PUNT_SOCIALES_CIUDADANAS + PUNT_INGLES, 
##     data = regresion)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4671 -0.2312  0.0003  0.2317  0.4680 
## 
## Coefficients:
##                           Estimate Std. Error   t value Pr(>|t|)    
## (Intercept)              1.213e-03  2.051e-03     0.591    0.554    
## PUNT_MATEMATICAS         1.154e+00  5.883e-05 19612.848   <2e-16 ***
## PUNT_LECTURA_CRITICA     1.154e+00  6.825e-05 16907.547   <2e-16 ***
## PUNT_C_NATURALES         1.154e+00  7.135e-05 16169.935   <2e-16 ***
## PUNT_SOCIALES_CIUDADANAS 1.154e+00  6.073e-05 19000.196   <2e-16 ***
## PUNT_INGLES              3.846e-01  4.886e-05  7872.792   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2878 on 546187 degrees of freedom
##   (19 observations deleted due to missingness)
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 3.483e+09 on 5 and 546187 DF,  p-value: < 2.2e-16

Modelo \[ Y = \beta_0 + \beta_1 (\text{PUNT\_MATEMATICAS}) + \beta_2 (\text{PUNT\_LECTURA\_CRITICA}) + \beta_3 (\text{PUNT\_C\_NATURALES}) + \beta_4 (\text{PUNT\_SOCIALES\_CIUDADANAS}) + \beta_5 (\text{PUNT\_INGLES}) \]

\[ Y=14.984658-0.200133Biking+0.178334Smoking+e_i \] Homocedasticidad

par(mfrow=c(2,2))
plot(modelo)

par(mfrow=c(1,1))

grafico del modelo

library(ggplot2)

# Lista de variables independientes
variables <- c(
  "PUNT_MATEMATICAS",
  "PUNT_LECTURA_CRITICA",
  "PUNT_C_NATURALES",
  "PUNT_SOCIALES_CIUDADANAS",
  "PUNT_INGLES"
)


# Generar un gráfico por cada variable
for (v in variables) {
  print(
    ggplot(regresion, aes_string(x = v, y = "PUNT_GLOBAL")) +
      geom_point(alpha = 0.4) +
      geom_smooth(method = "lm", se = TRUE) +
      labs(
        title = paste("Relación entre PUNT_GLOBAL y", v),
        x = v,
        y = "Puntaje Global"
      )
  )
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 19 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 19 rows containing missing values or values outside the scale range
## (`geom_point()`).

Linea de tendencia

library(ggplot2)

variables <- c(
  "PUNT_MATEMATICAS",
  "PUNT_LECTURA_CRITICA",
  "PUNT_C_NATURALES",
  "PUNT_SOCIALES_CIUDADANAS",
  "PUNT_INGLES"
)

graficos <- list()

for (v in variables) {
  p <- ggplot(regresion, aes_string(x = v, y = "PUNT_GLOBAL")) +
    geom_point(alpha = 0.4) +
    geom_smooth(method = "lm", se = TRUE, size = 1.2) +
    theme_bw() +
    labs(
      title = paste("PUNT_GLOBAL en función de", v),
      x = v,
      y = "PUNT_GLOBAL"
    )
  
  graficos[[v]] <- p
  print(p)
}
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 19 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 19 rows containing missing values or values outside the scale range
## (`geom_point()`).

annotate("text",
         x = 400,
         y = 200,
         label = "Y = β0 + β1*MATE + β2*LECT + β3*CNAT + β4*SOC + β5*ING",
         size = 5)
## mapping: x = ~x, y = ~y 
## geom_text: na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity
anova(modelo)
## Analysis of Variance Table
## 
## Response: PUNT_GLOBAL
##                              Df     Sum Sq    Mean Sq    F value    Pr(>F)    
## PUNT_MATEMATICAS              1 1167382207 1167382207 1.4096e+10 < 2.2e-16 ***
## PUNT_LECTURA_CRITICA          1  176005727  176005727 2.1252e+09 < 2.2e-16 ***
## PUNT_C_NATURALES              1   58831308   58831308 7.1038e+08 < 2.2e-16 ***
## PUNT_SOCIALES_CIUDADANAS      1   34795470   34795470 4.2015e+08 < 2.2e-16 ***
## PUNT_INGLES                   1    5133068    5133068 6.1981e+07 < 2.2e-16 ***
## Residuals                546187      45234          0                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Normalidad con lillie test
library(nortest)

residuos <- resid(modelo)   # residuos del nuevo modelo

lillie.test(residuos)
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  residuos
## D = 0.093639, p-value < 2.2e-16

prueba de homecedasticidad

library(lmtest)
## Warning: package 'lmtest' was built under R version 4.5.2
## Cargando paquete requerido: zoo
## 
## Adjuntando el paquete: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(modelo)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelo
## BP = 13.553, df = 5, p-value = 0.01872
# ANOVA del puntaje global según género del estudiante
anova_genero <- aov(PUNT_GLOBAL ~ ESTU_GENERO, data = regresion)

summary(anova_genero)
##                 Df    Sum Sq Mean Sq F value Pr(>F)    
## ESTU_GENERO      2 1.031e+07 5155982    1967 <2e-16 ***
## Residuals   546209 1.432e+09    2622                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(ggplot2)

ggplot(regresion, aes(x = ESTU_GENERO, y = PUNT_GLOBAL, fill = ESTU_GENERO)) +
  geom_boxplot() +
  theme_bw() +
  labs(title = "Puntaje global por género",
       x = "Género",
       y = "Puntaje global")

library(nortest)

by(regresion, regresion$ESTU_GENERO,
   function(x){ lillie.test(x$PUNT_GLOBAL) })
## regresion$ESTU_GENERO: -
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  x$PUNT_GLOBAL
## D = 0.060151, p-value = 0.3499
## 
## ------------------------------------------------------------ 
## regresion$ESTU_GENERO: F
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  x$PUNT_GLOBAL
## D = 0.038571, p-value < 2.2e-16
## 
## ------------------------------------------------------------ 
## regresion$ESTU_GENERO: M
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  x$PUNT_GLOBAL
## D = 0.031232, p-value < 2.2e-16
niveles <- unique(regresion$ESTU_GENERO)
par(mfrow = c(2, 3))

for(n in niveles){
  qqnorm(regresion$PUNT_GLOBAL[regresion$ESTU_GENERO == n],
         main = paste("QQ plot -", n))
  qqline(regresion$PUNT_GLOBAL[regresion$ESTU_GENERO == n])
}

par(mfrow = c(1,1))

bartlett.test(PUNT_GLOBAL ~ ESTU_GENERO, data = regresion)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  PUNT_GLOBAL by ESTU_GENERO
## Bartlett's K-squared = 547.12, df = 2, p-value < 2.2e-16
fligner.test(PUNT_GLOBAL ~ ESTU_GENERO, data = regresion)
## 
##  Fligner-Killeen test of homogeneity of variances
## 
## data:  PUNT_GLOBAL by ESTU_GENERO
## Fligner-Killeen:med chi-squared = 691.7, df = 2, p-value < 2.2e-16

#TukeyHSD(anova)

#plot(TukeyHSD(anova))