EDA FINAL

# Cargamos las librerías necesarias
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(reshape2)

## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

library(mice)

## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(knitr)
library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(ggplot2)
library(randomForest)

## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

library(e1071)
library(Metrics)

## 
## Attaching package: 'Metrics'
## 
## The following objects are masked from 'package:caret':
## 
##     precision, recall

library(pROC)

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following object is masked from 'package:Metrics':
## 
##     auc
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# IMPORTAMOS LA BASE DE DATOS
df <- read.csv("student-mat.csv")

# RESUMEN DE LOS DATOS
summary(df)

##     school              sex                 age         address         
##  Length:395         Length:395         Min.   :15.0   Length:395        
##  Class :character   Class :character   1st Qu.:16.0   Class :character  
##  Mode  :character   Mode  :character   Median :17.0   Mode  :character  
##                                        Mean   :16.7                     
##                                        3rd Qu.:18.0                     
##                                        Max.   :22.0                     
##    famsize            Pstatus               Medu            Fedu      
##  Length:395         Length:395         Min.   :0.000   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:2.000   1st Qu.:2.000  
##  Mode  :character   Mode  :character   Median :3.000   Median :2.000  
##                                        Mean   :2.749   Mean   :2.522  
##                                        3rd Qu.:4.000   3rd Qu.:3.000  
##                                        Max.   :4.000   Max.   :4.000  
##      Mjob               Fjob              reason            guardian        
##  Length:395         Length:395         Length:395         Length:395        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    traveltime      studytime        failures       schoolsup        
##  Min.   :1.000   Min.   :1.000   Min.   :0.0000   Length:395        
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.0000   Class :character  
##  Median :1.000   Median :2.000   Median :0.0000   Mode  :character  
##  Mean   :1.448   Mean   :2.035   Mean   :0.3342                     
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:0.0000                     
##  Max.   :4.000   Max.   :4.000   Max.   :3.0000                     
##     famsup              paid            activities          nursery         
##  Length:395         Length:395         Length:395         Length:395        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     higher            internet           romantic             famrel     
##  Length:395         Length:395         Length:395         Min.   :1.000  
##  Class :character   Class :character   Class :character   1st Qu.:4.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :4.000  
##                                                           Mean   :3.944  
##                                                           3rd Qu.:5.000  
##                                                           Max.   :5.000  
##     freetime         goout            Dalc            Walc      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :3.000   Median :3.000   Median :1.000   Median :2.000  
##  Mean   :3.235   Mean   :3.109   Mean   :1.481   Mean   :2.291  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:2.000   3rd Qu.:3.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##      health         absences            G1              G2       
##  Min.   :1.000   Min.   : 0.000   Min.   : 3.00   Min.   : 0.00  
##  1st Qu.:3.000   1st Qu.: 0.000   1st Qu.: 8.00   1st Qu.: 9.00  
##  Median :4.000   Median : 4.000   Median :11.00   Median :11.00  
##  Mean   :3.554   Mean   : 5.709   Mean   :10.91   Mean   :10.71  
##  3rd Qu.:5.000   3rd Qu.: 8.000   3rd Qu.:13.00   3rd Qu.:13.00  
##  Max.   :5.000   Max.   :75.000   Max.   :19.00   Max.   :19.00  
##        G3       
##  Min.   : 0.00  
##  1st Qu.: 8.00  
##  Median :11.00  
##  Mean   :10.42  
##  3rd Qu.:14.00  
##  Max.   :20.00

Inicialmente la base de datos cuenta con 395 observaciones, contenidas en 33 variables de las cuales 16 son variables de tipo caracter y 17 son variables de tipo numerica, de las 16 variables numericas se logran identificar que la mayor´ıa de ellas son variables binarias o discretas

# VARIABLES CATEGORICAS
categorical_cols <- sapply(df, is.character)
categorical_cols <- names(df[categorical_cols])
categorical_cols

##  [1] "school"     "sex"        "address"    "famsize"    "Pstatus"   
##  [6] "Mjob"       "Fjob"       "reason"     "guardian"   "schoolsup" 
## [11] "famsup"     "paid"       "activities" "nursery"    "higher"    
## [16] "internet"   "romantic"

# VARIABLES NUMERICAS
numerical_cols <- sapply(df, is.numeric)
numerical_cols <- names(df[numerical_cols])
numerical_cols

##  [1] "age"        "Medu"       "Fedu"       "traveltime" "studytime" 
##  [6] "failures"   "famrel"     "freetime"   "goout"      "Dalc"      
## [11] "Walc"       "health"     "absences"   "G1"         "G2"        
## [16] "G3"

# MATRIZ DE CORRELACION
cor_matrix <- cor(df[numerical_cols], use = "complete.obs")
cor_melted <- melt(cor_matrix)
font_size <- 6

ggplot(data = cor_melted, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient2(low = "purple", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name="Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = font_size),
        axis.text.y = element_text(size = font_size)) +
  coord_fixed() +
  labs(x = '', y = '') +
  geom_text(aes(label = sprintf("%.2f", value)), size = 3, check_overlap = TRUE)

En el diagrama de correlacion se nota una alta correlacion entre las variables G1 y G2, a su vez en Fedu y absences; lo que quiere decir que las relaciones familiares y las ausencias al colegios tienen una alta correlacion con la nota final del estudiante, de igual forma las notas del primer y segundo periodo

# DATOS FALTANTES
md.pattern(df, plot = TRUE, rotate.names = TRUE)

##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

##     school sex age address famsize Pstatus Medu Fedu Mjob Fjob reason guardian
## 395      1   1   1       1       1       1    1    1    1    1      1        1
##          0   0   0       0       0       0    0    0    0    0      0        0
##     traveltime studytime failures schoolsup famsup paid activities nursery
## 395          1         1        1         1      1    1          1       1
##              0         0        0         0      0    0          0       0
##     higher internet romantic famrel freetime goout Dalc Walc health absences G1
## 395      1        1        1      1        1     1    1    1      1        1  1
##          0        0        0      0        0     0    0    0      0        0  0
##     G2 G3  
## 395  1  1 0
##      0  0 0

En este analisis, notamos que no hay datos faltantes, lo que quiere decir que mis datos estan al 100% o completos

# ELIMINAR MULTICOLINEALIDAD
library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

modelo <- lm(G3 ~ ., data = df)
vif_valores <- vif(modelo)
vif_df <- data.frame(VIF = vif_valores)
kable(vif_df, caption = "VIF values")

VIF values
	VIF.GVIF	VIF.Df	VIF.GVIF..1..2.Df..
school	1.510967	1	1.229214
sex	1.486968	1	1.219413
age	1.803293	1	1.342867
address	1.387985	1	1.178128
famsize	1.153276	1	1.073907
Pstatus	1.145493	1	1.070277
Medu	2.940226	1	1.714709
Fedu	2.141010	1	1.463219
Mjob	3.430704	4	1.166602
Fjob	2.301555	4	1.109820
reason	1.553683	3	1.076202
guardian	1.736473	2	1.147934
traveltime	1.320973	1	1.149336
studytime	1.395833	1	1.181454
failures	1.563186	1	1.250274
schoolsup	1.255074	1	1.120301
famsup	1.304026	1	1.141940
paid	1.338698	1	1.157021
activities	1.158682	1	1.076421
nursery	1.151348	1	1.073009
higher	1.315791	1	1.147080
internet	1.257752	1	1.121495
romantic	1.174382	1	1.083689
famrel	1.141814	1	1.068557
freetime	1.321398	1	1.149521
goout	1.496482	1	1.223308
Dalc	2.028513	1	1.424259
Walc	2.389544	1	1.545815
health	1.179266	1	1.085940
absences	1.256253	1	1.120827
G1	4.673491	1	2.161826
G2	4.409261	1	2.099824

Haciendo el respectivo analisis de multicolinealidad y teniendo en cuenta que vamos a eliminar esas variables que tienen un VIF mayor que 5, llegamos a la conclusion que ninguna de nuestras variables son preocupantes a la hora de tener un VIF mayor que 5 por lo que no se elimina ninguna, las variables mas altas fueron G1 y G2 pero no influyen en nuestro estudio ni perjudican el analisis

# DESCRIPCION DE LA VARIABLE OBJETIVO
ggplot(data = df, aes(x = G3)) +
  geom_bar(fill = "purple") +
  labs(title = "NOTA FINAL", x = "", y = "Count")

Teniendo en cuenta la variable objetivo que es la nota final y al analizar sus frecuencias notamos que la mayor parte de los estudiantes obtuvieron una nota final de 10 y la minoria obtuvieron una nota superior que es de 20, a su vez preocupa mas de 20 estudiantes obtuvieron 0 en la nota finl, por lo que es ahi donde prevalece la importanci de nuestro estudio, para saber que variables influyen en esas notas negativas

# BOXPLOT VARIABLES NUMERICAS VS G3
for (col in numerical_cols) {
  p <- ggplot(df, aes_string(x = "G3", y = col)) +
    geom_boxplot() +
    coord_flip() +
    labs(title = paste("Boxplot of", col, "G3"), y = col, x = "G3")
  print(p)
}

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

Haciendo el respectivo analisis del Boxplot, este nos arroja los siguientes resultados:Se establece una distribucion simetrica en la variable G3, G2 posee un dato atipico en 0, lo que quiere decir que este dato se encuentra alejado de la concentracion total de datos.Fue minima la cantidad de estudiantes que obtuvo 0 en el segundo periodo.Con base en el numero de ausencias se pudieron observar algunos valores atipicos por encima de 20 ausencias, lo que quiere decir que la mayoria de los estudiantes no faltaron a clases mas de 20 veces durante el proceso academico.La variable absences posee un sesgo a la derecha ya que el bigote derecho es mas largo que el izquierdo, tanto que el izquierdo no se logra diferenciar.

# VARIABLES CATEGORICAS VS G3
color_palette <- rainbow(21)
for (col in categorical_cols) {
  df_percent <- df %>%
    group_by(!!sym(col)) %>%
    count(G3) %>%
    group_by(!!sym(col)) %>%
    mutate(perc = n / sum(n) * 100) %>%
    mutate(G3 = as.factor(G3))

  p <- ggplot(df_percent, aes_string(x = col, y = "perc", fill = "G3")) +
    geom_bar(stat = "identity", position = "fill") +
    scale_fill_manual(values = color_palette) +
    labs(title = paste("Porcentaje de G3 por", col), x = col, y = "Porcentaje (%)") +
    scale_y_continuous(labels = scales::percent_format())

  print(p)
}

Al analizar los diagramas anteriores que muestran cada un de las variables categoricas con respecto a G3 llegamos a la conclusion que los estudiantes que tienen internet obtuvieron buenas calificaciones a diferencia de los que no, tambien notamos que los estudiantes que se encuentran en una relacion amorosa tiende a ir mal academicamente a diferencia de los que estan solteros, la escuela con mejor promedio es la GP publica y el sexo masculino tiende a tener mejores notas que el femenino.

# ENTRENAMIENTO DE MODELOS
library(caTools)
set.seed(123)
split <- sample.split(df$G3, SplitRatio = 0.8)
train_set <- subset(df, split == TRUE)
test_set <- subset(df, split == FALSE)

# Escalando los datos
train_set[sapply(train_set, is.numeric)] <- scale(train_set[sapply(train_set, is.numeric)])
test_set[sapply(test_set, is.numeric)] <- scale(test_set[sapply(test_set, is.numeric)])

# REGRESION LINEAL SIMPLE
model <- lm(G3 ~ ., data = train_set)
summary(model)

## 
## Call:
## lm(formula = G3 ~ ., data = train_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.65574 -0.10381  0.03539  0.19914  1.04165 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.021991   0.206956   0.106  0.91545    
## schoolMS          0.113963   0.097077   1.174  0.24143    
## sexM              0.031303   0.056466   0.554  0.57977    
## age              -0.039789   0.030817  -1.291  0.19773    
## addressU          0.061712   0.065914   0.936  0.34996    
## famsizeLE3        0.020444   0.055335   0.369  0.71207    
## PstatusT         -0.031171   0.083628  -0.373  0.70963    
## Medu              0.019539   0.039805   0.491  0.62391    
## Fedu             -0.031341   0.034177  -0.917  0.35993    
## Mjobhealth       -0.009108   0.121100  -0.075  0.94010    
## Mjobother         0.019596   0.080443   0.244  0.80772    
## Mjobservices     -0.006203   0.089298  -0.069  0.94467    
## Mjobteacher      -0.055898   0.117153  -0.477  0.63364    
## Fjobhealth        0.119748   0.155572   0.770  0.44212    
## Fjobother        -0.033647   0.110777  -0.304  0.76156    
## Fjobservices     -0.063808   0.115696  -0.552  0.58173    
## Fjobteacher       0.018542   0.139886   0.133  0.89464    
## reasonhome       -0.038045   0.063220  -0.602  0.54781    
## reasonother       0.018248   0.091750   0.199  0.84250    
## reasonreputation  0.065414   0.063750   1.026  0.30574    
## guardianmother    0.048665   0.059911   0.812  0.41733    
## guardianother    -0.012184   0.113804  -0.107  0.91482    
## traveltime        0.024065   0.027266   0.883  0.37822    
## studytime        -0.038746   0.028184  -1.375  0.17032    
## failures         -0.048807   0.029159  -1.674  0.09530 .  
## schoolsupyes      0.056306   0.076309   0.738  0.46122    
## famsupyes         0.042879   0.054351   0.789  0.43084    
## paidyes           0.035660   0.054739   0.651  0.51530    
## activitiesyes    -0.078738   0.050430  -1.561  0.11960    
## nurseryyes       -0.057035   0.062013  -0.920  0.35852    
## higheryes        -0.019043   0.113438  -0.168  0.86681    
## internetyes      -0.024747   0.073447  -0.337  0.73642    
## romanticyes      -0.029391   0.053426  -0.550  0.58268    
## famrel            0.086680   0.025284   3.428  0.00070 ***
## freetime          0.027142   0.026097   1.040  0.29923    
## goout            -0.015477   0.028350  -0.546  0.58555    
## Dalc             -0.053533   0.031807  -1.683  0.09349 .  
## Walc              0.071068   0.034673   2.050  0.04134 *  
## health            0.003538   0.025478   0.139  0.88965    
## absences          0.079783   0.026382   3.024  0.00273 ** 
## G1                0.131227   0.049750   2.638  0.00882 ** 
## G2                0.787773   0.048311  16.306  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4086 on 276 degrees of freedom
## Multiple R-squared:  0.8546, Adjusted R-squared:  0.833 
## F-statistic: 39.57 on 41 and 276 DF,  p-value: < 2.2e-16

predictions <- predict(model, newdata = test_set)

# Evaluar el modelo
rmse <- sqrt(mean((test_set$G3 - predictions)^2))
print(paste("RMSE:", rmse))

## [1] "RMSE: 0.448409485079193"

# R cuadrado ajustado
SST <- sum((test_set$G3 - mean(test_set$G3))^2)
SSR <- sum((predictions - mean(test_set$G3))^2)
R2_adj <- 1 - (1 - SSR/SST) * ((nrow(test_set) - 1) / (nrow(test_set) - length(model$coefficients) - 1))
print(paste("R cuadrado ajustado:", R2_adj))

## [1] "R cuadrado ajustado: 0.712093899191313"

# GRAFICO DE DISPERSION DE PREDICCIONES VS VALORES REALES
plot(test_set$G3, predictions, main = "Predicciones vs Valores Reales",
     xlab = "Valores Reales", ylab = "Predicciones")
abline(0, 1, col = "purple")

# RANDOM FOREST
rf_model <- randomForest(G3 ~ ., data = train_set, importance = TRUE, na.action = na.omit)
print(rf_model)

## 
## Call:
##  randomForest(formula = G3 ~ ., data = train_set, importance = TRUE,      na.action = na.omit) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 10
## 
##           Mean of squared residuals: 0.151575
##                     % Var explained: 84.79

predictions <- predict(rf_model, test_set)
rmse <- sqrt(mean((predictions - test_set$G3)^2))
print(paste("RMSE:", rmse))

## [1] "RMSE: 0.382827566549953"

r_squared <- postResample(pred = predictions, obs = test_set$G3)[["Rsquared"]]
print(paste("R-squared:", r_squared))

## [1] "R-squared: 0.859160162266376"

# Gráfico de dispersión de predicciones vs valores reales
plot(test_set$G3, predictions, main = "Predicciones de Random Forest vs Valores Reales",
     xlab = "Valores Reales", ylab = "Predicciones", pch = 19, col = "blue")
abline(0, 1, col = "red", lwd = 2)

# SVM
svr_model <- svm(G3 ~ ., data = train_set, type = "eps-regression")
predictions <- predict(svr_model, test_set)
mse <- mse(test_set$G3, predictions)
print(paste("MSE:", mse))

## [1] "MSE: 0.225481560376798"

rmse <- rmse(test_set$G3, predictions)
print(paste("RMSE:", rmse))

## [1] "RMSE: 0.474848986917734"

mae <- mae(test_set$G3, predictions)
print(paste("MAE:", mae))

## [1] "MAE: 0.306513554265111"

predictions <- predict(svr_model, test_set)

# Calculando SS_res
ss_res <- sum((test_set$G3 - predictions)^2)

# Calculando SS_tot
ss_tot <- sum((test_set$G3 - mean(test_set$G3))^2)

# Calculando R^2
r_squared <- 1 - ss_res / ss_tot

print(paste("R-squared:", r_squared))

## [1] "R-squared: 0.771551576986665"

# Gráfico de dispersión de predicciones vs valores reales
plot(test_set$G3, predictions, main = "Predicciones de SVM vs Valores Reales",
     xlab = "Valores Reales", ylab = "Predicciones", pch = 19, col = "blue")
abline(0, 1, col = "red", lwd = 2)

En la seccion de entrenamientos de modelos al ser mi variable de respuesta una variable numerica continua (G3 NOTA FINAL), se convierte en un problema de regresion mas no de clasificacion, por lo que procedo a aplicar 3 modelos de regresion para escoger el que mejor prediga el rendimiento academico de los estudiantes de secundaria de dos escuelas en portugal. Al analizar cada una de las metricas de los modelos se llega a la conclusion que el modelo que mejor predice el rendimiento academico es el modelo RANDOM FOREST al poseer el menor RMSE (0,38) y el mayor R CUADRADO (0,85) y explica el 84.79% de la variabilidad a diferencia de los otros modelos. A su vez es necesario dar a conocer que las variables que mas influyen en la nota final son: Relaciones familiares (Famrel), absences (AUSENCIAS), G1 y G2.

EDA FINAL

Dina Monterroza Barrios

2023-11-28