# Cargamos las librerÃas necesarias
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(mice)
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(knitr)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(ggplot2)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(e1071)
library(Metrics)
##
## Attaching package: 'Metrics'
##
## The following objects are masked from 'package:caret':
##
## precision, recall
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following object is masked from 'package:Metrics':
##
## auc
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# IMPORTAMOS LA BASE DE DATOS
df <- read.csv("student-mat.csv")
# RESUMEN DE LOS DATOS
summary(df)
## school sex age address
## Length:395 Length:395 Min. :15.0 Length:395
## Class :character Class :character 1st Qu.:16.0 Class :character
## Mode :character Mode :character Median :17.0 Mode :character
## Mean :16.7
## 3rd Qu.:18.0
## Max. :22.0
## famsize Pstatus Medu Fedu
## Length:395 Length:395 Min. :0.000 Min. :0.000
## Class :character Class :character 1st Qu.:2.000 1st Qu.:2.000
## Mode :character Mode :character Median :3.000 Median :2.000
## Mean :2.749 Mean :2.522
## 3rd Qu.:4.000 3rd Qu.:3.000
## Max. :4.000 Max. :4.000
## Mjob Fjob reason guardian
## Length:395 Length:395 Length:395 Length:395
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## traveltime studytime failures schoolsup
## Min. :1.000 Min. :1.000 Min. :0.0000 Length:395
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.0000 Class :character
## Median :1.000 Median :2.000 Median :0.0000 Mode :character
## Mean :1.448 Mean :2.035 Mean :0.3342
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:0.0000
## Max. :4.000 Max. :4.000 Max. :3.0000
## famsup paid activities nursery
## Length:395 Length:395 Length:395 Length:395
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## higher internet romantic famrel
## Length:395 Length:395 Length:395 Min. :1.000
## Class :character Class :character Class :character 1st Qu.:4.000
## Mode :character Mode :character Mode :character Median :4.000
## Mean :3.944
## 3rd Qu.:5.000
## Max. :5.000
## freetime goout Dalc Walc
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.000
## Median :3.000 Median :3.000 Median :1.000 Median :2.000
## Mean :3.235 Mean :3.109 Mean :1.481 Mean :2.291
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## health absences G1 G2
## Min. :1.000 Min. : 0.000 Min. : 3.00 Min. : 0.00
## 1st Qu.:3.000 1st Qu.: 0.000 1st Qu.: 8.00 1st Qu.: 9.00
## Median :4.000 Median : 4.000 Median :11.00 Median :11.00
## Mean :3.554 Mean : 5.709 Mean :10.91 Mean :10.71
## 3rd Qu.:5.000 3rd Qu.: 8.000 3rd Qu.:13.00 3rd Qu.:13.00
## Max. :5.000 Max. :75.000 Max. :19.00 Max. :19.00
## G3
## Min. : 0.00
## 1st Qu.: 8.00
## Median :11.00
## Mean :10.42
## 3rd Qu.:14.00
## Max. :20.00
Inicialmente la base de datos cuenta con 395 observaciones, contenidas en 33 variables de las cuales 16 son variables de tipo caracter y 17 son variables de tipo numerica, de las 16 variables numericas se logran identificar que la mayor´ıa de ellas son variables binarias o discretas
# VARIABLES CATEGORICAS
categorical_cols <- sapply(df, is.character)
categorical_cols <- names(df[categorical_cols])
categorical_cols
## [1] "school" "sex" "address" "famsize" "Pstatus"
## [6] "Mjob" "Fjob" "reason" "guardian" "schoolsup"
## [11] "famsup" "paid" "activities" "nursery" "higher"
## [16] "internet" "romantic"
# VARIABLES NUMERICAS
numerical_cols <- sapply(df, is.numeric)
numerical_cols <- names(df[numerical_cols])
numerical_cols
## [1] "age" "Medu" "Fedu" "traveltime" "studytime"
## [6] "failures" "famrel" "freetime" "goout" "Dalc"
## [11] "Walc" "health" "absences" "G1" "G2"
## [16] "G3"
# MATRIZ DE CORRELACION
cor_matrix <- cor(df[numerical_cols], use = "complete.obs")
cor_melted <- melt(cor_matrix)
font_size <- 6
ggplot(data = cor_melted, aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
scale_fill_gradient2(low = "purple", high = "red", mid = "white",
midpoint = 0, limit = c(-1, 1), space = "Lab",
name="Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = font_size),
axis.text.y = element_text(size = font_size)) +
coord_fixed() +
labs(x = '', y = '') +
geom_text(aes(label = sprintf("%.2f", value)), size = 3, check_overlap = TRUE)
En el diagrama de correlacion se nota una alta correlacion entre las variables G1 y G2, a su vez en Fedu y absences; lo que quiere decir que las relaciones familiares y las ausencias al colegios tienen una alta correlacion con la nota final del estudiante, de igual forma las notas del primer y segundo periodo
# DATOS FALTANTES
md.pattern(df, plot = TRUE, rotate.names = TRUE)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## school sex age address famsize Pstatus Medu Fedu Mjob Fjob reason guardian
## 395 1 1 1 1 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0 0 0 0 0
## traveltime studytime failures schoolsup famsup paid activities nursery
## 395 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0
## higher internet romantic famrel freetime goout Dalc Walc health absences G1
## 395 1 1 1 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0 0 0 0
## G2 G3
## 395 1 1 0
## 0 0 0
En este analisis, notamos que no hay datos faltantes, lo que quiere decir que mis datos estan al 100% o completos
# ELIMINAR MULTICOLINEALIDAD
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
modelo <- lm(G3 ~ ., data = df)
vif_valores <- vif(modelo)
vif_df <- data.frame(VIF = vif_valores)
kable(vif_df, caption = "VIF values")
| VIF.GVIF | VIF.Df | VIF.GVIF..1..2.Df.. | |
|---|---|---|---|
| school | 1.510967 | 1 | 1.229214 |
| sex | 1.486968 | 1 | 1.219413 |
| age | 1.803293 | 1 | 1.342867 |
| address | 1.387985 | 1 | 1.178128 |
| famsize | 1.153276 | 1 | 1.073907 |
| Pstatus | 1.145493 | 1 | 1.070277 |
| Medu | 2.940226 | 1 | 1.714709 |
| Fedu | 2.141010 | 1 | 1.463219 |
| Mjob | 3.430704 | 4 | 1.166602 |
| Fjob | 2.301555 | 4 | 1.109820 |
| reason | 1.553683 | 3 | 1.076202 |
| guardian | 1.736473 | 2 | 1.147934 |
| traveltime | 1.320973 | 1 | 1.149336 |
| studytime | 1.395833 | 1 | 1.181454 |
| failures | 1.563186 | 1 | 1.250274 |
| schoolsup | 1.255074 | 1 | 1.120301 |
| famsup | 1.304026 | 1 | 1.141940 |
| paid | 1.338698 | 1 | 1.157021 |
| activities | 1.158682 | 1 | 1.076421 |
| nursery | 1.151348 | 1 | 1.073009 |
| higher | 1.315791 | 1 | 1.147080 |
| internet | 1.257752 | 1 | 1.121495 |
| romantic | 1.174382 | 1 | 1.083689 |
| famrel | 1.141814 | 1 | 1.068557 |
| freetime | 1.321398 | 1 | 1.149521 |
| goout | 1.496482 | 1 | 1.223308 |
| Dalc | 2.028513 | 1 | 1.424259 |
| Walc | 2.389544 | 1 | 1.545815 |
| health | 1.179266 | 1 | 1.085940 |
| absences | 1.256253 | 1 | 1.120827 |
| G1 | 4.673491 | 1 | 2.161826 |
| G2 | 4.409261 | 1 | 2.099824 |
Haciendo el respectivo analisis de multicolinealidad y teniendo en cuenta que vamos a eliminar esas variables que tienen un VIF mayor que 5, llegamos a la conclusion que ninguna de nuestras variables son preocupantes a la hora de tener un VIF mayor que 5 por lo que no se elimina ninguna, las variables mas altas fueron G1 y G2 pero no influyen en nuestro estudio ni perjudican el analisis
# DESCRIPCION DE LA VARIABLE OBJETIVO
ggplot(data = df, aes(x = G3)) +
geom_bar(fill = "purple") +
labs(title = "NOTA FINAL", x = "", y = "Count")
Teniendo en cuenta la variable objetivo que es la nota final y al analizar sus frecuencias notamos que la mayor parte de los estudiantes obtuvieron una nota final de 10 y la minoria obtuvieron una nota superior que es de 20, a su vez preocupa mas de 20 estudiantes obtuvieron 0 en la nota finl, por lo que es ahi donde prevalece la importanci de nuestro estudio, para saber que variables influyen en esas notas negativas
# BOXPLOT VARIABLES NUMERICAS VS G3
for (col in numerical_cols) {
p <- ggplot(df, aes_string(x = "G3", y = col)) +
geom_boxplot() +
coord_flip() +
labs(title = paste("Boxplot of", col, "G3"), y = col, x = "G3")
print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
Haciendo el respectivo analisis del Boxplot, este nos arroja los siguientes resultados:Se establece una distribucion simetrica en la variable G3, G2 posee un dato atipico en 0, lo que quiere decir que este dato se encuentra alejado de la concentracion total de datos.Fue minima la cantidad de estudiantes que obtuvo 0 en el segundo periodo.Con base en el numero de ausencias se pudieron observar algunos valores atipicos por encima de 20 ausencias, lo que quiere decir que la mayoria de los estudiantes no faltaron a clases mas de 20 veces durante el proceso academico.La variable absences posee un sesgo a la derecha ya que el bigote derecho es mas largo que el izquierdo, tanto que el izquierdo no se logra diferenciar.
# VARIABLES CATEGORICAS VS G3
color_palette <- rainbow(21)
for (col in categorical_cols) {
df_percent <- df %>%
group_by(!!sym(col)) %>%
count(G3) %>%
group_by(!!sym(col)) %>%
mutate(perc = n / sum(n) * 100) %>%
mutate(G3 = as.factor(G3))
p <- ggplot(df_percent, aes_string(x = col, y = "perc", fill = "G3")) +
geom_bar(stat = "identity", position = "fill") +
scale_fill_manual(values = color_palette) +
labs(title = paste("Porcentaje de G3 por", col), x = col, y = "Porcentaje (%)") +
scale_y_continuous(labels = scales::percent_format())
print(p)
}
Al analizar los diagramas anteriores que muestran cada un de las variables categoricas con respecto a G3 llegamos a la conclusion que los estudiantes que tienen internet obtuvieron buenas calificaciones a diferencia de los que no, tambien notamos que los estudiantes que se encuentran en una relacion amorosa tiende a ir mal academicamente a diferencia de los que estan solteros, la escuela con mejor promedio es la GP publica y el sexo masculino tiende a tener mejores notas que el femenino.
# ENTRENAMIENTO DE MODELOS
library(caTools)
set.seed(123)
split <- sample.split(df$G3, SplitRatio = 0.8)
train_set <- subset(df, split == TRUE)
test_set <- subset(df, split == FALSE)
# Escalando los datos
train_set[sapply(train_set, is.numeric)] <- scale(train_set[sapply(train_set, is.numeric)])
test_set[sapply(test_set, is.numeric)] <- scale(test_set[sapply(test_set, is.numeric)])
# REGRESION LINEAL SIMPLE
model <- lm(G3 ~ ., data = train_set)
summary(model)
##
## Call:
## lm(formula = G3 ~ ., data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.65574 -0.10381 0.03539 0.19914 1.04165
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.021991 0.206956 0.106 0.91545
## schoolMS 0.113963 0.097077 1.174 0.24143
## sexM 0.031303 0.056466 0.554 0.57977
## age -0.039789 0.030817 -1.291 0.19773
## addressU 0.061712 0.065914 0.936 0.34996
## famsizeLE3 0.020444 0.055335 0.369 0.71207
## PstatusT -0.031171 0.083628 -0.373 0.70963
## Medu 0.019539 0.039805 0.491 0.62391
## Fedu -0.031341 0.034177 -0.917 0.35993
## Mjobhealth -0.009108 0.121100 -0.075 0.94010
## Mjobother 0.019596 0.080443 0.244 0.80772
## Mjobservices -0.006203 0.089298 -0.069 0.94467
## Mjobteacher -0.055898 0.117153 -0.477 0.63364
## Fjobhealth 0.119748 0.155572 0.770 0.44212
## Fjobother -0.033647 0.110777 -0.304 0.76156
## Fjobservices -0.063808 0.115696 -0.552 0.58173
## Fjobteacher 0.018542 0.139886 0.133 0.89464
## reasonhome -0.038045 0.063220 -0.602 0.54781
## reasonother 0.018248 0.091750 0.199 0.84250
## reasonreputation 0.065414 0.063750 1.026 0.30574
## guardianmother 0.048665 0.059911 0.812 0.41733
## guardianother -0.012184 0.113804 -0.107 0.91482
## traveltime 0.024065 0.027266 0.883 0.37822
## studytime -0.038746 0.028184 -1.375 0.17032
## failures -0.048807 0.029159 -1.674 0.09530 .
## schoolsupyes 0.056306 0.076309 0.738 0.46122
## famsupyes 0.042879 0.054351 0.789 0.43084
## paidyes 0.035660 0.054739 0.651 0.51530
## activitiesyes -0.078738 0.050430 -1.561 0.11960
## nurseryyes -0.057035 0.062013 -0.920 0.35852
## higheryes -0.019043 0.113438 -0.168 0.86681
## internetyes -0.024747 0.073447 -0.337 0.73642
## romanticyes -0.029391 0.053426 -0.550 0.58268
## famrel 0.086680 0.025284 3.428 0.00070 ***
## freetime 0.027142 0.026097 1.040 0.29923
## goout -0.015477 0.028350 -0.546 0.58555
## Dalc -0.053533 0.031807 -1.683 0.09349 .
## Walc 0.071068 0.034673 2.050 0.04134 *
## health 0.003538 0.025478 0.139 0.88965
## absences 0.079783 0.026382 3.024 0.00273 **
## G1 0.131227 0.049750 2.638 0.00882 **
## G2 0.787773 0.048311 16.306 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4086 on 276 degrees of freedom
## Multiple R-squared: 0.8546, Adjusted R-squared: 0.833
## F-statistic: 39.57 on 41 and 276 DF, p-value: < 2.2e-16
predictions <- predict(model, newdata = test_set)
# Evaluar el modelo
rmse <- sqrt(mean((test_set$G3 - predictions)^2))
print(paste("RMSE:", rmse))
## [1] "RMSE: 0.448409485079193"
# R cuadrado ajustado
SST <- sum((test_set$G3 - mean(test_set$G3))^2)
SSR <- sum((predictions - mean(test_set$G3))^2)
R2_adj <- 1 - (1 - SSR/SST) * ((nrow(test_set) - 1) / (nrow(test_set) - length(model$coefficients) - 1))
print(paste("R cuadrado ajustado:", R2_adj))
## [1] "R cuadrado ajustado: 0.712093899191313"
# GRAFICO DE DISPERSION DE PREDICCIONES VS VALORES REALES
plot(test_set$G3, predictions, main = "Predicciones vs Valores Reales",
xlab = "Valores Reales", ylab = "Predicciones")
abline(0, 1, col = "purple")
# RANDOM FOREST
rf_model <- randomForest(G3 ~ ., data = train_set, importance = TRUE, na.action = na.omit)
print(rf_model)
##
## Call:
## randomForest(formula = G3 ~ ., data = train_set, importance = TRUE, na.action = na.omit)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 10
##
## Mean of squared residuals: 0.151575
## % Var explained: 84.79
predictions <- predict(rf_model, test_set)
rmse <- sqrt(mean((predictions - test_set$G3)^2))
print(paste("RMSE:", rmse))
## [1] "RMSE: 0.382827566549953"
r_squared <- postResample(pred = predictions, obs = test_set$G3)[["Rsquared"]]
print(paste("R-squared:", r_squared))
## [1] "R-squared: 0.859160162266376"
# Gráfico de dispersión de predicciones vs valores reales
plot(test_set$G3, predictions, main = "Predicciones de Random Forest vs Valores Reales",
xlab = "Valores Reales", ylab = "Predicciones", pch = 19, col = "blue")
abline(0, 1, col = "red", lwd = 2)
# SVM
svr_model <- svm(G3 ~ ., data = train_set, type = "eps-regression")
predictions <- predict(svr_model, test_set)
mse <- mse(test_set$G3, predictions)
print(paste("MSE:", mse))
## [1] "MSE: 0.225481560376798"
rmse <- rmse(test_set$G3, predictions)
print(paste("RMSE:", rmse))
## [1] "RMSE: 0.474848986917734"
mae <- mae(test_set$G3, predictions)
print(paste("MAE:", mae))
## [1] "MAE: 0.306513554265111"
predictions <- predict(svr_model, test_set)
# Calculando SS_res
ss_res <- sum((test_set$G3 - predictions)^2)
# Calculando SS_tot
ss_tot <- sum((test_set$G3 - mean(test_set$G3))^2)
# Calculando R^2
r_squared <- 1 - ss_res / ss_tot
print(paste("R-squared:", r_squared))
## [1] "R-squared: 0.771551576986665"
# Gráfico de dispersión de predicciones vs valores reales
plot(test_set$G3, predictions, main = "Predicciones de SVM vs Valores Reales",
xlab = "Valores Reales", ylab = "Predicciones", pch = 19, col = "blue")
abline(0, 1, col = "red", lwd = 2)
En la seccion de entrenamientos de modelos al ser mi variable de respuesta una variable numerica continua (G3 NOTA FINAL), se convierte en un problema de regresion mas no de clasificacion, por lo que procedo a aplicar 3 modelos de regresion para escoger el que mejor prediga el rendimiento academico de los estudiantes de secundaria de dos escuelas en portugal. Al analizar cada una de las metricas de los modelos se llega a la conclusion que el modelo que mejor predice el rendimiento academico es el modelo RANDOM FOREST al poseer el menor RMSE (0,38) y el mayor R CUADRADO (0,85) y explica el 84.79% de la variabilidad a diferencia de los otros modelos. A su vez es necesario dar a conocer que las variables que mas influyen en la nota final son: Relaciones familiares (Famrel), absences (AUSENCIAS), G1 y G2.