# Cargamos las librerías necesarias
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(mice)
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(knitr)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(ggplot2)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(e1071)
library(Metrics)
##
## Attaching package: 'Metrics'
##
## The following objects are masked from 'package:caret':
##
## precision, recall
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following object is masked from 'package:Metrics':
##
## auc
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# IMPORTAMOS LA BASE DE DATOS
df <- read.csv("student-mat.csv")
#Resumen estadistico de la varaible de interes
resumen_estadistico <- summary(df$G3)
print(resumen_estadistico)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 8.00 11.00 10.42 14.00 20.00
Inicialmente la base de datos cuenta con 395 observaciones, contenidas en 33 variables de las cuales 16 son variables de tipo caracter y 17 son variables de tipo numerica, de las 16 variables numericas se logran identificar que la mayor´ıa de ellas son variables binarias o discretas
# VARIABLES CATEGORICAS
categorical_cols <- sapply(df, is.character)
categorical_cols <- names(df[categorical_cols])
categorical_cols
## [1] "school" "sex" "address" "famsize" "Pstatus"
## [6] "Mjob" "Fjob" "reason" "guardian" "schoolsup"
## [11] "famsup" "paid" "activities" "nursery" "higher"
## [16] "internet" "romantic"
# VARIABLES NUMERICAS
numerical_cols <- sapply(df, is.numeric)
numerical_cols <- names(df[numerical_cols])
numerical_cols
## [1] "age" "Medu" "Fedu" "traveltime" "studytime"
## [6] "failures" "famrel" "freetime" "goout" "Dalc"
## [11] "Walc" "health" "absences" "G1" "G2"
## [16] "G3"
summary(df[numerical_cols])
## age Medu Fedu traveltime studytime
## Min. :15.0 Min. :0.000 Min. :0.000 Min. :1.000 Min. :1.000
## 1st Qu.:16.0 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.000
## Median :17.0 Median :3.000 Median :2.000 Median :1.000 Median :2.000
## Mean :16.7 Mean :2.749 Mean :2.522 Mean :1.448 Mean :2.035
## 3rd Qu.:18.0 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :22.0 Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000
## failures famrel freetime goout
## Min. :0.0000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:4.000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :3.000 Median :3.000
## Mean :0.3342 Mean :3.944 Mean :3.235 Mean :3.109
## 3rd Qu.:0.0000 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :3.0000 Max. :5.000 Max. :5.000 Max. :5.000
## Dalc Walc health absences
## Min. :1.000 Min. :1.000 Min. :1.000 Min. : 0.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:3.000 1st Qu.: 0.000
## Median :1.000 Median :2.000 Median :4.000 Median : 4.000
## Mean :1.481 Mean :2.291 Mean :3.554 Mean : 5.709
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.: 8.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :75.000
## G1 G2 G3
## Min. : 3.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 8.00 1st Qu.: 9.00 1st Qu.: 8.00
## Median :11.00 Median :11.00 Median :11.00
## Mean :10.91 Mean :10.71 Mean :10.42
## 3rd Qu.:13.00 3rd Qu.:13.00 3rd Qu.:14.00
## Max. :19.00 Max. :19.00 Max. :20.00
#TABLA DE CONTEO Y PROPORCION DE LAS VARIABLES CATEGORICAS
# Inicializar una lista para almacenar los data frames
list_df <- list()
for (col in categorical_cols) {
# Crear una tabla de conteo
conteo <- table(df[[col]])
# Convertir la tabla de conteo en un data frame
df_conteo <- as.data.frame(conteo)
# Renombrar las columnas
names(df_conteo) <- c("Value", "Count")
# Calcular las proporciones
df_conteo$Proportion <- df_conteo$Count / sum(df_conteo$Count)
# Agregar la columna de la variable
df_conteo$Variable <- col
# Agregar el data frame a la lista
list_df[[col]] <- df_conteo
}
# Combinar todos los data frames en uno
final_df <- do.call(rbind, list_df)
# Ver el data frame final
print(final_df)
## Value Count Proportion Variable
## school.1 GP 349 0.88354430 school
## school.2 MS 46 0.11645570 school
## sex.1 F 208 0.52658228 sex
## sex.2 M 187 0.47341772 sex
## address.1 R 88 0.22278481 address
## address.2 U 307 0.77721519 address
## famsize.1 GT3 281 0.71139241 famsize
## famsize.2 LE3 114 0.28860759 famsize
## Pstatus.1 A 41 0.10379747 Pstatus
## Pstatus.2 T 354 0.89620253 Pstatus
## Mjob.1 at_home 59 0.14936709 Mjob
## Mjob.2 health 34 0.08607595 Mjob
## Mjob.3 other 141 0.35696203 Mjob
## Mjob.4 services 103 0.26075949 Mjob
## Mjob.5 teacher 58 0.14683544 Mjob
## Fjob.1 at_home 20 0.05063291 Fjob
## Fjob.2 health 18 0.04556962 Fjob
## Fjob.3 other 217 0.54936709 Fjob
## Fjob.4 services 111 0.28101266 Fjob
## Fjob.5 teacher 29 0.07341772 Fjob
## reason.1 course 145 0.36708861 reason
## reason.2 home 109 0.27594937 reason
## reason.3 other 36 0.09113924 reason
## reason.4 reputation 105 0.26582278 reason
## guardian.1 father 90 0.22784810 guardian
## guardian.2 mother 273 0.69113924 guardian
## guardian.3 other 32 0.08101266 guardian
## schoolsup.1 no 344 0.87088608 schoolsup
## schoolsup.2 yes 51 0.12911392 schoolsup
## famsup.1 no 153 0.38734177 famsup
## famsup.2 yes 242 0.61265823 famsup
## paid.1 no 214 0.54177215 paid
## paid.2 yes 181 0.45822785 paid
## activities.1 no 194 0.49113924 activities
## activities.2 yes 201 0.50886076 activities
## nursery.1 no 81 0.20506329 nursery
## nursery.2 yes 314 0.79493671 nursery
## higher.1 no 20 0.05063291 higher
## higher.2 yes 375 0.94936709 higher
## internet.1 no 66 0.16708861 internet
## internet.2 yes 329 0.83291139 internet
## romantic.1 no 263 0.66582278 romantic
## romantic.2 yes 132 0.33417722 romantic
# MATRIZ DE CORRELACION
cor_matrix <- cor(df[numerical_cols], use = "complete.obs")
cor_melted <- melt(cor_matrix)
font_size <- 6
ggplot(data = cor_melted, aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
scale_fill_gradient2(low = "purple", high = "red", mid = "white",
midpoint = 0, limit = c(-1, 1), space = "Lab",
name="Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = font_size),
axis.text.y = element_text(size = font_size)) +
coord_fixed() +
labs(x = '', y = '') +
geom_text(aes(label = sprintf("%.2f", value)), size = 3, check_overlap = TRUE)
En el diagrama de correlacion se nota una alta correlacion entre las variables G1 y G2, a su vez en Fedu y absences; lo que quiere decir que las relaciones familiares y las ausencias al colegios tienen una alta correlacion con la nota final del estudiante, de igual forma las notas del primer y segundo periodo
# DATOS FALTANTES
md.pattern(df, plot = TRUE, rotate.names = TRUE)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## school sex age address famsize Pstatus Medu Fedu Mjob Fjob reason guardian
## 395 1 1 1 1 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0 0 0 0 0
## traveltime studytime failures schoolsup famsup paid activities nursery
## 395 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0
## higher internet romantic famrel freetime goout Dalc Walc health absences G1
## 395 1 1 1 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0 0 0 0
## G2 G3
## 395 1 1 0
## 0 0 0
En este analisis, notamos que no hay datos faltantes, lo que quiere decir que mis datos estan al 100% o completos
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
modelo <- lm(G3 ~ ., data = df)
# Calcular VIF
vif_valores <- vif(modelo)
# Crear un dataframe para mostrar los nombres de las variables y sus VIFs
vif_df <- data.frame(VIF = vif_valores)
# Imprimir el dataframe
kable(vif_df, caption = "VIF values")
| VIF.GVIF | VIF.Df | VIF.GVIF..1..2.Df.. | |
|---|---|---|---|
| school | 1.510967 | 1 | 1.229214 |
| sex | 1.486968 | 1 | 1.219413 |
| age | 1.803293 | 1 | 1.342867 |
| address | 1.387985 | 1 | 1.178128 |
| famsize | 1.153276 | 1 | 1.073907 |
| Pstatus | 1.145493 | 1 | 1.070277 |
| Medu | 2.940226 | 1 | 1.714709 |
| Fedu | 2.141010 | 1 | 1.463219 |
| Mjob | 3.430704 | 4 | 1.166602 |
| Fjob | 2.301555 | 4 | 1.109820 |
| reason | 1.553683 | 3 | 1.076202 |
| guardian | 1.736473 | 2 | 1.147934 |
| traveltime | 1.320973 | 1 | 1.149336 |
| studytime | 1.395833 | 1 | 1.181454 |
| failures | 1.563186 | 1 | 1.250274 |
| schoolsup | 1.255074 | 1 | 1.120301 |
| famsup | 1.304026 | 1 | 1.141940 |
| paid | 1.338698 | 1 | 1.157021 |
| activities | 1.158682 | 1 | 1.076421 |
| nursery | 1.151348 | 1 | 1.073009 |
| higher | 1.315791 | 1 | 1.147080 |
| internet | 1.257752 | 1 | 1.121495 |
| romantic | 1.174382 | 1 | 1.083689 |
| famrel | 1.141814 | 1 | 1.068557 |
| freetime | 1.321398 | 1 | 1.149521 |
| goout | 1.496482 | 1 | 1.223308 |
| Dalc | 2.028513 | 1 | 1.424259 |
| Walc | 2.389544 | 1 | 1.545815 |
| health | 1.179266 | 1 | 1.085940 |
| absences | 1.256253 | 1 | 1.120827 |
| G1 | 4.673491 | 1 | 2.161826 |
| G2 | 4.409261 | 1 | 2.099824 |
Haciendo el respectivo analisis de multicolinealidad y teniendo en cuenta que vamos a eliminar esas variables que tienen un VIF mayor que 5, llegamos a la conclusion que ninguna de nuestras variables son preocupantes a la hora de tener un VIF mayor que 5 por lo que no se elimina ninguna, las variables mas altas fueron G1 y G2 pero no influyen en nuestro estudio ni perjudican el analisis
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p <- plot_ly(data = df, x = ~G3, type = "histogram") %>%
layout(title = "NOTA FINAL",
xaxis = list(title = ""),
yaxis = list(title = "Count"),
marker = list(color = "purple"))
p
## Warning: 'layout' objects don't have these attributes: 'marker'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
Teniendo en cuenta la variable objetivo que es la nota final y al analizar sus frecuencias notamos que la mayor parte de los estudiantes obtuvieron una nota final de 10 y la minoria obtuvieron una nota superior que es de 20, a su vez preocupa mas de 20 estudiantes obtuvieron 0 en la nota finl, por lo que es ahi donde prevalece la importanci de nuestro estudio, para saber que variables influyen en esas notas negativas
# BOXPLOT VARIABLES NUMERICAS VS G3
for (col in numerical_cols) {
p1 <- plot_ly(df, y = ~get(col), x = ~G3, type = "box", name = col) %>%
layout(title = paste("Boxplot of", col, "vs G3"),
yaxis = list(title = col),
xaxis = list(title = "G3"))
# Para mostrar el gráfico en un entorno de R como RStudio
print(p1)
}
p1
Haciendo el respectivo analisis del Boxplot, este nos arroja los siguientes resultados:Se establece una distribucion simetrica en la variable G3, G2 posee un dato atipico en 0, lo que quiere decir que este dato se encuentra alejado de la concentracion total de datos.Fue minima la cantidad de estudiantes que obtuvo 0 en el segundo periodo.Con base en el numero de ausencias se pudieron observar algunos valores atipicos por encima de 20 ausencias, lo que quiere decir que la mayoria de los estudiantes no faltaron a clases mas de 20 veces durante el proceso academico.La variable absences posee un sesgo a la derecha ya que el bigote derecho es mas largo que el izquierdo, tanto que el izquierdo no se logra diferenciar.
# VARIABLES CATEGORICAS VS G3
color_palette <- rainbow(21)
for (col in categorical_cols) {
df_percent <- df %>%
group_by(!!sym(col)) %>%
count(G3) %>%
group_by(!!sym(col)) %>%
mutate(perc = n / sum(n) * 100) %>%
mutate(G3 = as.factor(G3))
p <- plot_ly(df_percent, x = ~get(col), y = ~perc, type = 'bar', name = ~G3, marker = list(color = ~G3)) %>%
layout(yaxis = list(title = 'Porcentaje (%)', tickformat = ',.0%'),
xaxis = list(title = col),
barmode = 'stack',
title = paste("Porcentaje de G3 por", col))
# Para mostrar el gráfico en un entorno de R como RStudio
print(p)
}
p
Al analizar los diagramas anteriores que muestran cada un de las variables categoricas con respecto a G3 llegamos a la conclusion que los estudiantes que tienen internet obtuvieron buenas calificaciones a diferencia de los que no, tambien notamos que los estudiantes que se encuentran en una relacion amorosa tiende a ir mal academicamente a diferencia de los que estan solteros, la escuela con mejor promedio es la GP publica y el sexo masculino tiende a tener mejores notas que el femenino.
# ENTRENAMIENTO DE MODELOS
library(caTools)
set.seed(123)
split <- sample.split(df$G3, SplitRatio = 0.8)
train_set <- subset(df, split == TRUE)
test_set <- subset(df, split == FALSE)
# Escalando los datos
train_set[sapply(train_set, is.numeric)] <- scale(train_set[sapply(train_set, is.numeric)])
test_set[sapply(test_set, is.numeric)] <- scale(test_set[sapply(test_set, is.numeric)])
# Separar la variable objetivo y las predictoras
y_train <- train_set$G3
x_train <- train_set[, names(train_set) != "G3"]
y_test <- test_set$G3
x_test <- test_set[, names(test_set) != "G3"]
# Configurar el control de entrenamiento
train_control <- trainControl(method = "cv", number = 10)
# Definir la cuadrícula de valores para alpha y lambda
grid <- expand.grid(
alpha = 0, # 0 para regresión Ridge
lambda = 10^seq(-3, 3, length = 100)
)
# Ejecutar la regresión Ridge
ridge_model <- train(
x = x_train,
y = y_train,
method = "glmnet",
trControl = train_control,
tuneGrid = grid,
preProcess = c("center", "scale"), # Escalado de los datos
family = "gaussian" # Asumiendo que G3 es una variable continua
)
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
# Ver los resultados
print(ridge_model)
## glmnet
##
## 318 samples
## 32 predictor
##
## Pre-processing: centered (15), scaled (15), ignore (17)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 285, 287, 287, 286, 286, 286, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 1.000000e-03 0.4087647 0.8306149 0.2623796
## 1.149757e-03 0.4087647 0.8306149 0.2623796
## 1.321941e-03 0.4087647 0.8306149 0.2623796
## 1.519911e-03 0.4087647 0.8306149 0.2623796
## 1.747528e-03 0.4087647 0.8306149 0.2623796
## 2.009233e-03 0.4087647 0.8306149 0.2623796
## 2.310130e-03 0.4087647 0.8306149 0.2623796
## 2.656088e-03 0.4087647 0.8306149 0.2623796
## 3.053856e-03 0.4087647 0.8306149 0.2623796
## 3.511192e-03 0.4087647 0.8306149 0.2623796
## 4.037017e-03 0.4087647 0.8306149 0.2623796
## 4.641589e-03 0.4087647 0.8306149 0.2623796
## 5.336699e-03 0.4087647 0.8306149 0.2623796
## 6.135907e-03 0.4087647 0.8306149 0.2623796
## 7.054802e-03 0.4087647 0.8306149 0.2623796
## 8.111308e-03 0.4087647 0.8306149 0.2623796
## 9.326033e-03 0.4087647 0.8306149 0.2623796
## 1.072267e-02 0.4087647 0.8306149 0.2623796
## 1.232847e-02 0.4087647 0.8306149 0.2623796
## 1.417474e-02 0.4087647 0.8306149 0.2623796
## 1.629751e-02 0.4087647 0.8306149 0.2623796
## 1.873817e-02 0.4087647 0.8306149 0.2623796
## 2.154435e-02 0.4087647 0.8306149 0.2623796
## 2.477076e-02 0.4087647 0.8306149 0.2623796
## 2.848036e-02 0.4087647 0.8306149 0.2623796
## 3.274549e-02 0.4087647 0.8306149 0.2623796
## 3.764936e-02 0.4087647 0.8306149 0.2623796
## 4.328761e-02 0.4087647 0.8306149 0.2623796
## 4.977024e-02 0.4087647 0.8306149 0.2623796
## 5.722368e-02 0.4087647 0.8306149 0.2623796
## 6.579332e-02 0.4087647 0.8306149 0.2623796
## 7.564633e-02 0.4087647 0.8306149 0.2623796
## 8.697490e-02 0.4087647 0.8306149 0.2623796
## 1.000000e-01 0.4101100 0.8300493 0.2633982
## 1.149757e-01 0.4122100 0.8291888 0.2648714
## 1.321941e-01 0.4146807 0.8282252 0.2665245
## 1.519911e-01 0.4175441 0.8271748 0.2682947
## 1.747528e-01 0.4208832 0.8260155 0.2702944
## 2.009233e-01 0.4247287 0.8247664 0.2726380
## 2.310130e-01 0.4291891 0.8234064 0.2751051
## 2.656088e-01 0.4343002 0.8219623 0.2782607
## 3.053856e-01 0.4402114 0.8203934 0.2821035
## 3.511192e-01 0.4469660 0.8187293 0.2868156
## 4.037017e-01 0.4547404 0.8169180 0.2924919
## 4.641589e-01 0.4635624 0.8149998 0.2995513
## 5.336699e-01 0.4736295 0.8129076 0.3073410
## 6.135907e-01 0.4849197 0.8106841 0.3164087
## 7.054802e-01 0.4976306 0.8082551 0.3270066
## 8.111308e-01 0.5116477 0.8056794 0.3387838
## 9.326033e-01 0.5271548 0.8028744 0.3523713
## 1.072267e+00 0.5439086 0.7999270 0.3673376
## 1.232847e+00 0.5620726 0.7967435 0.3831731
## 1.417474e+00 0.5812544 0.7934481 0.3993643
## 1.629751e+00 0.6016103 0.7899367 0.4163145
## 1.873817e+00 0.6226061 0.7863741 0.4336726
## 2.154435e+00 0.6444125 0.7826424 0.4512841
## 2.477076e+00 0.6663845 0.7789431 0.4688298
## 2.848036e+00 0.6887329 0.7751406 0.4867655
## 3.274549e+00 0.7107488 0.7714604 0.5046218
## 3.764936e+00 0.7327029 0.7677511 0.5226734
## 4.328761e+00 0.7538753 0.7642466 0.5401368
## 4.977024e+00 0.7746035 0.7607756 0.5571857
## 5.722368e+00 0.7942046 0.7575677 0.5736215
## 6.579332e+00 0.8130758 0.7544419 0.5895202
## 7.564633e+00 0.8306067 0.7516090 0.6043721
## 8.697490e+00 0.8472353 0.7488874 0.6185880
## 1.000000e+01 0.8624424 0.7464620 0.6316461
## 1.149757e+01 0.8766810 0.7441591 0.6439665
## 1.321941e+01 0.8895274 0.7421342 0.6550339
## 1.519911e+01 0.9014240 0.7402323 0.6653651
## 1.747528e+01 0.9120357 0.7385802 0.6745733
## 2.009233e+01 0.9217730 0.7370390 0.6830554
## 2.310130e+01 0.9303772 0.7357126 0.6905374
## 2.656088e+01 0.9382134 0.7344828 0.6973996
## 3.053856e+01 0.9450852 0.7334322 0.7033992
## 3.511192e+01 0.9513059 0.7324630 0.7088757
## 4.037017e+01 0.9567277 0.7316396 0.7136572
## 4.641589e+01 0.9616124 0.7308829 0.7180179
## 5.336699e+01 0.9658494 0.7302431 0.7218050
## 6.135907e+01 0.9696523 0.7296567 0.7252000
## 7.054802e+01 0.9729384 0.7291625 0.7281298
## 8.111308e+01 0.9758793 0.7287107 0.7307494
## 9.326033e+01 0.9784132 0.7283310 0.7330040
## 1.072267e+02 0.9806757 0.7279844 0.7350159
## 1.232847e+02 0.9826206 0.7276937 0.7367439
## 1.417474e+02 0.9843543 0.7274287 0.7382835
## 1.629751e+02 0.9858420 0.7272068 0.7396038
## 1.873817e+02 0.9871663 0.7270047 0.7407788
## 2.154435e+02 0.9883013 0.7268357 0.7417851
## 2.477076e+02 0.9893106 0.7266819 0.7426799
## 2.848036e+02 0.9901747 0.7265537 0.7434456
## 3.274549e+02 0.9909425 0.7264367 0.7441259
## 3.764936e+02 0.9915993 0.7263390 0.7447076
## 4.328761e+02 0.9921827 0.7262503 0.7452242
## 4.977024e+02 0.9926814 0.7261762 0.7456657
## 5.722368e+02 0.9931241 0.7261088 0.7460577
## 6.579332e+02 0.9935024 0.7260527 0.7463925
## 7.564633e+02 0.9938381 0.7260016 0.7466896
## 8.697490e+02 0.9951710 0.7259726 0.7478581
## 1.000000e+03 0.9960664 NaN 0.7486609
##
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 0.0869749.
# Hacer predicciones en el conjunto de prueba
predictions_ridge <- predict(ridge_model, x_test)
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
# Calcular RMSE
rmse <- sqrt(mean((y_test - predictions_ridge)^2))
# Calcular R cuadrado
r2 <- cor(y_test, predictions_ridge)^2
# Calcular otras métricas si es necesario
# Por ejemplo, Error Absoluto Medio (MAE)
mae <- mean(abs(y_test - predictions_ridge))
# Calcular Error Porcentual Absoluto Medio (MAPE)
mape <- mean(abs((y_test - predictions_ridge) / y_test)) * 100
# Imprimir las métricas
print(paste("RMSE:", rmse))
## [1] "RMSE: 0.450372007406673"
print(paste("R^2:", r2))
## [1] "R^2: 0.794635059941627"
print(paste("MAE:", mae))
## [1] "MAE: 0.318030687800231"
print(paste("MAPE:", mape))
## [1] "MAPE: 119.309296490607"
#GRID LASSO REGRESSION
# Configurar el control de entrenamiento
train_control <- trainControl(method = "cv", number = 10)
# Definir la cuadrícula de valores para alpha y lambda
grid <- expand.grid(
alpha = 1, # 1 para regresión LASSO
lambda = 10^seq(-3, 3, length = 100)
)
# Ejecutar la regresión LASSO
lasso_model <- train(
x = x_train,
y = y_train,
method = "glmnet",
trControl = train_control,
tuneGrid = grid,
preProcess = c("center", "scale"), # Escalado de los datos
family = "gaussian" # Asumiendo que G3 es una variable continua
)
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
# Ver los resultados
print(lasso_model)
## glmnet
##
## 318 samples
## 32 predictor
##
## Pre-processing: centered (15), scaled (15), ignore (17)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 287, 287, 287, 286, 286, 285, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 1.000000e-03 0.4039101 0.8363491 0.2589114
## 1.149757e-03 0.4039120 0.8363486 0.2589122
## 1.321941e-03 0.4039189 0.8363483 0.2589036
## 1.519911e-03 0.4038030 0.8364190 0.2586764
## 1.747528e-03 0.4036571 0.8365075 0.2583648
## 2.009233e-03 0.4034956 0.8366057 0.2580047
## 2.310130e-03 0.4033144 0.8367158 0.2576002
## 2.656088e-03 0.4031102 0.8368393 0.2571343
## 3.053856e-03 0.4028799 0.8369770 0.2566106
## 3.511192e-03 0.4026383 0.8371202 0.2560308
## 4.037017e-03 0.4023637 0.8372808 0.2554078
## 4.641589e-03 0.4020493 0.8374489 0.2547362
## 5.336699e-03 0.4017112 0.8376211 0.2540188
## 6.135907e-03 0.4013584 0.8377899 0.2531938
## 7.054802e-03 0.4009947 0.8379452 0.2521974
## 8.111308e-03 0.4006026 0.8381061 0.2510540
## 9.326033e-03 0.4000582 0.8383860 0.2496426
## 1.072267e-02 0.3995238 0.8386596 0.2480164
## 1.232847e-02 0.3989935 0.8389205 0.2461570
## 1.417474e-02 0.3983787 0.8392213 0.2440379
## 1.629751e-02 0.3976807 0.8395913 0.2417102
## 1.873817e-02 0.3971030 0.8398860 0.2396209
## 2.154435e-02 0.3969759 0.8398667 0.2377555
## 2.477076e-02 0.3972037 0.8396130 0.2359049
## 2.848036e-02 0.3973564 0.8394489 0.2340241
## 3.274549e-02 0.3976470 0.8392132 0.2322139
## 3.764936e-02 0.3980898 0.8388943 0.2305967
## 4.328761e-02 0.3989982 0.8382181 0.2294101
## 4.977024e-02 0.4007034 0.8369931 0.2288051
## 5.722368e-02 0.4031920 0.8354083 0.2294771
## 6.579332e-02 0.4053581 0.8343151 0.2314560
## 7.564633e-02 0.4066307 0.8343128 0.2337839
## 8.697490e-02 0.4081907 0.8346400 0.2367563
## 1.000000e-01 0.4107765 0.8347611 0.2407193
## 1.149757e-01 0.4146404 0.8346243 0.2453629
## 1.321941e-01 0.4199233 0.8344022 0.2510297
## 1.519911e-01 0.4270491 0.8340747 0.2583859
## 1.747528e-01 0.4365009 0.8336091 0.2682448
## 2.009233e-01 0.4486460 0.8331485 0.2808207
## 2.310130e-01 0.4638119 0.8330980 0.2957665
## 2.656088e-01 0.4829528 0.8330977 0.3135768
## 3.053856e-01 0.5069795 0.8330977 0.3352952
## 3.511192e-01 0.5368893 0.8330977 0.3614131
## 4.037017e-01 0.5737629 0.8330977 0.3936318
## 4.641589e-01 0.6188138 0.8330977 0.4320496
## 5.336699e-01 0.6733964 0.8330977 0.4783001
## 6.135907e-01 0.7390177 0.8330977 0.5339210
## 7.054802e-01 0.8173556 0.8330977 0.5992058
## 8.111308e-01 0.9102846 0.8330977 0.6775756
## 9.326033e-01 0.9947374 NaN 0.7483089
## 1.072267e+00 0.9947374 NaN 0.7483089
## 1.232847e+00 0.9947374 NaN 0.7483089
## 1.417474e+00 0.9947374 NaN 0.7483089
## 1.629751e+00 0.9947374 NaN 0.7483089
## 1.873817e+00 0.9947374 NaN 0.7483089
## 2.154435e+00 0.9947374 NaN 0.7483089
## 2.477076e+00 0.9947374 NaN 0.7483089
## 2.848036e+00 0.9947374 NaN 0.7483089
## 3.274549e+00 0.9947374 NaN 0.7483089
## 3.764936e+00 0.9947374 NaN 0.7483089
## 4.328761e+00 0.9947374 NaN 0.7483089
## 4.977024e+00 0.9947374 NaN 0.7483089
## 5.722368e+00 0.9947374 NaN 0.7483089
## 6.579332e+00 0.9947374 NaN 0.7483089
## 7.564633e+00 0.9947374 NaN 0.7483089
## 8.697490e+00 0.9947374 NaN 0.7483089
## 1.000000e+01 0.9947374 NaN 0.7483089
## 1.149757e+01 0.9947374 NaN 0.7483089
## 1.321941e+01 0.9947374 NaN 0.7483089
## 1.519911e+01 0.9947374 NaN 0.7483089
## 1.747528e+01 0.9947374 NaN 0.7483089
## 2.009233e+01 0.9947374 NaN 0.7483089
## 2.310130e+01 0.9947374 NaN 0.7483089
## 2.656088e+01 0.9947374 NaN 0.7483089
## 3.053856e+01 0.9947374 NaN 0.7483089
## 3.511192e+01 0.9947374 NaN 0.7483089
## 4.037017e+01 0.9947374 NaN 0.7483089
## 4.641589e+01 0.9947374 NaN 0.7483089
## 5.336699e+01 0.9947374 NaN 0.7483089
## 6.135907e+01 0.9947374 NaN 0.7483089
## 7.054802e+01 0.9947374 NaN 0.7483089
## 8.111308e+01 0.9947374 NaN 0.7483089
## 9.326033e+01 0.9947374 NaN 0.7483089
## 1.072267e+02 0.9947374 NaN 0.7483089
## 1.232847e+02 0.9947374 NaN 0.7483089
## 1.417474e+02 0.9947374 NaN 0.7483089
## 1.629751e+02 0.9947374 NaN 0.7483089
## 1.873817e+02 0.9947374 NaN 0.7483089
## 2.154435e+02 0.9947374 NaN 0.7483089
## 2.477076e+02 0.9947374 NaN 0.7483089
## 2.848036e+02 0.9947374 NaN 0.7483089
## 3.274549e+02 0.9947374 NaN 0.7483089
## 3.764936e+02 0.9947374 NaN 0.7483089
## 4.328761e+02 0.9947374 NaN 0.7483089
## 4.977024e+02 0.9947374 NaN 0.7483089
## 5.722368e+02 0.9947374 NaN 0.7483089
## 6.579332e+02 0.9947374 NaN 0.7483089
## 7.564633e+02 0.9947374 NaN 0.7483089
## 8.697490e+02 0.9947374 NaN 0.7483089
## 1.000000e+03 0.9947374 NaN 0.7483089
##
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.02154435.
# Hacer predicciones en el conjunto de prueba
predictions_lasso <- predict(lasso_model, x_test)
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
# Calcular métricas de rendimiento
rmse <- sqrt(mean((y_test - predictions_lasso)^2))
r2 <- cor(y_test, predictions_lasso)^2
mae <- mean(abs(y_test - predictions_lasso))
mape <- mean(abs((y_test - predictions_lasso) / y_test)) * 100
# Crear un data.frame para las métricas
metricaslasso <- data.frame(
RMSE = rmse,
R_Cuadrado = r2,
MAE = mae,
MAPE = mape
)
# Mostrar la tabla de métricas
print(metricaslasso)
## RMSE R_Cuadrado MAE MAPE
## 1 0.4345114 0.8087242 0.2888894 103.2331
random_ridge_regression <- function(x_train, y_train, x_test, y_test, feature_fraction = 0.8, seed = NULL) {
if (!is.null(seed)) {
set.seed(seed)
}
# Seleccionar un subconjunto aleatorio de características
selected_features <- sample(colnames(x_train), size = floor(feature_fraction * ncol(x_train)))
x_train_subset <- x_train[, selected_features]
x_test_subset <- x_test[, selected_features]
# Configurar el control de entrenamiento
train_control <- trainControl(method = "cv", number = 10)
# Definir la cuadrícula de valores para lambda
grid <- expand.grid(
alpha = 0, # 0 para regresión Ridge
lambda = 10^seq(-3, 3, length = 100)
)
# Ejecutar la regresión Ridge
ridge_model <- train(
x = x_train_subset,
y = y_train,
method = "glmnet",
trControl = train_control,
tuneGrid = grid,
preProcess = c("center", "scale"),
family = "gaussian"
)
# Hacer predicciones en el conjunto de prueba
predictions_random_ridge <- predict(ridge_model, x_test_subset)
# Calcular métricas de rendimiento
rmse <- sqrt(mean((y_test - predictions_random_ridge)^2))
r2 <- cor(y_test, predictions_random_ridge)^2
mae <- mean(abs(y_test - predictions_random_ridge))
mape <- mean(abs((y_test - predictions_random_ridge) / y_test)) * 100
return(list(model = ridge_model, rmse = rmse, r2 = r2, mae = mae, mape = mape))
}
# Asumiendo que x_train, y_train, x_test, y_test están definidos
resultados_random_ridge <- random_ridge_regression(x_train, y_train, x_test, y_test, feature_fraction = 0.8, seed = 123)
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
# Ver los resultados
print(resultados_random_ridge)
## $model
## glmnet
##
## 318 samples
## 25 predictor
##
## Pre-processing: centered (13), scaled (13), ignore (12)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 286, 286, 286, 287, 286, 287, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 1.000000e-03 0.4100798 0.8371722 0.2639148
## 1.149757e-03 0.4100798 0.8371722 0.2639148
## 1.321941e-03 0.4100798 0.8371722 0.2639148
## 1.519911e-03 0.4100798 0.8371722 0.2639148
## 1.747528e-03 0.4100798 0.8371722 0.2639148
## 2.009233e-03 0.4100798 0.8371722 0.2639148
## 2.310130e-03 0.4100798 0.8371722 0.2639148
## 2.656088e-03 0.4100798 0.8371722 0.2639148
## 3.053856e-03 0.4100798 0.8371722 0.2639148
## 3.511192e-03 0.4100798 0.8371722 0.2639148
## 4.037017e-03 0.4100798 0.8371722 0.2639148
## 4.641589e-03 0.4100798 0.8371722 0.2639148
## 5.336699e-03 0.4100798 0.8371722 0.2639148
## 6.135907e-03 0.4100798 0.8371722 0.2639148
## 7.054802e-03 0.4100798 0.8371722 0.2639148
## 8.111308e-03 0.4100798 0.8371722 0.2639148
## 9.326033e-03 0.4100798 0.8371722 0.2639148
## 1.072267e-02 0.4100798 0.8371722 0.2639148
## 1.232847e-02 0.4100798 0.8371722 0.2639148
## 1.417474e-02 0.4100798 0.8371722 0.2639148
## 1.629751e-02 0.4100798 0.8371722 0.2639148
## 1.873817e-02 0.4100798 0.8371722 0.2639148
## 2.154435e-02 0.4100798 0.8371722 0.2639148
## 2.477076e-02 0.4100798 0.8371722 0.2639148
## 2.848036e-02 0.4100798 0.8371722 0.2639148
## 3.274549e-02 0.4100798 0.8371722 0.2639148
## 3.764936e-02 0.4100798 0.8371722 0.2639148
## 4.328761e-02 0.4100798 0.8371722 0.2639148
## 4.977024e-02 0.4100798 0.8371722 0.2639148
## 5.722368e-02 0.4100798 0.8371722 0.2639148
## 6.579332e-02 0.4100798 0.8371722 0.2639148
## 7.564633e-02 0.4100798 0.8371722 0.2639148
## 8.697490e-02 0.4100798 0.8371722 0.2639148
## 1.000000e-01 0.4112798 0.8366091 0.2649545
## 1.149757e-01 0.4131945 0.8357463 0.2664167
## 1.321941e-01 0.4154535 0.8347747 0.2679420
## 1.519911e-01 0.4180807 0.8337060 0.2695906
## 1.747528e-01 0.4211565 0.8325266 0.2713962
## 2.009233e-01 0.4247047 0.8312623 0.2732491
## 2.310130e-01 0.4288507 0.8298816 0.2756692
## 2.656088e-01 0.4336318 0.8284146 0.2783842
## 3.053856e-01 0.4392095 0.8268193 0.2821319
## 3.511192e-01 0.4456193 0.8251410 0.2871149
## 4.037017e-01 0.4530585 0.8233282 0.2932978
## 4.641589e-01 0.4615570 0.8214173 0.3000132
## 5.336699e-01 0.4713292 0.8193500 0.3074760
## 6.135907e-01 0.4823530 0.8171708 0.3161302
## 7.054802e-01 0.4948446 0.8148145 0.3266242
## 8.111308e-01 0.5086870 0.8123417 0.3382892
## 9.326033e-01 0.5240765 0.8096750 0.3516099
## 1.072267e+00 0.5407616 0.8068982 0.3662688
## 1.232847e+00 0.5589158 0.8039290 0.3820512
## 1.417474e+00 0.5781341 0.8008807 0.3983111
## 1.629751e+00 0.5985783 0.7976640 0.4153313
## 1.873817e+00 0.6196994 0.7944226 0.4326611
## 2.154435e+00 0.6416719 0.7910573 0.4502647
## 2.477076e+00 0.6638320 0.7877400 0.4676218
## 2.848036e+00 0.6863951 0.7843578 0.4855899
## 3.274549e+00 0.7086327 0.7810992 0.5037717
## 3.764936e+00 0.7308209 0.7778372 0.5218853
## 4.328761e+00 0.7522211 0.7747642 0.5394131
## 4.977024e+00 0.7731775 0.7717407 0.5567872
## 5.722368e+00 0.7929908 0.7689511 0.5733240
## 6.579332e+00 0.8120668 0.7662489 0.5891804
## 7.564633e+00 0.8297814 0.7638016 0.6040943
## 8.697490e+00 0.8465818 0.7614629 0.6184167
## 1.000000e+01 0.8619390 0.7593783 0.6314888
## 1.149757e+01 0.8763148 0.7574088 0.6438188
## 1.321941e+01 0.8892787 0.7556766 0.6549258
## 1.519911e+01 0.9012807 0.7540554 0.6653110
## 1.747528e+01 0.9119814 0.7526450 0.6745576
## 2.009233e+01 0.9217977 0.7513349 0.6830555
## 2.310130e+01 0.9304680 0.7502052 0.6905450
## 2.656088e+01 0.9383625 0.7491621 0.6973427
## 3.053856e+01 0.9452826 0.7482689 0.7033578
## 3.511192e+01 0.9515459 0.7474481 0.7088582
## 4.037017e+01 0.9570030 0.7467491 0.7137104
## 4.641589e+01 0.9619188 0.7461091 0.7180730
## 5.336699e+01 0.9661815 0.7455663 0.7218504
## 6.135907e+01 0.9700070 0.7450708 0.7252354
## 7.054802e+01 0.9733119 0.7446520 0.7281565
## 8.111308e+01 0.9762694 0.7442705 0.7307675
## 9.326033e+01 0.9788170 0.7439489 0.7330148
## 1.072267e+02 0.9810916 0.7436564 0.7350195
## 1.232847e+02 0.9830467 0.7434103 0.7367415
## 1.417474e+02 0.9847893 0.7431867 0.7382753
## 1.629751e+02 0.9862845 0.7429989 0.7395907
## 1.873817e+02 0.9876155 0.7428285 0.7407610
## 2.154435e+02 0.9887560 0.7426854 0.7417636
## 2.477076e+02 0.9897702 0.7425557 0.7426547
## 2.848036e+02 0.9906385 0.7424470 0.7434174
## 3.274549e+02 0.9914100 0.7423485 0.7440949
## 3.764936e+02 0.9920699 0.7422659 0.7446743
## 4.328761e+02 0.9926560 0.7421911 0.7451887
## 4.977024e+02 0.9931571 0.7421285 0.7456285
## 5.722368e+02 0.9936018 0.7420717 0.7460187
## 6.579332e+02 0.9939819 0.7420242 0.7463522
## 7.564633e+02 0.9943191 0.7419812 0.7466481
## 8.697490e+02 0.9956515 0.7419572 0.7478131
## 1.000000e+03 0.9965576 NaN 0.7486110
##
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 0.0869749.
##
## $rmse
## [1] 0.4474431
##
## $r2
## [1] 0.7972915
##
## $mae
## [1] 0.3129922
##
## $mape
## [1] 114.7232
# Crear data.frames para cada modelo con sus métricas
metricas_ridge <- data.frame(
Modelo = "ModeloRidge",
RMSE = 0.450372,
R_Cuadrado = 0.7946351,
MAE = 0.3180307,
MAPE = 119.3093
)
metricas_lasso <- data.frame(
Modelo = "ModeloLasso",
RMSE = 0.436021,
R_Cuadrado = 0.8075008,
MAE = 0.2975574,
MAPE = 108.7644
)
metricas_random_ridge <- data.frame(
Modelo = "Random Ridge",
RMSE = 0.4474431,
R_Cuadrado = 0.7972915,
MAE = 0.3129922,
MAPE = 114.7232
)
# Combinar las métricas en un solo data.frame
metricas_combinadas <- rbind(metricas_ridge, metricas_lasso, metricas_random_ridge)
# Mostrar la tabla combinada
print(metricas_combinadas)
## Modelo RMSE R_Cuadrado MAE MAPE
## 1 ModeloRidge 0.4503720 0.7946351 0.3180307 119.3093
## 2 ModeloLasso 0.4360210 0.8075008 0.2975574 108.7644
## 3 Random Ridge 0.4474431 0.7972915 0.3129922 114.7232
En la seccion de entrenamientos de modelos al ser mi variable de respuesta una variable numerica continua (G3 NOTA FINAL), se convierte en un problema de regresion mas no de clasificacion, por lo que procedo a aplicar 3 modelos de regresion para escoger el que mejor prediga el rendimiento academico de los estudiantes de secundaria de dos escuelas en portugal. Al analizar cada una de las metricas de los modelos hibridos se llega a la conclusion que el modelo que mejor predice el rendimiento academico es el modelo GRID LASSO REGRESSION al poseer el menor RMSE (0,43) y el mayor R CUADRADO (0,80) y explica el 80.79% de la variabilidad a diferencia de los otros modelos. A su vez es necesario dar a conocer que las variables que mas influyen en la nota final son: Relaciones familiares (Famrel), absences (AUSENCIAS), G1 y G2.
GRAFICA DE LOS VALORES REALES VS LOS PREDICHOS
# Cargar la biblioteca plotly si no está ya cargada
if (!requireNamespace("plotly", quietly = TRUE)) {
install.packages("plotly")
library(plotly)
}
# Asegúrate de que dplyr o magrittr esté cargado para usar el operador pipe (%>%)
if (!requireNamespace("dplyr", quietly = TRUE)) {
install.packages("dplyr")
library(dplyr)
}
datos_ridge <- data.frame(Real = y_test, Prediccion = predictions_ridge)
datos_lasso <- data.frame(Real = y_test, Prediccion = predictions_lasso)
# Función para crear un gráfico de dispersión
crear_grafico <- function(datos, titulo) {
plot_ly(datos, x = ~Real, y = ~Prediccion, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~Real, y = ~Real, mode = 'lines', type = 'scatter', name = 'Línea Ideal') %>%
layout(title = titulo,
xaxis = list(title = 'Valores Reales'),
yaxis = list(title = 'Predicciones'))
}
# Crear gráficos
grafico_ridge <- crear_grafico(datos_ridge, 'Ridge Regression')
grafico_lasso <- crear_grafico(datos_lasso, 'Lasso Regression')
# Mostrar los gráficos
grafico_ridge
grafico_lasso
Teniendo en cuenta los diagramas de dispersión, donde los valores predichos por la regresión Lasso se alinean más estrechamente con los valores reales en comparación con los predichos por la regresión Ridge, se puede concluir que, para este conjunto de datos específico (rendimiento académico de estudiantes de secundaria en dos escuelas en Portugal), la regresión Lasso parece ser un modelo predictivo más efectivo. Esta conclusión se basa en la aparente mayor precisión de las predicciones de Lasso, evidenciada por una distribución más ajustada de los puntos alrededor de la línea de perfecta predicción en el diagrama de dispersión.