CORRECION EDA

# Cargamos las librerías necesarias
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(glmnet)

## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-8

library(reshape2)

## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

library(mice)

## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(knitr)
library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(ggplot2)
library(randomForest)

## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

library(e1071)
library(Metrics)

## 
## Attaching package: 'Metrics'
## 
## The following objects are masked from 'package:caret':
## 
##     precision, recall

library(pROC)

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following object is masked from 'package:Metrics':
## 
##     auc
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# IMPORTAMOS LA BASE DE DATOS
df <- read.csv("student-mat.csv")

#Resumen estadistico de la varaible de interes
resumen_estadistico <- summary(df$G3)

print(resumen_estadistico)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    8.00   11.00   10.42   14.00   20.00

Inicialmente la base de datos cuenta con 395 observaciones, contenidas en 33 variables de las cuales 16 son variables de tipo caracter y 17 son variables de tipo numerica, de las 16 variables numericas se logran identificar que la mayor´ıa de ellas son variables binarias o discretas

# VARIABLES CATEGORICAS
categorical_cols <- sapply(df, is.character)
categorical_cols <- names(df[categorical_cols])
categorical_cols

##  [1] "school"     "sex"        "address"    "famsize"    "Pstatus"   
##  [6] "Mjob"       "Fjob"       "reason"     "guardian"   "schoolsup" 
## [11] "famsup"     "paid"       "activities" "nursery"    "higher"    
## [16] "internet"   "romantic"

# VARIABLES NUMERICAS
numerical_cols <- sapply(df, is.numeric)
numerical_cols <- names(df[numerical_cols])
numerical_cols

##  [1] "age"        "Medu"       "Fedu"       "traveltime" "studytime" 
##  [6] "failures"   "famrel"     "freetime"   "goout"      "Dalc"      
## [11] "Walc"       "health"     "absences"   "G1"         "G2"        
## [16] "G3"

summary(df[numerical_cols])

##       age            Medu            Fedu         traveltime      studytime    
##  Min.   :15.0   Min.   :0.000   Min.   :0.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:16.0   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :17.0   Median :3.000   Median :2.000   Median :1.000   Median :2.000  
##  Mean   :16.7   Mean   :2.749   Mean   :2.522   Mean   :1.448   Mean   :2.035  
##  3rd Qu.:18.0   3rd Qu.:4.000   3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :22.0   Max.   :4.000   Max.   :4.000   Max.   :4.000   Max.   :4.000  
##     failures          famrel         freetime         goout      
##  Min.   :0.0000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:4.000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :3.000   Median :3.000  
##  Mean   :0.3342   Mean   :3.944   Mean   :3.235   Mean   :3.109  
##  3rd Qu.:0.0000   3rd Qu.:5.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :3.0000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##       Dalc            Walc           health         absences     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   : 0.000  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:3.000   1st Qu.: 0.000  
##  Median :1.000   Median :2.000   Median :4.000   Median : 4.000  
##  Mean   :1.481   Mean   :2.291   Mean   :3.554   Mean   : 5.709  
##  3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.:5.000   3rd Qu.: 8.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :75.000  
##        G1              G2              G3       
##  Min.   : 3.00   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 8.00   1st Qu.: 9.00   1st Qu.: 8.00  
##  Median :11.00   Median :11.00   Median :11.00  
##  Mean   :10.91   Mean   :10.71   Mean   :10.42  
##  3rd Qu.:13.00   3rd Qu.:13.00   3rd Qu.:14.00  
##  Max.   :19.00   Max.   :19.00   Max.   :20.00

#TABLA DE CONTEO Y PROPORCION DE LAS VARIABLES CATEGORICAS
# Inicializar una lista para almacenar los data frames
list_df <- list()

for (col in categorical_cols) {
  # Crear una tabla de conteo
  conteo <- table(df[[col]])

  # Convertir la tabla de conteo en un data frame
  df_conteo <- as.data.frame(conteo)

  # Renombrar las columnas
  names(df_conteo) <- c("Value", "Count")

  # Calcular las proporciones
  df_conteo$Proportion <- df_conteo$Count / sum(df_conteo$Count)

  # Agregar la columna de la variable
  df_conteo$Variable <- col

  # Agregar el data frame a la lista
  list_df[[col]] <- df_conteo
}

# Combinar todos los data frames en uno
final_df <- do.call(rbind, list_df)

# Ver el data frame final
print(final_df)

##                   Value Count Proportion   Variable
## school.1             GP   349 0.88354430     school
## school.2             MS    46 0.11645570     school
## sex.1                 F   208 0.52658228        sex
## sex.2                 M   187 0.47341772        sex
## address.1             R    88 0.22278481    address
## address.2             U   307 0.77721519    address
## famsize.1           GT3   281 0.71139241    famsize
## famsize.2           LE3   114 0.28860759    famsize
## Pstatus.1             A    41 0.10379747    Pstatus
## Pstatus.2             T   354 0.89620253    Pstatus
## Mjob.1          at_home    59 0.14936709       Mjob
## Mjob.2           health    34 0.08607595       Mjob
## Mjob.3            other   141 0.35696203       Mjob
## Mjob.4         services   103 0.26075949       Mjob
## Mjob.5          teacher    58 0.14683544       Mjob
## Fjob.1          at_home    20 0.05063291       Fjob
## Fjob.2           health    18 0.04556962       Fjob
## Fjob.3            other   217 0.54936709       Fjob
## Fjob.4         services   111 0.28101266       Fjob
## Fjob.5          teacher    29 0.07341772       Fjob
## reason.1         course   145 0.36708861     reason
## reason.2           home   109 0.27594937     reason
## reason.3          other    36 0.09113924     reason
## reason.4     reputation   105 0.26582278     reason
## guardian.1       father    90 0.22784810   guardian
## guardian.2       mother   273 0.69113924   guardian
## guardian.3        other    32 0.08101266   guardian
## schoolsup.1          no   344 0.87088608  schoolsup
## schoolsup.2         yes    51 0.12911392  schoolsup
## famsup.1             no   153 0.38734177     famsup
## famsup.2            yes   242 0.61265823     famsup
## paid.1               no   214 0.54177215       paid
## paid.2              yes   181 0.45822785       paid
## activities.1         no   194 0.49113924 activities
## activities.2        yes   201 0.50886076 activities
## nursery.1            no    81 0.20506329    nursery
## nursery.2           yes   314 0.79493671    nursery
## higher.1             no    20 0.05063291     higher
## higher.2            yes   375 0.94936709     higher
## internet.1           no    66 0.16708861   internet
## internet.2          yes   329 0.83291139   internet
## romantic.1           no   263 0.66582278   romantic
## romantic.2          yes   132 0.33417722   romantic

# MATRIZ DE CORRELACION
cor_matrix <- cor(df[numerical_cols], use = "complete.obs")
cor_melted <- melt(cor_matrix)
font_size <- 6

ggplot(data = cor_melted, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient2(low = "purple", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name="Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = font_size),
        axis.text.y = element_text(size = font_size)) +
  coord_fixed() +
  labs(x = '', y = '') +
  geom_text(aes(label = sprintf("%.2f", value)), size = 3, check_overlap = TRUE)

En el diagrama de correlacion se nota una alta correlacion entre las variables G1 y G2, a su vez en Fedu y absences; lo que quiere decir que las relaciones familiares y las ausencias al colegios tienen una alta correlacion con la nota final del estudiante, de igual forma las notas del primer y segundo periodo

# DATOS FALTANTES
md.pattern(df, plot = TRUE, rotate.names = TRUE)

##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

##     school sex age address famsize Pstatus Medu Fedu Mjob Fjob reason guardian
## 395      1   1   1       1       1       1    1    1    1    1      1        1
##          0   0   0       0       0       0    0    0    0    0      0        0
##     traveltime studytime failures schoolsup famsup paid activities nursery
## 395          1         1        1         1      1    1          1       1
##              0         0        0         0      0    0          0       0
##     higher internet romantic famrel freetime goout Dalc Walc health absences G1
## 395      1        1        1      1        1     1    1    1      1        1  1
##          0        0        0      0        0     0    0    0      0        0  0
##     G2 G3  
## 395  1  1 0
##      0  0 0

En este analisis, notamos que no hay datos faltantes, lo que quiere decir que mis datos estan al 100% o completos

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

modelo <- lm(G3 ~ ., data = df)

# Calcular VIF
vif_valores <- vif(modelo)

# Crear un dataframe para mostrar los nombres de las variables y sus VIFs
vif_df <- data.frame(VIF = vif_valores)

# Imprimir el dataframe
kable(vif_df, caption = "VIF values")

VIF values
	VIF.GVIF	VIF.Df	VIF.GVIF..1..2.Df..
school	1.510967	1	1.229214
sex	1.486968	1	1.219413
age	1.803293	1	1.342867
address	1.387985	1	1.178128
famsize	1.153276	1	1.073907
Pstatus	1.145493	1	1.070277
Medu	2.940226	1	1.714709
Fedu	2.141010	1	1.463219
Mjob	3.430704	4	1.166602
Fjob	2.301555	4	1.109820
reason	1.553683	3	1.076202
guardian	1.736473	2	1.147934
traveltime	1.320973	1	1.149336
studytime	1.395833	1	1.181454
failures	1.563186	1	1.250274
schoolsup	1.255074	1	1.120301
famsup	1.304026	1	1.141940
paid	1.338698	1	1.157021
activities	1.158682	1	1.076421
nursery	1.151348	1	1.073009
higher	1.315791	1	1.147080
internet	1.257752	1	1.121495
romantic	1.174382	1	1.083689
famrel	1.141814	1	1.068557
freetime	1.321398	1	1.149521
goout	1.496482	1	1.223308
Dalc	2.028513	1	1.424259
Walc	2.389544	1	1.545815
health	1.179266	1	1.085940
absences	1.256253	1	1.120827
G1	4.673491	1	2.161826
G2	4.409261	1	2.099824

Haciendo el respectivo analisis de multicolinealidad y teniendo en cuenta que vamos a eliminar esas variables que tienen un VIF mayor que 5, llegamos a la conclusion que ninguna de nuestras variables son preocupantes a la hora de tener un VIF mayor que 5 por lo que no se elimina ninguna, las variables mas altas fueron G1 y G2 pero no influyen en nuestro estudio ni perjudican el analisis

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

p <- plot_ly(data = df, x = ~G3, type = "histogram") %>%
  layout(title = "NOTA FINAL",
         xaxis = list(title = ""),
         yaxis = list(title = "Count"),
         marker = list(color = "purple"))

p

## Warning: 'layout' objects don't have these attributes: 'marker'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'

Teniendo en cuenta la variable objetivo que es la nota final y al analizar sus frecuencias notamos que la mayor parte de los estudiantes obtuvieron una nota final de 10 y la minoria obtuvieron una nota superior que es de 20, a su vez preocupa mas de 20 estudiantes obtuvieron 0 en la nota finl, por lo que es ahi donde prevalece la importanci de nuestro estudio, para saber que variables influyen en esas notas negativas

# BOXPLOT VARIABLES NUMERICAS VS G3
for (col in numerical_cols) {
  p1 <- plot_ly(df, y = ~get(col), x = ~G3, type = "box", name = col) %>%
    layout(title = paste("Boxplot of", col, "vs G3"),
           yaxis = list(title = col),
           xaxis = list(title = "G3"))

  # Para mostrar el gráfico en un entorno de R como RStudio
  print(p1)
}

p1

Haciendo el respectivo analisis del Boxplot, este nos arroja los siguientes resultados:Se establece una distribucion simetrica en la variable G3, G2 posee un dato atipico en 0, lo que quiere decir que este dato se encuentra alejado de la concentracion total de datos.Fue minima la cantidad de estudiantes que obtuvo 0 en el segundo periodo.Con base en el numero de ausencias se pudieron observar algunos valores atipicos por encima de 20 ausencias, lo que quiere decir que la mayoria de los estudiantes no faltaron a clases mas de 20 veces durante el proceso academico.La variable absences posee un sesgo a la derecha ya que el bigote derecho es mas largo que el izquierdo, tanto que el izquierdo no se logra diferenciar.

# VARIABLES CATEGORICAS VS G3
color_palette <- rainbow(21)

for (col in categorical_cols) {
  df_percent <- df %>%
    group_by(!!sym(col)) %>%
    count(G3) %>%
    group_by(!!sym(col)) %>%
    mutate(perc = n / sum(n) * 100) %>%
    mutate(G3 = as.factor(G3))

  p <- plot_ly(df_percent, x = ~get(col), y = ~perc, type = 'bar', name = ~G3, marker = list(color = ~G3)) %>%
    layout(yaxis = list(title = 'Porcentaje (%)', tickformat = ',.0%'),
           xaxis = list(title = col),
           barmode = 'stack',
           title = paste("Porcentaje de G3 por", col))

  # Para mostrar el gráfico en un entorno de R como RStudio
  print(p)
}

Al analizar los diagramas anteriores que muestran cada un de las variables categoricas con respecto a G3 llegamos a la conclusion que los estudiantes que tienen internet obtuvieron buenas calificaciones a diferencia de los que no, tambien notamos que los estudiantes que se encuentran en una relacion amorosa tiende a ir mal academicamente a diferencia de los que estan solteros, la escuela con mejor promedio es la GP publica y el sexo masculino tiende a tener mejores notas que el femenino.

# ENTRENAMIENTO DE MODELOS
library(caTools)
set.seed(123)
split <- sample.split(df$G3, SplitRatio = 0.8)
train_set <- subset(df, split == TRUE)
test_set <- subset(df, split == FALSE)

# Escalando los datos
train_set[sapply(train_set, is.numeric)] <- scale(train_set[sapply(train_set, is.numeric)])
test_set[sapply(test_set, is.numeric)] <- scale(test_set[sapply(test_set, is.numeric)])

# Separar la variable objetivo y las predictoras
y_train <- train_set$G3
x_train <- train_set[, names(train_set) != "G3"]

y_test <- test_set$G3
x_test <- test_set[, names(test_set) != "G3"]

# Configurar el control de entrenamiento
train_control <- trainControl(method = "cv", number = 10)

# Definir la cuadrícula de valores para alpha y lambda
grid <- expand.grid(
  alpha = 0,  # 0 para regresión Ridge
  lambda = 10^seq(-3, 3, length = 100)
)
  
  

# Ejecutar la regresión Ridge
ridge_model <- train(
  x = x_train, 
  y = y_train, 
  method = "glmnet",
  trControl = train_control,
  tuneGrid = grid,
  preProcess = c("center", "scale"),  # Escalado de los datos
  family = "gaussian"  # Asumiendo que G3 es una variable continua
)

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

# Ver los resultados
print(ridge_model)

## glmnet 
## 
## 318 samples
##  32 predictor
## 
## Pre-processing: centered (15), scaled (15), ignore (17) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 285, 287, 287, 286, 286, 286, ... 
## Resampling results across tuning parameters:
## 
##   lambda        RMSE       Rsquared   MAE      
##   1.000000e-03  0.4087647  0.8306149  0.2623796
##   1.149757e-03  0.4087647  0.8306149  0.2623796
##   1.321941e-03  0.4087647  0.8306149  0.2623796
##   1.519911e-03  0.4087647  0.8306149  0.2623796
##   1.747528e-03  0.4087647  0.8306149  0.2623796
##   2.009233e-03  0.4087647  0.8306149  0.2623796
##   2.310130e-03  0.4087647  0.8306149  0.2623796
##   2.656088e-03  0.4087647  0.8306149  0.2623796
##   3.053856e-03  0.4087647  0.8306149  0.2623796
##   3.511192e-03  0.4087647  0.8306149  0.2623796
##   4.037017e-03  0.4087647  0.8306149  0.2623796
##   4.641589e-03  0.4087647  0.8306149  0.2623796
##   5.336699e-03  0.4087647  0.8306149  0.2623796
##   6.135907e-03  0.4087647  0.8306149  0.2623796
##   7.054802e-03  0.4087647  0.8306149  0.2623796
##   8.111308e-03  0.4087647  0.8306149  0.2623796
##   9.326033e-03  0.4087647  0.8306149  0.2623796
##   1.072267e-02  0.4087647  0.8306149  0.2623796
##   1.232847e-02  0.4087647  0.8306149  0.2623796
##   1.417474e-02  0.4087647  0.8306149  0.2623796
##   1.629751e-02  0.4087647  0.8306149  0.2623796
##   1.873817e-02  0.4087647  0.8306149  0.2623796
##   2.154435e-02  0.4087647  0.8306149  0.2623796
##   2.477076e-02  0.4087647  0.8306149  0.2623796
##   2.848036e-02  0.4087647  0.8306149  0.2623796
##   3.274549e-02  0.4087647  0.8306149  0.2623796
##   3.764936e-02  0.4087647  0.8306149  0.2623796
##   4.328761e-02  0.4087647  0.8306149  0.2623796
##   4.977024e-02  0.4087647  0.8306149  0.2623796
##   5.722368e-02  0.4087647  0.8306149  0.2623796
##   6.579332e-02  0.4087647  0.8306149  0.2623796
##   7.564633e-02  0.4087647  0.8306149  0.2623796
##   8.697490e-02  0.4087647  0.8306149  0.2623796
##   1.000000e-01  0.4101100  0.8300493  0.2633982
##   1.149757e-01  0.4122100  0.8291888  0.2648714
##   1.321941e-01  0.4146807  0.8282252  0.2665245
##   1.519911e-01  0.4175441  0.8271748  0.2682947
##   1.747528e-01  0.4208832  0.8260155  0.2702944
##   2.009233e-01  0.4247287  0.8247664  0.2726380
##   2.310130e-01  0.4291891  0.8234064  0.2751051
##   2.656088e-01  0.4343002  0.8219623  0.2782607
##   3.053856e-01  0.4402114  0.8203934  0.2821035
##   3.511192e-01  0.4469660  0.8187293  0.2868156
##   4.037017e-01  0.4547404  0.8169180  0.2924919
##   4.641589e-01  0.4635624  0.8149998  0.2995513
##   5.336699e-01  0.4736295  0.8129076  0.3073410
##   6.135907e-01  0.4849197  0.8106841  0.3164087
##   7.054802e-01  0.4976306  0.8082551  0.3270066
##   8.111308e-01  0.5116477  0.8056794  0.3387838
##   9.326033e-01  0.5271548  0.8028744  0.3523713
##   1.072267e+00  0.5439086  0.7999270  0.3673376
##   1.232847e+00  0.5620726  0.7967435  0.3831731
##   1.417474e+00  0.5812544  0.7934481  0.3993643
##   1.629751e+00  0.6016103  0.7899367  0.4163145
##   1.873817e+00  0.6226061  0.7863741  0.4336726
##   2.154435e+00  0.6444125  0.7826424  0.4512841
##   2.477076e+00  0.6663845  0.7789431  0.4688298
##   2.848036e+00  0.6887329  0.7751406  0.4867655
##   3.274549e+00  0.7107488  0.7714604  0.5046218
##   3.764936e+00  0.7327029  0.7677511  0.5226734
##   4.328761e+00  0.7538753  0.7642466  0.5401368
##   4.977024e+00  0.7746035  0.7607756  0.5571857
##   5.722368e+00  0.7942046  0.7575677  0.5736215
##   6.579332e+00  0.8130758  0.7544419  0.5895202
##   7.564633e+00  0.8306067  0.7516090  0.6043721
##   8.697490e+00  0.8472353  0.7488874  0.6185880
##   1.000000e+01  0.8624424  0.7464620  0.6316461
##   1.149757e+01  0.8766810  0.7441591  0.6439665
##   1.321941e+01  0.8895274  0.7421342  0.6550339
##   1.519911e+01  0.9014240  0.7402323  0.6653651
##   1.747528e+01  0.9120357  0.7385802  0.6745733
##   2.009233e+01  0.9217730  0.7370390  0.6830554
##   2.310130e+01  0.9303772  0.7357126  0.6905374
##   2.656088e+01  0.9382134  0.7344828  0.6973996
##   3.053856e+01  0.9450852  0.7334322  0.7033992
##   3.511192e+01  0.9513059  0.7324630  0.7088757
##   4.037017e+01  0.9567277  0.7316396  0.7136572
##   4.641589e+01  0.9616124  0.7308829  0.7180179
##   5.336699e+01  0.9658494  0.7302431  0.7218050
##   6.135907e+01  0.9696523  0.7296567  0.7252000
##   7.054802e+01  0.9729384  0.7291625  0.7281298
##   8.111308e+01  0.9758793  0.7287107  0.7307494
##   9.326033e+01  0.9784132  0.7283310  0.7330040
##   1.072267e+02  0.9806757  0.7279844  0.7350159
##   1.232847e+02  0.9826206  0.7276937  0.7367439
##   1.417474e+02  0.9843543  0.7274287  0.7382835
##   1.629751e+02  0.9858420  0.7272068  0.7396038
##   1.873817e+02  0.9871663  0.7270047  0.7407788
##   2.154435e+02  0.9883013  0.7268357  0.7417851
##   2.477076e+02  0.9893106  0.7266819  0.7426799
##   2.848036e+02  0.9901747  0.7265537  0.7434456
##   3.274549e+02  0.9909425  0.7264367  0.7441259
##   3.764936e+02  0.9915993  0.7263390  0.7447076
##   4.328761e+02  0.9921827  0.7262503  0.7452242
##   4.977024e+02  0.9926814  0.7261762  0.7456657
##   5.722368e+02  0.9931241  0.7261088  0.7460577
##   6.579332e+02  0.9935024  0.7260527  0.7463925
##   7.564633e+02  0.9938381  0.7260016  0.7466896
##   8.697490e+02  0.9951710  0.7259726  0.7478581
##   1.000000e+03  0.9960664        NaN  0.7486609
## 
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 0.0869749.

# Hacer predicciones en el conjunto de prueba
predictions_ridge <- predict(ridge_model, x_test)

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

# Calcular RMSE
rmse <- sqrt(mean((y_test - predictions_ridge)^2))

# Calcular R cuadrado
r2 <- cor(y_test, predictions_ridge)^2

# Calcular otras métricas si es necesario
# Por ejemplo, Error Absoluto Medio (MAE)
mae <- mean(abs(y_test - predictions_ridge))

# Calcular Error Porcentual Absoluto Medio (MAPE)
mape <- mean(abs((y_test - predictions_ridge) / y_test)) * 100

# Imprimir las métricas
print(paste("RMSE:", rmse))

## [1] "RMSE: 0.450372007406673"

print(paste("R^2:", r2))

## [1] "R^2: 0.794635059941627"

print(paste("MAE:", mae))

## [1] "MAE: 0.318030687800231"

print(paste("MAPE:", mape))

## [1] "MAPE: 119.309296490607"

#GRID LASSO REGRESSION 

# Configurar el control de entrenamiento
train_control <- trainControl(method = "cv", number = 10)

# Definir la cuadrícula de valores para alpha y lambda
grid <- expand.grid(
  alpha = 1,  # 1 para regresión LASSO
  lambda = 10^seq(-3, 3, length = 100)
)

# Ejecutar la regresión LASSO
lasso_model <- train(
  x = x_train, 
  y = y_train, 
  method = "glmnet",
  trControl = train_control,
  tuneGrid = grid,
  preProcess = c("center", "scale"),  # Escalado de los datos
  family = "gaussian"  # Asumiendo que G3 es una variable continua
)

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

# Ver los resultados
print(lasso_model)

## glmnet 
## 
## 318 samples
##  32 predictor
## 
## Pre-processing: centered (15), scaled (15), ignore (17) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 287, 287, 287, 286, 286, 285, ... 
## Resampling results across tuning parameters:
## 
##   lambda        RMSE       Rsquared   MAE      
##   1.000000e-03  0.4039101  0.8363491  0.2589114
##   1.149757e-03  0.4039120  0.8363486  0.2589122
##   1.321941e-03  0.4039189  0.8363483  0.2589036
##   1.519911e-03  0.4038030  0.8364190  0.2586764
##   1.747528e-03  0.4036571  0.8365075  0.2583648
##   2.009233e-03  0.4034956  0.8366057  0.2580047
##   2.310130e-03  0.4033144  0.8367158  0.2576002
##   2.656088e-03  0.4031102  0.8368393  0.2571343
##   3.053856e-03  0.4028799  0.8369770  0.2566106
##   3.511192e-03  0.4026383  0.8371202  0.2560308
##   4.037017e-03  0.4023637  0.8372808  0.2554078
##   4.641589e-03  0.4020493  0.8374489  0.2547362
##   5.336699e-03  0.4017112  0.8376211  0.2540188
##   6.135907e-03  0.4013584  0.8377899  0.2531938
##   7.054802e-03  0.4009947  0.8379452  0.2521974
##   8.111308e-03  0.4006026  0.8381061  0.2510540
##   9.326033e-03  0.4000582  0.8383860  0.2496426
##   1.072267e-02  0.3995238  0.8386596  0.2480164
##   1.232847e-02  0.3989935  0.8389205  0.2461570
##   1.417474e-02  0.3983787  0.8392213  0.2440379
##   1.629751e-02  0.3976807  0.8395913  0.2417102
##   1.873817e-02  0.3971030  0.8398860  0.2396209
##   2.154435e-02  0.3969759  0.8398667  0.2377555
##   2.477076e-02  0.3972037  0.8396130  0.2359049
##   2.848036e-02  0.3973564  0.8394489  0.2340241
##   3.274549e-02  0.3976470  0.8392132  0.2322139
##   3.764936e-02  0.3980898  0.8388943  0.2305967
##   4.328761e-02  0.3989982  0.8382181  0.2294101
##   4.977024e-02  0.4007034  0.8369931  0.2288051
##   5.722368e-02  0.4031920  0.8354083  0.2294771
##   6.579332e-02  0.4053581  0.8343151  0.2314560
##   7.564633e-02  0.4066307  0.8343128  0.2337839
##   8.697490e-02  0.4081907  0.8346400  0.2367563
##   1.000000e-01  0.4107765  0.8347611  0.2407193
##   1.149757e-01  0.4146404  0.8346243  0.2453629
##   1.321941e-01  0.4199233  0.8344022  0.2510297
##   1.519911e-01  0.4270491  0.8340747  0.2583859
##   1.747528e-01  0.4365009  0.8336091  0.2682448
##   2.009233e-01  0.4486460  0.8331485  0.2808207
##   2.310130e-01  0.4638119  0.8330980  0.2957665
##   2.656088e-01  0.4829528  0.8330977  0.3135768
##   3.053856e-01  0.5069795  0.8330977  0.3352952
##   3.511192e-01  0.5368893  0.8330977  0.3614131
##   4.037017e-01  0.5737629  0.8330977  0.3936318
##   4.641589e-01  0.6188138  0.8330977  0.4320496
##   5.336699e-01  0.6733964  0.8330977  0.4783001
##   6.135907e-01  0.7390177  0.8330977  0.5339210
##   7.054802e-01  0.8173556  0.8330977  0.5992058
##   8.111308e-01  0.9102846  0.8330977  0.6775756
##   9.326033e-01  0.9947374        NaN  0.7483089
##   1.072267e+00  0.9947374        NaN  0.7483089
##   1.232847e+00  0.9947374        NaN  0.7483089
##   1.417474e+00  0.9947374        NaN  0.7483089
##   1.629751e+00  0.9947374        NaN  0.7483089
##   1.873817e+00  0.9947374        NaN  0.7483089
##   2.154435e+00  0.9947374        NaN  0.7483089
##   2.477076e+00  0.9947374        NaN  0.7483089
##   2.848036e+00  0.9947374        NaN  0.7483089
##   3.274549e+00  0.9947374        NaN  0.7483089
##   3.764936e+00  0.9947374        NaN  0.7483089
##   4.328761e+00  0.9947374        NaN  0.7483089
##   4.977024e+00  0.9947374        NaN  0.7483089
##   5.722368e+00  0.9947374        NaN  0.7483089
##   6.579332e+00  0.9947374        NaN  0.7483089
##   7.564633e+00  0.9947374        NaN  0.7483089
##   8.697490e+00  0.9947374        NaN  0.7483089
##   1.000000e+01  0.9947374        NaN  0.7483089
##   1.149757e+01  0.9947374        NaN  0.7483089
##   1.321941e+01  0.9947374        NaN  0.7483089
##   1.519911e+01  0.9947374        NaN  0.7483089
##   1.747528e+01  0.9947374        NaN  0.7483089
##   2.009233e+01  0.9947374        NaN  0.7483089
##   2.310130e+01  0.9947374        NaN  0.7483089
##   2.656088e+01  0.9947374        NaN  0.7483089
##   3.053856e+01  0.9947374        NaN  0.7483089
##   3.511192e+01  0.9947374        NaN  0.7483089
##   4.037017e+01  0.9947374        NaN  0.7483089
##   4.641589e+01  0.9947374        NaN  0.7483089
##   5.336699e+01  0.9947374        NaN  0.7483089
##   6.135907e+01  0.9947374        NaN  0.7483089
##   7.054802e+01  0.9947374        NaN  0.7483089
##   8.111308e+01  0.9947374        NaN  0.7483089
##   9.326033e+01  0.9947374        NaN  0.7483089
##   1.072267e+02  0.9947374        NaN  0.7483089
##   1.232847e+02  0.9947374        NaN  0.7483089
##   1.417474e+02  0.9947374        NaN  0.7483089
##   1.629751e+02  0.9947374        NaN  0.7483089
##   1.873817e+02  0.9947374        NaN  0.7483089
##   2.154435e+02  0.9947374        NaN  0.7483089
##   2.477076e+02  0.9947374        NaN  0.7483089
##   2.848036e+02  0.9947374        NaN  0.7483089
##   3.274549e+02  0.9947374        NaN  0.7483089
##   3.764936e+02  0.9947374        NaN  0.7483089
##   4.328761e+02  0.9947374        NaN  0.7483089
##   4.977024e+02  0.9947374        NaN  0.7483089
##   5.722368e+02  0.9947374        NaN  0.7483089
##   6.579332e+02  0.9947374        NaN  0.7483089
##   7.564633e+02  0.9947374        NaN  0.7483089
##   8.697490e+02  0.9947374        NaN  0.7483089
##   1.000000e+03  0.9947374        NaN  0.7483089
## 
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.02154435.

# Hacer predicciones en el conjunto de prueba
predictions_lasso <- predict(lasso_model, x_test)

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

# Calcular métricas de rendimiento
rmse <- sqrt(mean((y_test - predictions_lasso)^2))
r2 <- cor(y_test, predictions_lasso)^2
mae <- mean(abs(y_test - predictions_lasso))
mape <- mean(abs((y_test - predictions_lasso) / y_test)) * 100

# Crear un data.frame para las métricas
metricaslasso <- data.frame(
  RMSE = rmse,
  R_Cuadrado = r2,
  MAE = mae,
  MAPE = mape
)

# Mostrar la tabla de métricas
print(metricaslasso)

##        RMSE R_Cuadrado       MAE     MAPE
## 1 0.4345114  0.8087242 0.2888894 103.2331

random_ridge_regression <- function(x_train, y_train, x_test, y_test, feature_fraction = 0.8, seed = NULL) {
  if (!is.null(seed)) {
    set.seed(seed)
  }
  
  # Seleccionar un subconjunto aleatorio de características
  selected_features <- sample(colnames(x_train), size = floor(feature_fraction * ncol(x_train)))
  x_train_subset <- x_train[, selected_features]
  x_test_subset <- x_test[, selected_features]

  # Configurar el control de entrenamiento
  train_control <- trainControl(method = "cv", number = 10)

  # Definir la cuadrícula de valores para lambda
  grid <- expand.grid(
    alpha = 0,  # 0 para regresión Ridge
    lambda = 10^seq(-3, 3, length = 100)
  )

  # Ejecutar la regresión Ridge
  ridge_model <- train(
    x = x_train_subset, 
    y = y_train, 
    method = "glmnet",
    trControl = train_control,
    tuneGrid = grid,
    preProcess = c("center", "scale"),
    family = "gaussian"
  )

  # Hacer predicciones en el conjunto de prueba
  predictions_random_ridge <- predict(ridge_model, x_test_subset)

  # Calcular métricas de rendimiento
  rmse <- sqrt(mean((y_test - predictions_random_ridge)^2))
  r2 <- cor(y_test, predictions_random_ridge)^2
  mae <- mean(abs(y_test - predictions_random_ridge))
  mape <- mean(abs((y_test - predictions_random_ridge) / y_test)) * 100

  return(list(model = ridge_model, rmse = rmse, r2 = r2, mae = mae, mape = mape))
}

# Asumiendo que x_train, y_train, x_test, y_test están definidos
resultados_random_ridge <- random_ridge_regression(x_train, y_train, x_test, y_test, feature_fraction = 0.8, seed = 123)

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

# Ver los resultados
print(resultados_random_ridge)

## $model
## glmnet 
## 
## 318 samples
##  25 predictor
## 
## Pre-processing: centered (13), scaled (13), ignore (12) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 286, 286, 286, 287, 286, 287, ... 
## Resampling results across tuning parameters:
## 
##   lambda        RMSE       Rsquared   MAE      
##   1.000000e-03  0.4100798  0.8371722  0.2639148
##   1.149757e-03  0.4100798  0.8371722  0.2639148
##   1.321941e-03  0.4100798  0.8371722  0.2639148
##   1.519911e-03  0.4100798  0.8371722  0.2639148
##   1.747528e-03  0.4100798  0.8371722  0.2639148
##   2.009233e-03  0.4100798  0.8371722  0.2639148
##   2.310130e-03  0.4100798  0.8371722  0.2639148
##   2.656088e-03  0.4100798  0.8371722  0.2639148
##   3.053856e-03  0.4100798  0.8371722  0.2639148
##   3.511192e-03  0.4100798  0.8371722  0.2639148
##   4.037017e-03  0.4100798  0.8371722  0.2639148
##   4.641589e-03  0.4100798  0.8371722  0.2639148
##   5.336699e-03  0.4100798  0.8371722  0.2639148
##   6.135907e-03  0.4100798  0.8371722  0.2639148
##   7.054802e-03  0.4100798  0.8371722  0.2639148
##   8.111308e-03  0.4100798  0.8371722  0.2639148
##   9.326033e-03  0.4100798  0.8371722  0.2639148
##   1.072267e-02  0.4100798  0.8371722  0.2639148
##   1.232847e-02  0.4100798  0.8371722  0.2639148
##   1.417474e-02  0.4100798  0.8371722  0.2639148
##   1.629751e-02  0.4100798  0.8371722  0.2639148
##   1.873817e-02  0.4100798  0.8371722  0.2639148
##   2.154435e-02  0.4100798  0.8371722  0.2639148
##   2.477076e-02  0.4100798  0.8371722  0.2639148
##   2.848036e-02  0.4100798  0.8371722  0.2639148
##   3.274549e-02  0.4100798  0.8371722  0.2639148
##   3.764936e-02  0.4100798  0.8371722  0.2639148
##   4.328761e-02  0.4100798  0.8371722  0.2639148
##   4.977024e-02  0.4100798  0.8371722  0.2639148
##   5.722368e-02  0.4100798  0.8371722  0.2639148
##   6.579332e-02  0.4100798  0.8371722  0.2639148
##   7.564633e-02  0.4100798  0.8371722  0.2639148
##   8.697490e-02  0.4100798  0.8371722  0.2639148
##   1.000000e-01  0.4112798  0.8366091  0.2649545
##   1.149757e-01  0.4131945  0.8357463  0.2664167
##   1.321941e-01  0.4154535  0.8347747  0.2679420
##   1.519911e-01  0.4180807  0.8337060  0.2695906
##   1.747528e-01  0.4211565  0.8325266  0.2713962
##   2.009233e-01  0.4247047  0.8312623  0.2732491
##   2.310130e-01  0.4288507  0.8298816  0.2756692
##   2.656088e-01  0.4336318  0.8284146  0.2783842
##   3.053856e-01  0.4392095  0.8268193  0.2821319
##   3.511192e-01  0.4456193  0.8251410  0.2871149
##   4.037017e-01  0.4530585  0.8233282  0.2932978
##   4.641589e-01  0.4615570  0.8214173  0.3000132
##   5.336699e-01  0.4713292  0.8193500  0.3074760
##   6.135907e-01  0.4823530  0.8171708  0.3161302
##   7.054802e-01  0.4948446  0.8148145  0.3266242
##   8.111308e-01  0.5086870  0.8123417  0.3382892
##   9.326033e-01  0.5240765  0.8096750  0.3516099
##   1.072267e+00  0.5407616  0.8068982  0.3662688
##   1.232847e+00  0.5589158  0.8039290  0.3820512
##   1.417474e+00  0.5781341  0.8008807  0.3983111
##   1.629751e+00  0.5985783  0.7976640  0.4153313
##   1.873817e+00  0.6196994  0.7944226  0.4326611
##   2.154435e+00  0.6416719  0.7910573  0.4502647
##   2.477076e+00  0.6638320  0.7877400  0.4676218
##   2.848036e+00  0.6863951  0.7843578  0.4855899
##   3.274549e+00  0.7086327  0.7810992  0.5037717
##   3.764936e+00  0.7308209  0.7778372  0.5218853
##   4.328761e+00  0.7522211  0.7747642  0.5394131
##   4.977024e+00  0.7731775  0.7717407  0.5567872
##   5.722368e+00  0.7929908  0.7689511  0.5733240
##   6.579332e+00  0.8120668  0.7662489  0.5891804
##   7.564633e+00  0.8297814  0.7638016  0.6040943
##   8.697490e+00  0.8465818  0.7614629  0.6184167
##   1.000000e+01  0.8619390  0.7593783  0.6314888
##   1.149757e+01  0.8763148  0.7574088  0.6438188
##   1.321941e+01  0.8892787  0.7556766  0.6549258
##   1.519911e+01  0.9012807  0.7540554  0.6653110
##   1.747528e+01  0.9119814  0.7526450  0.6745576
##   2.009233e+01  0.9217977  0.7513349  0.6830555
##   2.310130e+01  0.9304680  0.7502052  0.6905450
##   2.656088e+01  0.9383625  0.7491621  0.6973427
##   3.053856e+01  0.9452826  0.7482689  0.7033578
##   3.511192e+01  0.9515459  0.7474481  0.7088582
##   4.037017e+01  0.9570030  0.7467491  0.7137104
##   4.641589e+01  0.9619188  0.7461091  0.7180730
##   5.336699e+01  0.9661815  0.7455663  0.7218504
##   6.135907e+01  0.9700070  0.7450708  0.7252354
##   7.054802e+01  0.9733119  0.7446520  0.7281565
##   8.111308e+01  0.9762694  0.7442705  0.7307675
##   9.326033e+01  0.9788170  0.7439489  0.7330148
##   1.072267e+02  0.9810916  0.7436564  0.7350195
##   1.232847e+02  0.9830467  0.7434103  0.7367415
##   1.417474e+02  0.9847893  0.7431867  0.7382753
##   1.629751e+02  0.9862845  0.7429989  0.7395907
##   1.873817e+02  0.9876155  0.7428285  0.7407610
##   2.154435e+02  0.9887560  0.7426854  0.7417636
##   2.477076e+02  0.9897702  0.7425557  0.7426547
##   2.848036e+02  0.9906385  0.7424470  0.7434174
##   3.274549e+02  0.9914100  0.7423485  0.7440949
##   3.764936e+02  0.9920699  0.7422659  0.7446743
##   4.328761e+02  0.9926560  0.7421911  0.7451887
##   4.977024e+02  0.9931571  0.7421285  0.7456285
##   5.722368e+02  0.9936018  0.7420717  0.7460187
##   6.579332e+02  0.9939819  0.7420242  0.7463522
##   7.564633e+02  0.9943191  0.7419812  0.7466481
##   8.697490e+02  0.9956515  0.7419572  0.7478131
##   1.000000e+03  0.9965576        NaN  0.7486110
## 
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 0.0869749.
## 
## $rmse
## [1] 0.4474431
## 
## $r2
## [1] 0.7972915
## 
## $mae
## [1] 0.3129922
## 
## $mape
## [1] 114.7232

# Crear data.frames para cada modelo con sus métricas
metricas_ridge <- data.frame(
  Modelo = "ModeloRidge",
  RMSE = 0.450372,
  R_Cuadrado = 0.7946351,
  MAE = 0.3180307,
  MAPE = 119.3093
)

metricas_lasso <- data.frame(
  Modelo = "ModeloLasso",
  RMSE = 0.436021,
  R_Cuadrado = 0.8075008,
  MAE = 0.2975574,
  MAPE = 108.7644
)

metricas_random_ridge <- data.frame(
  Modelo = "Random Ridge",
  RMSE = 0.4474431,
  R_Cuadrado = 0.7972915,
  MAE = 0.3129922,
  MAPE = 114.7232
)

# Combinar las métricas en un solo data.frame
metricas_combinadas <- rbind(metricas_ridge, metricas_lasso, metricas_random_ridge)

# Mostrar la tabla combinada
print(metricas_combinadas)

##         Modelo      RMSE R_Cuadrado       MAE     MAPE
## 1  ModeloRidge 0.4503720  0.7946351 0.3180307 119.3093
## 2  ModeloLasso 0.4360210  0.8075008 0.2975574 108.7644
## 3 Random Ridge 0.4474431  0.7972915 0.3129922 114.7232

En la seccion de entrenamientos de modelos al ser mi variable de respuesta una variable numerica continua (G3 NOTA FINAL), se convierte en un problema de regresion mas no de clasificacion, por lo que procedo a aplicar 3 modelos de regresion para escoger el que mejor prediga el rendimiento academico de los estudiantes de secundaria de dos escuelas en portugal. Al analizar cada una de las metricas de los modelos hibridos se llega a la conclusion que el modelo que mejor predice el rendimiento academico es el modelo GRID LASSO REGRESSION al poseer el menor RMSE (0,43) y el mayor R CUADRADO (0,80) y explica el 80.79% de la variabilidad a diferencia de los otros modelos. A su vez es necesario dar a conocer que las variables que mas influyen en la nota final son: Relaciones familiares (Famrel), absences (AUSENCIAS), G1 y G2.

GRAFICA DE LOS VALORES REALES VS LOS PREDICHOS

# Cargar la biblioteca plotly si no está ya cargada
if (!requireNamespace("plotly", quietly = TRUE)) {
    install.packages("plotly")
    library(plotly)
}

# Asegúrate de que dplyr o magrittr esté cargado para usar el operador pipe (%>%)
if (!requireNamespace("dplyr", quietly = TRUE)) {
    install.packages("dplyr")
    library(dplyr)
}

datos_ridge <- data.frame(Real = y_test, Prediccion = predictions_ridge)
datos_lasso <- data.frame(Real = y_test, Prediccion = predictions_lasso)


# Función para crear un gráfico de dispersión
crear_grafico <- function(datos, titulo) {
  plot_ly(datos, x = ~Real, y = ~Prediccion, type = 'scatter', mode = 'markers') %>%
    add_trace(x = ~Real, y = ~Real, mode = 'lines', type = 'scatter', name = 'Línea Ideal') %>%
    layout(title = titulo,
           xaxis = list(title = 'Valores Reales'),
           yaxis = list(title = 'Predicciones'))
}

# Crear gráficos
grafico_ridge <- crear_grafico(datos_ridge, 'Ridge Regression')
grafico_lasso <- crear_grafico(datos_lasso, 'Lasso Regression')

# Mostrar los gráficos
grafico_ridge

grafico_lasso

Teniendo en cuenta los diagramas de dispersión, donde los valores predichos por la regresión Lasso se alinean más estrechamente con los valores reales en comparación con los predichos por la regresión Ridge, se puede concluir que, para este conjunto de datos específico (rendimiento académico de estudiantes de secundaria en dos escuelas en Portugal), la regresión Lasso parece ser un modelo predictivo más efectivo. Esta conclusión se basa en la aparente mayor precisión de las predicciones de Lasso, evidenciada por una distribución más ajustada de los puntos alrededor de la línea de perfecta predicción en el diagrama de dispersión.

CORRECION EDA

Dina Monterroza Barrios

2023-12-04