# Paso 1: Generar un conjunto de datos simple
set.seed(123)
n <- 200

# Predictores verdaderos
x1 <- rnorm(n)
x2 <- rnorm(n)

# Variable de respuesta
y <- 3 * x1 + 2 * x2 + rnorm(n)

# Predictores irrelevantes (ruido)
noise1 <- rnorm(n)
noise2 <- rnorm(n)
noise3 <- rnorm(n)
noise4 <- rnorm(n)

data <- data.frame(y, x1, x2, noise1, noise2, noise3, noise4)
summary(data)
##        y                  x1                 x2               noise1         
##  Min.   :-7.42464   Min.   :-2.30917   Min.   :-2.46590   Min.   :-2.601700  
##  1st Qu.:-2.41935   1st Qu.:-0.62576   1st Qu.:-0.59077   1st Qu.:-0.693147  
##  Median : 0.03947   Median :-0.05874   Median : 0.02283   Median : 0.002188  
##  Mean   : 0.09031   Mean   :-0.00857   Mean   : 0.04212   Mean   :-0.021885  
##  3rd Qu.: 2.04531   3rd Qu.: 0.56840   3rd Qu.: 0.71482   3rd Qu.: 0.643251  
##  Max.   :13.25650   Max.   : 3.24104   Max.   : 2.57146   Max.   : 2.691714  
##      noise2             noise3             noise4         
##  Min.   :-2.50792   Min.   :-2.54934   Min.   :-2.313736  
##  1st Qu.:-0.66866   1st Qu.:-0.66293   1st Qu.:-0.748365  
##  Median : 0.02479   Median : 0.02223   Median :-0.005609  
##  Mean   : 0.03720   Mean   : 0.05038   Mean   :-0.031375  
##  3rd Qu.: 0.66781   3rd Qu.: 0.59483   3rd Qu.: 0.641604  
##  Max.   : 2.68486   Max.   : 3.18404   Max.   : 3.390371
# Paso 2: Ajustar un modelo solo con las variables relevantes
model_good <- lm(y ~ x1 + x2, data = data)
summary(model_good)
## 
## Call:
## lm(formula = y ~ x1 + x2, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.86923 -0.57711  0.04882  0.66331  2.41531 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.03363    0.06851   0.491    0.624    
## x1           2.96759    0.07278  40.776   <2e-16 ***
## x2           1.94934    0.06892  28.283   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9679 on 197 degrees of freedom
## Multiple R-squared:  0.9242, Adjusted R-squared:  0.9234 
## F-statistic:  1200 on 2 and 197 DF,  p-value: < 2.2e-16
# Paso 3: Ajustar un modelo con todas las variables (incluyendo ruido)
model_bad <- lm(y ~ ., data = data)
summary(model_bad)
## 
## Call:
## lm(formula = y ~ ., data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8430 -0.5863  0.0751  0.6334  2.3761 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.03484    0.06933   0.503    0.616    
## x1           2.96866    0.07382  40.217   <2e-16 ***
## x2           1.94762    0.07065  27.568   <2e-16 ***
## noise1       0.03008    0.06801   0.442    0.659    
## noise2      -0.02275    0.06846  -0.332    0.740    
## noise3       0.02789    0.06943   0.402    0.688    
## noise4       0.03277    0.07388   0.444    0.658    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.976 on 193 degrees of freedom
## Multiple R-squared:  0.9245, Adjusted R-squared:  0.9221 
## F-statistic: 393.7 on 6 and 193 DF,  p-value: < 2.2e-16
# Paso 4: Comparación visual
pred_good <- predict(model_good)
pred_bad <- predict(model_bad)

plot(y, pred_good, col = "blue", pch = 19, 
     xlab = "True values", ylab = "Predictions")
points(y, pred_bad, col = "red", pch = 19)
legend("topleft", legend = c("Relevant variables", "All variables"), 
       col = c("blue", "red"), pch = 19)

plot(y, pred_good, col = "blue", pch = 19, 
     xlab = "True values", ylab = "Predictions")

legend("topleft", legend = c("Relevant variables"), 
       col = c("blue"), pch = 19)

plot(y, pred_bad, col = "red", pch = 19 ,
     xlab = "True values", ylab = "Predictions")
legend("topleft", legend = c("Relevant variables"), 
       col = c( "red"), pch = 19)

# Encontrar el mejor modelo usando AIC (Selección de variables)
step(model_bad)
## Start:  AIC=-2.86
## y ~ x1 + x2 + noise1 + noise2 + noise3 + noise4
## 
##          Df Sum of Sq     RSS    AIC
## - noise2  1      0.11  183.94  -4.75
## - noise3  1      0.15  183.98  -4.69
## - noise1  1      0.19  184.02  -4.66
## - noise4  1      0.19  184.02  -4.66
## <none>                 183.83  -2.86
## - x2      1    723.88  907.71 314.52
## - x1      1   1540.54 1724.37 442.86
## 
## Step:  AIC=-4.75
## y ~ x1 + x2 + noise1 + noise3 + noise4
## 
##          Df Sum of Sq     RSS    AIC
## - noise4  1      0.15  184.09  -6.58
## - noise3  1      0.18  184.12  -6.55
## - noise1  1      0.21  184.14  -6.52
## <none>                 183.94  -4.75
## - x2      1    727.67  911.60 313.38
## - x1      1   1543.54 1727.48 441.22
## 
## Step:  AIC=-6.58
## y ~ x1 + x2 + noise1 + noise3
## 
##          Df Sum of Sq     RSS    AIC
## - noise1  1      0.21  184.30  -8.35
## - noise3  1      0.25  184.34  -8.31
## <none>                 184.09  -6.58
## - x2      1    741.30  925.38 314.38
## - x1      1   1550.47 1734.56 440.04
## 
## Step:  AIC=-8.35
## y ~ x1 + x2 + noise3
## 
##          Df Sum of Sq     RSS    AIC
## - noise3  1      0.26  184.56 -10.06
## <none>                 184.30  -8.35
## - x2      1    748.14  932.44 313.90
## - x1      1   1552.01 1736.32 438.24
## 
## Step:  AIC=-10.06
## y ~ x1 + x2
## 
##        Df Sum of Sq     RSS    AIC
## <none>               184.56 -10.06
## - x2    1    749.45  934.02 312.24
## - x1    1   1557.75 1742.32 436.93
## 
## Call:
## lm(formula = y ~ x1 + x2, data = data)
## 
## Coefficients:
## (Intercept)           x1           x2  
##     0.03363      2.96759      1.94934