# Paso 1: Generar un conjunto de datos simple
set.seed(123)
n <- 200
# Predictores verdaderos
x1 <- rnorm(n)
x2 <- rnorm(n)
# Variable de respuesta
y <- 3 * x1 + 2 * x2 + rnorm(n)
# Predictores irrelevantes (ruido)
noise1 <- rnorm(n)
noise2 <- rnorm(n)
noise3 <- rnorm(n)
noise4 <- rnorm(n)
data <- data.frame(y, x1, x2, noise1, noise2, noise3, noise4)
summary(data)
## y x1 x2 noise1
## Min. :-7.42464 Min. :-2.30917 Min. :-2.46590 Min. :-2.601700
## 1st Qu.:-2.41935 1st Qu.:-0.62576 1st Qu.:-0.59077 1st Qu.:-0.693147
## Median : 0.03947 Median :-0.05874 Median : 0.02283 Median : 0.002188
## Mean : 0.09031 Mean :-0.00857 Mean : 0.04212 Mean :-0.021885
## 3rd Qu.: 2.04531 3rd Qu.: 0.56840 3rd Qu.: 0.71482 3rd Qu.: 0.643251
## Max. :13.25650 Max. : 3.24104 Max. : 2.57146 Max. : 2.691714
## noise2 noise3 noise4
## Min. :-2.50792 Min. :-2.54934 Min. :-2.313736
## 1st Qu.:-0.66866 1st Qu.:-0.66293 1st Qu.:-0.748365
## Median : 0.02479 Median : 0.02223 Median :-0.005609
## Mean : 0.03720 Mean : 0.05038 Mean :-0.031375
## 3rd Qu.: 0.66781 3rd Qu.: 0.59483 3rd Qu.: 0.641604
## Max. : 2.68486 Max. : 3.18404 Max. : 3.390371
# Paso 2: Ajustar un modelo solo con las variables relevantes
model_good <- lm(y ~ x1 + x2, data = data)
summary(model_good)
##
## Call:
## lm(formula = y ~ x1 + x2, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.86923 -0.57711 0.04882 0.66331 2.41531
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.03363 0.06851 0.491 0.624
## x1 2.96759 0.07278 40.776 <2e-16 ***
## x2 1.94934 0.06892 28.283 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9679 on 197 degrees of freedom
## Multiple R-squared: 0.9242, Adjusted R-squared: 0.9234
## F-statistic: 1200 on 2 and 197 DF, p-value: < 2.2e-16
# Paso 3: Ajustar un modelo con todas las variables (incluyendo ruido)
model_bad <- lm(y ~ ., data = data)
summary(model_bad)
##
## Call:
## lm(formula = y ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8430 -0.5863 0.0751 0.6334 2.3761
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.03484 0.06933 0.503 0.616
## x1 2.96866 0.07382 40.217 <2e-16 ***
## x2 1.94762 0.07065 27.568 <2e-16 ***
## noise1 0.03008 0.06801 0.442 0.659
## noise2 -0.02275 0.06846 -0.332 0.740
## noise3 0.02789 0.06943 0.402 0.688
## noise4 0.03277 0.07388 0.444 0.658
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.976 on 193 degrees of freedom
## Multiple R-squared: 0.9245, Adjusted R-squared: 0.9221
## F-statistic: 393.7 on 6 and 193 DF, p-value: < 2.2e-16
# Paso 4: Comparación visual
pred_good <- predict(model_good)
pred_bad <- predict(model_bad)
plot(y, pred_good, col = "blue", pch = 19,
xlab = "True values", ylab = "Predictions")
points(y, pred_bad, col = "red", pch = 19)
legend("topleft", legend = c("Relevant variables", "All variables"),
col = c("blue", "red"), pch = 19)

plot(y, pred_good, col = "blue", pch = 19,
xlab = "True values", ylab = "Predictions")
legend("topleft", legend = c("Relevant variables"),
col = c("blue"), pch = 19)

plot(y, pred_bad, col = "red", pch = 19 ,
xlab = "True values", ylab = "Predictions")
legend("topleft", legend = c("Relevant variables"),
col = c( "red"), pch = 19)

# Encontrar el mejor modelo usando AIC (Selección de variables)
step(model_bad)
## Start: AIC=-2.86
## y ~ x1 + x2 + noise1 + noise2 + noise3 + noise4
##
## Df Sum of Sq RSS AIC
## - noise2 1 0.11 183.94 -4.75
## - noise3 1 0.15 183.98 -4.69
## - noise1 1 0.19 184.02 -4.66
## - noise4 1 0.19 184.02 -4.66
## <none> 183.83 -2.86
## - x2 1 723.88 907.71 314.52
## - x1 1 1540.54 1724.37 442.86
##
## Step: AIC=-4.75
## y ~ x1 + x2 + noise1 + noise3 + noise4
##
## Df Sum of Sq RSS AIC
## - noise4 1 0.15 184.09 -6.58
## - noise3 1 0.18 184.12 -6.55
## - noise1 1 0.21 184.14 -6.52
## <none> 183.94 -4.75
## - x2 1 727.67 911.60 313.38
## - x1 1 1543.54 1727.48 441.22
##
## Step: AIC=-6.58
## y ~ x1 + x2 + noise1 + noise3
##
## Df Sum of Sq RSS AIC
## - noise1 1 0.21 184.30 -8.35
## - noise3 1 0.25 184.34 -8.31
## <none> 184.09 -6.58
## - x2 1 741.30 925.38 314.38
## - x1 1 1550.47 1734.56 440.04
##
## Step: AIC=-8.35
## y ~ x1 + x2 + noise3
##
## Df Sum of Sq RSS AIC
## - noise3 1 0.26 184.56 -10.06
## <none> 184.30 -8.35
## - x2 1 748.14 932.44 313.90
## - x1 1 1552.01 1736.32 438.24
##
## Step: AIC=-10.06
## y ~ x1 + x2
##
## Df Sum of Sq RSS AIC
## <none> 184.56 -10.06
## - x2 1 749.45 934.02 312.24
## - x1 1 1557.75 1742.32 436.93
##
## Call:
## lm(formula = y ~ x1 + x2, data = data)
##
## Coefficients:
## (Intercept) x1 x2
## 0.03363 2.96759 1.94934