This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
Use the rnorm() function to generate a predictor X of length n = 100, as well as a noise vector ϵ of length n = 100.
set.seed(123)
n <- 100
X <- rnorm(n)
epsilon <- rnorm(n)
Generate a response vector Y of length n = 100 according to the model Y = β0 + β1X + β2X2 + β3X3 + ϵ, where β0, β1, β2, and β3 are constants of your choice.
beta0 <- 3
beta1 <- 2
beta2 <- -1
beta3 <- 0.5
Y <- beta0 + beta1 * X + beta2 * X^2 + beta3 * X^3 + epsilon
Use the regsubsets() function to perform best subset selection in order to choose the best model containing the predictors X1,X2, . . . ,X10. What is the best model obtained according to Cp, BIC, and adjusted R2? Show some plots to provide evidence for your answer, and report the coefficients of the best model obtained. Note you will need to use the data.frame() function to create a single data set containing both X and Y.
library(leaps)
## Warning: package 'leaps' was built under R version 4.4.3
df = data.frame(y = Y, x = X)
fit = regsubsets(y ~ poly(x, 10, raw = T), data = df, nvmax = 10)
sub_summary = summary(fit)
bm_Cp <- which.min(sub_summary$cp)
bm_BIC <- which.min(sub_summary$bic)
bm_adjR2 <- which.max(sub_summary$adjr2)
cat("Best model based on Cp: ", bm_Cp, "\n")
## Best model based on Cp: 3
cat("Best model based on BIC: ", bm_BIC, "\n")
## Best model based on BIC: 3
cat("Best model based on Adjusted R^2: ", bm_adjR2, "\n")
## Best model based on Adjusted R^2: 6
bm <- bm_Cp
coef(fit, bm)
## (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2
## 2.9703939 1.9204462 -1.0915431
## poly(x, 10, raw = T)3
## 0.5204363
plot(sub_summary$cp, xlab = "Subset Size", ylab = "Cp", pch = 20, type = "l")
points(bm_Cp, sub_summary$cp[bm_Cp], pch = 4, col = "red", lwd = 7)
plot(sub_summary$bic, xlab = "Subset Size", ylab = "BIC", pch = 20, type = "l")
points(bm_BIC, sub_summary$bic[bm_BIC], pch = 4, col = "red", lwd = 7)
plot(sub_summary$adjr2, xlab = "Subset Size", ylab = "Adjusted R2", pch = 20,
type = "l")
points(bm_adjR2, sub_summary$adjr2[bm_adjR2], pch = 4, col = "red", lwd = 7)
Repeat (c), using forward stepwise selection and also using backwards stepwise selection. How does your answer compare to the results in (c)?
fwd_mod = regsubsets(Y ~ poly(X, 10, raw = T), data = df, nvmax = 10, method = "forward")
summ_fwd = summary(fwd_mod)
bwd_mod = regsubsets(Y ~ poly(X, 10, raw = T), data = df, nvmax = 10, method = "backward")
summ_bwd = summary(bwd_mod)
which.min(summ_fwd$cp)
## [1] 3
which.min(summ_bwd$cp)
## [1] 6
which.min(summ_fwd$bic)
## [1] 3
which.min(summ_bwd$bic)
## [1] 6
which.max(summ_fwd$adjr2)
## [1] 4
which.max(summ_bwd$adjr2)
## [1] 6
par(mfrow = c(3, 2))
plot(summ_fwd$cp, xlab = "Subset Size", ylab = "Forward Cp", pch = 20, type = "l")
points(which.min(summ_fwd$cp), summ_fwd$cp[which.min(summ_fwd$cp)], pch = 4, col = "red", lwd = 7)
plot(summ_bwd$cp, xlab = "Subset Size", ylab = "Backward Cp", pch = 20, type = "l")
points(which.min(summ_bwd$cp), summ_bwd$cp[which.min(summ_bwd$cp)], pch = 4, col = "red", lwd = 7)
plot(summ_fwd$bic, xlab = "Subset Size", ylab = "Forward BIC", pch = 20, type = "l")
points(which.min(summ_fwd$bic), summ_fwd$bic[which.min(summ_fwd$bic)], pch = 4, col = "red", lwd = 7)
plot(summ_bwd$bic, xlab = "Subset Size", ylab = "Backward BIC", pch = 20, type = "l")
points(which.min(summ_bwd$bic), summ_bwd$bic[which.min(summ_bwd$bic)], pch = 4, col = "red", lwd = 7)
plot(summ_fwd$adjr2, xlab = "Subset Size", ylab = "Forward Adjusted R²", pch = 20, type = "l")
points(which.max(summ_fwd$adjr2), summ_fwd$adjr2[which.max(summ_fwd$adjr2)], pch = 4, col = "red", lwd = 7)
plot(summ_bwd$adjr2, xlab = "Subset Size", ylab = "Backward Adjusted R²", pch = 20, type = "l")
points(which.max(summ_bwd$adjr2), summ_bwd$adjr2[which.max(summ_bwd$adjr2)], pch = 4, col = "red", lwd = 7)
coefficients(fwd_mod, id = 3)
## (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)2
## 2.9703939 1.9204462 -1.0915431
## poly(X, 10, raw = T)3
## 0.5204363
coefficients(bwd_mod, id = 6)
## (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)4
## 2.90032830 2.11707619 -1.53472963
## poly(X, 10, raw = T)5 poly(X, 10, raw = T)6 poly(X, 10, raw = T)7
## 0.30756427 0.57013129 -0.04513887
## poly(X, 10, raw = T)8
## -0.06125841
coefficients(fwd_mod, id = 4)
## (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)2
## 3.040475559 1.928367169 -1.253869916
## poly(X, 10, raw = T)3 poly(X, 10, raw = T)6
## 0.523397035 0.009248115
The models that include fewer predictors, particularly Model 1 with only 3 predictors, seem to capture the most essential features of the data based on the coefficients. Adding more predictors (Model 2 and Model 3) brings in additional terms, but these higher-degree terms appear to have a less significant effect, as their coefficients are smaller or closer to zero.
Now fit a lasso model to the simulated data, again using X1,X2, . . . ,X10 as predictors. Use cross-validation to select the optimal value of λ. Create plots of the cross-validation error as a function of λ. Report the resulting coefficient estimates, and discuss the results obtained.
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.4.3
## Loading required package: Matrix
## Loaded glmnet 4.1-8
xmat = model.matrix(Y ~ poly(X, 10, raw = T), data = df)[, -1]
lasso_mod = cv.glmnet(xmat, Y, alpha = 1)
lambda_best = lasso_mod$lambda.min
lambda_best
## [1] 0.052965
plot(lasso_mod)
best_mod = glmnet(xmat, Y, alpha = 1)
predict(best_mod, s = lambda_best, type = "coefficients")
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 2.9284572
## poly(X, 10, raw = T)1 1.8857588
## poly(X, 10, raw = T)2 -1.0324006
## poly(X, 10, raw = T)3 0.5048784
## poly(X, 10, raw = T)4 .
## poly(X, 10, raw = T)5 .
## poly(X, 10, raw = T)6 .
## poly(X, 10, raw = T)7 .
## poly(X, 10, raw = T)8 .
## poly(X, 10, raw = T)9 .
## poly(X, 10, raw = T)10 .
The lasso model has effectively identified a simpler, more interpretable model by shrinking the less important predictors (higher-order polynomials) to zero, while retaining significant predictors. This approach provides a better balance between model complexity and predictive performance.
Now generate a response vector Y according to the model Y = β0 + β7X7 + ϵ, and perform best subset selection and the lasso. Discuss the results obtained.
beta7 = 7
beta0=3
Y = beta0 + beta7 * X^7 + epsilon
df = data.frame(y = Y, x = X)
full_mod = regsubsets(Y ~ poly(X, 10, raw = T), data = df, nvmax = 10)
mod.summary = summary(full_mod)
which.min(mod.summary$cp)
## [1] 1
which.min(mod.summary$bic)
## [1] 1
which.max(mod.summary$adjr2)
## [1] 6
# Coefficients for the best model
coefficients(full_mod, id = 1)
## (Intercept) poly(X, 10, raw = T)7
## 2.893251 6.999704
coefficients(full_mod, id = 6)
## (Intercept) poly(X, 10, raw = T)2 poly(X, 10, raw = T)4
## 2.79358642 2.30456710 -4.10732161
## poly(X, 10, raw = T)6 poly(X, 10, raw = T)7 poly(X, 10, raw = T)8
## 2.27693175 7.00151236 -0.50477246
## poly(X, 10, raw = T)10
## 0.03899314
xmat <- model.matrix(y ~ poly(x, 10, raw = TRUE), data = df)[, -1]
library(glmnet)
lasso_mod <- cv.glmnet(xmat, Y, alpha = 1)
lambda_best <- lasso_mod$lambda.min
lambda_best
## [1] 10.70277
best_mod <- glmnet(xmat, Y, alpha = 1)
coef(best_mod, s = lambda_best)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 3.444221
## poly(x, 10, raw = TRUE)1 .
## poly(x, 10, raw = TRUE)2 .
## poly(x, 10, raw = TRUE)3 .
## poly(x, 10, raw = TRUE)4 .
## poly(x, 10, raw = TRUE)5 .
## poly(x, 10, raw = TRUE)6 .
## poly(x, 10, raw = TRUE)7 6.795659
## poly(x, 10, raw = TRUE)8 .
## poly(x, 10, raw = TRUE)9 .
## poly(x, 10, raw = TRUE)10 .
The Lasso model identifies X7 as the significant predictor with a coefficient of approximately 6.80, while the other coefficients are shrunk to zero, reflecting the model’s sparsity and the influence of only X7 in the model.
Split the data set into a training set and a test set.
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.4.2
set.seed(11)
sum(is.na(College))
## [1] 0
# Split the data (50% training, 50% testing)
train_size = dim(College)[1] / 2
train = sample(1:dim(College)[1], train_size)
test = -train
Col_train = College[train, ]
Col_test = College[test, ]
Fit a linear model using least squares on the training set, and report the test error obtained.
lm_fit = lm(Enroll ~ Private + Outstate + Room.Board + Books, data = Col_train)
lm_pred = predict(lm_fit, Col_test)
te_ridge_lm = mean((Col_test[, "Enroll"] - lm_pred)^2)
te_ridge_lm
## [1] 590967
The test error for the linear model (OLS) is 590,967.
Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.
library(glmnet)
train_mat = model.matrix(Enroll ~ Private + Outstate + Room.Board + Books, data = Col_train)
test_mat = model.matrix(Enroll ~ Private + Outstate + Room.Board + Books, data = Col_test)
grid = 10 ^ seq(4, -2, length = 100)
ridge_mod = cv.glmnet(train_mat, Col_train[, "Enroll"], alpha = 0, lambda = grid, thresh = 1e-12)
lambda.best_ridge = ridge_mod$lambda.min
ridge_pred = predict(ridge_mod, newx = test_mat, s = lambda.best_ridge)
te_ridge_ridge = mean((Col_test[, "Enroll"] - ridge_pred)^2)
te_ridge_ridge
## [1] 591418.2
The test error for ridge regression (591,418.2) is slightly higher than the test error for the linear model (153,8442), indicating that the linear model performed slightly better on this dataset.
Fit a lasso model on the training set, with λ chosen by crossvalidation. Report the test error obtained, along with the number of non-zero coefficient estimates.
lasso_mod = cv.glmnet(train_mat, Col_train[, "Enroll"], alpha = 1, lambda = grid, thresh = 1e-12)
lambda.best_lasso = lasso_mod$lambda.min
lasso_pred = predict(lasso_mod, newx = test_mat, s = lambda.best_lasso)
te_ridge_lasso = mean((Col_test[, "Enroll"] - lasso_pred)^2)
te_ridge_lasso
## [1] 591488.3
coef_lasso = predict(lasso_mod, s = lambda.best_lasso, type = "coefficients")
coef_lasso
## 6 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 8.783334e+02
## (Intercept) .
## PrivateYes -1.337981e+03
## Outstate 3.910309e-02
## Room.Board 5.272033e-02
## Books 4.403111e-01
The test error for the lasso regression is 591,488.3, and the model reduces some variables to zero (such as “PrivateYes”), indicating feature selection. This result is similar to ridge regression, with a slightly higher test error than the linear model (153,8442).
Fit a PCR model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.
library(pls)
## Warning: package 'pls' was built under R version 4.4.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
pcr_fit = pcr(Enroll ~ Private + Outstate + Room.Board + Books, data = Col_train, scale = T, validation = "CV")
validationplot(pcr_fit, val.type = "MSEP")
optimal_components = pcr_fit$ncomp
print(optimal_components)
## [1] 4
# Predict with the best number of components (optimal_components) on the test set
pcr_pred = predict(pcr_fit, Col_test, ncomp = optimal_components)
# Calculate test error (Mean Squared Error)
te_ridge_pcr = mean((Col_test[, "Enroll"] - pcr_pred)^2)
te_ridge_pcr
## [1] 590967
The PCR test error of 590967
Fit a PLS model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.
library(pls)
pls_fit = plsr(Enroll ~ Private + Outstate + Room.Board + Books, data = Col_train, scale = TRUE, validation = "CV")
validationplot(pls_fit, val.type = "MSEP")
optimal_M = which.min(pls_fit$validation$PRESS)
pls_pred = predict(pls_fit, Col_test, ncomp = optimal_M)
te_ridge_pls = mean((Col_test[, "Enroll"] - pls_pred)^2)
te_ridge_pls
## [1] 590967
optimal_M
## [1] 4
The plot shows the Mean Squared Error of Prediction (MSEP) decreasing as the number of components increases, indicating that model performance improves with more components.
Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?
test_r2_values <- c(te_ridge_lm, te_ridge_ridge, te_ridge_lasso, te_ridge_pcr, te_ridge_pls)
models <- c("OLS", "Ridge", "Lasso", "PCR", "PLS")
barplot(test_r2_values,
names.arg = models,
col = "lightblue",
main = "Comparison of Test Errors by Model",
ylab = "Test Error (MSE)",
ylim = c(590000, 592000))
The bar chart compares the test errors (MSE) of different regression models, showing that OLS, PCR, and PLS perform better than Ridge and Lasso.