library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.3
library(leaps)
## Warning: package 'leaps' was built under R version 4.3.3
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.3.3
## Loading required package: Matrix
## Loaded glmnet 4.1-8
8. In this exercise, we will generate simulated data, and will then use this data to perform best subset selection. (a) Use the rnorm() function to generate a predictor X of length n = 100, as well as a noise vector ϵ of length n = 100.
set.seed(1)
x <- rnorm(100)
epsilon <- rnorm(100)
beta0 <- 4
beta1 <- 2
beta2 <- 1
beta3 <- -0.5
Y <- beta0 + (beta1 * x) + (beta2 * x^2) + (beta3 * x^3) + epsilon
data.full <- data.frame(y = Y, x = x)
regfit.full <- regsubsets(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = data.full, nvmax = 10)
reg.summary <- summary(regfit.full)
par(mfrow = c(2, 2))
plot(reg.summary$cp, xlab = "Number of variables", ylab = "C_p", type = "l")
points(which.min(reg.summary$cp), reg.summary$cp[which.min(reg.summary$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary$bic, xlab = "Number of variables", ylab = "BIC", type = "l")
points(which.min(reg.summary$bic), reg.summary$bic[which.min(reg.summary$bic)], col = "red", cex = 2, pch = 20)
plot(reg.summary$adjr2, xlab = "Number of variables", ylab = "Adjusted R^2", type = "l")
points(which.max(reg.summary$adjr2), reg.summary$adjr2[which.max(reg.summary$adjr2)], col = "red", cex = 2, pch = 20)
mtext("Plots of C_p, BIC, and adjusted R^2 for best subset selection", side = 3, line = -2, outer = TRUE)
##We find that with C_p and adjusted r^2 we would pick the 4-variable model, but with BIC we would pick the 3-variable model.The best model has 4 variables.
coef(regfit.full, which.max(reg.summary$adjr2))
## (Intercept) x I(x^2) I(x^3) I(x^5)
## 4.07200775 2.38745596 0.84575641 -0.94202574 0.08072292
regfit.fwd <- regsubsets(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = data.full, nvmax = 10, method = "forward")
reg.summary.fwd <- summary(regfit.fwd)
par(mfrow = c(2, 2))
plot(reg.summary.fwd$cp, xlab = "Number of variables", ylab = "C_p", type = "l")
points(which.min(reg.summary.fwd$cp), reg.summary.fwd$cp[which.min(reg.summary.fwd$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary.fwd$bic, xlab = "Number of variables", ylab = "BIC", type = "l")
points(which.min(reg.summary.fwd$bic), reg.summary.fwd$bic[which.min(reg.summary.fwd$bic)], col = "red", cex = 2, pch = 20)
plot(reg.summary.fwd$adjr2, xlab = "Number of variables", ylab = "Adjusted R^2", type = "l")
points(which.max(reg.summary.fwd$adjr2), reg.summary.fwd$adjr2[which.max(reg.summary.fwd$adjr2)], col = "red", cex = 2, pch = 20)
mtext("Plots of C_p, BIC and adjusted R^2 for forward stepwise selection", side = 3, line = -2, outer = TRUE)
##We obtain the same results with forward stepwise selection. The model with 4 variables is the best fit.
regfit.bwd <- regsubsets(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = data.full, nvmax = 10, method = "backward")
reg.summary.bwd <- summary(regfit.bwd)
par(mfrow = c(2, 2))
plot(reg.summary.bwd$cp, xlab = "Number of variables", ylab = "C_p", type = "l")
points(which.min(reg.summary.bwd$cp), reg.summary.bwd$cp[which.min(reg.summary.bwd$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary.bwd$bic, xlab = "Number of variables", ylab = "BIC", type = "l")
points(which.min(reg.summary.bwd$bic), reg.summary.bwd$bic[which.min(reg.summary.bwd$bic)], col = "red", cex = 2, pch = 20)
plot(reg.summary.bwd$adjr2, xlab = "Number of variables", ylab = "Adjusted R^2", type = "l")
points(which.max(reg.summary.bwd$adjr2), reg.summary.bwd$adjr2[which.max(reg.summary.bwd$adjr2)], col = "red", cex = 2, pch = 20)
mtext("Plots of C_p, BIC and adjusted R^2 for backward stepwise selection", side = 3, line = -2, outer = TRUE)
xmat <- model.matrix(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = data.full)[, -1]
cv.lasso <- cv.glmnet(xmat, Y, alpha = 1)
plot(cv.lasso)
bestlam <- cv.lasso$lambda.min
bestlam
## [1] 0.02263167
fit.lasso <- glmnet(xmat, Y, alpha = 1)
predict(fit.lasso, s = bestlam, type = "coefficients")[1:11, ]
## (Intercept) x I(x^2) I(x^3) I(x^4)
## 4.1487840948 1.8829647714 0.7143433309 -0.4586384985 0.0000000000
## I(x^5) I(x^6) I(x^7) I(x^8) I(x^9)
## 0.0000000000 0.0042021942 0.0000000000 0.0006359117 0.0000000000
## I(x^10)
## 0.0000000000
beta7 <- 7
y <- beta0 + beta7 * x^7 + epsilon
data.full <- data.frame(y = y, x = x)
regfit.full <- regsubsets(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = data.full, nvmax = 10)
reg.summary <- summary(regfit.full)
par(mfrow = c(2, 2))
plot(reg.summary$cp, xlab = "Number of variables", ylab = "C_p", type = "l")
points(which.min(reg.summary$cp), reg.summary$cp[which.min(reg.summary$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary$bic, xlab = "Number of variables", ylab = "BIC", type = "l")
points(which.min(reg.summary$bic), reg.summary$bic[which.min(reg.summary$bic)], col = "red", cex = 2, pch = 20)
plot(reg.summary$adjr2, xlab = "Number of variables", ylab = "Adjusted R^2", type = "l")
points(which.max(reg.summary$adjr2), reg.summary$adjr2[which.max(reg.summary$adjr2)], col = "red", cex = 2, pch = 20)
xmat <- model.matrix(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = data.full)[, -1]
cv.lasso <- cv.glmnet(xmat, y, alpha = 1)
bestlam <- cv.lasso$lambda.min
bestlam
## [1] 12.36884
fit.lasso <- glmnet(xmat, y, alpha = 1)
predict(fit.lasso, s = bestlam, type = "coefficients")[1:11, ]
## (Intercept) x I(x^2) I(x^3) I(x^4) I(x^5)
## 4.820215 0.000000 0.000000 0.000000 0.000000 0.000000
## I(x^6) I(x^7) I(x^8) I(x^9) I(x^10)
## 0.000000 6.796694 0.000000 0.000000 0.000000
9. In this exercise, we will predict the number of applications received using the other variables in the College data set. (a) Split the data set into a training set and a test set.
set.seed(10)
train_set <- sample(1:nrow(College), nrow(College)/2)
test_set <- (-train_set)
train <- College[train_set, ]
test <- College[test_set, ]
lm <- lm(Apps ~ ., data = train)
pred_lm <- predict(lm, test)
mean((pred_lm - test$Apps)^2)
## [1] 1020100
train_matrix <- model.matrix(Apps ~ ., data = train)
test_matrix <- model.matrix(Apps ~ ., data = test)
grid <- 10 ^ seq(4, -2, length = 100)
fit_ridge <- glmnet(train_matrix, train$Apps, alpha = 0, lambda = grid, thresh = 1e-12)
cv_ridge <- cv.glmnet(train_matrix, train$Apps, alpha = 0, lambda = grid, thresh = 1e-12)
bestlam <- cv_ridge$lambda.min
bestlam
## [1] 0.01
pred_ridge <- predict(fit_ridge, s = bestlam, newx = test_matrix)
mean((pred_ridge - test$Apps)^2)
## [1] 1020090
fit_lasso <- glmnet(train_matrix, train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
cv_lasso <- cv.glmnet(train_matrix, train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
bestlam_lasso <- cv_lasso$lambda.min
bestlam_lasso
## [1] 0.01
pred_lasso <- predict(fit_lasso, s = bestlam_lasso, newx = test_matrix)
mean((pred_lasso - test$Apps)^2)
## [1] 1020097
library(pls)
## Warning: package 'pls' was built under R version 4.3.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
fit_pcr <- pcr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(fit_pcr, val.type = "MSEP")
pred_pcr <- predict(fit_pcr, test, ncomp = 10)
mean((pred_pcr - test$Apps)^2)
## [1] 1422699
fit_pls <- plsr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(fit_pls, val.type = "MSEP")
pred_pls <- predict(fit_pls, test, ncomp = 10)
mean((pred_pls - test$Apps)^2)
## [1] 1029442
test.avg <- mean(test$Apps)
lm.r2 <- 1 - mean((pred_lm - test$Apps)^2) / mean((test.avg - test$Apps)^2)
ridge.r2 <- 1 - mean((pred_ridge - test$Apps)^2) / mean((test.avg - test$Apps)^2)
lasso.r2 <- 1 - mean((pred_lasso - test$Apps)^2) / mean((test.avg - test$Apps)^2)
pcr.r2 <- 1 - mean((pred_pcr - test$Apps)^2) / mean((test.avg - test$Apps)^2)
pls.r2 <- 1 - mean((pred_pls - test$Apps)^2) / mean((test.avg - test$Apps)^2)
print(lm.r2)
## [1] 0.9076134
print(ridge.r2)
## [1] 0.9076143
print(lasso.r2)
## [1] 0.9076137
print(pcr.r2)
## [1] 0.8711516
print(pls.r2)
## [1] 0.9067674
All models except for PCR predict college applications with over 90% accuracy. The difference is small between the other four model approaches.