#2 #a. #iii because the lasso method is less flexible than the least squares method and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance #b. #iii because the ridge regression is less flexible than the least squares method and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance. #c #ii because unlike the other methods, the non-linear methods is more flexible than the least square method and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias.

#9. #a library(ISLR) data(College) set.seed(11) train= sample(1:dim(College)[1], dim(College)[1] / 2) test= -train College.train= College[train, ] College.test= College[test, ] #b fit.lm <- lm(Apps ~ ., data = College.train) pred.lm <- predict(fit.lm, College.test) mean((pred.lm - College.test\(Apps)^2) #test error = 1020100 #c train.mat <- model.matrix(Apps ~ ., data = College.train) test.mat <- model.matrix(Apps ~ ., data = College.test) grid <- 10 ^ seq(4, -2, length = 100) fit.ridge <- glmnet(train.mat, College.train\)Apps, alpha = 0, lambda = grid, thresh = 1e-12) cv.ridge <- cv.glmnet(train.mat, College.train\(Apps, alpha = 0, lambda = grid, thresh = 1e-12) bestlam.ridge <- cv.ridge\)lambda.min bestlam.ridge pred.ridge <- predict(fit.ridge, s = bestlam.ridge, newx = test.mat) mean((pred.ridge - College.test\(Apps)^2) #test error = 1020090 #d fit.lasso <- glmnet(train.mat, College.train\)Apps, alpha = 1, lambda = grid, thresh = 1e-12) cv.lasso <- cv.glmnet(train.mat, College.train\(Apps, alpha = 1, lambda = grid, thresh = 1e-12) bestlam.lasso <- cv.lasso\)lambda.min bestlam.lasso pred.lasso <- predict(fit.lasso, s = bestlam.lasso, newx = test.mat) mean((pred.lasso - College.test\(Apps)^2) #test error = 1008637 #e library(pls) fit.pcr <- pcr(Apps ~ ., data = College.train, scale = TRUE, validation = "CV") validationplot(fit.pcr, val.type = "MSEP") pred.pcr <- predict(fit.pcr, College.test, ncomp = 10) mean((pred.pcr - College.test\)Apps)^2) #test error= 1422699 #f fit.pls <- plsr(Apps ~ ., data = College.train, scale = TRUE, validation = “CV”) validationplot(fit.pls, val.type = “MSEP”) pred.pls <- predict(fit.pls, College.test, ncomp = 10) mean((pred.pls - College.test\(Apps)^2) #test error = 1029442 #g test.avg <- mean(College.test\)Apps) lm.r2 <- 1 - mean((pred.lm - College.test\(Apps)^2) / mean((test.avg - College.test\)Apps)^2) ridge.r2 <- 1 - mean((pred.ridge - College.test\(Apps)^2) / mean((test.avg - College.test\)Apps)^2) lasso.r2 <- 1 - mean((pred.lasso - College.test\(Apps)^2) / mean((test.avg - College.test\)Apps)^2) pcr.r2 <- 1 - mean((pred.pcr - College.test\(Apps)^2) / mean((test.avg - College.test\)Apps)^2) pls.r2 <- 1 - mean((pred.pls - College.test\(Apps)^2) / mean((test.avg - College.test\)Apps)^2) #All models, except PCR, predict college applications with high accuracy. #11 library(leaps) library(MASS) set.seed(1) #a attach(Boston) predict.regsubsets = function(object, newdata, id, …) { form = as.formula(object$call[[2]]) mat = model.matrix(form, newdata) coefi = coef(object, id = id) mat[, names(coefi)] %*% coefi }

k = 10 p = ncol(Boston) - 1 folds = sample(rep(1:k, length = nrow(Boston))) cv.errors = matrix(NA, k, p) for (i in 1:k) { best.fit = regsubsets(crim ~ ., data = Boston[folds != i, ], nvmax = p) for (j in 1:p) { pred = predict(best.fit, Boston[folds == i, ], id = j) cv.errors[i, j] = mean((Boston\(crim[folds == i] - pred)^2) } } mean.cv.errors <- apply(cv.errors, 2, mean) plot(mean.cv.errors, type = "b", xlab = "Number of variables", ylab = "CV error") which.min(mean.cv.errors) mean.cv.errors[which.min(mean.cv.errors)] coef(cv.lasso) sqrt(cv.lasso\)cvm[cv.lasso\(lambda == cv.lasso\)lambda.1se]) x = model.matrix(crim ~ . - 1, data = Boston) y = Boston\(crim cv.ridge = cv.glmnet(x, y, type.measure = "mse", alpha = 0) plot(cv.ridge) coef(cv.ridge) sqrt(cv.ridge\)cvm[cv.ridge\(lambda == cv.ridge\)lambda.1se]) pcr.crime = pcr(crim ~ ., data = Boston, scale = TRUE, validation = “CV”) summary(pcr.crime) #b #The model with the lower cross-validation error is the one chosen by the best subset selection method. #c # No,the model chosen by the best subset selection method has only 13 predictors.