Question 6:

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

Part A: Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.3
library(boot)
set.seed(1)
degree <- 10
cv.errs <- rep(NA, degree)
for (i in 1:degree) {
  fit <- glm(wage ~ poly(age, i), data = Wage)
  cv.errs[i] <- cv.glm(Wage, fit)$delta[1]
}
plot(1:degree, cv.errs, xlab = 'Degree', ylab = 'The Test MSE', type = 'l')
deg.min <- which.min(cv.errs)
points(deg.min, cv.errs[deg.min], col = 'cyan', cex = 2, pch = 19)

plot(wage ~ age, data = Wage, col = "lightpink")
age.range <- range(Wage$age)
age.grid <- seq(from = age.range[1], to = age.range[2])
fit <- lm(wage ~ poly(age, 3), data = Wage)
preds <- predict(fit, newdata = list(age = age.grid))
lines(age.grid, preds, col = "gray48", lwd = 2)

Part B: Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained.

cv.errs <- rep(NA, degree)
for (i in 2:degree) {
  Wage$age.cut <- cut(Wage$age, i)
  fit <- glm(wage ~ age.cut, data = Wage)
  cv.errs[i] <- cv.glm(Wage, fit)$delta[1]
}
plot(2:degree, cv.errs[-1], xlab = 'Cuts', ylab = 'Test MSE', type = 'l')
deg.min <- which.min(cv.errs)
points(deg.min, cv.errs[deg.min], col = 'darkolivegreen4', cex = 2, pch = 19)

plot(wage ~ age, data = Wage, col = "paleturquoise3")
fit <- glm(wage ~ cut(age, 8), data = Wage)
preds <- predict(fit, list(age = age.grid))
lines(age.grid, preds, col = "gray48", lwd = 2)

res <- cut(c(1,5,2,3,8), 2)
res
## [1] (0.993,4.5] (4.5,8.01]  (0.993,4.5] (0.993,4.5] (4.5,8.01] 
## Levels: (0.993,4.5] (4.5,8.01]
length(res)
## [1] 5
class(res[1])
## [1] "factor"

Answer:

Question 10:

This question relates to the College data set.

Part A: Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

library(ISLR)
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.4
library(gam)
## Warning: package 'gam' was built under R version 4.0.5
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.0.4
## Loaded gam 1.20
library(gbm)
## Warning: package 'gbm' was built under R version 4.0.5
## Loaded gbm 2.1.8
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.0.4
## Loading required package: Matrix
## Loaded glmnet 4.1-1
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
set.seed(12345)
train <- sample(nrow(College) * 0.7)
train_set <- College[train, ]
test_set <- College[-train, ]
forward_subset <- regsubsets(Outstate ~ ., data = train_set, nvmax = ncol(College)-1, method = "forward")
model_summary <- summary(forward_subset)

plot_metric <- function(metric, yaxis_label, reverse = FALSE) {
  plot(metric, xlab = "Number of Variables", ylab = yaxis_label, xaxt = "n", type = "l")
  axis(side = 1, at = 1:length(metric))
  
  if (reverse) {
    metric_1se <- max(metric) - (sd(metric) / sqrt(length(metric)))
    min_subset <- which(metric > metric_1se)
  } else {
    metric_1se <- min(metric) + (sd(metric) / sqrt(length(metric)))
    min_subset <- which(metric < metric_1se)
  }
  
  abline(h = metric_1se, col = "red", lty = 2)
  abline(v = min_subset[1], col = "green", lty = 2)
}

par(mfrow=c(1, 3))
  
plot_metric(model_summary$cp, "Cp")
plot_metric(model_summary$bic, "BIC")
plot_metric(model_summary$adjr2, "Adjusted R2", reverse = TRUE) # higher values are better

help("College")
## starting httpd help server ... done

List of Predictors:

coef(forward_subset, 6)
##   (Intercept)    PrivateYes    Room.Board           PhD   perc.alumni 
## -3769.0587788  2748.6944010     0.8999634    38.5143460    44.4889713 
##        Expend     Grad.Rate 
##     0.2543900    31.2043096

Part B: Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

Part C:Evaluate the model obtained on the test set, and explain the results obtained.

Part D: For which variables, if any, is there evidence of a non-linear relationship with the response?