1. In this exercise, you will further analyze the Wage data set considered throughout this chapter.
  1. Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.
require(ISLR); require(caret); require(tidyverse); require(broom); require(ggthemes)
## Loading required package: ISLR
## Loading required package: caret
## Warning: package 'caret' was built under R version 4.0.4
## Loading required package: lattice
## Loading required package: ggplot2
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 4.0.4
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.6     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::lift()   masks caret::lift()
## Loading required package: broom
## Loading required package: ggthemes
## Warning: package 'ggthemes' was built under R version 4.0.4
require(knitr); require(kableExtra)
## Loading required package: knitr
## Loading required package: kableExtra
## Warning: package 'kableExtra' was built under R version 4.0.4
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
theme_set(theme_tufte(base_size = 14) + theme(legend.position = 'top'))
set.seed(1)
data('Wage')
set.seed(101)

inTrain <- sample(nrow(Wage), nrow(Wage)*0.6)

train <- Wage[inTrain,]
val <- Wage[-inTrain,]

errors <- list()
models <- list()
for (i in 1:20) {
    models[[i]] <- lm(wage ~ poly(age, i), data = train)
    pred <- predict(models[[i]], newdata = val)
    resid <- val$wage - pred
    errors[[i]] <- sqrt(mean(resid^2))
    
}

data_frame(Errors = unlist(errors)) %>%
    mutate(Polynomial = row_number()) %>%
    ggplot(aes(Polynomial, Errors)) +
    geom_col(aes(fill = Polynomial == which.min(errors))) +
    coord_cartesian(ylim = c(min(unlist(errors)) - 1*sd(unlist(errors)),
                             min(unlist(errors)) + 3*sd(unlist(errors)))) +
    scale_x_continuous(breaks = 1:length(errors)) +
    guides(fill = FALSE)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.

do.call(anova, models) %>%
    tidy  %>%
    mutate(Model = paste('Model ', 1:20), 
           Percent = sumsq/lag(rss) * 100) %>%
    select(Model, sumsq, Percent, rss, p.value) %>%
    rename(Error = rss, Explained = sumsq, p = p.value) %>%
    kable(digits = 2, format = 'html') %>%
    kable_styling(bootstrap_options = c('striped', 'hover')) %>%
    scroll_box(height = '500px')
Model Explained Percent Error p
Model 1 NA NA 2883771 NA
Model 2 137713.65 4.78 2746057 0.00
Model 3 6086.71 0.22 2739971 0.05
Model 4 3232.92 0.12 2736738 0.15
Model 5 1440.32 0.05 2735297 0.33
Model 6 2345.57 0.09 2732952 0.22
Model 7 200.11 0.01 2732752 0.72
Model 8 0.16 0.00 2732752 0.99
Model 9 1635.54 0.06 2731116 0.30
Model 10 488.43 0.02 2730628 0.57
Model 11 110.62 0.00 2730517 0.79
Model 12 83.55 0.00 2730433 0.82
Model 13 702.51 0.03 2729731 0.50
Model 14 551.39 0.02 2729180 0.55
Model 15 2927.10 0.11 2726252 0.17
Model 16 2223.24 0.08 2724029 0.23
Model 17 213.26 0.01 2723816 0.71
Model 18 801.30 0.03 2723015 0.47
Model 19 0.05 0.00 2723015 1.00
Model 20 220.08 0.01 2722794 0.70
model <- lm(wage ~ poly(age, which.min(errors)), data = train)

val %>%
    mutate(predictions = predict(model, val)) %>%
    ggplot(aes(age, wage, col = 'blue')) +
    geom_point() +
    geom_line(aes(age, predictions, col = 'goldenrod2'), size = 1.5) +
    scale_color_manual(name = 'Value Type',
                         labels = c('Observed', 'Predicted'),
                         values = c('#56B4E9', '#E69F00')) +
    labs(x = 'Age', y = 'Wage', 
         title = paste0('Predictions from polynomial model of level ', which.min(errors)))

  1. Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained.
errors <- list()
for (i in 2:12) {
    step_fit <- lm(wage ~ cut(age, i), data = train)
    pred <- predict(step_fit, newdata = val)
    resid <- val$wage - pred
    errors[[i-1]] <- sqrt(mean(resid^2))
}

data_frame(Errors = unlist(errors)) %>%
    mutate(Steps = row_number() + 1) %>%
    ggplot(aes(Steps, Errors)) +
    geom_col(aes(fill = Steps == (which.min(errors)+1))) +
    coord_cartesian(ylim = c(min(unlist(errors)) - 1*sd(unlist(errors)),
                             min(unlist(errors)) + 3*sd(unlist(errors)))) +
    scale_x_continuous(breaks = 2:(length(errors)+1)) +
    guides(fill = FALSE)

step_fit <- lm(wage ~ cut(age, which.min(errors)), data = train)

val %>%
    mutate(predictions = predict(step_fit, val)) %>%
    ggplot(aes(age, wage, col = 'blue')) +
    geom_point() +
    geom_line(aes(age, predictions, col = 'goldenrod2'), size = 1.5) +
    scale_color_manual(name = 'Value Type',
                         labels = c('Observed', 'Predicted'),
                         values = c('#56B4E9', '#E69F00')) +
    labs(x = 'Age', y = 'Wage')

10. This question relates to the College data set (a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

library(ISLR)
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.4
train <- sample(1: nrow(College), nrow(College)/2)
test <- -train
fit <- regsubsets(Outstate ~ ., data = College, subset = train, method = 'forward')
fit.summary <- summary(fit)
fit.summary
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College, subset = train, 
##     method = "forward")
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
##          PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 ) " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 3  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 4  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 ) "*"        " "  " "    " "    " "       " "       "*"        
## 8  ( 1 ) "*"        " "  "*"    " "    " "       " "       "*"        
##          P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 ) " "         " "        " "   " "      " " " "      " "      
## 2  ( 1 ) " "         " "        " "   " "      " " " "      " "      
## 3  ( 1 ) " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 ) " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 ) " "         "*"        " "   " "      " " " "      " "      
## 6  ( 1 ) " "         "*"        " "   " "      "*" " "      " "      
## 7  ( 1 ) " "         "*"        " "   " "      "*" " "      " "      
## 8  ( 1 ) " "         "*"        " "   " "      "*" " "      " "      
##          perc.alumni Expend Grad.Rate
## 1  ( 1 ) " "         "*"    " "      
## 2  ( 1 ) " "         "*"    " "      
## 3  ( 1 ) " "         "*"    " "      
## 4  ( 1 ) "*"         "*"    " "      
## 5  ( 1 ) "*"         "*"    "*"      
## 6  ( 1 ) "*"         "*"    "*"      
## 7  ( 1 ) "*"         "*"    "*"      
## 8  ( 1 ) "*"         "*"    "*"
coef(fit, id = 6)
##   (Intercept)    PrivateYes    Room.Board           PhD   perc.alumni 
## -3287.5299971  2512.8945110     1.0770517    26.8926378    60.8413480 
##        Expend     Grad.Rate 
##     0.2261472    26.1662097
  1. Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

  2. Evaluate the model obtained on the test set, and explain the results obtained. preds <- predict(gam_fit, newdata = test) error <- mean((test$Outstate-preds)^2) val.errors[6]-error

GAM model does better than a simple linear model with 6 variables.

  1. For which variables, if any, is there evidence of a non-linear relationship with the response? it) The model suggests a strong non-linear relationship between "Outstate" and "Expend", and a moderate non-linear relationship between "Outstate" and "Grad.rate"