library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr     1.1.1     âś” readr     2.1.4
## âś” forcats   1.0.0     âś” stringr   1.5.0
## âś” ggplot2   3.4.2     âś” tibble    3.2.1
## âś” lubridate 1.9.2     âś” tidyr     1.3.0
## âś” purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ISLR)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(boot)
## 
## Attaching package: 'boot'
## 
## The following object is masked from 'package:lattice':
## 
##     melanoma
library(gam)
## Loading required package: splines
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## 
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## 
## Loaded gam 1.22-2
library(leaps)

Question 6

  1. In this exercise, you will further analyze the Wage data set considered throughout this chapter.
  1. Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.
set.seed(1)
degree <- rep(NA, 15)
for (i in 1:15) {
  fit <- glm(wage ~ poly(age,i), data = Wage)
  degree[i] <- cv.glm(Wage, fit, K = 15)$delta[1]
}

plot(1:15, degree,
     xlab = "Degree",
     ylab = "MSE"
     )

min.degree <- which.min(degree)

min.degree
## [1] 7

It appears that 7 degrees gave us the lowest MSE.

fit1 <- lm(wage ~ age, data = Wage)
fit2 <- lm(wage ~ poly(age, 2), data = Wage)
fit3 <- lm(wage ~ poly(age, 3), data = Wage)
fit4 <- lm(wage ~ poly(age, 4), data = Wage)
fit5 <- lm(wage ~ poly(age, 5), data = Wage)
fit6 <- lm(wage ~ poly(age, 6), data = Wage)
fit7 <- lm(wage ~ poly(age, 7), data = Wage)



anova(fit1,fit2,fit3,fit4,fit5,fit6,fit7)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Model 6: wage ~ poly(age, 6)
## Model 7: wage ~ poly(age, 7)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.6926 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8956  0.001673 ** 
## 4   2995 4771604  1      6070   3.8125  0.050966 .  
## 5   2994 4770322  1      1283   0.8055  0.369516    
## 6   2993 4766389  1      3932   2.4697  0.116165    
## 7   2992 4763834  1      2555   1.6049  0.205311    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

I am going to use a poly(age,4) model because .0509 is so close to .05. I am leaving out the 7th degree, as it would compicate the model with only nominal benefits.

plot(wage ~ age, data = Wage)
agelimits <- range(Wage$age)
agegrid <- seq(from = agelimits[1], to = agelimits[2])
polyfit <- lm(wage ~ poly(age,4), data = Wage)
polypredict <- predict(polyfit, newdata = list(age = agegrid))
lines(agegrid, polypredict, col = 'blue', lwd = 2)

  1. Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained.
set.seed(1)

kfold <- rep(NA, 15)
for(i in 2:15) {
  Wage$age.cut <- cut(Wage$age, i)
  fit <- glm(wage ~ age.cut, data = Wage)
  kfold[i] <- cv.glm(Wage, fit, K =15)$delta[1]
}

plot(2:15, kfold[-1],
     xlab = "Degrees",
     ylab = "MSE")

min.folds <- which.min(kfold)

min.folds
## [1] 8

It appears that 8 cuts gives us the lowest MSE.

plot(wage ~ age, data = Wage)

kfold.fit <- glm(wage ~ cut(age,8), data = Wage)
kfold.predict <- predict(kfold.fit, data.frame(age = agegrid))

lines(agegrid, kfold.predict, col = 'blue', lwd = 2)

Question 10

  1. This question relates to the College data set.
  1. Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.
split = .8

trainIndex <- createDataPartition(College$Outstate, p =split, list = FALSE)

Collegetrain <- College[ trainIndex,]
Collegetest <- College[-trainIndex,]
fwd.step <- regsubsets(Outstate ~., data = Collegetrain, method = "forward", nvmax = 18)

sumfwd.step <- summary(fwd.step)

sumfwd.step
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = Collegetrain, method = "forward", 
##     nvmax = 18)
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
##           PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 )  " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 3  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 4  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 8  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 9  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 10  ( 1 ) "*"        " "  "*"    " "    " "       " "       " "        
## 11  ( 1 ) "*"        " "  "*"    " "    " "       " "       "*"        
## 12  ( 1 ) "*"        "*"  "*"    " "    " "       " "       "*"        
## 13  ( 1 ) "*"        "*"  "*"    " "    "*"       " "       "*"        
## 14  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"        
## 15  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"        
## 16  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
## 17  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
##           P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 2  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 3  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 )  " "         "*"        " "   " "      " " "*"      " "      
## 6  ( 1 )  " "         "*"        " "   " "      " " "*"      " "      
## 7  ( 1 )  " "         "*"        " "   "*"      " " "*"      " "      
## 8  ( 1 )  " "         "*"        " "   "*"      " " "*"      "*"      
## 9  ( 1 )  " "         "*"        " "   "*"      "*" "*"      "*"      
## 10  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 11  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 12  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 13  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 14  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 15  ( 1 ) " "         "*"        "*"   "*"      "*" "*"      "*"      
## 16  ( 1 ) " "         "*"        "*"   "*"      "*" "*"      "*"      
## 17  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
##           perc.alumni Expend Grad.Rate
## 1  ( 1 )  " "         "*"    " "      
## 2  ( 1 )  " "         "*"    " "      
## 3  ( 1 )  " "         "*"    " "      
## 4  ( 1 )  " "         "*"    "*"      
## 5  ( 1 )  " "         "*"    "*"      
## 6  ( 1 )  "*"         "*"    "*"      
## 7  ( 1 )  "*"         "*"    "*"      
## 8  ( 1 )  "*"         "*"    "*"      
## 9  ( 1 )  "*"         "*"    "*"      
## 10  ( 1 ) "*"         "*"    "*"      
## 11  ( 1 ) "*"         "*"    "*"      
## 12  ( 1 ) "*"         "*"    "*"      
## 13  ( 1 ) "*"         "*"    "*"      
## 14  ( 1 ) "*"         "*"    "*"      
## 15  ( 1 ) "*"         "*"    "*"      
## 16  ( 1 ) "*"         "*"    "*"      
## 17  ( 1 ) "*"         "*"    "*"
par(mfrow=c(1,3))
plot(sumfwd.step$adjr2)
plot(sumfwd.step$cp)
plot(sumfwd.step$bic)

It appears that 6 is a reasonable amount of variables to choose.

coeffs <- coef(fwd.step, id = 6)
names(coeffs)
## [1] "(Intercept)" "PrivateYes"  "Room.Board"  "Terminal"    "perc.alumni"
## [6] "Expend"      "Grad.Rate"
  1. Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.
attach(College)
gam.fit <- gam(Outstate ~ Private + s(Room.Board) + s(Terminal) + s(perc.alumni) + s(Expend) + s(Grad.Rate), data = Collegetrain)

par(mfrow = c(2,3))
plot(gam.fit, se = TRUE, col = 'blue')

  1. Evaluate the model obtained on the test set, and explain the results obtained.
mean((predict(gam.fit, newdata = Collegetest) - Collegetest$Outstate)^2)
## [1] 3641989
err <- mean((Collegetest$Outstate - predict(gam.fit, Collegetest))^2)

tss <-mean((Collegetest$Outstate - mean(Collegetest$Outstate))^2)

rs <- 1 - err/tss

rs
## [1] 0.7794837

We have an R-Squared of .8414, which means that 84.14% of the variation in the data can be explained by our gam model.

  1. For which variables, if any, is there evidence of a non-linear relationship with the response?
summary(gam.fit)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(Terminal) + 
##     s(perc.alumni) + s(Expend) + s(Grad.Rate), data = Collegetrain)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7387.37 -1080.65    30.96  1245.60  7782.51 
## 
## (Dispersion Parameter for gaussian family taken to be 3414453)
## 
##     Null Deviance: 10015674527 on 622 degrees of freedom
## Residual Deviance: 2052088069 on 601.0006 degrees of freedom
## AIC: 11163.72 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                 Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private          1 2848901411 2848901411  834.37 < 2.2e-16 ***
## s(Room.Board)    1 1863074984 1863074984  545.64 < 2.2e-16 ***
## s(Terminal)      1  691934312  691934312  202.65 < 2.2e-16 ***
## s(perc.alumni)   1  390893703  390893703  114.48 < 2.2e-16 ***
## s(Expend)        1  915495774  915495774  268.12 < 2.2e-16 ***
## s(Grad.Rate)     1  138830787  138830787   40.66 3.614e-10 ***
## Residuals      601 2052088069    3414453                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                Npar Df  Npar F   Pr(F)    
## (Intercept)                               
## Private                                   
## s(Room.Board)        3  2.2819 0.07812 .  
## s(Terminal)          3  2.6420 0.04854 *  
## s(perc.alumni)       3  1.5776 0.19363    
## s(Expend)            3 30.4245 < 2e-16 ***
## s(Grad.Rate)         3  2.5902 0.05201 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

It appears that Terminal and Expend both have a non-linear relationship with Outstate at the \(\alpha\) = .05 level. This being said, there is also evidence that Room.Board and Grad.Rate also have a non-linear relationship, but the relationship isn’t as signficant as the first two.