Load Libraries and attach data sets

library(ISLR2)
library(caret)
library(gam)
library(leaps)
library(pander)
library(boot)
library(tidyverse)
attach(Wage)
attach(College)





Problem 6.

In this exercise, you will further analyze the Wage data set considered throughout this chapter.


(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

Using the Polynomial Regression with cross-validation, the degree with the lowest error was 7; however, further analysis of the plot shows that a degree of 4 has a very similar error value and would result in a less complex model. So 4 would be my choice as the optimal degree using this Polynomial Regression.
Using the ANOVA model, a degree of 3 would cause you to reject the Null hypothesis while 4 is right on the cusp. In this case, I would choose to reject the null hypothesis for a degree of 4 and choose it as the optimal degree in the ANOVA model as well.


Fit Polynomial Regression

set.seed(22)

d <- 10
cv.err <- rep(NA, d)

for (i in 1:d){
  fit <- glm(wage~poly(age,i),data=Wage)
  cv.err[i] <- cv.glm(Wage,fit,K=d)$delta[1]
}

d.min <- which.min(cv.err)


Plot Optimal Degree from Polynomial Regression

plot(cv.err, xlab = 'Degree', ylab = 'MSE', type = 'l', col = 'light Blue')
points(c(4,d.min), c(cv.err[d.min],cv.err[d.min]), col = c('Dark Green','Dark Blue'), bg = c('Dark Green','Dark Blue'), pch = 21, fill = T, cex = 1)
text(c(4,7), c(1595,1595), pos = 3, labels = c('Optimal Degree','Minimum CV Error'), col = c('Dark Green','Dark Blue'))


ANOVA Test

set.seed(22)

fit1=lm(wage~poly(age,1),data=Wage)
fit2=lm(wage~poly(age,2),data=Wage)
fit3=lm(wage~poly(age,3),data=Wage)
fit4=lm(wage~poly(age,4),data=Wage)
fit5=lm(wage~poly(age,5),data=Wage)
fit6=lm(wage~poly(age,6),data=Wage)


Print Anova Table

pander(anova(fit1, fit2, fit3, fit4, fit5, fit6))
Analysis of Variance Table
Res.Df RSS Df Sum of Sq F Pr(>F)
2998 5022216 NA NA NA NA
2997 4793430 1 228786 143.7 2.29e-32
2996 4777674 1 15756 9.894 0.001675
2995 4771604 1 6070 3.812 0.05099
2994 4770322 1 1283 0.8054 0.3696
2993 4766389 1 3932 2.469 0.1162


Plot of Fit to Data

plot(wage ~ age, data = Wage, col = 'dark grey')
age.range <- range(Wage$age)
age.grid <- seq(from = age.range[1], to = age.range[2])
preds <- predict(fit4, newdata = list(age = age.grid), se = T)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
lines(age.grid, preds$fit, col = 'dark blue', lwd = 2)
matlines(age.grid,se.bands,lwd=1,col='light blue',lty=3)


(b) Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.

Using cross-validation, the optimal number of cuts is 8.


Fit Step Function

set.seed(22)

for (i in 2:d) {
  Wage$age.cut <- cut(Wage$age, i)
  fit <- glm(wage ~ age.cut, data = Wage)
  cv.err[i] <- cv.glm(Wage, fit)$delta[1]
}

d.min <- which.min(cv.err)

#Optimal number of cuts: 
d.min
## [1] 8


Plot of Fit to Data

plot(wage ~ age, data = Wage, col = 'darkgrey')
fit <- glm(wage ~ cut(age, d.min), data = Wage)
preds <- predict(fit, list(age = age.grid))
lines(age.grid, preds, col = 'dark blue', lwd = 2)


Problem 10.

This question relates to the College data set.


(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

Looking at the plots of the forward regression (below), it appears that the optimal number of predictors are arguably between 5 to 6. In this case, after evaluating the curves, I selected 6 as the optimal number of predictors. I will note however, that running the model with the top 5 predictors (excluding Grad.Rate) results in only a slightly lower \(R^{2}\), which might arguably be worth the reduction in model complexity. I elected to stay with 6, as that is what the plots indicated to be best.


Split Data into Train & Test

set.seed(22)

in_train <- sample(nrow(College) * 0.75)
train <- College[in_train, ]
test <- College[-in_train, ]


Perform Stepwise selection on the training set

set.seed(22)

fwd_step <- regsubsets(Outstate ~ ., data = train, method = 'forward')

fwd_step_summary <- summary(fwd_step)
fwd_step_summary
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = train, method = "forward")
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
##          PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 ) " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 3  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 4  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 8  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
##          P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 ) " "         " "        " "   " "      " " " "      " "      
## 2  ( 1 ) " "         " "        " "   " "      " " " "      " "      
## 3  ( 1 ) " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 ) " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 ) " "         "*"        " "   " "      "*" " "      " "      
## 6  ( 1 ) " "         "*"        " "   " "      "*" " "      " "      
## 7  ( 1 ) " "         "*"        " "   "*"      "*" " "      " "      
## 8  ( 1 ) " "         "*"        " "   "*"      "*" "*"      " "      
##          perc.alumni Expend Grad.Rate
## 1  ( 1 ) " "         "*"    " "      
## 2  ( 1 ) " "         "*"    " "      
## 3  ( 1 ) " "         "*"    " "      
## 4  ( 1 ) "*"         "*"    " "      
## 5  ( 1 ) "*"         "*"    " "      
## 6  ( 1 ) "*"         "*"    "*"      
## 7  ( 1 ) "*"         "*"    "*"      
## 8  ( 1 ) "*"         "*"    "*"


Create 3 plots to determine best model

par(mfrow=c(1, 3))

#Plot 1
plot(fwd_step_summary$cp, xlab = '# of Predictors', ylab = 'Cp', type = 'l')
min_cp <-  min(fwd_step_summary$cp)
std_cp <-  sd(fwd_step_summary$cp)
abline(h = min_cp + std_cp, col = 'red', lty = 2)

#Plot 2
plot(fwd_step_summary$bic,xlab = '# of Predictors', ylab = 'BIC', type = 'l')
min_bic <-  min(fwd_step_summary$bic)
std_bic <-  sd(fwd_step_summary$bic)
abline(h = min_bic + std_bic, col = 'red', lty=2)

#Plot 3
plot(fwd_step_summary$adjr2, xlab = '# of Predictors', ylab = 'Adjusted R2', type = 'l')
max_adjr2 <-  max(fwd_step_summary$adjr2)
std_adjr2 <-  sd(fwd_step_summary$adjr2)
abline(h=max_adjr2 - std_adjr2, col = 'red', lty = 2)


(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

Plotting the results with a smoothing spline using 4 degrees of freedom shows that there are are not linear relationships between most of the predictors and the response; however, it does appear that there might be a slightly more linear relationships with perc.alumni and Room.Board than the others.


List Optimal (Top 6) predictors

predictors <- (names(coef(fwd_step, 6)))[2:7]
predictors
## [1] "PrivateYes"  "Room.Board"  "PhD"         "perc.alumni" "Expend"     
## [6] "Grad.Rate"

Train and plot a GAM

coefs <- names(coef(fwd_step, 6))



gam_fit <-  gam(Outstate ~ Private +
                  s(Room.Board, 4) + 
                  s(PhD, 4) +
                  s(perc.alumni, 4) +
                  s(Expend, 4) + 
                  s(Grad.Rate, 4),
                  data = train)

gam_fit_summary <- (summary(gam_fit))

par(mfrow=c(2, 3))
plot(gam_fit, se = TRUE, col = 'dark blue')

(c) Evaluate the model obtained on the test set, and explain the results obtained.

The model has an \(R^{2}\) of .76, which means that approximately 76% of the variance in Outstate can be explained by the six predictors Private, Room.Board, PhD, perc.alumni, Expend, and Grad.Rate.


Evaluate Model using R2

gam_pred <-  predict(gam_fit, test)
gam_err <-  mean((test$Outstate - gam_pred)^2)
gam_tss <-  mean((test$Outstate - mean(test$Outstate))^2)
gam_r2 <-  1 - gam_err / gam_tss

gam_r2
## [1] 0.7588494

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

The Anova for Nonparametric Effects (in the summary below) shows that there is a very significant (alpha = 0) non-linear relationship between Expend and Outstate (the response). It also shows that there is a significant (alpha = .05) non-linear relationship between the response and PhD as well as Grad.Rate. As a result, if we remove smoothing for perc.alumni, we wind up with a very slight increase in our \(R^{2}\) (.7598 vice .7588).


Print a Summary of the Model

gam_fit_summary
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 4) + s(PhD, 
##     4) + s(perc.alumni, 4) + s(Expend, 4) + s(Grad.Rate, 4), 
##     data = train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -6855.38 -1064.53    16.12  1242.89  7131.43 
## 
## (Dispersion Parameter for gaussian family taken to be 3279494)
## 
##     Null Deviance: 9261203461 on 581 degrees of freedom
## Residual Deviance: 1836516009 on 559.9998 degrees of freedom
## AIC: 10407.08 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private             1 2167242895 2167242895 660.847 < 2.2e-16 ***
## s(Room.Board, 4)    1 1783298982 1783298982 543.773 < 2.2e-16 ***
## s(PhD, 4)           1  660457576  660457576 201.390 < 2.2e-16 ***
## s(perc.alumni, 4)   1  382401587  382401587 116.604 < 2.2e-16 ***
## s(Expend, 4)        1  852532186  852532186 259.959 < 2.2e-16 ***
## s(Grad.Rate, 4)     1  119779828  119779828  36.524 2.754e-09 ***
## Residuals         560 1836516009    3279494                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df Npar F   Pr(F)    
## (Intercept)                                 
## Private                                     
## s(Room.Board, 4)        3  3.121 0.02564 *  
## s(PhD, 4)               3  2.947 0.03235 *  
## s(perc.alumni, 4)       3  1.256 0.28865    
## s(Expend, 4)            3 39.022 < 2e-16 ***
## s(Grad.Rate, 4)         3  3.080 0.02708 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


Train without smoothing per.alumni

gam_fit2 <-  gam(Outstate ~ Private +
                  s(Room.Board, 4) + 
                  s(PhD, 4) +
                  perc.alumni +
                  s(Expend, 4) + 
                  s(Grad.Rate, 4),
                  data = train)

gam2_pred <-  predict(gam_fit2, test)
gam2_err <-  mean((test$Outstate - gam2_pred)^2)
gam2_tss <-  mean((test$Outstate - mean(test$Outstate))^2)
gam2_r2 <-  1 - gam2_err / gam2_tss

gam2_r2
## [1] 0.7598218