Problem 6

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

library(ISLR2)
library(boot)
attach(Wage)

(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

After looking at the graph for the optimal degree using cross-validation, we determined a degree of 4 was optimal because there does not seem to be much change after 4 by adding more.

set.seed (1)
cv_error <- rep(0,10)
for (i in 1:10){
glm.fit <- glm(wage ~ poly(age,i),data=Wage)
cv_error[i]<- cv.glm(Wage,glm.fit,K=10)$delta[1]
}
cv_error
##  [1] 1676.826 1600.763 1598.399 1595.651 1594.977 1596.061 1594.298 1598.134
##  [9] 1593.913 1595.950
plot(1:10, cv_error, xlab = 'Degree', ylab = 'CV Error', type = 'l')

Using ANOVA for hypothesis testing, we found that the optimal degree was 3 because anything past 3 is insignificant.

fit_1 <- lm(wage ~ age, data=Wage)
fit_2 <- lm(wage ~ poly(age, 2), data=Wage) 
fit_3 <- lm(wage ~ poly(age, 3), data=Wage) 
fit_4 <- lm(wage ~ poly(age, 4), data=Wage) 
fit_5 <- lm(wage ~ poly(age, 5), data=Wage) 
anova(fit_1, fit_2, fit_3, fit_4, fit_5)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(wage~age, data=Wage, col="grey")
agelims <-  range(Wage$age)
age_grid <- seq(from=agelims[1], to=agelims[2])
lm_fit <- lm(wage~poly(age,3), data = Wage)
preds <- predict(lm_fit, newdata = list(age=age_grid))
lines(age_grid, preds, col="darkblue", lwd=2)

(b) Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.

The optimal number of cuts using cross-validation was 8 because it had the lowest MSE.

set.seed(2)
cv_errors2 <- rep(NA, 10)
for(i in 2:10){
  Wage$age.cut <- cut(Wage$age,i)
  glm.fit <- glm(wage ~ age.cut, data=Wage)
  cv_errors2[i] <- cv.glm(Wage, glm.fit, K=10)$delta[1]
}
plot(2:10, cv_errors2[-1], type="b", xlab="Number of cuts", ylab="Test MSE")

fit_step <- glm(wage ~ cut(age, 8), data=Wage)
preds <- predict(fit_step, data.frame(age = age_grid))
plot(age, wage, col="gray")
lines(age_grid, preds, col="darkblue", lwd=2)

detach(Wage)

Problem 10

This question relates to the College data set.

library(ISLR2)
library(leaps)
attach(College)

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

set.seed(1)
inTrain <- sample(1:nrow(College), 0.8*nrow(College))
college_train <- College[inTrain,]
college_test <- College[-inTrain,]
regfit_fwd <- regsubsets(Outstate ~ ., data = college_train, nvmax=17, method = 'forward')
fit_summary <- summary(regfit_fwd)
summary(regfit_fwd)
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = college_train, nvmax = 17, 
##     method = "forward")
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
##           PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 )  " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 3  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 4  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 8  ( 1 )  "*"        " "  "*"    " "    " "       " "       " "        
## 9  ( 1 )  "*"        "*"  "*"    " "    " "       " "       " "        
## 10  ( 1 ) "*"        "*"  "*"    " "    "*"       " "       " "        
## 11  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       " "        
## 12  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       " "        
## 13  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       " "        
## 14  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"        
## 15  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
## 16  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
## 17  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
##           P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 2  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 3  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 )  " "         "*"        " "   " "      "*" " "      " "      
## 6  ( 1 )  " "         "*"        " "   " "      "*" " "      " "      
## 7  ( 1 )  " "         "*"        " "   "*"      "*" " "      " "      
## 8  ( 1 )  " "         "*"        " "   "*"      "*" " "      " "      
## 9  ( 1 )  " "         "*"        " "   "*"      "*" " "      " "      
## 10  ( 1 ) " "         "*"        " "   "*"      "*" " "      " "      
## 11  ( 1 ) " "         "*"        " "   "*"      "*" " "      " "      
## 12  ( 1 ) " "         "*"        " "   "*"      "*" "*"      " "      
## 13  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 14  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 15  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 16  ( 1 ) "*"         "*"        " "   "*"      "*" "*"      "*"      
## 17  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
##           perc.alumni Expend Grad.Rate
## 1  ( 1 )  " "         "*"    " "      
## 2  ( 1 )  " "         "*"    " "      
## 3  ( 1 )  " "         "*"    " "      
## 4  ( 1 )  "*"         "*"    " "      
## 5  ( 1 )  "*"         "*"    " "      
## 6  ( 1 )  "*"         "*"    "*"      
## 7  ( 1 )  "*"         "*"    "*"      
## 8  ( 1 )  "*"         "*"    "*"      
## 9  ( 1 )  "*"         "*"    "*"      
## 10  ( 1 ) "*"         "*"    "*"      
## 11  ( 1 ) "*"         "*"    "*"      
## 12  ( 1 ) "*"         "*"    "*"      
## 13  ( 1 ) "*"         "*"    "*"      
## 14  ( 1 ) "*"         "*"    "*"      
## 15  ( 1 ) "*"         "*"    "*"      
## 16  ( 1 ) "*"         "*"    "*"      
## 17  ( 1 ) "*"         "*"    "*"
plot(fit_summary$cp ,xlab="Number of Variables ", ylab="MSE",type="b")

coef(regfit_fwd, 6)
##   (Intercept)    PrivateYes    Room.Board           PhD   perc.alumni 
## -3605.6259459  2741.6004311     0.9838692    36.0303348    56.8285993 
##        Expend     Grad.Rate 
##     0.2094267    27.8946995

Using a forward stepwise selection we found that 6 variables, although not the lowest MSE, gives the most simple function to use. Those variables are PrivateYes, Room.Board, PhD, perc.alumni, Expend, and Grad.Rate.

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

After fitting a GAM on the training data, we can see that

library(gam)
gam1 <- gam(Outstate ~ Private + s(Room.Board, 5)+ s(PhD, 5)+ s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate , 5), data=college_train)
par(mfrow = c(3,3))
plot(gam1, se = TRUE, col= 'blue')

(c) Evaluate the model obtained on the test set, and explain the results obtained.

The MSE for this model is 2975384.

preds <- predict(gam1, college_test)
mean((college_test$Outstate - preds)^2)
## [1] 2975384

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

The p-value for PhD, perc.alumni, and Grad.Rate show that a linear function is adequate for this term. However, there is very clear evidence that a non-linear term is required for Expend and also for Room.Board.

summary(gam1)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 5) + s(PhD, 
##     5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5), 
##     data = college_train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7721.22 -1113.81    46.01  1267.04  7434.35 
## 
## (Dispersion Parameter for gaussian family taken to be 3551523)
## 
##     Null Deviance: 10354544612 on 620 degrees of freedom
## Residual Deviance: 2109604021 on 593.9998 degrees of freedom
## AIC: 11157.19 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private             1 2673192211 2673192211 752.689 < 2.2e-16 ***
## s(Room.Board, 5)    1 2018307787 2018307787 568.294 < 2.2e-16 ***
## s(PhD, 5)           1  667452395  667452395 187.934 < 2.2e-16 ***
## s(perc.alumni, 5)   1  480883824  480883824 135.402 < 2.2e-16 ***
## s(Expend, 5)        1  832807942  832807942 234.493 < 2.2e-16 ***
## s(Grad.Rate, 5)     1  111213610  111213610  31.314 3.354e-08 ***
## Residuals         594 2109604021    3551523                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df  Npar F  Pr(F)    
## (Intercept)                                 
## Private                                     
## s(Room.Board, 5)        4  2.6328 0.0334 *  
## s(PhD, 5)               4  1.5805 0.1779    
## s(perc.alumni, 5)       4  1.2180 0.3020    
## s(Expend, 5)            4 28.3327 <2e-16 ***
## s(Grad.Rate, 5)         4  1.8478 0.1181    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1