library(ISLR2)
library (boot)
library(leaps)

## Warning: package 'leaps' was built under R version 4.2.3

library(gam)

## Warning: package 'gam' was built under R version 4.2.3

## Loading required package: splines

## Loading required package: foreach

## Warning: package 'foreach' was built under R version 4.2.3

## Loaded gam 1.22-2

Exercise 6

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

attach(Wage)

#Using cross-validation to select the optimal degree. 
set.seed (5)
cv.error.5 <- rep (0, 5)
for (i in 1:5){
glm.fit <- glm (wage ~ poly (age , i), data = Wage)
cv.error.5[i] <- cv.glm(Wage , glm.fit, K = 5)$delta[1]
}

cv.error.5

## [1] 1675.403 1600.834 1597.054 1594.266 1597.060

which.min(cv.error.5)

## [1] 4

plot(cv.error.5,  xlab="Degree", ylab="Test MSE", type = "l")

#Using ANOVA

fit.1=lm(wage~age,data=Wage)
fit.2=lm(wage~poly(age,2),data=Wage)
fit.3=lm(wage~poly(age,3),data=Wage)
fit.4=lm(wage~poly(age,4),data=Wage)
fit.5=lm(wage~poly(age,5),data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5)

## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

agelims=range(age)
age.grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit.4,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)

plot(age,wage,xlim=agelims,cex =.5,col="darkgrey")
title("Degree-4 Polynomial")
lines(age.grid,preds$fit,lwd=2,col="darkblue")
matlines(age.grid,se.bands,lwd=1,col="lightblue",lty=3)

The optimal degree was chosen is 4.

The p-value comparing the linear Model 1 to the quadratic Model 2 is essentially zero, indicating that a linear fit is not sufficient. Similarly the p-value comparing the quadratic Model 2 to the cubic Model 3 is very low (0.0017), so the quadratic fit is also insufficient. The p-value comparing the cubic and degree-4 polynomials, Model 3 and Model 4, is approximately 5% while the degree-5 polynomial Model 5 seems unnecessary because its p-value is 0.37. Hence, either a cubic or a quadratic polynomial appear to provide a reasonable fit to the data, but lower- or higher-order models are not justified.

(b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained.

cv.errors.cut <- rep(NA, 10)

for(i in 2:10){
  Wage$age.cut <- cut(age,i)
  glm.fit <- glm(wage ~ age.cut, data=Wage)
  cv.errors.cut[i] <- cv.glm(Wage, glm.fit, K=10)$delta[1]
}

cv.errors.cut

##  [1]       NA 1734.406 1682.668 1635.540 1630.482 1623.813 1613.104 1601.741
##  [9] 1611.393 1607.700

which.min(cv.errors.cut)

## [1] 8

fit.cut <- glm(wage ~ cut (age , 8), data = Wage)
preds=predict(fit.cut,newdata=list(age=age.grid),se=TRUE)

plot(age,wage,xlim=agelims,cex =.5,col="darkgrey")
lines(age.grid,preds$fit,lwd=2,col="red")

Exercise 10

This question relates to the College data set.

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

str(College)

## 'data.frame':    777 obs. of  18 variables:
##  $ Private    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Apps       : num  1660 2186 1428 417 193 ...
##  $ Accept     : num  1232 1924 1097 349 146 ...
##  $ Enroll     : num  721 512 336 137 55 158 103 489 227 172 ...
##  $ Top10perc  : num  23 16 22 60 16 38 17 37 30 21 ...
##  $ Top25perc  : num  52 29 50 89 44 62 45 68 63 44 ...
##  $ F.Undergrad: num  2885 2683 1036 510 249 ...
##  $ P.Undergrad: num  537 1227 99 63 869 ...
##  $ Outstate   : num  7440 12280 11250 12960 7560 ...
##  $ Room.Board : num  3300 6450 3750 5450 4120 ...
##  $ Books      : num  450 750 400 450 800 500 500 450 300 660 ...
##  $ Personal   : num  2200 1500 1165 875 1500 ...
##  $ PhD        : num  70 29 53 92 76 67 90 89 79 40 ...
##  $ Terminal   : num  78 30 66 97 72 73 93 100 84 41 ...
##  $ S.F.Ratio  : num  18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
##  $ perc.alumni: num  12 16 30 37 2 11 26 37 23 15 ...
##  $ Expend     : num  7041 10527 8735 19016 10922 ...
##  $ Grad.Rate  : num  60 56 54 59 15 55 63 73 80 52 ...

attach(College)

set.seed(10)
train <- sample(1: nrow(College), nrow(College)/2)
test <- -train

regfit.fwd <- regsubsets(Outstate ~ ., data = College, subset = train, nvmax = 17, method = 'forward')

summary (regfit.fwd)

## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College, subset = train, 
##     nvmax = 17, method = "forward")
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
##           PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 )  " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 3  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 4  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 8  ( 1 )  "*"        " "  " "    " "    " "       "*"       " "        
## 9  ( 1 )  "*"        " "  " "    " "    " "       "*"       " "        
## 10  ( 1 ) "*"        " "  "*"    " "    " "       "*"       " "        
## 11  ( 1 ) "*"        "*"  "*"    " "    " "       "*"       " "        
## 12  ( 1 ) "*"        "*"  "*"    " "    " "       "*"       "*"        
## 13  ( 1 ) "*"        "*"  "*"    " "    "*"       "*"       "*"        
## 14  ( 1 ) "*"        "*"  "*"    " "    "*"       "*"       "*"        
## 15  ( 1 ) "*"        "*"  "*"    " "    "*"       "*"       "*"        
## 16  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
## 17  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
##           P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 2  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 3  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 )  " "         "*"        " "   " "      " " "*"      " "      
## 6  ( 1 )  " "         "*"        " "   " "      " " "*"      " "      
## 7  ( 1 )  " "         "*"        " "   "*"      " " "*"      " "      
## 8  ( 1 )  " "         "*"        " "   "*"      " " "*"      " "      
## 9  ( 1 )  " "         "*"        " "   "*"      " " "*"      "*"      
## 10  ( 1 ) " "         "*"        " "   "*"      " " "*"      "*"      
## 11  ( 1 ) " "         "*"        " "   "*"      " " "*"      "*"      
## 12  ( 1 ) " "         "*"        " "   "*"      " " "*"      "*"      
## 13  ( 1 ) " "         "*"        " "   "*"      " " "*"      "*"      
## 14  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 15  ( 1 ) "*"         "*"        " "   "*"      "*" "*"      "*"      
## 16  ( 1 ) "*"         "*"        " "   "*"      "*" "*"      "*"      
## 17  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
##           perc.alumni Expend Grad.Rate
## 1  ( 1 )  " "         "*"    " "      
## 2  ( 1 )  " "         "*"    " "      
## 3  ( 1 )  " "         "*"    " "      
## 4  ( 1 )  " "         "*"    "*"      
## 5  ( 1 )  " "         "*"    "*"      
## 6  ( 1 )  "*"         "*"    "*"      
## 7  ( 1 )  "*"         "*"    "*"      
## 8  ( 1 )  "*"         "*"    "*"      
## 9  ( 1 )  "*"         "*"    "*"      
## 10  ( 1 ) "*"         "*"    "*"      
## 11  ( 1 ) "*"         "*"    "*"      
## 12  ( 1 ) "*"         "*"    "*"      
## 13  ( 1 ) "*"         "*"    "*"      
## 14  ( 1 ) "*"         "*"    "*"      
## 15  ( 1 ) "*"         "*"    "*"      
## 16  ( 1 ) "*"         "*"    "*"      
## 17  ( 1 ) "*"         "*"    "*"

coef(regfit.fwd, 6)

##   (Intercept)    PrivateYes    Room.Board      Terminal   perc.alumni 
## -4020.7703897  2600.1120874     0.8969917    40.7248748    36.9282476 
##        Expend     Grad.Rate 
##     0.2205798    37.4196026

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

gam.m1=gam(Outstate ~ Private + s(Room.Board, 4) + s(Terminal, 4) + s(perc.alumni, 4) + s(Expend, 4) + s(Grad.Rate, 4), data = College, subset = train)

par(mfrow=c(2,3))
plot(gam.m1, se=TRUE,col ="#1c9099")

Expend and Grad.Rate are non-linear.

(c) Evaluate the model obtained on the test set, and explain the results obtained.

preds=predict(gam.m1,newdata=College[test,])

mean((College[test,]$Outstate-preds)^2)

## [1] 3350219

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

summary(gam.m1)

## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 4) + s(Terminal, 
##     4) + s(perc.alumni, 4) + s(Expend, 4) + s(Grad.Rate, 4), 
##     data = College, subset = train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7530.37 -1115.04    25.66  1204.25  7560.01 
## 
## (Dispersion Parameter for gaussian family taken to be 3673466)
## 
##     Null Deviance: 6657447006 on 387 degrees of freedom
## Residual Deviance: 1344488967 on 366.0001 degrees of freedom
## AIC: 6989.707 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private             1 1702177913 1702177913 463.371 < 2.2e-16 ***
## s(Room.Board, 4)    1 1177488269 1177488269 320.539 < 2.2e-16 ***
## s(Terminal, 4)      1  331253283  331253283  90.175 < 2.2e-16 ***
## s(perc.alumni, 4)   1  220188183  220188183  59.940 9.661e-14 ***
## s(Expend, 4)        1  697327563  697327563 189.828 < 2.2e-16 ***
## s(Grad.Rate, 4)     1  120566187  120566187  32.821 2.112e-08 ***
## Residuals         366 1344488967    3673466                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df  Npar F   Pr(F)    
## (Intercept)                                  
## Private                                      
## s(Room.Board, 4)        3  2.3806 0.06930 .  
## s(Terminal, 4)          3  1.5315 0.20594    
## s(perc.alumni, 4)       3  0.4054 0.74916    
## s(Expend, 4)            3 30.5874 < 2e-16 ***
## s(Grad.Rate, 4)         3  2.2220 0.08524 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

There is very clear evidence that a non-linear term is required for Grad.Rate and Expend.

Assignment #6 - Chapter 7

Tien Vo

2023-04-15