Exercise 6

First we’ll fit a polynomial regression model to wage data. And we’ll use K-fold cross validation with K=10 to select the optimal degree for the polynomial.

library (ISLR )
library(boot)
attach (Wage)

set.seed(1)
degree <- 10
cv.errs <- rep(NA, degree)    #container of test errors

for (i in 1:degree) {           #loop over power of age
  fit <- glm(wage ~ poly(age, i), data = Wage)   #Fitting a polynomial regression model
  cv.errs[i] <- cv.glm(Wage, fit)$delta[1]       #cv.glm's cross-validation 
}

#plot the test MSE by degrees
plot(1:degree, cv.errs, xlab = 'Degree', ylab = 'Test MSE', type = 'l')
deg.min <- which.min(cv.errs)
points(deg.min, cv.errs[deg.min], col = 'red', cex = 2, pch = 16)   #point the minimum value

We see that using cross-validation, a degree 9 polynomial provides the best fit of the data with the minimum test MSE. But test MSE of degree 4 is small enough. That makes us to make a comparision between several degree polynomials.

fit_1 <- lm(wage ~ age, data=Wage)
fit_2 <- lm(wage ~ poly(age, 2), data=Wage) 
fit_3 <- lm(wage ~ poly(age, 3), data=Wage) 
fit_4 <- lm(wage ~ poly(age, 4), data=Wage) 
fit_5 <- lm(wage ~ poly(age, 5), data=Wage) 
fit_6 <- lm(wage ~ poly(age, 6), data=Wage)
fit_7 <- lm(wage ~ poly(age, 7), data=Wage)
fit_8 <- lm(wage ~ poly(age, 8), data=Wage)

anova(fit_1, fit_2, fit_3, fit_4, fit_5,fit_6,fit_7,fit_8)

The comparison by ANOVA shoes that the p-value between the cubic to quartic is very large. So, a 4 degree polynomial will be enough to fit the model really well.

We will now plot the results of the polynomial fit.

age_lim <- range(age)
age_grid <- seq(from=age_lim[1], to=age_lim[2])
preds <- predict(fit_4, newdata=list(age=age_grid),se=TRUE)   #Predict age
se_bands <- cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)

plot(age, wage, xlim=age_lim, cex=.5, col="darkgrey")
lines(age_grid, preds$fit, lwd=2, col="blue")
matlines(age_grid, se_bands, lwd=1, col="blue", lty=3)

  • (b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained.

We’ll fit a step functiom to the wage data and then perform cross-validation test.

require(boot)
set.seed(2)

cv.errors <- rep(NA, 10)

for(i in 2:10){
  Wage$age.cut <- cut(Wage$age,i)
  glm.fit <- glm(wage ~ age.cut, data=Wage)
  cv.errors[i] <- cv.glm(Wage, glm.fit, K=10)$delta[1]
}
plot(2:10, cv.errors[-1], type="b", xlab="Number of cuts", ylab="Test MSE")
points(which.min(cv.errors), cv.errors[which.min(cv.errors)], col="red", pch=15, cex=2)

Cross validation approximates that the test error is minimized at k=8 knots. Now we will train the entire data with step function using 8 cuts and plot the fit.

fit_step <- glm(wage ~ cut(age, 8), data=Wage)
preds <- predict(fit_step, data.frame(age = age_grid))
plot(age, wage, col="darkgray")
lines(age_grid, preds, col="darkgreen", lwd=2)

Therefore, the optimal number of cuts is 8.

Exercise 10

library(leaps)
train <- sample(1: nrow(College), nrow(College)/2)
test <- -train
fit <- regsubsets(Outstate ~ ., data = College, subset = train, method = 'forward')
coef(fit, id = 6)
##   (Intercept)    PrivateYes    Room.Board      Terminal   perc.alumni 
## -3933.0756018  2391.3013093     1.1163216    32.0693193    49.1741555 
##        Expend     Grad.Rate 
##     0.2071043    32.3687749
library(gam)
gam.mod <- gam(Outstate ~ Private + s(Room.Board, 5) + s(Terminal, 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5), data = College, subset = train)
par(mfrow = c(2,3))
plot(gam.mod, se = TRUE)

Based on the shape of the fit curves, Expend and Grad.Rate are strong non-linear with outstate.

  • (c) Evaluate the model obtained on the test set, and explain the results obtained.
preds <- predict(gam.mod, College[test, ])
RSS <- sum((College[test, ]$Outstate - preds)^2) # based on equation (3.16)
TSS <- sum((College[test, ]$Outstate - mean(College[test, ]$Outstate)) ^ 2)
R2= 1 - (RSS / TSS)
R2
## [1] 0.7755414

The \(R^{2}\) statistic on test set is 0.78.

  • (d) For which variables, if any, is there evidence of a non-linear relationship with the response?
summary(gam.mod)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 5) + s(Terminal, 
##     5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5), 
##     data = College, subset = train)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -6277.5 -1084.1   123.7  1223.2  7139.7 
## 
## (Dispersion Parameter for gaussian family taken to be 3370637)
## 
##     Null Deviance: 6286829154 on 387 degrees of freedom
## Residual Deviance: 1216799198 on 360.9997 degrees of freedom
## AIC: 6960.989 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private             1 1597519236 1597519236 473.952 < 2.2e-16 ***
## s(Room.Board, 5)    1 1370398372 1370398372 406.570 < 2.2e-16 ***
## s(Terminal, 5)      1  261867894  261867894  77.691 < 2.2e-16 ***
## s(perc.alumni, 5)   1  254976797  254976797  75.647 < 2.2e-16 ***
## s(Expend, 5)        1  542142991  542142991 160.843 < 2.2e-16 ***
## s(Grad.Rate, 5)     1   73167730   73167730  21.707 4.473e-06 ***
## Residuals         361 1216799198    3370637                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df  Npar F   Pr(F)    
## (Intercept)                                  
## Private                                      
## s(Room.Board, 5)        4  2.1533 0.07381 .  
## s(Terminal, 5)          4  1.2345 0.29581    
## s(perc.alumni, 5)       4  0.7622 0.55043    
## s(Expend, 5)            4 22.5609 < 2e-16 ***
## s(Grad.Rate, 5)         4  1.6358 0.16463    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Anova for Nonparametric Effects shows Expend has strong non-linear relationshop with the Outstate.