Assignment 6, Chapter 7, Q6 & 10

Question 6

library(ISLR)
attach(Wage)

(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data

library(boot)
set.seed(1)
cv.error <- rep(0,10)
for(i in 1:10){
  glm.fit <- glm(wage~poly(age,i), data=Wage)
  cv.error[i] <- cv.glm(Wage,glm.fit,K=10)$delta[1]
}
cv.error

##  [1] 1676.826 1600.763 1598.399 1595.651 1594.977 1596.061 1594.298 1598.134
##  [9] 1593.913 1595.950

which.min(cv.error)

## [1] 9

cv.error[9]

## [1] 1593.913

The lowest d MSE was the 9th degree, with a result of 1593.913; however, the change of MSE after d=4 is negligible (1595.651), so that will be used going forward. Utilizing the ANOVA below, we see that after the 4th polynomial, it stops being significant, which is in agreement with that above.

fit.1 <- lm(wage~age, data=Wage)
fit.2 <- lm(wage~poly (age, 2), data=Wage)
fit.3 <- lm(wage~poly (age, 3), data=Wage)
fit.4 <- lm(wage~poly (age, 4), data=Wage)
fit.5 <- lm(wage~poly (age, 5), data=Wage)
anova ( fit.1, fit.2, fit.3, fit.4, fit.5)

## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(wage ~ age, data=Wage, col="darkgrey")
agelims <- range(age)
age.grid <- seq(from=agelims[1], to=agelims[2])
fit <- lm(wage~poly(age,4), data=Wage)
preds <- predict(fit,newdata = list(age=age.grid),se=TRUE)
se.bands <- cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
title("Degree-4 Polynomial",outer=T)
lines(age.grid,preds$fit,lwd=2,col="blue")
matlines(age.grid,se.bands,lwd=1,col="blue",lty=3)

(b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained

cv.errors <- rep(NA, 10)

for(i in 2:10){
  Wage$age.cut <- cut(Wage$age,i)
  glm.fit <- glm(wage ~ age.cut, data=Wage)
  cv.errors[i] <- cv.glm(Wage, glm.fit, K=10)$delta[1]
}

cv.errors

##  [1]       NA 1735.201 1682.307 1636.967 1631.602 1625.438 1613.727 1600.229
##  [9] 1612.512 1607.027

which.min(cv.errors)

## [1] 8

This shows that we should be using 8 cuts

table(cut(age,8))

## 
## (17.9,25.8] (25.8,33.5] (33.5,41.2]   (41.2,49]   (49,56.8] (56.8,64.5] 
##         231         519         671         728         503         276 
## (64.5,72.2] (72.2,80.1] 
##          54          18

fit=lm(wage~cut(age,8),data=Wage)
coef(summary(fit))

##                        Estimate Std. Error   t value      Pr(>|t|)
## (Intercept)            76.28175   2.629812 29.006542 3.110596e-163
## cut(age, 8)(25.8,33.5] 25.83329   3.161343  8.171618  4.440913e-16
## cut(age, 8)(33.5,41.2] 40.22568   3.049065 13.192791  1.136044e-38
## cut(age, 8)(41.2,49]   43.50112   3.018341 14.412262  1.406253e-45
## cut(age, 8)(49,56.8]   40.13583   3.176792 12.634076  1.098741e-35
## cut(age, 8)(56.8,64.5] 44.10243   3.564299 12.373380  2.481643e-34
## cut(age, 8)(64.5,72.2] 28.94825   6.041576  4.791505  1.736008e-06
## cut(age, 8)(72.2,80.1] 15.22418   9.781110  1.556488  1.196978e-01

preds <- predict(fit, list(age=age.grid))
plot(wage~age, data=Wage, col='darkgrey')
lines(age.grid, preds, col='blue', lwd=2)

detach(Wage)

Question 10

attach(College)
library(leaps)

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors

dim(College)

## [1] 777  18

regfit.fwd <-  regsubsets(Outstate~.,College, nvmax=18, method="forward")
summary(regfit.fwd)

## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., College, nvmax = 18, method = "forward")
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
##           PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 )  " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 3  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 4  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 8  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 9  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 10  ( 1 ) "*"        " "  "*"    " "    " "       " "       " "        
## 11  ( 1 ) "*"        " "  "*"    " "    " "       " "       "*"        
## 12  ( 1 ) "*"        "*"  "*"    " "    " "       " "       "*"        
## 13  ( 1 ) "*"        "*"  "*"    " "    "*"       " "       "*"        
## 14  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"        
## 15  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"        
## 16  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
## 17  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
##           P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 2  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 3  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 )  " "         "*"        " "   " "      "*" " "      " "      
## 6  ( 1 )  " "         "*"        " "   " "      "*" " "      " "      
## 7  ( 1 )  " "         "*"        " "   "*"      "*" " "      " "      
## 8  ( 1 )  " "         "*"        " "   "*"      "*" "*"      " "      
## 9  ( 1 )  " "         "*"        " "   "*"      "*" "*"      "*"      
## 10  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 11  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 12  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 13  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 14  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 15  ( 1 ) " "         "*"        "*"   "*"      "*" "*"      "*"      
## 16  ( 1 ) " "         "*"        "*"   "*"      "*" "*"      "*"      
## 17  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
##           perc.alumni Expend Grad.Rate
## 1  ( 1 )  " "         "*"    " "      
## 2  ( 1 )  " "         "*"    " "      
## 3  ( 1 )  " "         "*"    " "      
## 4  ( 1 )  "*"         "*"    " "      
## 5  ( 1 )  "*"         "*"    " "      
## 6  ( 1 )  "*"         "*"    "*"      
## 7  ( 1 )  "*"         "*"    "*"      
## 8  ( 1 )  "*"         "*"    "*"      
## 9  ( 1 )  "*"         "*"    "*"      
## 10  ( 1 ) "*"         "*"    "*"      
## 11  ( 1 ) "*"         "*"    "*"      
## 12  ( 1 ) "*"         "*"    "*"      
## 13  ( 1 ) "*"         "*"    "*"      
## 14  ( 1 ) "*"         "*"    "*"      
## 15  ( 1 ) "*"         "*"    "*"      
## 16  ( 1 ) "*"         "*"    "*"      
## 17  ( 1 ) "*"         "*"    "*"

set.seed(1)
train <- sample(c(TRUE, FALSE), nrow(College), rep=TRUE)
head(train)

## [1]  TRUE FALSE  TRUE  TRUE FALSE  TRUE

test <- (!train)
head(test)

## [1] FALSE  TRUE FALSE FALSE  TRUE FALSE

regfit.full <- regsubsets(Outstate~.,data=College[train,], nvmax=17)
test.mat <- model.matrix(Outstate~.,data=College[test,])

val.errors <- rep(NA,17)
for(i in 1:17){
  coefi <- coef(regfit.full, id=i)
  pred <- test.mat[,names(coefi)]%*%coefi
  val.errors[i] <- mean((College$Outstate[test]-pred)^2)
}
val.errors

##  [1] 8743411 6271243 5188723 4548428 4269584 4087184 4067437 4119695 4031878
## [10] 3809441 3833116 3827900 3846028 3836026 3822375 3826807 3828915

#how many variables is best for the model?
which.min(val.errors)

## [1] 10

coef(regfit.full,10)

##   (Intercept)    PrivateYes          Apps        Accept     Top10perc 
## -3189.1762784  2340.3784123    -0.2845174     0.7052637    29.7207779 
##   F.Undergrad    Room.Board      Terminal   perc.alumni        Expend 
##    -0.1492326     0.9832653    27.5107535    51.2155999     0.1966401 
##     Grad.Rate 
##    22.4494743

The best model uses 10 variables: Private, Apps, Accept, Top10perc, F.Undergrad, Room.Board, Terminal, perc.alumni, Expend, and Grad.Rate.

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings

library(gam)

## Loading required package: splines

## Loading required package: foreach

## Loaded gam 1.22-1

gam1 <- gam(Outstate~Private+s(Apps,4)+s(Accept,4)+s(Top10perc, 4)+s(F.Undergrad,4)+s(Room.Board,4)+s(Terminal,4)+s(perc.alumni,4)+s(Expend,4)+s(Grad.Rate,4), data=College[train,])
par(mfrow=c(1,3))
plot(gam1, se=TRUE,col="blue")

(c) Evaluate the model obtained on the test set, and explain the results obtained

preds <- predict(gam1, newdata = College[test,])
mean((College[test,]$Outstate-preds)^2)

## [1] 3699419

The MSE of this model on the test data is 3699419,

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

From the plots in Q10b, it appears that the following variables are non-linear: top10perc, Terminal, Expend, and Grad.Rate