library(ISLR)
attach(Wage)

Problem 6

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

Cross validation has the best fit with the lowest error at K-fold equal to 10 and to the 5th power.

library(boot)
set.seed(1)
cv.e = rep(NA,6)
for (i in 1:6){
 glm.fit=glm(wage~poly(age ,i),data=Wage) 
 cv.e[i]=cv.glm(Wage,glm.fit,K=10)$delta[2]
}
cv.e
## [1] 1676.681 1600.607 1598.089 1595.381 1594.716 1595.676
which.min(cv.e)
## [1] 5

Polynomial Regression Plot

plot(1:6, cv.e, xlab="Degree", ylab="CV Error", type="l", pch=20, lwd=2, ylim=c(1590, 1650))

min.point = min(cv.e)
sd.points = sd(cv.e)

ANOVA Comparison

set.seed(1)
fit1=lm(wage~poly(age,1),data=Wage)
fit2=lm(wage~poly(age,2),data=Wage)
fit3=lm(wage~poly(age,3),data=Wage)
fit4=lm(wage~poly(age,4),data=Wage)
fit5=lm(wage~poly(age,5),data=Wage)
fit6=lm(wage~poly(age,6),data=Wage)
anova(fit1, fit2, fit3, fit4, fit5, fit6)
## Analysis of Variance Table
## 
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Model 6: wage ~ poly(age, 6)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.6636 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8936  0.001675 ** 
## 4   2995 4771604  1      6070   3.8117  0.050989 .  
## 5   2994 4770322  1      1283   0.8054  0.369565    
## 6   2993 4766389  1      3932   2.4692  0.116201    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(wage ~ age, data = Wage, col = "grey")
age.range = range(Wage$age)
age.grid = seq(from = age.range[1], to = age.range[2])
fit = lm(wage ~ poly(age, 3), data = Wage)
preds = predict(fit, newdata = list(age = age.grid))
lines(age.grid, preds, col = "blue", lwd = 2)

(b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained. Using 12 intervals, the optimal number of cuts is 11.

set.seed(1)
cv.e.12 = rep(NA,12)

for (i in 2:12) {
  Wage$age.cut = cut(Wage$age, i)
  glm.fit = glm(wage ~ age.cut, data = Wage)
  cv.e.12[i] = cv.glm(Wage, glm.fit, K = 12)$delta[1]
}

cv.e.12
##  [1]       NA 1733.987 1683.033 1636.750 1630.302 1624.056 1610.984 1602.523
##  [9] 1610.884 1608.476 1599.299 1603.111
which.min(cv.e.12)
## [1] 11
plot(cv.e.12, xlab = "Cuts", ylab = "Test MSE", type = "l")

d.min <- which.min(cv.e.12)

Problem 10

library(ISLR)
library(leaps)
library(MASS)
attach(College)

This question relates to the College data set.

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors,perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

A satisfactory model uses 6 predictors wich are : PrivateYes , Room.Board, Terminal, perc.alumni, Expend, Grad.Rate.

set.seed(3)
train.c = sample(length(Outstate), length(Outstate)*.70)
test.c = (-train.c)
college.test = College[test.c,]
college.train = College[train.c,]
fit.c = regsubsets(Outstate ~ ., data = college.train, method = 'forward')
fit.summ = summary(fit.c)
fit.summ
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = college.train, method = "forward")
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
##          PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 ) " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 3  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 4  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 ) "*"        " "  " "    " "    " "       " "       "*"        
## 8  ( 1 ) "*"        " "  "*"    " "    " "       " "       "*"        
##          P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 ) " "         " "        " "   " "      " " " "      " "      
## 2  ( 1 ) " "         " "        " "   " "      " " " "      " "      
## 3  ( 1 ) " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 ) " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 ) " "         "*"        " "   " "      " " " "      " "      
## 6  ( 1 ) " "         "*"        " "   " "      " " "*"      " "      
## 7  ( 1 ) " "         "*"        " "   " "      " " "*"      " "      
## 8  ( 1 ) " "         "*"        " "   " "      " " "*"      " "      
##          perc.alumni Expend Grad.Rate
## 1  ( 1 ) " "         "*"    " "      
## 2  ( 1 ) " "         "*"    " "      
## 3  ( 1 ) " "         "*"    " "      
## 4  ( 1 ) "*"         "*"    " "      
## 5  ( 1 ) "*"         "*"    "*"      
## 6  ( 1 ) "*"         "*"    "*"      
## 7  ( 1 ) "*"         "*"    "*"      
## 8  ( 1 ) "*"         "*"    "*"
par(mfrow = c(1, 3))
plot(fit.summ$cp, xlab = "variables", ylab = "CP", type = "l")
min.cp <- min(fit.summ$cp)
std.cp <- sd(fit.summ$cp)
abline(h = min.cp + 0.2 * std.cp, col = "red", lty = 2)
abline(h = min.cp - 0.2 * std.cp, col = "red", lty = 2)
plot(fit.summ$bic, xlab = "Variables", ylab = "BIC", type='l')
min.bic <- min(fit.summ$bic)
std.bic <- sd(fit.summ$bic)
abline(h = min.bic + 0.2 * std.bic, col = "red", lty = 2)
abline(h = min.bic - 0.2 * std.bic, col = "red", lty = 2)
plot(fit.summ$adjr2, xlab = "Variables", ylab = "Adj R2", type = "l", ylim = c(0.4, 0.84))
max.adjr2 <- max(fit.summ$adjr2)
std.adjr2 <- sd(fit.summ$adjr2)
abline(h = max.adjr2 + 0.2 * std.adjr2, col = "red", lty = 2)
abline(h = max.adjr2 - 0.2 * std.adjr2, col = "red", lty = 2)

coef(fit.c,id = 6)
##   (Intercept)    PrivateYes    Room.Board      Terminal   perc.alumni 
## -3706.5683569  2590.9970262     0.8335834    37.0548647    46.5572252 
##        Expend     Grad.Rate 
##     0.2661565    33.0613845

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

From the graphs we can see that terminal and expend do not have a linear relationship.

library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.16.1
gam.fit = gam(Outstate ~ Private + s(Room.Board) + s(Terminal) + s(perc.alumni) + s(Expend) + s(Grad.Rate), data = College, subset = train.c)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
summary(gam.fit)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(Terminal) + 
##     s(perc.alumni) + s(Expend) + s(Grad.Rate), data = College, 
##     subset = train.c)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7032.08 -1064.72    24.97  1178.71  7615.90 
## 
## (Dispersion Parameter for gaussian family taken to be 3403569)
## 
##     Null Deviance: 8774298365 on 542 degrees of freedom
## Residual Deviance: 1773258847 on 520.9998 degrees of freedom
## AIC: 9731.411 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                 Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private          1 2429353156 2429353156 713.766 < 2.2e-16 ***
## s(Room.Board)    1 1866418714 1866418714 548.371 < 2.2e-16 ***
## s(Terminal)      1  579306806  579306806 170.206 < 2.2e-16 ***
## s(perc.alumni)   1  420113851  420113851 123.433 < 2.2e-16 ***
## s(Expend)        1  771527591  771527591 226.682 < 2.2e-16 ***
## s(Grad.Rate)     1  135915060  135915060  39.933 5.646e-10 ***
## Residuals      521 1773258847    3403569                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                Npar Df  Npar F   Pr(F)    
## (Intercept)                               
## Private                                   
## s(Room.Board)        3  2.9638 0.03171 *  
## s(Terminal)          3  1.1810 0.31634    
## s(perc.alumni)       3  1.6164 0.18456    
## s(Expend)            3 28.7779 < 2e-16 ***
## s(Grad.Rate)         3  1.9129 0.12646    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2,3))
plot(gam.fit, se=TRUE, col = "red")

(c) Evaluate the model obtained on the test set, and explain the results obtained. The MSE is 3690000 and the R Squared is .77 using GAMs 6 predictors

set.seed(1)
preds = predict(gam.fit, newdata=College[test.c, ])
MSE = mean((college.test$Outstate - preds)^2)
MSE
## [1] 3690000
R = sum((College[test.c, ]$Outstate - preds)^2) 
T = sum((College[test.c, ]$Outstate - mean(College[test.c, ]$Outstate)) ^ 2)
1 - (R/T)  
## [1] 0.7711211

(d) For which variables, if any, is there evidence of a non-linear relationship with the response? Using a p-value of .05 we can see that expend and room and board have a non-linear relationship with Outstate.

summary(gam.fit)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(Terminal) + 
##     s(perc.alumni) + s(Expend) + s(Grad.Rate), data = College, 
##     subset = train.c)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7032.08 -1064.72    24.97  1178.71  7615.90 
## 
## (Dispersion Parameter for gaussian family taken to be 3403569)
## 
##     Null Deviance: 8774298365 on 542 degrees of freedom
## Residual Deviance: 1773258847 on 520.9998 degrees of freedom
## AIC: 9731.411 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                 Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private          1 2429353156 2429353156 713.766 < 2.2e-16 ***
## s(Room.Board)    1 1866418714 1866418714 548.371 < 2.2e-16 ***
## s(Terminal)      1  579306806  579306806 170.206 < 2.2e-16 ***
## s(perc.alumni)   1  420113851  420113851 123.433 < 2.2e-16 ***
## s(Expend)        1  771527591  771527591 226.682 < 2.2e-16 ***
## s(Grad.Rate)     1  135915060  135915060  39.933 5.646e-10 ***
## Residuals      521 1773258847    3403569                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                Npar Df  Npar F   Pr(F)    
## (Intercept)                               
## Private                                   
## s(Room.Board)        3  2.9638 0.03171 *  
## s(Terminal)          3  1.1810 0.31634    
## s(perc.alumni)       3  1.6164 0.18456    
## s(Expend)            3 28.7779 < 2e-16 ***
## s(Grad.Rate)         3  1.9129 0.12646    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1