PROBLEM 6

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

Use ISLR, Boot, and the Wage dataset

require (ISLR)
## Loading required package: ISLR
require (boot)
## Loading required package: boot
data (Wage)
set.seed(1)

Using cross validation to select the optimal degree d for the polynomial.

error_cross<- rep(0,10)
for (i in 1:10) {fit.glm<-glm(wage~poly(age,i),data=Wage)
error_cross[i]<-cv.glm(Wage,fit.glm,K=10)$delta[1]
}

Plotting Cross Validation Error

error_cross
##  [1] 1675.837 1601.012 1598.801 1594.217 1594.625 1594.888 1595.500
##  [8] 1595.436 1596.335 1595.835
plot(error_cross,type = "b")

from the plot, d = 4 looks like a good choice.

ANOVA for comparison with polynomial. I chose to test degrees from 1 through 10 to ensure a full comparison against ANOVA.

fit_1 <- lm(wage~age, data=Wage)
fit_2 <- lm(wage~poly(age,2), data=Wage)
fit_3 <- lm(wage~poly(age,3), data=Wage)
fit_4 <- lm(wage~poly(age,4), data=Wage)
fit_5 <- lm(wage~poly(age,5), data=Wage)
fit_6 <- lm(wage~poly(age,6), data=Wage)
fit_7 <- lm(wage~poly(age,7), data=Wage)
fit_8 <- lm(wage~poly(age,8), data=Wage)
fit_9 <- lm(wage~poly(age,9), data=Wage)
fit_10 <- lm(wage~poly(age,10), data=Wage)
anova(fit_1,fit_2,fit_3,fit_4,fit_5,fit_6,fit_7,fit_8,fit_9,fit_10)
## Analysis of Variance Table
## 
## Model  1: wage ~ age
## Model  2: wage ~ poly(age, 2)
## Model  3: wage ~ poly(age, 3)
## Model  4: wage ~ poly(age, 4)
## Model  5: wage ~ poly(age, 5)
## Model  6: wage ~ poly(age, 6)
## Model  7: wage ~ poly(age, 7)
## Model  8: wage ~ poly(age, 8)
## Model  9: wage ~ poly(age, 9)
## Model 10: wage ~ poly(age, 10)
##    Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1    2998 5022216                                    
## 2    2997 4793430  1    228786 143.7638 < 2.2e-16 ***
## 3    2996 4777674  1     15756   9.9005  0.001669 ** 
## 4    2995 4771604  1      6070   3.8143  0.050909 .  
## 5    2994 4770322  1      1283   0.8059  0.369398    
## 6    2993 4766389  1      3932   2.4709  0.116074    
## 7    2992 4763834  1      2555   1.6057  0.205199    
## 8    2991 4763707  1       127   0.0796  0.777865    
## 9    2990 4756703  1      7004   4.4014  0.035994 *  
## 10   2989 4756701  1         3   0.0017  0.967529    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

d = 3 and d = 4 look the best. I decided to choose d = 4

agel<-range(Wage$age)
grid<-seq(agel[1],agel[2])
pred<- predict(fit_4, newdata=list(age=grid), se=TRUE)

Making a plot of the resulting polynomial fit

se.bands <- pred$fit + cbind(2*pred$se.fit, -2*pred$se.fit)
par(mfrow=c(1,1), mar=c(4.5,4.5,1,1), oma=c(0,0,4,0))
plot(Wage$age, Wage$wage, xlim=agel, cex=0.5, col="grey")
title("Polynomial Fit,d=4", outer=TRUE)
lines(grid, pred$fit, lwd=2, col="blue")
matlines(grid, se.bands, lwd=1, col="blue", lty=3)

Cross validation

error_cv<-rep(0,9)
for (i in 2:10) {
  Wage$age.cut <- cut(Wage$age,i)
  glm.fit <- glm(wage~age.cut, data=Wage)
  error_cv[i-1] <- cv.glm(Wage, glm.fit, K=10)$delta[1] 
}

Plotting cross-validation

error_cv
## [1] 1733.880 1683.337 1635.042 1633.382 1622.835 1610.996 1605.042 1609.491
## [9] 1606.329
plot(2:10, error_cv, type="b")

eight cuts looks ideal.

fit_08<- glm(wage~cut(age,8), data=Wage)
pred2 <- predict(fit_08, newdata=list(age=grid), se=TRUE)

Plotting the fit

se.bands2 <- pred2$fit + cbind(2*pred2$se.fit, -2*pred2$se.fit)
plot(Wage$age, Wage$wage, xlim=agel, cex=0.5, col="darkgrey")
title("Fit_08")
lines(grid, pred2$fit, lwd=2, col="blue")
matlines(grid, se.bands2, lwd=1, col="blue", lty=3)

PROBLEM 10

This question relates to the College data set.

require(leaps)
## Loading required package: leaps
data("College")
set.seed(1)

Splitting the data into train and test sets

trainid<- sample(1:nrow(College), nrow(College)/2)
data.train <- College[trainid,]
data.test <- College[-trainid,]

Predict function to perform forward selection

predict.regsubsets <- function(object, newdata, id, ...){
  form <- as.formula(object$call[[2]])
  mat <- model.matrix(form, newdata)
  coefi <- coef(object, id=id)
  xvars <- names(coefi)
  mat[,xvars]%*%coefi
}

Forward selection

frwd.fit<- regsubsets(Outstate~., data=data.train, nvmax=ncol(College)-1)
(summary.frwd<- summary(frwd.fit))
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = data.train, nvmax = ncol(College) - 
##     1)
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: exhaustive
##           PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 )  " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 3  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 4  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 8  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 9  ( 1 )  "*"        "*"  "*"    " "    " "       " "       "*"        
## 10  ( 1 ) "*"        "*"  "*"    " "    " "       " "       "*"        
## 11  ( 1 ) "*"        "*"  "*"    " "    " "       " "       "*"        
## 12  ( 1 ) "*"        "*"  "*"    " "    " "       " "       "*"        
## 13  ( 1 ) "*"        "*"  "*"    " "    "*"       " "       "*"        
## 14  ( 1 ) "*"        "*"  "*"    " "    "*"       " "       "*"        
## 15  ( 1 ) "*"        "*"  "*"    " "    "*"       "*"       "*"        
## 16  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
## 17  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
##           P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 2  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 3  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 )  " "         "*"        " "   " "      " " "*"      " "      
## 6  ( 1 )  " "         "*"        " "   " "      " " "*"      " "      
## 7  ( 1 )  " "         "*"        " "   "*"      " " "*"      " "      
## 8  ( 1 )  " "         "*"        " "   "*"      " " "*"      "*"      
## 9  ( 1 )  " "         "*"        " "   " "      " " "*"      " "      
## 10  ( 1 ) " "         "*"        " "   " "      " " "*"      "*"      
## 11  ( 1 ) " "         "*"        " "   "*"      " " "*"      "*"      
## 12  ( 1 ) "*"         "*"        " "   "*"      " " "*"      "*"      
## 13  ( 1 ) "*"         "*"        " "   "*"      " " "*"      "*"      
## 14  ( 1 ) "*"         "*"        " "   "*"      "*" "*"      "*"      
## 15  ( 1 ) "*"         "*"        " "   "*"      "*" "*"      "*"      
## 16  ( 1 ) "*"         "*"        " "   "*"      "*" "*"      "*"      
## 17  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
##           perc.alumni Expend Grad.Rate
## 1  ( 1 )  " "         "*"    " "      
## 2  ( 1 )  " "         "*"    " "      
## 3  ( 1 )  " "         "*"    " "      
## 4  ( 1 )  "*"         "*"    " "      
## 5  ( 1 )  "*"         "*"    " "      
## 6  ( 1 )  "*"         "*"    "*"      
## 7  ( 1 )  "*"         "*"    "*"      
## 8  ( 1 )  "*"         "*"    "*"      
## 9  ( 1 )  "*"         "*"    "*"      
## 10  ( 1 ) "*"         "*"    "*"      
## 11  ( 1 ) "*"         "*"    "*"      
## 12  ( 1 ) "*"         "*"    "*"      
## 13  ( 1 ) "*"         "*"    "*"      
## 14  ( 1 ) "*"         "*"    "*"      
## 15  ( 1 ) "*"         "*"    "*"      
## 16  ( 1 ) "*"         "*"    "*"      
## 17  ( 1 ) "*"         "*"    "*"
err_frwd<- rep(NA, ncol(College)-1)
for(i in 1:(ncol(College)-1)) {
  frwd.pred<- predict(frwd.fit, data.test, id=i)
  err_frwd[i] <- mean((data.test$Outstate - frwd.pred)^2)
}

Plotting to identify a satisfactory model that only uses a subset of predictors

par(mfrow=c(2,2))
plot(err_frwd, type="b", main="Test MSE", xlab="Number of Predictors")
min.mse <- which.min(err_frwd)  
points(min.mse, err_frwd[min.mse], col="red", pch=9, lwd=5)
plot(summary.frwd$adjr2, type="b", main="Adjusted R^2", xlab="Number of Predictors")
max.adjr2 <- which.max(summary.frwd$adjr2)  
points(max.adjr2, summary.frwd$adjr2[max.adjr2], col="red", pch=9, lwd=5)
plot(summary.frwd$cp, type="b", main="cp", xlab="Number of Predictors")
min.cp <- which.min(summary.frwd$cp)  
points(min.cp, summary.frwd$cp[min.cp], col="red", pch=9, lwd=5)
plot(summary.frwd$bic, type="b", main="bic", xlab="Number of Predictors")
min.bic <- which.min(summary.frwd$bic)  
points(min.bic, summary.frwd$bic[min.bic], col="red", pch=9, lwd=5)

The model doesn’t improve after 6 predictors

coef(frwd.fit,6)
##   (Intercept)    PrivateYes    Room.Board      Terminal   perc.alumni 
## -4241.4402916  2790.4303173     0.9629335    37.8412517    60.6406044 
##        Expend     Grad.Rate 
##     0.2149396    30.3831268

Six Variables = Private, Room Board, Terminal, perc.alumni, Expend, Grad.Rate

GAM

require(gam)
## Loading required package: gam
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.16
fit.gam<- gam(Outstate~Private+s(Room.Board,df=2)+s(Terminal,df=2)+s(perc.alumni,df=2)+s(Expend,df=5)+s(Grad.Rate,df=2), data=data.train)
par(mfrow=c(1,3))
plot(fit.gam, se=TRUE, col="blue")

pred3 <- predict(fit.gam, data.test)
(mse.error <- mean((data.test$Outstate - pred3)^2))
## [1] 3721642
err_frwd[6]
## [1] 4357411

Better than linear fit

summary(fit.gam)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 2) + s(Terminal, 
##     df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate, 
##     df = 2), data = data.train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -5010.54 -1143.18    46.42  1230.10  7702.80 
## 
## (Dispersion Parameter for gaussian family taken to be 3310745)
## 
##     Null Deviance: 6221998532 on 387 degrees of freedom
## Residual Deviance: 1234907641 on 372.9999 degrees of freedom
## AIC: 6942.72 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                         Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private                  1 1795888516 1795888516 542.442 < 2.2e-16 ***
## s(Room.Board, df = 2)    1 1243271116 1243271116 375.526 < 2.2e-16 ***
## s(Terminal, df = 2)      1  388884475  388884475 117.461 < 2.2e-16 ***
## s(perc.alumni, df = 2)   1  308209205  308209205  93.094 < 2.2e-16 ***
## s(Expend, df = 5)        1  456342500  456342500 137.837 < 2.2e-16 ***
## s(Grad.Rate, df = 2)     1   63658700   63658700  19.228 1.511e-05 ***
## Residuals              373 1234907641    3310745                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                        Npar Df  Npar F     Pr(F)    
## (Intercept)                                         
## Private                                             
## s(Room.Board, df = 2)        1  3.6382   0.05723 .  
## s(Terminal, df = 2)          1  1.4197   0.23421    
## s(perc.alumni, df = 2)       1  1.4844   0.22386    
## s(Expend, df = 5)            4 16.9959 8.171e-13 ***
## s(Grad.Rate, df = 2)         1  4.1068   0.04343 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Significant evidence of non-linear relationship with the ‘Expend’ variable, some with ‘Room.Board’ and ‘Grad.Rate’ but none with ‘perc.alumni’ and ‘Terminal’.