Problem 6 - Part A

wage <- Wage
attach(wage)
## The following object is masked _by_ .GlobalEnv:
## 
##     wage
set.seed(1)
wageCV.err <- rep(0,10)
for (x in 1:10) {
  wageGLM <- glm(wage~poly(age,x), data=wage)
  wageCV.err[x] <- cv.glm(wage, wageGLM, K=10)$delta[1]
}

The results in the cross-validation plot below suggest that degree 4 is the optimal degree. Each degree after 4 does not exhibit a significant error difference, therefore 4 will be selected if there is also agreement among the ANOVA models in the next step.

plot(wageCV.err, type="b", xlab="Degree", ylab="CV Error", lwd=2, ylim=c(1590,1700), col="blue")

The p-values for each model in the ANOVA table suggest that the 3rd and 4th degree models are significant (although the 4th degree borders on being insignificant since it’s p-value is 0.05) and each degree beyond 4 is not significant.

wageFit.1 <- lm(wage~age, data=wage)
wageFit.2 <- lm(wage~poly(age,2), data=wage)
wageFit.3 <- lm(wage~poly(age,3), data=wage)
wageFit.4 <- lm(wage~poly(age,4), data=wage)
wageFit.5 <- lm(wage~poly(age,5), data=wage)
wageFit.6 <- lm(wage~poly(age,6), data=wage)
wageFit.7 <- lm(wage~poly(age,7), data=wage)
wageFit.8 <- lm(wage~poly(age,8), data=wage)
wageFit.9 <- lm(wage~poly(age,9), data=wage)
wageFit.10 <- lm(wage~poly(age,10), data=wage)
anova(wageFit.1, wageFit.2, wageFit.3, wageFit.4, wageFit.5, wageFit.6, wageFit.7, wageFit.8, wageFit.9, wageFit.10)

In this chunk, I am plotting the polynomial fit to the data.

plot(wage~age, data=wage, col="grey", xlab="Age", ylab="Wage")
ageLims <- range(wage$age)
ageGrid <- seq(ageLims[1], ageLims[2])
wageLM = lm(wage~poly(age,4), data=wage)
wagePred <- predict(wageLM, data.frame(age=ageGrid))
lines(ageGrid, wagePred, lwd=2, col="blue")

Problem 6 - Part B

set.seed(1)
wageCV.err.2 <- rep(0,9)
for (i in 2:10) {
  wage$age.cut <- cut(wage$age,i)
  wageGLM.2 <- glm(wage~age.cut, data=wage)
  wageCV.err.2[i-1] <- cv.glm(wage, wageGLM.2, K=10)$delta[1]
}

The results in the cross-validation plot below suggest that 8 cuts is optimal since the cross-validation error is lowest.

plot(2:10, wageCV.err.2, type="b", col="blue", xlab="Cuts", ylab="CV error")

In this chunk, I am creating a plot of the fit for 8 cuts.

plot(wage~age, data=wage, col="grey", xlab="Age", ylab="Wage")
ageLims.2 <- range(wage$age)
ageGrid.2 <- seq(ageLims[1], ageLims[2])
wageLM.2 = lm(wage~cut(age,8), data=wage)
wagePred.2 <- predict(wageLM.2, data.frame(age=ageGrid.2))
lines(ageGrid.2, wagePred.2, lwd=2, col="blue")

Problem 10 - Part A

college <- College
attach(college)
set.seed(1)
split <- sample(1:nrow(college), nrow(college)/2)
collegeTrain <- college[split,]
collegeTest <- college[-split,]
collegeFWD <- regsubsets(Outstate~., data=collegeTrain, nvmax=17, method="forward")
collegeFWD.summary = summary(collegeFWD)

The results of the plot below suggest that a subset containing 6 predictors is optimal. The lowest BIC value is identified for a predictor subset in the BIC plot. Additionally, the values for adjusted R-squared and CP do not improve dramatically for subsets containing more than 6 predictors.

par(mfrow=c(2,2))

plot(collegeFWD.summary$adjr2, type="b", xlab="Predictors", ylab="Adjusted R^2", main="Adjusted R^2")
adjrMax <- which.max(collegeFWD.summary$adjr2)
points(adjrMax, collegeFWD.summary$adjr[adjrMax], col="red", lwd=4)

plot(collegeFWD.summary$bic, type="b", xlab="Predictors", ylab="BIC", main="BIC")
bicMin <- which.min(collegeFWD.summary$bic)
points(bicMin, collegeFWD.summary$bic[bicMin], col="red", lwd=4)

plot(collegeFWD.summary$cp, type="b", xlab="Predictors", ylab="CP", main="CP")
cpMin <- which.min(collegeFWD.summary$cp)
points(cpMin, collegeFWD.summary$cp[cpMin], col="red", lwd=4)

In this chunk, I am revealing the 6 predictors in the model.

coef(collegeFWD, 6)
##   (Intercept)    PrivateYes    Room.Board      Terminal   perc.alumni 
## -4726.8810613  2717.7019276     1.1032433    36.9990286    59.0863753 
##        Expend     Grad.Rate 
##     0.1930814    33.8303314

Problem 10 - Part B

collegeGAM <- gam(Outstate~Private + s(Room.Board, df=3) + s(Terminal, df=3) + s(perc.alumni, df=3) + s(Expend, df=3) + s(Grad.Rate, df=3), data=collegeTrain)

In the plots below for the 6 predictors included in the GAM model, it is evident that Terminal, perc.alumni, Room.Board and Grad.Rate share a linear relationship with the Outstate variable. The plot for Expend seems to provide evidence that the relationship with Outstate is not linear.

par(mfrow=c(2,3))
plot(collegeGAM, se=TRUE, col="blue")

Problem 10 - Part C

The test MSE for the model obtained on the the test set is shown to be 3378416. In addition, the R-squared value is 0.7639667 which suggests that about 77% of the variance can be explained by the model.

collegeGAM.pred <- predict(collegeGAM, collegeTest)
(collegeGAM.mse <- mean((collegeTest$Outstate - collegeGAM.pred)^2))
## [1] 3378416
collegeGAM.tss <- mean((collegeTest$Outstate - mean(collegeTest$Outstate))^2)
collegeGAM.rss <- 1-collegeGAM.mse/collegeGAM.tss
collegeGAM.rss
## [1] 0.7639667

Problem 10 - Part D

The results in the ANOVA for Nonparametric Effects table below suggest that Expend shares a nonlinear relationship with Outstate due to the small p-value obtained. The results of the other predictors (except Private which is categorical) suggest that the relationships are linear due to the large p-values.

summary(collegeGAM)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 3) + s(Terminal, 
##     df = 3) + s(perc.alumni, df = 3) + s(Expend, df = 3) + s(Grad.Rate, 
##     df = 3), data = collegeTrain)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -6851.9 -1144.9   -99.9  1275.7  7845.3 
## 
## (Dispersion Parameter for gaussian family taken to be 3811388)
## 
##     Null Deviance: 6989966760 on 387 degrees of freedom
## Residual Deviance: 1414024650 on 370.9999 degrees of freedom
## AIC: 6999.272 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                         Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private                  1 1769107166 1769107166 464.163 < 2.2e-16 ***
## s(Room.Board, df = 3)    1 1625808440 1625808440 426.566 < 2.2e-16 ***
## s(Terminal, df = 3)      1  305564773  305564773  80.171 < 2.2e-16 ***
## s(perc.alumni, df = 3)   1  358311620  358311620  94.011 < 2.2e-16 ***
## s(Expend, df = 3)        1  569459943  569459943 149.410 < 2.2e-16 ***
## s(Grad.Rate, df = 3)     1   91109927   91109927  23.905  1.51e-06 ***
## Residuals              371 1414024650    3811388                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                        Npar Df Npar F     Pr(F)    
## (Intercept)                                        
## Private                                            
## s(Room.Board, df = 3)        2  1.598    0.2037    
## s(Terminal, df = 3)          2  1.137    0.3220    
## s(perc.alumni, df = 3)       2  0.278    0.7575    
## s(Expend, df = 3)            2 34.287 2.176e-14 ***
## s(Grad.Rate, df = 3)         2  0.909    0.4039    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1