Assignment 6

##Question 6

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data. Using K fold cross validation, the polynomial degree I would select is a 3rd degree. The 9th degree had the lowest error, however this model would be very complex and the 3rd degree model has an error of only 1.244 higher than the 9th degree model. Not much is gained by increasing the model complexity from 3rd degree to 9th degree. The ANOVA results also show to select a 3rd degree model.

library(ISLR)

## Warning: package 'ISLR' was built under R version 3.6.3

library(boot)
attach(Wage)
set.seed(1)
cv.error=rep(0,10)
for (i in 1:10){ 
 glm.fit=glm(wage~poly(age ,i),data=Wage) 
 cv.error[i]=cv.glm(Wage,glm.fit,K=10)$delta[1]
}
cv.error

##  [1] 1676.826 1600.763 1598.399 1595.651 1594.977 1596.061 1594.298
##  [8] 1598.134 1593.913 1595.950

plot(1:10, cv.error)

which.min(cv.error)

## [1] 9

fit.1 = lm(wage~poly(age, 1), data=Wage)
fit.2 = lm(wage~poly(age, 2), data=Wage)
fit.3 = lm(wage~poly(age, 3), data=Wage)
fit.4 = lm(wage~poly(age, 4), data=Wage)
fit.5 = lm(wage~poly(age, 5), data=Wage)
fit.6 = lm(wage~poly(age, 6), data=Wage)
fit.7 = lm(wage~poly(age, 7), data=Wage)
fit.8 = lm(wage~poly(age, 8), data=Wage)
fit.9 = lm(wage~poly(age, 9), data=Wage)
fit.10 = lm(wage~poly(age, 10), data=Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5, fit.6, fit.7, fit.8, fit.9, fit.10)

## Analysis of Variance Table
## 
## Model  1: wage ~ poly(age, 1)
## Model  2: wage ~ poly(age, 2)
## Model  3: wage ~ poly(age, 3)
## Model  4: wage ~ poly(age, 4)
## Model  5: wage ~ poly(age, 5)
## Model  6: wage ~ poly(age, 6)
## Model  7: wage ~ poly(age, 7)
## Model  8: wage ~ poly(age, 8)
## Model  9: wage ~ poly(age, 9)
## Model 10: wage ~ poly(age, 10)
##    Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1    2998 5022216                                    
## 2    2997 4793430  1    228786 143.7638 < 2.2e-16 ***
## 3    2996 4777674  1     15756   9.9005  0.001669 ** 
## 4    2995 4771604  1      6070   3.8143  0.050909 .  
## 5    2994 4770322  1      1283   0.8059  0.369398    
## 6    2993 4766389  1      3932   2.4709  0.116074    
## 7    2992 4763834  1      2555   1.6057  0.205199    
## 8    2991 4763707  1       127   0.0796  0.777865    
## 9    2990 4756703  1      7004   4.4014  0.035994 *  
## 10   2989 4756701  1         3   0.0017  0.967529    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

agelims=range(age)
age.grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit.3,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
plot(age,wage,xlim=agelims,cex=.5,col='darkgrey')
title('Degree-3 Polynomial')
lines(age.grid,preds$fit,lwd=2,col='darkblue',lty=3)
matlines(age.grid,se.bands,lwd=1,col='lightblue',lty=3)

(b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained.

The optimal number of cuts according to cross validation is 8

cv.error = rep(0,10)
for (i in 2:10){ 
  table(cut(age,i))
 fit=glm(wage~cut(age,i),data=Wage) 
 cv.error[i]=cv.glm(Wage,glm.fit,K=10)$delta[1]
}
plot(2:10, cv.error[-1])

cv.error

##  [1]    0.000 1601.798 1596.693 1596.498 1594.889 1595.814 1596.361
##  [8] 1594.670 1595.482 1597.764

fit<-glm(wage~cut(age,8),data=Wage)
preds=predict(fit,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
plot(age,wage,xlim=agelims,cex=.5,col='darkgrey')
lines(age.grid,preds$fit,lwd=2,col='red',lty=3)
matlines(age.grid,se.bands,lwd=2,col='lightblue',lty=3)

##Question 10

This question relates to the College data set.

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors. After splitting and running cross validation, the 16 predictor model has the lowest test MSE. After also checking the Cp, BIC and Adj R2 I decided that prediction would be the better route and utilize a near full model with 16 predictors.

library(leaps)

## Warning: package 'leaps' was built under R version 3.6.3

attach(College)
set.seed(1)
train=sample(c(TRUE ,FALSE), nrow(College),rep=TRUE)
test=-train
regfit.fwd=regsubsets(Outstate~., data=College[train,], nvmax=17, method ="forward")
test.mat=model.matrix(Outstate~.,data=College[test ,])
val.errors =rep(NA ,17)
for(i in 1:17){
 coefi=coef(regfit.fwd ,id=i)
 pred=test.mat[,names(coefi)]%*%coefi
 val.errors[i]=mean((College$Outstate[test]-pred)^2)
}
val.errors

##  [1] 8906888 6403184 5156790 4553988 4264031 4100910 4061381 4059015
##  [9] 4042114 3972686 3876028 3853605 3823763 3831087 3824131 3818758
## [17] 3819792

plot(val.errors)

which.min(val.errors)

## [1] 16

reg.summary = summary(regfit.fwd)
par(mfrow=c(1,3))
plot(reg.summary$adjr2, xlab="Number of Variables ", ylab="Adjusted RSq",type="l")
which.max(reg.summary$adjr2)

## [1] 13

max.adjr2 = which.max(reg.summary$adjr2)
points(max.adjr2,reg.summary$adjr2[max.adjr2], col="red", cex=2, pch =20)
plot(reg.summary$cp,xlab="Number of Variables ",ylab="Cp",type='l')
which.min(reg.summary$cp)

## [1] 11

min.cp = which.min(reg.summary$cp)
points(min.cp,reg.summary$cp[min.cp], col ="red", cex=2, pch =20)
which.min(reg.summary$bic)

## [1] 6

min.bic = which.min(reg.summary$bic)
plot(reg.summary$bic ,xlab="Number of Variables ", ylab="BIC", type='l')
points (min.bic,reg.summary$bic[min.bic],col="red",cex=2,pch =20)

reg.final = regsubsets(Outstate ~ ., data = College, method = "forward", nvmax = 17)
names(coef(reg.final,16))

##  [1] "(Intercept)" "PrivateYes"  "Apps"        "Accept"      "Enroll"     
##  [6] "Top10perc"   "Top25perc"   "F.Undergrad" "Room.Board"  "Books"      
## [11] "Personal"    "PhD"         "Terminal"    "S.F.Ratio"   "perc.alumni"
## [16] "Expend"      "Grad.Rate"

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

library(gam)

## Loading required package: splines

## Loading required package: foreach

## Warning: package 'foreach' was built under R version 3.6.3

## Loaded gam 1.20

gam.fit = gam(Outstate~ s(Apps,4)+s(Accept,4)+s(Enroll,4)+s(Top10perc,4)+s(Top25perc,4)+s(F.Undergrad,4)+s(Room.Board,4)+s(Books,4)+s(Personal,4)+s(PhD,4)+s(Terminal,4)+s(S.F.Ratio,4)+s(perc.alumni,4)+s(Expend,4)+s(Grad.Rate,4),data=College[train,])
plot(gam.fit, se = TRUE, col = "#1c9099")

(c) Evaluate the model obtained on the test set, and explain theresults obtained. The model has a test MSE of 3819792

gam.pred = predict(gam.fit, data=College[test,])
val.error = mean((College$Outstate[test]-pred)^2)
val.error

## [1] 3819792

(d) For which variables, if any, is there evidence of a non-linear relationship with the response? The variables with a low p-value and thus indicating that there non-linear relationships to the response variable are Accept, F.Undergrad, S.F.Ratio, Expend, and Grad.Rate

summary(gam.fit)

## 
## Call: gam(formula = Outstate ~ s(Apps, 4) + s(Accept, 4) + s(Enroll, 
##     4) + s(Top10perc, 4) + s(Top25perc, 4) + s(F.Undergrad, 4) + 
##     s(Room.Board, 4) + s(Books, 4) + s(Personal, 4) + s(PhD, 
##     4) + s(Terminal, 4) + s(S.F.Ratio, 4) + s(perc.alumni, 4) + 
##     s(Expend, 4) + s(Grad.Rate, 4), data = College[train, ])
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -5555.9  -992.1   123.9  1187.8  4704.2 
## 
## (Dispersion Parameter for gaussian family taken to be 3254379)
## 
##     Null Deviance: 6262049699 on 392 degrees of freedom
## Residual Deviance: 1080451222 on 331.9992 degrees of freedom
## AIC: 7066.233 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## s(Apps, 4)          1   75315178   75315178  23.1427 2.286e-06 ***
## s(Accept, 4)        1  450800405  450800405 138.5212 < 2.2e-16 ***
## s(Enroll, 4)        1  468537482  468537482 143.9714 < 2.2e-16 ***
## s(Top10perc, 4)     1 1591917099 1591917099 489.1616 < 2.2e-16 ***
## s(Top25perc, 4)     1    4184512    4184512   1.2858  0.257640    
## s(F.Undergrad, 4)   1  201648683  201648683  61.9623 5.006e-14 ***
## s(Room.Board, 4)    1  640508400  640508400 196.8143 < 2.2e-16 ***
## s(Books, 4)         1      72307      72307   0.0222  0.881598    
## s(Personal, 4)      1    8872262    8872262   2.7263  0.099656 .  
## s(PhD, 4)           1   27191047   27191047   8.3552  0.004099 ** 
## s(Terminal, 4)      1    8478335    8478335   2.6052  0.107463    
## s(S.F.Ratio, 4)     1   75693729   75693729  23.2590 2.160e-06 ***
## s(perc.alumni, 4)   1  144977766  144977766  44.5485 1.042e-10 ***
## s(Expend, 4)        1  330114280  330114280 101.4370 < 2.2e-16 ***
## s(Grad.Rate, 4)     1   52286334   52286334  16.0665 7.556e-05 ***
## Residuals         332 1080451222    3254379                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df  Npar F     Pr(F)    
## (Intercept)                                    
## s(Apps, 4)              3  1.2961  0.275672    
## s(Accept, 4)            3 15.1600 2.865e-09 ***
## s(Enroll, 4)            3  2.2811  0.079114 .  
## s(Top10perc, 4)         3  2.5358  0.056721 .  
## s(Top25perc, 4)         3  1.2361  0.296528    
## s(F.Undergrad, 4)       3 16.8219 3.392e-10 ***
## s(Room.Board, 4)        3  1.1520  0.328179    
## s(Books, 4)             3  2.3580  0.071576 .  
## s(Personal, 4)          3  2.0371  0.108498    
## s(PhD, 4)               3  0.5935  0.619635    
## s(Terminal, 4)          3  2.0686  0.104184    
## s(S.F.Ratio, 4)         3  8.0677 3.337e-05 ***
## s(perc.alumni, 4)       3  0.7215  0.539727    
## s(Expend, 4)            3 12.6773 7.264e-08 ***
## s(Grad.Rate, 4)         3  4.5216  0.004002 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Assignment 6

David Rodriguez

7/18/2020