##Question 6
In this exercise, you will further analyze the Wage data set considered throughout this chapter.
(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data. Using K fold cross validation, the polynomial degree I would select is a 3rd degree. The 9th degree had the lowest error, however this model would be very complex and the 3rd degree model has an error of only 1.244 higher than the 9th degree model. Not much is gained by increasing the model complexity from 3rd degree to 9th degree. The ANOVA results also show to select a 3rd degree model.
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
library(boot)
attach(Wage)
set.seed(1)
cv.error=rep(0,10)
for (i in 1:10){
glm.fit=glm(wage~poly(age ,i),data=Wage)
cv.error[i]=cv.glm(Wage,glm.fit,K=10)$delta[1]
}
cv.error
## [1] 1676.826 1600.763 1598.399 1595.651 1594.977 1596.061 1594.298
## [8] 1598.134 1593.913 1595.950
plot(1:10, cv.error)
which.min(cv.error)
## [1] 9
fit.1 = lm(wage~poly(age, 1), data=Wage)
fit.2 = lm(wage~poly(age, 2), data=Wage)
fit.3 = lm(wage~poly(age, 3), data=Wage)
fit.4 = lm(wage~poly(age, 4), data=Wage)
fit.5 = lm(wage~poly(age, 5), data=Wage)
fit.6 = lm(wage~poly(age, 6), data=Wage)
fit.7 = lm(wage~poly(age, 7), data=Wage)
fit.8 = lm(wage~poly(age, 8), data=Wage)
fit.9 = lm(wage~poly(age, 9), data=Wage)
fit.10 = lm(wage~poly(age, 10), data=Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5, fit.6, fit.7, fit.8, fit.9, fit.10)
## Analysis of Variance Table
##
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Model 6: wage ~ poly(age, 6)
## Model 7: wage ~ poly(age, 7)
## Model 8: wage ~ poly(age, 8)
## Model 9: wage ~ poly(age, 9)
## Model 10: wage ~ poly(age, 10)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.7638 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.9005 0.001669 **
## 4 2995 4771604 1 6070 3.8143 0.050909 .
## 5 2994 4770322 1 1283 0.8059 0.369398
## 6 2993 4766389 1 3932 2.4709 0.116074
## 7 2992 4763834 1 2555 1.6057 0.205199
## 8 2991 4763707 1 127 0.0796 0.777865
## 9 2990 4756703 1 7004 4.4014 0.035994 *
## 10 2989 4756701 1 3 0.0017 0.967529
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
agelims=range(age)
age.grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit.3,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
plot(age,wage,xlim=agelims,cex=.5,col='darkgrey')
title('Degree-3 Polynomial')
lines(age.grid,preds$fit,lwd=2,col='darkblue',lty=3)
matlines(age.grid,se.bands,lwd=1,col='lightblue',lty=3)
(b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained.
The optimal number of cuts according to cross validation is 8
cv.error = rep(0,10)
for (i in 2:10){
table(cut(age,i))
fit=glm(wage~cut(age,i),data=Wage)
cv.error[i]=cv.glm(Wage,glm.fit,K=10)$delta[1]
}
plot(2:10, cv.error[-1])
cv.error
## [1] 0.000 1601.798 1596.693 1596.498 1594.889 1595.814 1596.361
## [8] 1594.670 1595.482 1597.764
fit<-glm(wage~cut(age,8),data=Wage)
preds=predict(fit,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
plot(age,wage,xlim=agelims,cex=.5,col='darkgrey')
lines(age.grid,preds$fit,lwd=2,col='red',lty=3)
matlines(age.grid,se.bands,lwd=2,col='lightblue',lty=3)
##Question 10
This question relates to the College data set.
(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors. After splitting and running cross validation, the 16 predictor model has the lowest test MSE. After also checking the Cp, BIC and Adj R2 I decided that prediction would be the better route and utilize a near full model with 16 predictors.
library(leaps)
## Warning: package 'leaps' was built under R version 3.6.3
attach(College)
set.seed(1)
train=sample(c(TRUE ,FALSE), nrow(College),rep=TRUE)
test=-train
regfit.fwd=regsubsets(Outstate~., data=College[train,], nvmax=17, method ="forward")
test.mat=model.matrix(Outstate~.,data=College[test ,])
val.errors =rep(NA ,17)
for(i in 1:17){
coefi=coef(regfit.fwd ,id=i)
pred=test.mat[,names(coefi)]%*%coefi
val.errors[i]=mean((College$Outstate[test]-pred)^2)
}
val.errors
## [1] 8906888 6403184 5156790 4553988 4264031 4100910 4061381 4059015
## [9] 4042114 3972686 3876028 3853605 3823763 3831087 3824131 3818758
## [17] 3819792
plot(val.errors)
which.min(val.errors)
## [1] 16
reg.summary = summary(regfit.fwd)
par(mfrow=c(1,3))
plot(reg.summary$adjr2, xlab="Number of Variables ", ylab="Adjusted RSq",type="l")
which.max(reg.summary$adjr2)
## [1] 13
max.adjr2 = which.max(reg.summary$adjr2)
points(max.adjr2,reg.summary$adjr2[max.adjr2], col="red", cex=2, pch =20)
plot(reg.summary$cp,xlab="Number of Variables ",ylab="Cp",type='l')
which.min(reg.summary$cp)
## [1] 11
min.cp = which.min(reg.summary$cp)
points(min.cp,reg.summary$cp[min.cp], col ="red", cex=2, pch =20)
which.min(reg.summary$bic)
## [1] 6
min.bic = which.min(reg.summary$bic)
plot(reg.summary$bic ,xlab="Number of Variables ", ylab="BIC", type='l')
points (min.bic,reg.summary$bic[min.bic],col="red",cex=2,pch =20)
reg.final = regsubsets(Outstate ~ ., data = College, method = "forward", nvmax = 17)
names(coef(reg.final,16))
## [1] "(Intercept)" "PrivateYes" "Apps" "Accept" "Enroll"
## [6] "Top10perc" "Top25perc" "F.Undergrad" "Room.Board" "Books"
## [11] "Personal" "PhD" "Terminal" "S.F.Ratio" "perc.alumni"
## [16] "Expend" "Grad.Rate"
(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.6.3
## Loaded gam 1.20
gam.fit = gam(Outstate~ s(Apps,4)+s(Accept,4)+s(Enroll,4)+s(Top10perc,4)+s(Top25perc,4)+s(F.Undergrad,4)+s(Room.Board,4)+s(Books,4)+s(Personal,4)+s(PhD,4)+s(Terminal,4)+s(S.F.Ratio,4)+s(perc.alumni,4)+s(Expend,4)+s(Grad.Rate,4),data=College[train,])
plot(gam.fit, se = TRUE, col = "#1c9099")
(c) Evaluate the model obtained on the test set, and explain theresults obtained. The model has a test MSE of 3819792
gam.pred = predict(gam.fit, data=College[test,])
val.error = mean((College$Outstate[test]-pred)^2)
val.error
## [1] 3819792
(d) For which variables, if any, is there evidence of a non-linear relationship with the response? The variables with a low p-value and thus indicating that there non-linear relationships to the response variable are Accept, F.Undergrad, S.F.Ratio, Expend, and Grad.Rate
summary(gam.fit)
##
## Call: gam(formula = Outstate ~ s(Apps, 4) + s(Accept, 4) + s(Enroll,
## 4) + s(Top10perc, 4) + s(Top25perc, 4) + s(F.Undergrad, 4) +
## s(Room.Board, 4) + s(Books, 4) + s(Personal, 4) + s(PhD,
## 4) + s(Terminal, 4) + s(S.F.Ratio, 4) + s(perc.alumni, 4) +
## s(Expend, 4) + s(Grad.Rate, 4), data = College[train, ])
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5555.9 -992.1 123.9 1187.8 4704.2
##
## (Dispersion Parameter for gaussian family taken to be 3254379)
##
## Null Deviance: 6262049699 on 392 degrees of freedom
## Residual Deviance: 1080451222 on 331.9992 degrees of freedom
## AIC: 7066.233
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(Apps, 4) 1 75315178 75315178 23.1427 2.286e-06 ***
## s(Accept, 4) 1 450800405 450800405 138.5212 < 2.2e-16 ***
## s(Enroll, 4) 1 468537482 468537482 143.9714 < 2.2e-16 ***
## s(Top10perc, 4) 1 1591917099 1591917099 489.1616 < 2.2e-16 ***
## s(Top25perc, 4) 1 4184512 4184512 1.2858 0.257640
## s(F.Undergrad, 4) 1 201648683 201648683 61.9623 5.006e-14 ***
## s(Room.Board, 4) 1 640508400 640508400 196.8143 < 2.2e-16 ***
## s(Books, 4) 1 72307 72307 0.0222 0.881598
## s(Personal, 4) 1 8872262 8872262 2.7263 0.099656 .
## s(PhD, 4) 1 27191047 27191047 8.3552 0.004099 **
## s(Terminal, 4) 1 8478335 8478335 2.6052 0.107463
## s(S.F.Ratio, 4) 1 75693729 75693729 23.2590 2.160e-06 ***
## s(perc.alumni, 4) 1 144977766 144977766 44.5485 1.042e-10 ***
## s(Expend, 4) 1 330114280 330114280 101.4370 < 2.2e-16 ***
## s(Grad.Rate, 4) 1 52286334 52286334 16.0665 7.556e-05 ***
## Residuals 332 1080451222 3254379
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(Apps, 4) 3 1.2961 0.275672
## s(Accept, 4) 3 15.1600 2.865e-09 ***
## s(Enroll, 4) 3 2.2811 0.079114 .
## s(Top10perc, 4) 3 2.5358 0.056721 .
## s(Top25perc, 4) 3 1.2361 0.296528
## s(F.Undergrad, 4) 3 16.8219 3.392e-10 ***
## s(Room.Board, 4) 3 1.1520 0.328179
## s(Books, 4) 3 2.3580 0.071576 .
## s(Personal, 4) 3 2.0371 0.108498
## s(PhD, 4) 3 0.5935 0.619635
## s(Terminal, 4) 3 2.0686 0.104184
## s(S.F.Ratio, 4) 3 8.0677 3.337e-05 ***
## s(perc.alumni, 4) 3 0.7215 0.539727
## s(Expend, 4) 3 12.6773 7.264e-08 ***
## s(Grad.Rate, 4) 3 4.5216 0.004002 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1