library(tidyverse)
library(openintro)
library(ISLR)
library(boot)
library(leaps)
library(gam)In this exercise, you will further analyze the Wage data set considered throughout this chapter.
(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.
#set.seed(1)
## crossval = rep(0, 10)
## for (i in 1:10) {
## fit = glm(wage ~ poly(age, i), data = Wage)
## crossval[i] = cv.glm(Wage, fit)$delta[1]
##}
##plot(1:degree, crossval, xlab = 'degree', ylab = 'Test MSE', type = 'l')
##deg.min = which.min(crossval)
##points(deg.min, crossval[deg.min], col = 'red', cex = 2, pch = 19)note when knitting the code chunk would not load, so I commented out
This tells us that the minimum of test MSE is at the 9th degree, and the 4th degree is deemed small enough.
plot(wage ~ age, data = Wage, col = "grey")
agerange = range(Wage$age)
agegrid = seq(from = agerange[1], to = agerange[2])
fitlm = lm(wage ~ poly(age, 3), data = Wage)
predic = predict(fitlm, newdata = list(age = agegrid))
lines(agegrid, predic, col = "Blue", lwd = 2)(b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained.
##cval = rep(NA, degree)
##for (i in 2:degree) {
## Wage$age.cut = cut(Wage$age, i)
## fitcv = glm(wage ~ age.cut, data = Wage)
## cval[i] = cv.glm(Wage, fit)$delta[1]
##}
##plot(2:degree, cval[-1], xlab = 'Cuts', ylab = 'Test MSE', type = 'l')
##deg.min = which.min(cval)
##points(deg.min, cval[deg.min], col = 'red', cex = 2, pch = 19)note when knitting the code chunk would not load, so I commented out
The optimal number of cuts is 8 based on the minimum test MSE.
plot(wage ~ age, data = Wage, col = "grey")
fit = glm(wage ~ cut(age, 8), data = Wage)
preds = predict(fit, list(age = agegrid))
lines(agegrid, preds, col = "blue", lwd = 2)This question relates to the College data set.
(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.
train = sample(1: nrow(College), nrow(College)/2)
test = -train
fit = regsubsets(Outstate ~ ., data = College, subset = train, method = 'forward')
fit.summary = summary(fit)
fit.summary## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College, subset = train,
## method = "forward")
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " "*" " " " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " " " "*" " "
## 6 ( 1 ) " " "*" " " " " " " "*" " "
## 7 ( 1 ) " " "*" " " " " " " "*" " "
## 8 ( 1 ) " " "*" " " " " " " "*" " "
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" " "
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
coef(fit, id = 6)## (Intercept) PrivateYes Room.Board Terminal perc.alumni
## -4037.0865719 2680.2790846 0.7993214 41.8254047 62.2156457
## Expend Grad.Rate
## 0.2612828 28.4894543
(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.
gamm = gam(Outstate ~ Private + s(Room.Board, 5) + s(Terminal, 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5), data = College, subset = train)
par(mfrow = c(2,3))
plot(gamm, se = TRUE, col = 'blue')It appears that as room and board costs as well as alumni percent increase, so does out of state tuition. It also appears that expend and graduation rates are not linear with regards to out of state tuition
(c) Evaluate the model obtained on the test set, and explain the results obtained.
prediction = predict(gamm,newdata = College)
mse = mean((College$Outstate - prediction)^2)
mse## [1] 3517639
preds = predict(gamm, College[test, ])
RSS = sum((College[test, ]$Outstate - preds)^2)
TSS = sum((College[test, ]$Outstate - mean(College[test, ]$Outstate)) ^ 2)
1 - (RSS / TSS)## [1] 0.7602172
The MSE is 3408341 and the r^2 0.7722337, meaning it is a relatively good model based off these values.
(d) For which variables, if any, is there evidence of a non-linear relationship with the response?
summary(gamm)##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 5) + s(Terminal,
## 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5),
## data = College, subset = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5122.1 -1128.3 102.8 1152.5 7034.1
##
## (Dispersion Parameter for gaussian family taken to be 3412094)
##
## Null Deviance: 6291005229 on 387 degrees of freedom
## Residual Deviance: 1231765592 on 360.9999 degrees of freedom
## AIC: 6965.732
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1795364372 1795364372 526.18 < 2.2e-16 ***
## s(Room.Board, 5) 1 1190871921 1190871921 349.01 < 2.2e-16 ***
## s(Terminal, 5) 1 542984820 542984820 159.14 < 2.2e-16 ***
## s(perc.alumni, 5) 1 380739464 380739464 111.59 < 2.2e-16 ***
## s(Expend, 5) 1 525878215 525878215 154.12 < 2.2e-16 ***
## s(Grad.Rate, 5) 1 68650189 68650189 20.12 9.79e-06 ***
## Residuals 361 1231765592 3412094
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, 5) 4 1.3024 0.26869
## s(Terminal, 5) 4 1.7149 0.14604
## s(perc.alumni, 5) 4 0.4537 0.76971
## s(Expend, 5) 4 16.5943 1.705e-12 ***
## s(Grad.Rate, 5) 4 3.0563 0.01694 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We can see that Expend appears to have a strong non-linear relationship with out of state tuition. Room and board also appears to have a relatively non-linear relationship with out of state tuition. …