In this exercise, you will further analyze the Wage data set considered throughout this chapter. (a)Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.
#1. Choosing optimal degree d
set.seed(1)
cv.error <- rep(0,10)
for (i in 1:10){
glm.fit <- glm(wage~poly(age, i, raw=T), data=Wage)
cv.error[i] <- cv.glm(Wage, glm.fit, K=10)$delta[1]
}
plot(1:10, cv.error)
d=5 is sufficient.
#2.Using ANOVA
fit.1 <- lm(wage ∼ age , data = Wage)
fit.2 <- lm(wage ∼ poly(age , 2), data = Wage)
fit.3 <- lm(wage ∼ poly(age , 3), data = Wage)
fit.4 <- lm(wage ∼ poly(age , 4), data = Wage)
fit.5 <- lm(wage ∼ poly(age , 5), data = Wage)
anova (fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From the results, we see that the linear and quadratic fit are insufficient as the p-value comparing the Model 1 throough model 3 is very small. Hence either the model with cubic and fourth degree polynomial is sufficient.
#degree 5
poly.fit <- lm(wage~poly(age, 5, raw=T), data=Wage)
agelims <- range(age)
age.grid <- seq(from=agelims[1], to=agelims[2])
preds <- predict(poly.fit, newdata=list(age=age.grid), se=TRUE)
se.bands <- cbind(preds$fit + 2*preds$se.fit, preds$fit-2*preds$se.fit)
par(mfrow=c(1,2), mar=c(4.5, 4.5, 1, 1), oma=c(0,0,4,0))
plot(age,wage, xlim=agelims, col="darkgrey")
title("Degree-5 Polynomial", outer=T)
lines(age.grid,preds$fit, lwd=2, col="blue")
matlines(age.grid, se.bands, lwd=1, col="blue", lty=3)
#degree 4
poly.fit4 <- lm(wage~poly(age, 4), data=Wage)
agelims <- range(age)
age.grid <- seq(from=agelims[1], to=agelims[2])
preds <- predict(poly.fit, newdata=list(age=age.grid), se=TRUE)
se.bands <- cbind(preds$fit + 2*preds$se.fit, preds$fit-2*preds$se.fit)
par(mfrow=c(1,2), mar=c(4.5, 4.5, 1, 1), oma=c(0,0,4,0))
plot(age,wage, xlim=agelims, col="darkgrey")
title("Degree-4 Polynomial", outer=T)
lines(age.grid,preds$fit, lwd=2, col="red")
matlines(age.grid, se.bands, lwd=1, col="red", lty=3)
set.seed(1)
cv.error.1 <- rep(NA,10)
for (i in 2:10) {
Wage$age.cut <- cut(age, i)
fits <- glm(wage ~ age.cut, data = Wage)
cv.error.1[i] <- cv.glm(Wage, fits, K = 10)$delta[1]
}
cv.error.1
## [1] NA 1734.489 1684.271 1635.552 1632.080 1623.415 1614.996 1601.318
## [9] 1613.954 1606.331
plot(cv.error.1)
cuts=8
step.fit <- lm(wage~cut(age,8), data=Wage)
summary(step.fit)
##
## Call:
## lm(formula = wage ~ cut(age, 8), data = Wage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -99.697 -24.552 -5.307 15.417 198.560
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76.282 2.630 29.007 < 2e-16 ***
## cut(age, 8)(25.8,33.5] 25.833 3.161 8.172 4.44e-16 ***
## cut(age, 8)(33.5,41.2] 40.226 3.049 13.193 < 2e-16 ***
## cut(age, 8)(41.2,49] 43.501 3.018 14.412 < 2e-16 ***
## cut(age, 8)(49,56.8] 40.136 3.177 12.634 < 2e-16 ***
## cut(age, 8)(56.8,64.5] 44.102 3.564 12.373 < 2e-16 ***
## cut(age, 8)(64.5,72.2] 28.948 6.042 4.792 1.74e-06 ***
## cut(age, 8)(72.2,80.1] 15.224 9.781 1.556 0.12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.97 on 2992 degrees of freedom
## Multiple R-squared: 0.08467, Adjusted R-squared: 0.08253
## F-statistic: 39.54 on 7 and 2992 DF, p-value: < 2.2e-16
agelim<-range(age)
age.grid <- seq(from=agelim[1], to=agelim[2])
age.pred <- predict(step.fit, newdata=list(age=age.grid), se=T)
se.bands <- cbind(age.pred$fit + 2*age.pred$se.fit, age.pred$fit-2*age.pred$se.fit)
par(mfrow=c(1,2), mar=c(4.5, 4.5, 1, 1), oma=c(0,0,4,0))
plot(Wage$age, wage, xlim=agelim, cex=0.5, col="darkgrey")
lines(age.grid, age.pred$fit, lwd=2, col="red")
matlines(age.grid, se.bands, lwd=1, col="yellow", lty=3)
10.This question relates to the College data set. (a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.
attach(College)
library(caTools)
## Warning: package 'caTools' was built under R version 4.1.3
library(leaps)
## Warning: package 'leaps' was built under R version 4.1.3
set.seed(4)
split <- sample.split(College, SplitRatio = 0.7)
Train<- subset(College, split==TRUE)
Test<- subset(College, split==FALSE)
outstae.fwd <- regsubsets(Outstate~.,data=Train, method = "forward", nvmax = 17)
reg.summary <- summary(outstae.fwd)
## CONDITIONS
par (mfrow = c(2, 2))
plot (reg.summary$rss , xlab = " Number of Variables ",
ylab = " RSS ", type = "l")
plot (reg.summary$adjr2 , xlab = " Number of Variables ",
ylab = " Adjusted RSq ", type = "l")
which.max (reg.summary$adjr2)
## [1] 13
points (13, reg.summary$adjr2[13], col = " red ", cex = 2,
pch = 20)
plot(reg.summary$cp, xlab = " Number of Variables ",
ylab = "Cp", type = "l")
which.min (reg.summary$cp)
## [1] 13
points (13, reg.summary$cp[13], col = " red ", cex = 2,
pch = 20)
which.min (reg.summary$bic)
## [1] 12
plot(reg.summary$bic , xlab = " Number of Variables ",
ylab = " BIC ", type = "l")
coef(outstae.fwd, 12)
## (Intercept) PrivateYes Apps Accept Top10perc F.Undergrad
## -498.8406116 1998.8934761 -0.3535329 0.8428989 27.6730748 -0.1843719
## Room.Board Personal PhD S.F.Ratio perc.alumni Expend
## 0.9662785 -0.2089996 22.8819556 -69.4121802 44.5025373 0.1935047
## Grad.Rate
## 18.8065512
I would choose a twelve-variable model as it has the smallest bic value.
library(gam)
## Warning: package 'gam' was built under R version 4.1.3
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.1.3
## Loaded gam 1.22-2
gam1 <- lm(Outstate~ns(Top10perc,4)+Private+ns(Apps,4)+ns(Accept,4)+ns(F.Undergrad, 4)+ns(Room.Board,4)+ns(Personal,4)+ns(PhD,4)+ns(S.F.Ratio,4)+ns(perc.alumni,4)+ns(Expend,4)+ns(Grad.Rate,4), data=Train)
par(mfrow=c(1,3))
gam.m1 <- gam(Outstate~s(Top10perc,4)+Private+s(Apps,4)+s(Accept,4)+s(F.Undergrad, 4)+s(Room.Board,4)+s(Personal,4)+s(PhD,4)+s(S.F.Ratio,4)+s(perc.alumni,4)+s(Expend,4)+s(Grad.Rate,4), data=Train)
plot(gam.m1, se=T, col="red")
preds <- predict(gam.m1, newdata = Test)
test.MSE <- mean((preds-Test$Outstate)^2)
test.MSE
## [1] 3137820
T (d) For which variables, if any, is there evidence of a non-linear relationship with the response?
summary(gam.m1)
##
## Call: gam(formula = Outstate ~ s(Top10perc, 4) + Private + s(Apps,
## 4) + s(Accept, 4) + s(F.Undergrad, 4) + s(Room.Board, 4) +
## s(Personal, 4) + s(PhD, 4) + s(S.F.Ratio, 4) + s(perc.alumni,
## 4) + s(Expend, 4) + s(Grad.Rate, 4), data = Train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5691.06 -1058.50 85.85 1135.80 7566.55
##
## (Dispersion Parameter for gaussian family taken to be 3220605)
##
## Null Deviance: 8612051022 on 518 degrees of freedom
## Residual Deviance: 1523344492 on 472.9995 degrees of freedom
## AIC: 9295.947
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(Top10perc, 4) 1 2175704900 2175704900 675.5579 < 2.2e-16 ***
## Private 1 1749570776 1749570776 543.2429 < 2.2e-16 ***
## s(Apps, 4) 1 126603875 126603875 39.3106 8.155e-10 ***
## s(Accept, 4) 1 444627 444627 0.1381 0.71039
## s(F.Undergrad, 4) 1 329233234 329233234 102.2271 < 2.2e-16 ***
## s(Room.Board, 4) 1 593311562 593311562 184.2236 < 2.2e-16 ***
## s(Personal, 4) 1 18725265 18725265 5.8142 0.01628 *
## s(PhD, 4) 1 60108701 60108701 18.6638 1.901e-05 ***
## s(S.F.Ratio, 4) 1 137150511 137150511 42.5853 1.745e-10 ***
## s(perc.alumni, 4) 1 127348161 127348161 39.5417 7.311e-10 ***
## s(Expend, 4) 1 440731420 440731420 136.8474 < 2.2e-16 ***
## s(Grad.Rate, 4) 1 32300890 32300890 10.0294 0.00164 **
## Residuals 473 1523344492 3220605
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(Top10perc, 4) 3 0.9085 0.436766
## Private
## s(Apps, 4) 3 2.4895 0.059700 .
## s(Accept, 4) 3 8.6563 1.331e-05 ***
## s(F.Undergrad, 4) 3 4.2898 0.005305 **
## s(Room.Board, 4) 3 0.6059 0.611402
## s(Personal, 4) 3 3.6482 0.012674 *
## s(PhD, 4) 3 1.1009 0.348385
## s(S.F.Ratio, 4) 3 3.5481 0.014508 *
## s(perc.alumni, 4) 3 0.9702 0.406547
## s(Expend, 4) 3 20.4074 1.881e-12 ***
## s(Grad.Rate, 4) 3 1.7415 0.157646
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1