Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.
library(ISLR)## Warning: package 'ISLR' was built under R version 4.2.3
library(boot)(a) Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?
deltas <- {}
for (i in 1:10) {
fit <- glm(wage ~ poly(age, i), data = Wage)
deltas[i] <- cv.glm(Wage, fit, K = 10)$delta[2]
}
plot(1:10, deltas, xlab = "Degree", ylab = "CV MSE", type = "b")
abline(h = min(deltas) + 0.2 * sd(deltas), col = "red", lty = 2)
points(3, deltas[3], col = "#BC3C29FF", cex = 2, pch = 20)"The minimum of test MSE is at degree 9. But test MSE of degree 4 is small enough. The comparison by ANOVA suggest degree 4 is enough."
fit.1 = lm(wage~poly(age, 1), data=Wage)
fit.2 = lm(wage~poly(age, 2), data=Wage)
fit.3 = lm(wage~poly(age, 3), data=Wage)
fit.4 = lm(wage~poly(age, 4), data=Wage)
fit.5 = lm(wage~poly(age, 5), data=Wage)
fit.6 = lm(wage~poly(age, 6), data=Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5, fit.6)## Analysis of Variance Table
##
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Model 6: wage ~ poly(age, 6)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.6636 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8936 0.001675 **
## 4 2995 4771604 1 6070 3.8117 0.050989 .
## 5 2994 4770322 1 1283 0.8054 0.369565
## 6 2993 4766389 1 3932 2.4692 0.116201
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
"So the degrees d≥3 are insignificant. So d=3 is optimal. We can make a plot of the resulting polynomial fit to the data."
attach(Wage)
plot(age, wage, col = "darkgrey")
x <- seq(min(age),max(age))
pred <- predict(fit.3, newdata = list(age = x))
lines(x, pred, col = "#BC3C29FF", lwd = 2)(b) Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.
deltas <- {}
for (i in 2:10) {
Wage$cut.age <- cut(age, i)
fit <- glm(wage ~ cut.age, data = Wage)
deltas[i] <- cv.glm(Wage, fit, K = 10)$delta[2]
}
plot(2:10, deltas[-1], xlab = "Cuts", ylab = "CV MSE", type = "b")
abline(h = min(deltas[-1]) + 0.2 * sd(deltas[-1]), col = "red", lty = 2)
points(8, deltas[8], col = "#BC3C29FF", cex = 2, pch = 20)"The 8 cuts seems optimal. We can make a plot of the fit obtained."
attach(Wage)## The following objects are masked from Wage (pos = 3):
##
## age, education, health, health_ins, jobclass, logwage, maritl,
## race, region, wage, year
plot(age, wage, col = "darkgrey")
Wage$cut.age <- cut(age, 8)
fit <- glm(wage ~ cut(age, 8), data = Wage)
pred <- predict(fit, newdata = list(age = x))
lines(x, pred, col = "#BC3C29FF", lwd = 2)attach(Wage)## The following objects are masked from Wage (pos = 3):
##
## age, cut.age, education, health, health_ins, jobclass, logwage,
## maritl, race, region, wage, year
## The following objects are masked from Wage (pos = 4):
##
## age, education, health, health_ins, jobclass, logwage, maritl,
## race, region, wage, year
plot(age, wage, col = "darkgrey")
x <- seq(min(age),max(age))
pred <- predict(fit.3, newdata = list(age = x))
lines(x, pred, col = "#BC3C29FF", lwd = 2)This question relates to the College data set.
library(leaps)## Warning: package 'leaps' was built under R version 4.2.3
library(MASS)
library(gam)## Warning: package 'gam' was built under R version 4.2.3
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-2
(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.
train <- sample(1: nrow(College), nrow(College)/2)
test <- -train
fit <- regsubsets(Outstate ~ ., data = College, subset = train, method = 'forward')
fit.summary <- summary(fit)
fit.summary## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College, subset = train,
## method = "forward")
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " " " " " " " "*" " "
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " "*" " " " "
## 5 ( 1 ) " " "*" " " " " "*" " " " "
## 6 ( 1 ) " " "*" " " " " "*" " " " "
## 7 ( 1 ) " " "*" " " "*" "*" " " " "
## 8 ( 1 ) " " "*" " " "*" "*" " " " "
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) " " "*" " "
## 5 ( 1 ) " " "*" "*"
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
coef(fit, id = 6)## (Intercept) PrivateYes Room.Board PhD perc.alumni
## -3737.7955819 3274.0102175 0.8129304 42.5216195 38.2229018
## Expend Grad.Rate
## 0.2310828 32.7807402
(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.
"Bases on the shape of the fit curves, Expend and Grad.Rate are strong non-linear with outstate."
gam.mod <- gam(Outstate ~ Private + s(Room.Board, 5) + s(Terminal, 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5), data = College, subset = train)
par(mfrow = c(2,3))
plot(gam.mod, se = TRUE)(c) Evaluate the model obtained on the test set, and explain the results obtained.
preds <- predict(gam.mod, College[test, ])
RSS <- sum((College[test, ]$Outstate - preds)^2) # based on equation (3.16)
TSS <- sum((College[test, ]$Outstate - mean(College[test, ]$Outstate)) ^ 2)
1 - (RSS / TSS)## [1] 0.7716504
"The R squared statistic on test set is 0.8057."
(d) For which variables, if any, is there evidence of a non-linear relationship with the response?
summary(gam.mod)##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 5) + s(Terminal,
## 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5),
## data = College, subset = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6984.27 -1124.28 22.56 1167.87 4563.05
##
## (Dispersion Parameter for gaussian family taken to be 3190985)
##
## Null Deviance: 6121409884 on 387 degrees of freedom
## Residual Deviance: 1151945610 on 361 degrees of freedom
## AIC: 6939.737
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1821727192 1821727192 570.898 < 2.2e-16 ***
## s(Room.Board, 5) 1 1166297468 1166297468 365.498 < 2.2e-16 ***
## s(Terminal, 5) 1 415359777 415359777 130.167 < 2.2e-16 ***
## s(perc.alumni, 5) 1 208936669 208936669 65.477 9.063e-15 ***
## s(Expend, 5) 1 620265596 620265596 194.381 < 2.2e-16 ***
## s(Grad.Rate, 5) 1 101245204 101245204 31.729 3.572e-08 ***
## Residuals 361 1151945610 3190985
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, 5) 4 2.7108 0.02998 *
## s(Terminal, 5) 4 0.3536 0.84150
## s(perc.alumni, 5) 4 1.9114 0.10794
## s(Expend, 5) 4 13.7969 1.76e-10 ***
## s(Grad.Rate, 5) 4 1.3412 0.25416
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
"Anova for Nonparametric Effects shows Expend has strong non-linear relationshop with the Outstate. Grad.Rate and PhD have moderate non-linear relationship with the Outstate."
…