library(ISLR2)
library(boot)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
library(leaps)
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-2
a. Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.
attach(Wage)
set.seed(99)
err <- rep(0,5)
for (i in 1:5){
glm.fit <- glm(wage ~ poly(age,i),data=Wage)
err[i]<- cv.glm(Wage,glm.fit,K=10)$delta[1]
}
plot(err, type="b", xlab="Degree", ylab="Test MSE")
points(which.min(err), err[4], col="red", pch=20, cex=2)
Through cross-validation, the degree with min error is shown to be the 4th degree. To compare, we now run the ANOVA test:
fit.1 <- lm(wage ~ age , data = Wage)
fit.2 <- lm(wage ~ poly(age , 2), data = Wage)
fit.3 <- lm(wage ~ poly(age , 3), data = Wage)
fit.4 <- lm(wage ~ poly(age , 4), data = Wage)
fit.5 <- lm(wage ~ poly(age , 5), data = Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The p-values of the different degrees indicate that a quadratic or cubic degrees provide a good fit for the data.
fit <- lm(wage ~ poly(age , 3), data = Wage)
agelims <- range(age)
age.grid <- seq(from = agelims[1], to = agelims [2])
preds <- predict(fit , newdata = list(age = age.grid), se = TRUE)
se.bands <- cbind(preds$fit + 2 * preds$se.fit ,preds$fit - 2 * preds$se.fit)
plot(age , wage , xlim = agelims , cex = .5, col = "darkgrey")
lines(age.grid, preds$fit , lwd = 2, col = "blue")
matlines(age.grid , se.bands, lwd = 1, col = "blue", lty = 3)
b. Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.
set.seed(99)
err <- rep(NA, 10)
for(i in 2:10){
Wage$age.cut <- cut(Wage$age,i)
glm.fit <- glm(wage ~ age.cut, data=Wage)
err[i] <- cv.glm(Wage, glm.fit, K=10)$delta[1]}
plot(2:10, err[-1], type="b")
points(which.min(err), err[which.min(err)], col="red", pch=20, cex=2)
The number of cuts indicated by cross-validation is 8, which we use to fit the regression:
step <- glm(wage ~ cut(age, 8), data=Wage)
preds <- predict(step, data.frame(age = age.grid))
plot(age, wage, col="black")
lines(age.grid, preds, col="red")
detach(Wage)
This question relates to the College data set.
attach(College)set.seed(99)
train <- createDataPartition(College$Outstate, p = 0.7, list = FALSE)
training <- College[train,]
testing <- College[-train,]
fit_fwd <- regsubsets(Outstate ~ ., data= training, nvmax=17, method="forward")
summary <- summary(fit_fwd)
par(mfrow = c(1, 3))
plot(summary$cp, type = "l")
min_cp <- min(summary$cp)
sd_cp <- sd(summary$cp)
plot(summary$bic, type = "l")
min_bic <- min(summary$bic)
sd_bic <- sd(summary$bic)
plot(summary$adjr2, type = "l", ylim = c(0.45,0.8))
max_adj = max(summary$adjr2)
sd_adj = sd(summary$adjr2)
which.min(summary$cp)
## [1] 13
which.min(summary$bic)
## [1] 12
which.max(summary$adjr2)
## [1] 13
The best subsets include around 13 and 12 predictors, however in the BIC plot we can observe that a local minimum is achieved around the model with 6 predictors. For CP and R2, beyond 6 predictors there is not a substantial improvement for the models, therefore we can choose a model with 6 predictors, with coefficients:
coef(fit_fwd, id=6)
## (Intercept) PrivateYes Room.Board Terminal perc.alumni
## -4184.6616355 2683.7986714 1.0021114 42.7394905 50.2026915
## Expend Grad.Rate
## 0.2017187 26.6202852
b. Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.
gam_train <- gam(Outstate ~ Private+s(Expend,df= 3)+s(Room.Board,df=3)+s(Terminal,df=3)+s(perc.alumni,df=3)+s(Grad.Rate,df=3), data=training)
par(mfrow = c(2, 3))
plot(gam_train, se = T, col = "red")
There appears to be a somewhat linear behavior for Room.Board and perc.alumni.
c. Evaluate the model obtained on the test set, and explain the results obtained.
pred <- predict(gam_train, testing)
err <- mean((testing$Outstate - pred)^2)
err
## [1] 3377651
tss <- mean((testing$Outstate - mean(testing$Outstate))^2)
rss <- 1 - err / tss
rss
## [1] 0.7871247
For this model there is an associated R squared value of 0.79.
d. For which variables, if any, is there evidence of a non-linear relationship with the response?
summary(gam_train)
##
## Call: gam(formula = Outstate ~ Private + s(Expend, df = 3) + s(Room.Board,
## df = 3) + s(Terminal, df = 3) + s(perc.alumni, df = 3) +
## s(Grad.Rate, df = 3), data = training)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7262 -1111 95 1350 5089
##
## (Dispersion Parameter for gaussian family taken to be 3589533)
##
## Null Deviance: 8892099442 on 545 degrees of freedom
## Residual Deviance: 1898862849 on 529 degrees of freedom
## AIC: 9809.279
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2192049040 2192049040 610.678 < 2.2e-16 ***
## s(Expend, df = 3) 1 2827261955 2827261955 787.641 < 2.2e-16 ***
## s(Room.Board, df = 3) 1 475244621 475244621 132.397 < 2.2e-16 ***
## s(Terminal, df = 3) 1 110352804 110352804 30.743 4.657e-08 ***
## s(perc.alumni, df = 3) 1 148971129 148971129 41.502 2.651e-10 ***
## s(Grad.Rate, df = 3) 1 87736351 87736351 24.442 1.030e-06 ***
## Residuals 529 1898862849 3589533
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Expend, df = 3) 2 43.409 < 2e-16 ***
## s(Room.Board, df = 3) 2 3.567 0.02892 *
## s(Terminal, df = 3) 2 2.069 0.12737
## s(perc.alumni, df = 3) 2 1.258 0.28512
## s(Grad.Rate, df = 3) 2 2.533 0.08034 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ANOVA’s nonparametric p-value indicates that there is a non-linear relationship between Outstate and Expend (2e-16) and Outstate and Room Board (0.0289) in a lesser degree.
detach(College)