\(\textbf{Question 6:}\) In this exercise, you will further analyze the Wage data set considered throughout this chapter.
library(ISLR)
library(boot)
set.seed(5)
cv_err <- rep(0, 5)
for (i in 1:5) {
fit <- glm(wage ~ poly(age, i), data = Wage)
cv_err[i] <- cv.glm(Wage, fit)$delta[1]
}
cv_err
## [1] 1676.235 1600.529 1595.960 1594.596 1594.879
# choosing optimal value
plot(1:5, cv_err, xlab = 'Degree', ylab = 'MSE', type = 'l')
points(which.min(cv_err), cv_err[which.min(cv_err)], col = 'blue', pch = 20, cex = 2)
lm_1 <- lm(wage ~ age, data = Wage)
lm_2 <- lm(wage ~ poly(age, 2), data = Wage)
lm_3 <- lm(wage ~ poly(age, 3), data = Wage)
lm_4 <- lm(wage ~ poly(age, 4), data = Wage)
lm_5 <- lm(wage ~ poly(age, 5), data = Wage)
anova(lm_1, lm_2, lm_3, lm_4, lm_5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
\(\textbf{Response:}\) As can be observed from the plot above, the optimal degree selected by cross-validation was 4. Conversely, ANOVA awarded the lowest p-value to degree 3 suggesting that it provides a reasonable fit to the data. It should also be stated that ANOVA gave a p-value of 0.051046 to degree 4, which is also relatively low.
set.seed(5)
cv_err_2 <- NA
for (i in 2:10) {
Wage$age_cut <- cut(Wage$age, i)
tmp_fit <- glm(wage ~ age_cut, data = Wage)
cv_err_2[i] <- cv.glm(Wage, tmp_fit, K = 10)$delta[1]
}
cv_err_2
## [1] NA 1733.469 1681.712 1636.621 1630.320 1623.048 1610.884 1600.176
## [9] 1612.748 1603.480
# choosing optimal value
plot(2:10, cv_err_2[-1], xlab = 'Degree', ylab = 'MSE', type = 'l')
points(which.min(cv_err_2), cv_err_2[which.min(cv_err_2)], col = 'blue', pch = 20, cex = 2)
\(\textbf{Response:}\) as depicted, the optimal number of cuts is 8. The plot of the obtained fit is shown below.
# making a plot of the fit
age_bounds <- range(Wage$age)
age_bounds_seq <- seq(from = age_bounds[1], to = age_bounds[2])
cut_fit <- lm(wage ~ cut(age, which.min(cv_err_2)), data = Wage)
predictions <- predict(cut_fit, data.frame(age = age_bounds_seq), se = TRUE)
plot(wage ~ age, data = Wage, col = "grey")
lines(age_bounds_seq, predictions$fit, col = "blue", lwd = 1)
\(\textbf{Question 10:}\) This question relates to the College data set.
library(leaps)
# splitting the data into a training set and a test set
set.seed(1)
sample_data <- sample(1:nrow(College), nrow(College) / 4)
train_data <- College[-sample_data,]
test_data <- College[sample_data,]
# applying forward stepwise selection
fss_fit <- regsubsets(Outstate ~ ., data = train_data, nvmax=17, method = 'forward')
fss_fit_summary <- summary(fss_fit)
fss_fit_summary
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = train_data, nvmax = 17,
## method = "forward")
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " "*" " " " " " " " "
## 9 ( 1 ) "*" " " "*" " " " " " " "*"
## 10 ( 1 ) "*" "*" "*" " " " " " " "*"
## 11 ( 1 ) "*" "*" "*" " " "*" " " "*"
## 12 ( 1 ) "*" "*" "*" " " "*" " " "*"
## 13 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 14 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 15 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " "*" " " " "
## 6 ( 1 ) " " "*" " " " " "*" " " " "
## 7 ( 1 ) " " "*" " " "*" "*" " " " "
## 8 ( 1 ) " " "*" " " "*" "*" " " " "
## 9 ( 1 ) " " "*" " " "*" "*" " " " "
## 10 ( 1 ) " " "*" " " "*" "*" " " " "
## 11 ( 1 ) " " "*" " " "*" "*" " " " "
## 12 ( 1 ) " " "*" " " "*" "*" "*" " "
## 13 ( 1 ) " " "*" " " "*" "*" "*" " "
## 14 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 15 ( 1 ) "*" "*" " " "*" "*" "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" " "
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
## 9 ( 1 ) "*" "*" "*"
## 10 ( 1 ) "*" "*" "*"
## 11 ( 1 ) "*" "*" "*"
## 12 ( 1 ) "*" "*" "*"
## 13 ( 1 ) "*" "*" "*"
## 14 ( 1 ) "*" "*" "*"
## 15 ( 1 ) "*" "*" "*"
## 16 ( 1 ) "*" "*" "*"
## 17 ( 1 ) "*" "*" "*"
par(mfrow=c(1, 3))
# Cp
plot(fss_fit_summary$cp, xlab="Variables", ylab="Cp", type='l')
abline(h = min(fss_fit_summary$cp) + (sd(fss_fit_summary$cp) / sqrt(length(fss_fit_summary$cp))), col="blue", lty=2)
# BIC
plot(fss_fit_summary$bic, xlab="Variables", ylab="BIC", type='l')
abline(h = min(fss_fit_summary$bic) + (sd(fss_fit_summary$bic) / sqrt(length(fss_fit_summary$bic))), col="blue", lty=2)
# Adjusted R2
plot(fss_fit_summary$adjr2, xlab="Variables", ylab="Adjusted R2", type='l')
abline(h = max(fss_fit_summary$adjr2) - (sd(fss_fit_summary$adjr2) / sqrt(length(fss_fit_summary$adjr2))), col="blue", lty=2)
\(\textbf{Response:}\) in the plots above, the blue dashed line denotes the best metric within 1 standard error. For the simplest model within 1 standard error, as the subset of 5 was borderline in each, I selected the subset of 6, as given below.
coef(fss_fit, 6)
## (Intercept) PrivateYes Room.Board PhD perc.alumni
## -3581.4787977 2775.6393977 0.9777427 35.1291352 40.8915058
## Expend Grad.Rate
## 0.2565909 28.1092855
library(gam)
## Warning: package 'gam' was built under R version 4.0.5
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.0.3
## Loaded gam 1.20
gam_fit <- gam(Outstate ~ Private + s(Room.Board, 2) + s(PhD, 2) + s(perc.alumni, 2) + s(Expend, 2) + s(Grad.Rate, 2), data = train_data)
par(mfrow = c(2,3))
plot(gam_fit, se = TRUE, col = 'blue')
\(\textbf{Response:}\) of these predictors, Expend stands out as being nonlinear. In terms of the other predictors, overall, there is a tendency for the response to increase as they do (when holding the other predictors fixed and accounting for confidence intervals).
predictions_gam <- predict(gam_fit, test_data)
print("RMSE:")
## [1] "RMSE:"
sqrt(mean((test_data$Outstate - predictions_gam)^2))
## [1] 2058.699
print("R^2:")
## [1] "R^2:"
1 - (sum((test_data$Outstate - predictions_gam)^2) / sum((test_data$Outstate - mean(test_data$Outstate))^2))
## [1] 0.770429
\(\textbf{Response:}\) as shown above, the evaluation on the test set with the selected 6 predictors gave an RMSE of 2008.041 and R^2 of 0.781588.
summary(gam_fit)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 2) + s(PhD,
## 2) + s(perc.alumni, 2) + s(Expend, 2) + s(Grad.Rate, 2),
## data = train_data)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7407.55 -1216.76 46.97 1225.72 5139.92
##
## (Dispersion Parameter for gaussian family taken to be 3423150)
##
## Null Deviance: 8975856426 on 582 degrees of freedom
## Residual Deviance: 1954619072 on 571.0001 degrees of freedom
## AIC: 10440.22
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2569915155 2569915155 750.746 < 2.2e-16 ***
## s(Room.Board, 2) 1 1946035728 1946035728 568.493 < 2.2e-16 ***
## s(PhD, 2) 1 820663818 820663818 239.739 < 2.2e-16 ***
## s(perc.alumni, 2) 1 362041296 362041296 105.763 < 2.2e-16 ***
## s(Expend, 2) 1 595593815 595593815 173.990 < 2.2e-16 ***
## s(Grad.Rate, 2) 1 85667934 85667934 25.026 7.541e-07 ***
## Residuals 571 1954619072 3423150
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, 2) 1 2.190 0.13944
## s(PhD, 2) 1 4.125 0.04272 *
## s(perc.alumni, 2) 1 1.738 0.18786
## s(Expend, 2) 1 50.341 3.843e-12 ***
## s(Grad.Rate, 2) 1 1.777 0.18306
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
\(\textbf{Response:}\) the summary output above suggests a strong non-linear relationship between Outstate and Expend. This supports the observation that was made in part (b) by way of the plots.