6. In this exercise, you will further analyze the Wage data set considered throughout this chapter.
Answer: From the results we can see for this polynomial d=4 is the optimal degree. We used after ANOVA, where we can see that, model M1 is enough to test the null hypothesis and show that the data is more complex against the alternative hypothesis M2. From examining the p-values we can see that the quadratic or cubic polynomial seem to be a better fit to the data.
library(ISLR2)
attach(Wage)
library(boot)
require(boot)
set.seed(15)
cv.error <- rep(0,5)
for (i in 1:5){
glm.fit <- glm(wage ~ poly(age,i),data=Wage)
cv.error[i]<- cv.glm(Wage,glm.fit,K=10)$delta[1]
}
cv.error
## [1] 1676.192 1599.786 1596.256 1593.755 1595.106
plot(cv.error, type="b", xlab="Degree", ylab="Test MSE")
points(which.min(cv.error), cv.error[4], col="red", pch=20, cex=2)
fit_1 <- lm(wage ~ age, data=Wage)
fit_2 <- lm(wage ~ poly(age, 2), data=Wage)
fit_3 <- lm(wage ~ poly(age, 3), data=Wage)
fit_4 <- lm(wage ~ poly(age, 4), data=Wage)
fit_5 <- lm(wage ~ poly(age, 5), data=Wage)
anova(fit_1, fit_2, fit_3, fit_4, fit_5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
age_lim <- range(age)
age_grid <- seq(from=age_lim[1], to=age_lim[2])
preds <- predict(fit_4, newdata=list(age=age_grid),se=TRUE)
se_bands <- cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
plot(age, wage, xlim=age_lim, cex=.5, col="darkgrey")
lines(age_grid, preds$fit, lwd=2, col="blue")
matlines(age_grid, se_bands, lwd=1, col="blue", lty=3)
Answer: From the graphs below we can see that there is minimum error for 8 cuts, we then fit all the data with a step function using 8 cuts and plotting it.
par(bg = 'grey')
HW6_2 <- rep(NA, 10)
for (i in 2:10) {
Wage$age.cut <- cut(Wage$age, i)
fit <- glm(wage ~ age.cut, data = Wage)
HW6_2[i] <- cv.glm(Wage, fit, K = 10)$delta[1]
}
plot(2:10, HW6_2[-1], xlab = "Cuts", ylab = "Test MSE", type = "l")
d.min <- which.min(HW6_2)
points(which.min(HW6_2), HW6_2[which.min(HW6_2)], col = "yellow", cex = 2, pch = 15)
par(bg = 'white')
plot(wage ~ age, data = Wage, col = "grey50")
HW6_3 <- range(Wage$age)
age.grid <- seq(from = HW6_3[1], to = HW6_3[2])
fit <- glm(wage ~ cut(age, 8), data = Wage)
preds <- predict(fit, data.frame(age = age.grid))
lines(age.grid, preds, col = "tan4", lwd = 2)
10. This question relates to the College data set.
Answer: We can see from the graph below that Cp, BIC, and adjr2 show that for the minimum size for the subset which scores within 0.2 st.dev. of the optium is size 6.
library(leaps)
## Warning: package 'leaps' was built under R version 4.2.2
set.seed(1)
attach(College)
train <- sample(length(Outstate), length(Outstate) / 2)
test <- -train
College.train <- College[train, ]
College.test <- College[test, ]
fit <- regsubsets(Outstate ~ ., data = College.train, nvmax = 17, method = "forward")
fit.summary <- summary(fit)
par(mfrow = c(1, 3),bg = "slategrey")
plot(fit.summary$cp, xlab = "Number of variables", ylab = "Cp", type = "l")
min.cp <- min(fit.summary$cp)
std.cp <- sd(fit.summary$cp)
abline(h = min.cp + 0.2 * std.cp, col = "steelblue1", lty = 2)
abline(h = min.cp - 0.2 * std.cp, col = "steelblue1", lty = 2)
plot(fit.summary$bic, xlab = "Number of variables", ylab = "BIC", type='l')
min.bic <- min(fit.summary$bic)
std.bic <- sd(fit.summary$bic)
abline(h = min.bic + 0.2 * std.bic, col = "steelblue1", lty = 2)
abline(h = min.bic - 0.2 * std.bic, col = "steelblue1", lty = 2)
plot(fit.summary$adjr2, xlab = "Number of variables", ylab = "Adjusted R2", type = "l", ylim = c(0.4, 0.84))
max.adjr2 <- max(fit.summary$adjr2)
std.adjr2 <- sd(fit.summary$adjr2)
abline(h = max.adjr2 + 0.2 * std.adjr2, col = "steelblue1", lty = 2)
abline(h = max.adjr2 - 0.2 * std.adjr2, col = "steelblue1", lty = 2)
fit <- regsubsets(Outstate ~ ., data = College, method = "forward")
coeffs <- coef(fit, id = 6)
names(coeffs)
## [1] "(Intercept)" "PrivateYes" "Room.Board" "PhD" "perc.alumni"
## [6] "Expend" "Grad.Rate"
library(gam)
## Warning: package 'gam' was built under R version 4.2.2
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.20.2
fit <- gam(Outstate ~ Private + s(Room.Board, df = 2) + s(PhD, df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate, df = 2), data=College.train)
par(mfrow = c(2, 3),bg = "grey" )
plot(fit, se = T, col = "ghostwhite")
preds <- predict(fit, College.test)
error <- mean((College.test$Outstate - preds)^2)
error
## [1] 3349290
ts <- mean((College.test$Outstate - mean(College.test$Outstate))^2)
rs <- 1 - error / ts
rs
## [1] 0.7660016
Answer: A strong non linear relationship is shown using ANOVA between “Outstate” and “Expend”, including a not so strong non linear relationship between “PhD” and “Outstate” or “Grad.Rate”, while using p-value of 0.05.
summary(fit)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 2) + s(PhD,
## df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate,
## df = 2), data = College.train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7402.89 -1114.45 -12.67 1282.69 7470.60
##
## (Dispersion Parameter for gaussian family taken to be 3711182)
##
## Null Deviance: 6989966760 on 387 degrees of freedom
## Residual Deviance: 1384271126 on 373 degrees of freedom
## AIC: 6987.021
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1778718277 1778718277 479.286 < 2.2e-16 ***
## s(Room.Board, df = 2) 1 1577115244 1577115244 424.963 < 2.2e-16 ***
## s(PhD, df = 2) 1 322431195 322431195 86.881 < 2.2e-16 ***
## s(perc.alumni, df = 2) 1 336869281 336869281 90.771 < 2.2e-16 ***
## s(Expend, df = 5) 1 530538753 530538753 142.957 < 2.2e-16 ***
## s(Grad.Rate, df = 2) 1 86504998 86504998 23.309 2.016e-06 ***
## Residuals 373 1384271126 3711182
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, df = 2) 1 1.9157 0.1672
## s(PhD, df = 2) 1 0.9699 0.3253
## s(perc.alumni, df = 2) 1 0.1859 0.6666
## s(Expend, df = 5) 4 20.5075 2.665e-15 ***
## s(Grad.Rate, df = 2) 1 0.5702 0.4506
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1