library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.1.3
library(boot)
set.seed(1)
degree <- 10
cv.errs <- rep(NA, degree)
for (i in 1:degree) {
fit <- glm(wage ~ poly(age, i), data = Wage)
cv.errs[i] <- cv.glm(Wage, fit)$delta[1]
}
plot(1:degree, cv.errs, xlab = 'Degree', ylab = 'Test MSE', type = 'l')
deg.min <- which.min(cv.errs)
points(deg.min, cv.errs[deg.min], col = 'green', cex = 2, pch = 19)
As shown the minumun of the test MSE is at the 9th degree. However, the
test MSE of the 4th degree would be considered small enough.
Additionaly,the comparison by ANOVA considers the 4th enough of a
validation.
plot(wage ~ age, data = Wage, col = "darkgrey")
age.range <- range(Wage$age)
age.grid <- seq(from = age.range[1], to = age.range[2])
fit <- lm(wage ~ poly(age, 3), data = Wage)
preds <- predict(fit, newdata = list(age = age.grid))
lines(age.grid, preds, col = "green", lwd = 2)
cv.errs <- rep(NA, degree)
for (i in 2:degree) {
Wage$age.cut <- cut(Wage$age, i)
fit <- glm(wage ~ age.cut, data = Wage)
cv.errs[i] <- cv.glm(Wage, fit)$delta[1]
}
plot(2:degree, cv.errs[-1], xlab = 'Cuts', ylab = 'Test MSE', type = 'l')
deg.min <- which.min(cv.errs)
points(deg.min, cv.errs[deg.min], col = 'red', cex = 2, pch = 19)
plot(wage ~ age, data = Wage, col = "darkgrey")
fit <- glm(wage ~ cut(age, 8), data = Wage)
preds <- predict(fit, list(age = age.grid))
lines(age.grid, preds, col = "red", lwd = 2)
library(ISLR2)
library(leaps)
## Warning: package 'leaps' was built under R version 4.1.3
train <- sample(1: nrow(College), nrow(College)/2)
test <- -train
fit <- regsubsets(Outstate ~ ., data = College, subset = train, method = 'forward')
fit.summary <- summary(fit)
fit.summary
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College, subset = train,
## method = "forward")
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " "*" " "
## 8 ( 1 ) "*" " " " " " " " " "*" " "
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " "*" " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " "*" " " " "
## 6 ( 1 ) " " "*" " " " " "*" " " " "
## 7 ( 1 ) " " "*" " " " " "*" " " " "
## 8 ( 1 ) " " "*" " " "*" "*" " " " "
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) "*" " " " "
## 3 ( 1 ) "*" "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" " "
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
coef(fit, id = 6)
## (Intercept) PrivateYes Room.Board PhD perc.alumni
## -3815.6574509 2880.3858979 0.9861841 43.6735045 40.4602197
## Expend Grad.Rate
## 0.1770944 30.8363935
library(gam)
## Warning: package 'gam' was built under R version 4.1.3
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.1.3
## Loaded gam 1.22-2
gam.mod <- gam(Outstate ~ Private + s(Room.Board, 5) + s(Terminal, 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5), data = College, subset = train)
par(mfrow = c(2,3))
plot(gam.mod, se = TRUE)
The graphs shown above, display that Grad.Rate and Expend have a
non-linear relationship with outstate.
preds <- predict(gam.mod, College[test, ])
RSS <- sum((College[test, ]$Outstate - preds)^2) # based on equation (3.16)
TSS <- sum((College[test, ]$Outstate - mean(College[test, ]$Outstate)) ^ 2)
1 - (RSS / TSS)
## [1] 0.7649038
The r-squared value is .755
summary(gam.mod)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 5) + s(Terminal,
## 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5),
## data = College, subset = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7289.5 -1004.3 18.3 1123.6 4218.8
##
## (Dispersion Parameter for gaussian family taken to be 3138798)
##
## Null Deviance: 6139053909 on 387 degrees of freedom
## Residual Deviance: 1133105994 on 361 degrees of freedom
## AIC: 6933.339
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1658551575 1658551575 528.404 < 2.2e-16 ***
## s(Room.Board, 5) 1 1093958629 1093958629 348.528 < 2.2e-16 ***
## s(Terminal, 5) 1 239592419 239592419 76.332 < 2.2e-16 ***
## s(perc.alumni, 5) 1 189302589 189302589 60.310 8.461e-14 ***
## s(Expend, 5) 1 671008681 671008681 213.779 < 2.2e-16 ***
## s(Grad.Rate, 5) 1 87504239 87504239 27.878 2.236e-07 ***
## Residuals 361 1133105994 3138798
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, 5) 4 3.6201 0.006576 **
## s(Terminal, 5) 4 2.3018 0.058243 .
## s(perc.alumni, 5) 4 0.8690 0.482600
## s(Expend, 5) 4 28.0768 < 2.2e-16 ***
## s(Grad.Rate, 5) 4 2.7848 0.026556 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The Summary above expands on the results in part b, which displays Grad.Rate and PhD as having a midlevel non-linear relationship with Outstate. This is incontrast to Expend which has a strong non-linear relationship with Outstate.