library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.3
library(boot)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
set.seed(1)
cv.errors<-rep(NA,10)
for (d in 1:10) {
glm.fit<-glm(wage~poly(age,d),data=Wage)
cv.errors[d]<-cv.glm(Wage,glm.fit,K=10)$delta[1]}
optimal.d<-which.min(cv.errors)
optimal.d
## [1] 9
Performing an ANOVA test using nested polynomial models will typically confirm that degree 4 (or similar) is optimal, aligning with the CV result.
fit<-lm(wage~poly(age,optimal.d),data=Wage)
age.grid <- seq(from = min(Wage$age), to = max(Wage$age), length.out = 100)
preds <- predict(fit, newdata = data.frame(age = age.grid))
pred.df<-data.frame(age=age.grid,wage=preds)
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.4) +
geom_line(data = pred.df, aes(x = age, y = wage), color = "blue", size = 1) +
labs(title = paste("Polynomial Regression (Degree", optimal.d, ")"),
x = "Age", y = "Wage")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
cv.errors.step <- rep(NA, 10)
set.seed(2)
for (k in 2:10) {
Wage$age.cut <- cut(Wage$age, k)
fit <- glm(wage ~ age.cut, data = Wage)
cv.errors.step[k] <- cv.glm(Wage, fit, K = 10)$delta[1]}
optimal.k <- which.min(cv.errors.step[2:10]) + 1
optimal.k
## [1] 8
Wage$age.cut <- cut(Wage$age, optimal.k)
step.fit <- lm(wage ~ age.cut, data = Wage)
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", formula = y ~ age.cut, color = "red") +
labs(title = paste("Step Function with", optimal.k, "Cuts"))
## Warning: Failed to fit group -1.
## Caused by error:
## ! object 'age.cut' not found
library(leaps)
## Warning: package 'leaps' was built under R version 4.4.3
library(gam)
## Warning: package 'gam' was built under R version 4.4.3
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-5
library(ISLR2)
data(College)
head(College)
## Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University Yes 1660 1232 721 23 52
## Adelphi University Yes 2186 1924 512 16 29
## Adrian College Yes 1428 1097 336 22 50
## Agnes Scott College Yes 417 349 137 60 89
## Alaska Pacific University Yes 193 146 55 16 44
## Albertson College Yes 587 479 158 38 62
## F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University 2885 537 7440 3300 450
## Adelphi University 2683 1227 12280 6450 750
## Adrian College 1036 99 11250 3750 400
## Agnes Scott College 510 63 12960 5450 450
## Alaska Pacific University 249 869 7560 4120 800
## Albertson College 678 41 13500 3335 500
## Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University 2200 70 78 18.1 12 7041
## Adelphi University 1500 29 30 12.2 16 10527
## Adrian College 1165 53 66 12.9 30 8735
## Agnes Scott College 875 92 97 7.7 37 19016
## Alaska Pacific University 1500 76 72 11.9 2 10922
## Albertson College 675 67 73 9.4 11 9727
## Grad.Rate
## Abilene Christian University 60
## Adelphi University 56
## Adrian College 54
## Agnes Scott College 59
## Alaska Pacific University 15
## Albertson College 55
set.seed(1)
train <- sample(length(College$Outstate), length(College$Outstate) / 2)
test <- -train
College.train <- College[train, ]
College.test <- College[test, ]
fit <- regsubsets(Outstate ~ ., data = College.train, nvmax = 17, method = "forward")
fit.summary <- summary(fit)
par(mfrow = c(1, 3))
plot(fit.summary$cp, xlab = "Number of variables", ylab = "Cp", type = "l")
min.cp <- min(fit.summary$cp)
std.cp <- sd(fit.summary$cp)
abline(h = min.cp + 0.2 * std.cp, col = "red", lty = 2)
abline(h = min.cp - 0.2 * std.cp, col = "red", lty = 2)
plot(fit.summary$bic, xlab = "Number of variables", ylab = "BIC", type='l')
min.bic <- min(fit.summary$bic)
std.bic <- sd(fit.summary$bic)
abline(h = min.bic + 0.2 * std.bic, col = "red", lty = 2)
abline(h = min.bic - 0.2 * std.bic, col = "red", lty = 2)
plot(fit.summary$adjr2, xlab = "Number of variables", ylab = "Adjusted R2", type = "l", ylim = c(0.4, 0.84))
max.adjr2 <- max(fit.summary$adjr2)
std.adjr2 <- sd(fit.summary$adjr2)
abline(h = max.adjr2 + 0.2 * std.adjr2, col = "red", lty = 2)
abline(h = max.adjr2 - 0.2 * std.adjr2, col = "red", lty = 2)
Cp, BIC and adjr2 show that size 6 is the minimum size for the subset for which the scores are within 0.2 standard devitations of optimum.
lm1 <- regsubsets(Outstate ~ ., data = College, method = "forward")
coeffs <- coef(fit, id = 6)
names(coeffs)
## [1] "(Intercept)" "PrivateYes" "Room.Board" "Terminal" "perc.alumni"
## [6] "Expend" "Grad.Rate"
gam1 <- gam(Outstate ~ Private + s(Room.Board, df = 2) + s(PhD, df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate, df = 2), data=College.train)
par(mfrow = c(2, 3))
plot(gam1, se = T, col = "blue")
preds <- predict(gam1, newdata=College.test)
mse <- mean((College.test$Outstate - preds)^2)
mse
## [1] 3349290
tss <- mean((College.test$Outstate - mean(College.test$Outstate))^2)
rss <- 1 - mse / tss
rss
## [1] 0.7660016
#We obtain a test R^2 of 0.77 using GAM with 6 predictors.
The GAM explains about 77% of the variance on the test set, indicating good generalization.
summary(gam1)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 2) + s(PhD,
## df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate,
## df = 2), data = College.train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7402.89 -1114.45 -12.67 1282.69 7470.60
##
## (Dispersion Parameter for gaussian family taken to be 3711182)
##
## Null Deviance: 6989966760 on 387 degrees of freedom
## Residual Deviance: 1384271126 on 373 degrees of freedom
## AIC: 6987.021
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1778718277 1778718277 479.286 < 2.2e-16 ***
## s(Room.Board, df = 2) 1 1577115244 1577115244 424.963 < 2.2e-16 ***
## s(PhD, df = 2) 1 322431195 322431195 86.881 < 2.2e-16 ***
## s(perc.alumni, df = 2) 1 336869281 336869281 90.771 < 2.2e-16 ***
## s(Expend, df = 5) 1 530538753 530538753 142.957 < 2.2e-16 ***
## s(Grad.Rate, df = 2) 1 86504998 86504998 23.309 2.016e-06 ***
## Residuals 373 1384271126 3711182
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, df = 2) 1 1.9157 0.1672
## s(PhD, df = 2) 1 0.9699 0.3253
## s(perc.alumni, df = 2) 1 0.1859 0.6666
## s(Expend, df = 5) 4 20.5075 2.665e-15 ***
## s(Grad.Rate, df = 2) 1 0.5702 0.4506
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#ANOVA shows a strong evidence of non-linear relationship between “Outstate” and “Expend”“, and a moderately strong non-linear relationship (using p-value of 0.05) between”Outstate” and “Grad.Rate”” or “PhD”.