library(ISLR)
## Warning: package 'ISLR' was built under R version 4.3.3
library(boot)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(splines)
set.seed(1)
cv.errors <- rep(NA, 5)
for (d in 1:5) {
glm.fit <- glm(wage ~ poly(age, d), data=Wage)
cv.errors[d] <- cv.glm(Wage, glm.fit)$delta[1]
}
optimal.degree <- which.min(cv.errors)
print(optimal.degree)
## [1] 4
plot(1:5, cv.errors, type="b", xlab="Degree", ylab="CV Error")
points(optimal.degree, cv.errors[optimal.degree], col="red", pch=19)
fit.1 <- lm(wage ~ poly(age, 1), data=Wage)
fit.2 <- lm(wage ~ poly(age, 2), data=Wage)
fit.3 <- lm(wage ~ poly(age, 3), data=Wage)
fit.4 <- lm(wage ~ poly(age, 4), data=Wage)
fit.5 <- lm(wage ~ poly(age, 5), data=Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
##
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
age.grid <- seq(min(Wage$age),max(Wage$age), length=100)
preds <- predict(glm(wage ~ poly(age,optimal.degree ), data=Wage), newdata = list(age=age.grid))
plot(Wage$age, Wage$wage, col="gray", xlab="Age", ylab="Wage")
lines(age.grid, preds, col="blue", lwd=2)
title(paste("Polynomial Degree:", optimal.degree))
Degree 4 was selected as the optimal degree for the polynomial with the smallest prediction error. When comparing ANOVA it suggested that 3-degree polynomial was statistically significant. After plotting this model, it shows to be appropriate to use a nonlinear model and the diminishing returns as we increase degrees for polynomials.
set.seed(1)
cv.errors <- rep(NA, 5)
for (k in 2:6) {
Wage$age.cut <- cut(Wage$age, k)
glm.fit <- glm(wage ~ age.cut, data=Wage)
cv.errors[k - 1] <- cv.glm(Wage, glm.fit)$delta[1]
}
optimal.cuts <- which.min(cv.errors) + 1
print(optimal.cuts)
## [1] 6
plot(2:6, cv.errors, type="b", xlab="Number of Cuts", ylab="CV Error")
points(optimal.cuts, cv.errors[optimal.cuts - 1], col="red", pch=19)
Wage$age.cut <- cut(Wage$age, optimal.cuts)
fit <- lm(wage ~ age.cut, data=Wage)
# Get mean wage per age bin
preds <- predict(fit)
# Plot actual vs. predicted
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(color = "gray", alpha = 0.5) +
geom_step(aes(x = age, y = preds), color = "blue", size = 1.2) +
labs(title = paste("Step Function Fit with", optimal.cuts, "Cuts"),
x = "Age", y = "Wage")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
The optimal cuts were six and this plot shows as you get older wage goes up but as you reach retirement age the wages slowly diminish.
library(ISLR)
library(leaps)
## Warning: package 'leaps' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gam)
## Warning: package 'gam' was built under R version 4.3.2
## Loading required package: foreach
## Loaded gam 1.22-3
set.seed(1)
data(College)
# Number of observations
n <- nrow(College)
# Sample 70% for training
train_index <- sample(1:n, size = floor(0.7 * n))
# Create training and test sets
train.data <- College[train_index, ]
test.data <- College[-train_index, ]
# Forward selection with up to all predictors
fwd.fit <- regsubsets(Outstate ~ ., data = train.data, method = "forward", nvmax = ncol(College) - 1)
fwd.summary <- summary(fwd.fit)
best.size <- which.max(fwd.summary$adjr2)
best.size
## [1] 13
# Plot adjusted R-squared
plot(fwd.summary$adjr2, type = "b", xlab = "Number of Predictors", ylab = "Adjusted R²")
best.size <- which.max(fwd.summary$adjr2)
points(best.size, fwd.summary$adjr2[best.size], col = "red", pch = 19)
# Coefficients of best model
best.model.vars <- coef(fwd.fit, best.size)
print(best.model.vars)
## (Intercept) PrivateYes Apps Accept Top10perc
## -1739.5725417 2276.7996721 -0.3358567 0.7814587 28.9687655
## F.Undergrad Room.Board Personal PhD Terminal
## -0.1559550 0.9134134 -0.3484815 11.8113175 24.9233138
## S.F.Ratio perc.alumni Expend Grad.Rate
## -55.0649149 48.6046652 0.1744677 20.9498491
print(names(best.model.vars)[-1])
## [1] "PrivateYes" "Apps" "Accept" "Top10perc" "F.Undergrad"
## [6] "Room.Board" "Personal" "PhD" "Terminal" "S.F.Ratio"
## [11] "perc.alumni" "Expend" "Grad.Rate"
selected_vars <- c("Private", "Apps", "Accept", "Top10perc", "F.Undergrad","Room.Board", "Personal", "PhD", "Terminal","S.F.Ratio", "perc.alumni", "Expend", "Grad.Rate")
gam.formula <- as.formula(
paste("Outstate ~ Private +",
paste(paste0("s(", selected_vars[-1], ")"), collapse = " + "))
)
gam.fit <- gam(gam.formula, data = train.data)
# Plot smoothing functions with confidence intervals
par(mfrow = c(3, 4)) # Layout for 12 plots
plot(gam.fit, se = TRUE, col = "blue")
The variables Accept has a nearly linear and positive line. Positive
curved plots are found in Top10perc,Room.Board,
S.F.Ratio,perc.alumni,and Expend. Some have a wiggly and flatter line
such as the following variables of Personal, PhD, and Terminal. Negative
curved plots would be found in Apps and F.Undergrad variables.
gam.pred <- predict(gam.fit, newdata = test.data)
gam.mse <- mean((test.data$Outstate - gam.pred)^2)
cat("Test MSE of GAM:", gam.mse, "\n")
## Test MSE of GAM: 3146155
# Predict using linear model
lm.fit <- lm(Outstate ~ Private + Apps + Accept + Top10perc + F.Undergrad +
Room.Board + Personal + PhD + Terminal + S.F.Ratio +
perc.alumni + Expend + Grad.Rate,
data = train.data)
lm.pred <- predict(lm.fit, newdata = test.data)
lm.mse <- mean((test.data$Outstate - lm.pred)^2)
cat("Linear Model Test MSE:", lm.mse, "\n")
## Linear Model Test MSE: 3421127
cat("GAM Test MSE:", gam.mse, "\n")
## GAM Test MSE: 3146155
The test MSE for GAM is about 3.15 million which is the average squared difference between predicted and actual out-of-state tuition. It has a lower MSE than linear model test MSE suggesting GAM made more accurate predictions on the test set. This improvement suggests the nonlinear relationships captured by GAM added predictive power.
plot(gam.fit, se = TRUE, col = "blue")
summary(gam.fit)
##
## Call: gam(formula = gam.formula, data = train.data)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5910.22 -1088.99 73.01 1113.20 7156.97
##
## (Dispersion Parameter for gaussian family taken to be 3195196)
##
## Null Deviance: 9260683704 on 542 degrees of freedom
## Residual Deviance: 1575229760 on 492.9994 degrees of freedom
## AIC: 9723.111
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2531384624 2531384624 792.2471 < 2.2e-16 ***
## s(Apps) 1 920829741 920829741 288.1920 < 2.2e-16 ***
## s(Accept) 1 111647663 111647663 34.9424 6.346e-09 ***
## s(Top10perc) 1 1058267443 1058267443 331.2058 < 2.2e-16 ***
## s(F.Undergrad) 1 273900647 273900647 85.7226 < 2.2e-16 ***
## s(Room.Board) 1 513712100 513712100 160.7764 < 2.2e-16 ***
## s(Personal) 1 54852808 54852808 17.1673 4.028e-05 ***
## s(PhD) 1 61155825 61155825 19.1399 1.483e-05 ***
## s(Terminal) 1 22024889 22024889 6.8931 0.0089216 **
## s(S.F.Ratio) 1 89787883 89787883 28.1009 1.741e-07 ***
## s(perc.alumni) 1 139078026 139078026 43.5272 1.081e-10 ***
## s(Expend) 1 468470487 468470487 146.6171 < 2.2e-16 ***
## s(Grad.Rate) 1 37708950 37708950 11.8018 0.0006417 ***
## Residuals 493 1575229760 3195196
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Apps) 3 2.1126 0.097725 .
## s(Accept) 3 7.7040 4.868e-05 ***
## s(Top10perc) 3 1.9302 0.123747
## s(F.Undergrad) 3 2.0602 0.104615
## s(Room.Board) 3 1.8472 0.137676
## s(Personal) 3 2.7447 0.042544 *
## s(PhD) 3 1.9813 0.115856
## s(Terminal) 3 1.8216 0.142254
## s(S.F.Ratio) 3 4.8727 0.002386 **
## s(perc.alumni) 3 1.3279 0.264519
## s(Expend) 3 25.8833 1.443e-15 ***
## s(Grad.Rate) 3 1.3980 0.242634
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
All the selected variables but Private variable have a p-value less than 0.05 which indicates they have a nonlinear relationship to the response and are statistically significant. The Private variable also indicates to be statistically significat and have a linear relationship to the response.