library(ISLR2)
library(boot)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Load Wage data
data(Wage)
set.seed(1)
cv_errors <- rep(NA, 10)
for (d in 1:10) {
glm_fit <- glm(wage ~ poly(age, d), data = Wage)
cv_errors[d] <- cv.glm(Wage, glm_fit, K = 10)$delta[1]
}
optimal_degree <- which.min(cv_errors)
optimal_degree
## [1] 9
fit_1 <- lm(wage ~ poly(age, 1), data = Wage)
fit_2 <- lm(wage ~ poly(age, 2), data = Wage)
fit_3 <- lm(wage ~ poly(age, 3), data = Wage)
fit_4 <- lm(wage ~ poly(age, 4), data = Wage)
fit_5 <- lm(wage ~ poly(age, 5), data = Wage)
anova(fit_1, fit_2, fit_3, fit_4, fit_5)
## Analysis of Variance Table
##
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", formula = y ~ poly(x, optimal_degree), se = FALSE, color = "blue") +
labs(title = paste("Polynomial Regression (Degree =", optimal_degree, ")"),
x = "Age", y = "Wage")
set.seed(1)
cv_step_errors <- rep(NA, 10)
for (k in 2:10) {
Wage$age.cut <- cut(Wage$age, k)
glm_fit <- glm(wage ~ age.cut, data = Wage)
cv_step_errors[k] <- cv.glm(Wage, glm_fit, K = 10)$delta[1]
}
optimal_cuts <- which.min(cv_step_errors)
optimal_cuts
## [1] 8
Wage$age.cut <- cut(Wage$age, optimal_cuts)
step_fit <- lm(wage ~ age.cut, data = Wage)
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.5) +
stat_summary(fun = mean, geom = "line", aes(group = 1), color = "red", linewidth = 1.5) +
labs(title = paste("Step Function Fit (", optimal_cuts, " cuts)", sep = ""),
x = "Age", y = "Wage")
data(College)
set.seed(1)
train_index <- sample(1:nrow(College), nrow(College)/2)
train <- College[train_index, ]
test <- College[-train_index, ]
library(leaps)
regfit_fwd <- regsubsets(Outstate ~ ., data = train, method = "forward")
summary_fwd <- summary(regfit_fwd)
plot(summary_fwd$cp, xlab = "Number of Variables", ylab = "Cp", type = "l")
which.min(summary_fwd$cp)
## [1] 8
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-3
# Suppose selected variables are Private, Room.Board, Terminal, perc.alumni
gam_model <- gam(Outstate ~ s(Room.Board, 4) + s(Terminal, 4) + s(perc.alumni, 4) + Private, data = train)
par(mfrow = c(2,2))
plot(gam_model, se = TRUE, col = "blue")
preds <- predict(gam_model, newdata = test)
test_mse <- mean((test$Outstate - preds)^2)
test_mse
## [1] 4656249
# Look at the GAM plots: if the plots show curvature, the relationship is non-linear.
# Variables like Room.Board, Terminal, and perc.alumni may show non-linear relationships.
data(College)
set.seed(1)
train_index <- sample(1:nrow(College), nrow(College)/2)
train <- College[train_index, ]
test <- College[-train_index, ]
library(leaps)
regfit_fwd <- regsubsets(Outstate ~ ., data = train, method = "forward")
summary_fwd <- summary(regfit_fwd)
plot(summary_fwd$cp, xlab = "Number of Variables", ylab = "Cp", type = "l")
which.min(summary_fwd$cp)
## [1] 8
library(gam)
# Suppose selected variables are Private, Room.Board, Terminal, perc.alumni
gam_model <- gam(Outstate ~ s(Room.Board, 4) + s(Terminal, 4) + s(perc.alumni, 4) + Private, data = train)
par(mfrow = c(2,2))
plot(gam_model, se = TRUE, col = "blue")
preds <- predict(gam_model, newdata = test)
test_mse <- mean((test$Outstate - preds)^2)
test_mse
## [1] 4656249
# Based on the GAM plots:
# Room.Board shows a clear upward curve, suggesting a non-linear positive relationship with Outstate.
# Terminal also shows curvature, indicating a non-linear relationship.
# perc.alumni displays strong non-linear behavior, especially at higher values.
# Private (a categorical variable) shows a clear difference in predicted values between Yes and No groups.
# Therefore, evidence of non-linear relationships exists for:
# Room.Board, Terminal, and perc.alumni.