Problem 6 - Wage Data

Load Libraries and Data

library(ISLR2)
library(boot)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Load Wage data
data(Wage)

(a) Polynomial Regression with Cross-Validation

set.seed(1)

cv_errors <- rep(NA, 10)

for (d in 1:10) {
  glm_fit <- glm(wage ~ poly(age, d), data = Wage)
  cv_errors[d] <- cv.glm(Wage, glm_fit, K = 10)$delta[1]
}

optimal_degree <- which.min(cv_errors)
optimal_degree
## [1] 9

ANOVA Comparison

fit_1 <- lm(wage ~ poly(age, 1), data = Wage)
fit_2 <- lm(wage ~ poly(age, 2), data = Wage)
fit_3 <- lm(wage ~ poly(age, 3), data = Wage)
fit_4 <- lm(wage ~ poly(age, 4), data = Wage)
fit_5 <- lm(wage ~ poly(age, 5), data = Wage)

anova(fit_1, fit_2, fit_3, fit_4, fit_5)
## Analysis of Variance Table
## 
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Polynomial Fit Plot

ggplot(Wage, aes(x = age, y = wage)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", formula = y ~ poly(x, optimal_degree), se = FALSE, color = "blue") +
  labs(title = paste("Polynomial Regression (Degree =", optimal_degree, ")"),
       x = "Age", y = "Wage")

(b) Step Function Fit with Cross-Validation

set.seed(1)

cv_step_errors <- rep(NA, 10)

for (k in 2:10) {
  Wage$age.cut <- cut(Wage$age, k)
  glm_fit <- glm(wage ~ age.cut, data = Wage)
  cv_step_errors[k] <- cv.glm(Wage, glm_fit, K = 10)$delta[1]
}

optimal_cuts <- which.min(cv_step_errors)
optimal_cuts
## [1] 8

Step Function Fit Plot (Corrected)

Wage$age.cut <- cut(Wage$age, optimal_cuts)

step_fit <- lm(wage ~ age.cut, data = Wage)

ggplot(Wage, aes(x = age, y = wage)) +
  geom_point(alpha = 0.5) +
  stat_summary(fun = mean, geom = "line", aes(group = 1), color = "red", linewidth = 1.5) +
  labs(title = paste("Step Function Fit (", optimal_cuts, " cuts)", sep = ""),
       x = "Age", y = "Wage")

Problem 10 - College Data

Load Data

data(College)

(a) Split into Training and Test Sets + Forward Stepwise Selection

set.seed(1)

train_index <- sample(1:nrow(College), nrow(College)/2)
train <- College[train_index, ]
test <- College[-train_index, ]

library(leaps)
regfit_fwd <- regsubsets(Outstate ~ ., data = train, method = "forward")
summary_fwd <- summary(regfit_fwd)

plot(summary_fwd$cp, xlab = "Number of Variables", ylab = "Cp", type = "l")

which.min(summary_fwd$cp)
## [1] 8

(b) Fit a GAM Model

library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-3
# Suppose selected variables are Private, Room.Board, Terminal, perc.alumni
gam_model <- gam(Outstate ~ s(Room.Board, 4) + s(Terminal, 4) + s(perc.alumni, 4) + Private, data = train)

par(mfrow = c(2,2))
plot(gam_model, se = TRUE, col = "blue")

(c) Evaluate Model on Test Set

preds <- predict(gam_model, newdata = test)

test_mse <- mean((test$Outstate - preds)^2)
test_mse
## [1] 4656249

(d) Non-linear Relationships

# Look at the GAM plots: if the plots show curvature, the relationship is non-linear.
# Variables like Room.Board, Terminal, and perc.alumni may show non-linear relationships.

Problem 10 - College Data

Load Data

data(College)

(a) Split into Training and Test Sets + Forward Stepwise Selection

set.seed(1)

train_index <- sample(1:nrow(College), nrow(College)/2)
train <- College[train_index, ]
test <- College[-train_index, ]

library(leaps)
regfit_fwd <- regsubsets(Outstate ~ ., data = train, method = "forward")
summary_fwd <- summary(regfit_fwd)

plot(summary_fwd$cp, xlab = "Number of Variables", ylab = "Cp", type = "l")

which.min(summary_fwd$cp)
## [1] 8

(b) Fit a GAM Model

library(gam)

# Suppose selected variables are Private, Room.Board, Terminal, perc.alumni
gam_model <- gam(Outstate ~ s(Room.Board, 4) + s(Terminal, 4) + s(perc.alumni, 4) + Private, data = train)

par(mfrow = c(2,2))
plot(gam_model, se = TRUE, col = "blue")

(c) Evaluate Model on Test Set

preds <- predict(gam_model, newdata = test)

test_mse <- mean((test$Outstate - preds)^2)
test_mse
## [1] 4656249

(d) Non-linear Relationships

# Based on the GAM plots:
# Room.Board shows a clear upward curve, suggesting a non-linear positive relationship with Outstate.
# Terminal also shows curvature, indicating a non-linear relationship.
# perc.alumni displays strong non-linear behavior, especially at higher values.
# Private (a categorical variable) shows a clear difference in predicted values between Yes and No groups.

# Therefore, evidence of non-linear relationships exists for:
# Room.Board, Terminal, and perc.alumni.