HW 6

library(ISLR2)

## Warning: package 'ISLR2' was built under R version 4.4.2

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: lattice

library(boot)

## Warning: package 'boot' was built under R version 4.4.3

## 
## Attaching package: 'boot'

## The following object is masked from 'package:lattice':
## 
##     melanoma

library(leaps)

## Warning: package 'leaps' was built under R version 4.4.2

library(mgcv)

## Loading required package: nlme

## 
## Attaching package: 'nlme'

## The following object is masked from 'package:dplyr':
## 
##     collapse

## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.

library(gratia)

## Warning: package 'gratia' was built under R version 4.4.3

attach(Wage)
attach(College)

###In this exercise, you will further analyze the Wage data set considered throughout this chapter.

(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

set.seed(123)

cv_results <- lapply(1:10, function(d) {
  glm_fit <- glm(wage ~ poly(age, d), data = Wage)
  cv_error <- cv.glm(Wage, glm_fit, K = 10)$delta[1]
  data.frame(degree = d, cv_error = cv_error)
})

cv_df <- bind_rows(cv_results)

best_degree <- cv_df |>
  arrange(cv_error) |>
  slice(1)

print(best_degree)

##   degree cv_error
## 1     10 1593.154

fit_1 <- lm(wage ~ poly(age, 1), data = Wage)
fit_2 <- lm(wage ~ poly(age, 2), data = Wage)
fit_3 <- lm(wage ~ poly(age, 3), data = Wage)
fit_4 <- lm(wage ~ poly(age, 4), data = Wage)
fit_5 <- lm(wage ~ poly(age, 5), data = Wage)

anova(fit_1, fit_2, fit_3, fit_4, fit_5)

## Analysis of Variance Table
## 
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

best_model <- lm(wage ~ poly(age, best_degree$degree), data = Wage)

age_grid <- data.frame(age = seq(min(Wage$age), max(Wage$age), length.out = 100))
wage_preds <- predict(best_model, newdata = age_grid)

ggplot(Wage, aes(x = age, y = wage)) +
  geom_point(alpha = 0.5, color = "gray") +
  geom_line(data = data.frame(age = age_grid$age, wage = wage_preds), 
            aes(x = age, y = wage), color = "blue", linewidth = 1.2) + 
  labs(title = paste("Polynomial Degree", best_degree$degree), 
       x = "Age", y = "Wage") +
  theme_minimal()

(b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained.

set.seed(123)

cv_results <- lapply(2:10, function(k) {
  Wage <- Wage |>
    mutate(age_cut = cut(age, breaks = k))
  
  glm_fit <- glm(wage ~ age_cut, data = Wage)
  cv_error <- boot::cv.glm(Wage, glm_fit, K = 10)$delta[1]
  
  data.frame(cuts = k, cv_error = cv_error)
})

cv_df <- bind_rows(cv_results)

best_cut <- cv_df |> arrange(cv_error) |> slice(1)
print(best_cut)

##   cuts cv_error
## 1    8 1603.495

best_k <- best_cut$cuts

Wage <- Wage |> 
  mutate(age_cut = cut(age, breaks = best_k))

final_step_fit <- glm(wage ~ age_cut, data = Wage)

age_grid <- data.frame(age = seq(min(Wage$age), max(Wage$age), length.out = 100))
age_grid$age_cut <- cut(age_grid$age, breaks = best_k)

age_grid$wage_pred <- predict(final_step_fit, newdata = age_grid)

ggplot(Wage, aes(x = age, y = wage))+
  geom_point(alpha = 0.4, color = "gray") +
  geom_line(data = age_grid, aes(x = age, y =wage_pred), color = "darkred", linewidth = 1.2) +
  labs(
    title = paste("Step Function Fit: Wage ~ Age with", best_k, "cuts"),
    x = "Age", y = "Wage"
    ) +
  theme_minimal()

###This question relates to the College data set.

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

set.seed(123)

sample_indices <- sample(1:nrow(College), size = 0.8 * nrow(College))
college_train <- College[sample_indices, ]
college_test <- College[-sample_indices, ]

forward_fit <- regsubsets(
  Outstate ~ ., data = college_train, nvmax = ncol(college_train) - 1, method = "forward")

fit_summary <- summary(forward_fit)

fit_summary$adjr2

##  [1] 0.4391229 0.5941741 0.6690075 0.7052218 0.7250374 0.7346054 0.7369384
##  [8] 0.7399431 0.7455402 0.7477156 0.7511284 0.7519828 0.7524343 0.7526841
## [15] 0.7523417 0.7519748 0.7515665

fit_summary$bic

##  [1] -347.2352 -542.7495 -663.9014 -730.4342 -768.2260 -784.7995 -784.8635
##  [8] -786.5799 -794.6757 -794.5933 -797.6387 -794.3635 -790.0859 -785.3054
## [15] -779.0406 -772.7171 -766.2935

which.max(fit_summary$adjr2)

## [1] 14

which.max(fit_summary$bic)

## [1] 1

best_size <- which.min(fit_summary$bic)

best_vars <- coef(forward_fit, id = best_size)
print(best_vars)

##   (Intercept)    PrivateYes          Apps        Accept        Enroll 
## -2002.7518586  2604.8342824    -0.2518536     0.8483927    -1.1478290 
##     Top10perc    Room.Board      Personal           PhD   perc.alumni 
##    25.2899335     0.8327798    -0.2759877    27.9273632    45.7642632 
##        Expend     Grad.Rate 
##     0.2112021    21.8924587

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

best_vars <- coef(forward_fit, id = which.min(fit_summary$bic))
print(best_vars)

##   (Intercept)    PrivateYes          Apps        Accept        Enroll 
## -2002.7518586  2604.8342824    -0.2518536     0.8483927    -1.1478290 
##     Top10perc    Room.Board      Personal           PhD   perc.alumni 
##    25.2899335     0.8327798    -0.2759877    27.9273632    45.7642632 
##        Expend     Grad.Rate 
##     0.2112021    21.8924587

gam_fit <- gam(
  Outstate ~ Private + s(Room.Board) + s(Grad.Rate) + s(PhD), data = college_train
)

summary(gam_fit)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Outstate ~ Private + s(Room.Board) + s(Grad.Rate) + s(PhD)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7721.3      197.0   39.19   <2e-16 ***
## PrivateYes    3678.0      243.1   15.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df     F p-value    
## s(Room.Board) 6.227  7.436 15.58  <2e-16 ***
## s(Grad.Rate)  4.998  6.148 10.12  <2e-16 ***
## s(PhD)        4.288  5.293 31.00  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.696   Deviance explained = 70.4%
## GCV = 5.0536e+06  Scale est. = 4.911e+06  n = 621

plot(gam_fit, se = TRUE, col = "blue")

draw(gam_fit)

The GAM reveals non-linear relationships that a standard linear model would miss. Using the room and board, grad rates, and PhD info provide some better understanding of how tuition is influenced. And obvisously Private colleges consistently charge more, regardless of other features.

(c) Evaluate the model obtained on the test set, and explain the results obtained.

gam_preds <- predict(gam_fit, newdata = college_test)
actual <- college_test$Outstate

mse_test <- mean((actual - gam_preds)^2)

ss_total <- sum((actual - mean(actual))^2)
ss_resid <- sum((actual - gam_preds)^2)
r2_test <- ss_resid / ss_total

mse_test

## [1] 4796623

r2_test

## [1] 0.2964146

the model fits the training data and the R^2 shows the model explained about 70% of the variance in the Outstate tuition on the training data. However, from the test data the R^2 means it only explains about 30% of the variance and the high MSE shows large prediction errors on the new data. which would suggest overfitting.

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

there is strong evidence of a non-linear relationship between Outstate tuition and the three other variables tested. Room.Board, Grad.Rate, and PhD are all significant. Tuition ticks up then explodes based on Room.Board, it increases with Grad.Rate but flattens, and PhD cause a U-Shape with both very low and very high PhD students tend to charge more.

HW 6

Fry PXY676

2025-04-16