library(ISLR2)
library(boot)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

6.

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

data(Wage)
str(Wage)
## 'data.frame':    3000 obs. of  11 variables:
##  $ year      : int  2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
##  $ age       : int  18 24 45 43 50 54 44 30 41 52 ...
##  $ maritl    : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
##  $ race      : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
##  $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
##  $ region    : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ jobclass  : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
##  $ health    : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
##  $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
##  $ logwage   : num  4.32 4.26 4.88 5.04 4.32 ...
##  $ wage      : num  75 70.5 131 154.7 75 ...

(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

set.seed(1)

cv_errors = rep(NA, 10)

for (d in 1:10) {
  glm_fit = glm(wage ~ poly(age, d), data = Wage)
  cv_errors[d] = cv.glm(Wage, glm_fit, K = 10)$delta[1]
}

best_degree = which.min(cv_errors)

best_degree
## [1] 9
plot(1:10, cv_errors, type = "b", xlab = "Degree", ylab = "CV Error", main = "Cross-Validation Error by Polynomial Degree")

ggplot(Wage, aes(x = age, y = wage)) +
  geom_point(alpha = 0.5) +
  stat_smooth(method = "lm", formula = y ~ poly(x, best_degree), color = "blue") +
  ggtitle(paste("Polynomial Regression Fit (Degree", best_degree, ")"))

fit_1 = lm(wage ~ poly(age, 1), data = Wage)
fit_2 = lm(wage ~ poly(age, 2), data = Wage)
fit_3 = lm(wage ~ poly(age, 3), data = Wage)
fit_4 = lm(wage ~ poly(age, 4), data = Wage)
fit_5 = lm(wage ~ poly(age, 5), data = Wage)
fit_6 = lm(wage ~ poly(age, 6), data = Wage)
fit_7 = lm(wage ~ poly(age, 7), data = Wage)

anova(fit_1, fit_2, fit_3, fit_4, fit_5, fit_6, fit_7)
## Analysis of Variance Table
## 
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Model 6: wage ~ poly(age, 6)
## Model 7: wage ~ poly(age, 7)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.6926 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8956  0.001673 ** 
## 4   2995 4771604  1      6070   3.8125  0.050966 .  
## 5   2994 4770322  1      1283   0.8055  0.369516    
## 6   2993 4766389  1      3932   2.4697  0.116165    
## 7   2992 4763834  1      2555   1.6049  0.205311    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ANOVA shows significance is lost with more complex models, contradicting degree 9 chosen by cross validation.

(b) Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.

set.seed(1)
cv_error_step = rep(NA, 10)

for (k in 2:10) {
  Wage$age_cut = cut(Wage$age, k)
  fit = glm(wage ~ age_cut, data = Wage)
  cv_error_step[k] = cv.glm(Wage, fit, K = 10)$delta[1]
}

# Remove NAs and find best number of cuts
valid_errors = cv_error_step[!is.na(cv_error_step)]
best_cuts = which.min(valid_errors)

best_cuts
## [1] 7
# Fit final model using best number of cuts
Wage$age_cut = cut(Wage$age, best_cuts)
final_step_model = lm(wage ~ age_cut, data = Wage)

# Plot step function fit
ggplot(Wage, aes(x = age, y = wage)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", formula = y ~ age_cut, color = "red", se = FALSE) +
  ggtitle(paste("Step Function Fit with", best_cuts, "Cuts")) +
  theme_minimal()
## Warning: Failed to fit group -1.
## Caused by error:
## ! object 'age_cut' not found

7.

This question relates to the College data set.

data("College")

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

index = sample(nrow(College), nrow(College)*0.7)
train = College[index, ]
test = College[-index, ]
forward_model = lm(Outstate ~ ., data = train)
step_forward = step(forward_model, direction = "forward")
## Start:  AIC=8230.12
## Outstate ~ Private + Apps + Accept + Enroll + Top10perc + Top25perc + 
##     F.Undergrad + P.Undergrad + Room.Board + Books + Personal + 
##     PhD + Terminal + S.F.Ratio + perc.alumni + Expend + Grad.Rate
summary(step_forward)
## 
## Call:
## lm(formula = Outstate ~ Private + Apps + Accept + Enroll + Top10perc + 
##     Top25perc + F.Undergrad + P.Undergrad + Room.Board + Books + 
##     Personal + PhD + Terminal + S.F.Ratio + perc.alumni + Expend + 
##     Grad.Rate, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6366.7 -1198.3    12.5  1214.6  9978.2 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.870e+03  8.702e+02  -2.148 0.032134 *  
## PrivateYes   2.424e+03  2.931e+02   8.269 1.11e-15 ***
## Apps        -3.111e-01  7.835e-02  -3.970 8.19e-05 ***
## Accept       8.162e-01  1.534e-01   5.322 1.53e-07 ***
## Enroll      -8.625e-01  4.380e-01  -1.969 0.049453 *  
## Top10perc    2.706e+01  1.340e+01   2.020 0.043899 *  
## Top25perc   -6.065e+00  1.055e+01  -0.575 0.565575    
## F.Undergrad -1.623e-02  8.031e-02  -0.202 0.839927    
## P.Undergrad -9.688e-03  6.629e-02  -0.146 0.883862    
## Room.Board   8.076e-01  9.974e-02   8.096 3.98e-15 ***
## Books        1.455e-01  5.178e-01   0.281 0.778769    
## Personal    -2.792e-01  1.356e-01  -2.059 0.039943 *  
## PhD          1.738e+01  9.860e+00   1.762 0.078607 .  
## Terminal     1.802e+01  1.067e+01   1.688 0.091914 .  
## S.F.Ratio   -2.682e+01  2.786e+01  -0.963 0.336020    
## perc.alumni  4.713e+01  8.942e+00   5.270 1.99e-07 ***
## Expend       2.026e-01  2.727e-02   7.427 4.51e-13 ***
## Grad.Rate    2.335e+01  6.721e+00   3.474 0.000555 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1924 on 525 degrees of freedom
## Multiple R-squared:  0.7672, Adjusted R-squared:  0.7596 
## F-statistic: 101.8 on 17 and 525 DF,  p-value: < 2.2e-16

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

library(gam)
## Loading required package: splines
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded gam 1.22-5
gam_fit <- gam(Outstate ~ Private + s(Apps) + s(Accept) + s(Enroll) + s(Top10perc) + 
                 s(Room.Board) + s(Personal) + s(perc.alumni) + s(Expend) +
                 s(Grad.Rate), data = train)
summary(gam_fit)
## 
## Call: gam(formula = Outstate ~ Private + s(Apps) + s(Accept) + s(Enroll) + 
##     s(Top10perc) + s(Room.Board) + s(Personal) + s(perc.alumni) + 
##     s(Expend) + s(Grad.Rate), data = train)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -5893.2 -1068.2   102.4  1079.9  7445.4 
## 
## (Dispersion Parameter for gaussian family taken to be 3007394)
## 
##     Null Deviance: 8345919950 on 542 degrees of freedom
## Residual Deviance: 1518732105 on 504.9994 degrees of freedom
## AIC: 9679.278 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                 Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private          1 2608030537 2608030537 867.206 < 2.2e-16 ***
## s(Apps)          1  913942591  913942591 303.899 < 2.2e-16 ***
## s(Accept)        1   96521225   96521225  32.095 2.469e-08 ***
## s(Enroll)        1  107905856  107905856  35.880 3.990e-09 ***
## s(Top10perc)     1  836264579  836264579 278.070 < 2.2e-16 ***
## s(Room.Board)    1  492510405  492510405 163.767 < 2.2e-16 ***
## s(Personal)      1   31190572   31190572  10.371  0.001362 ** 
## s(perc.alumni)   1  269037432  269037432  89.459 < 2.2e-16 ***
## s(Expend)        1  452989070  452989070 150.625 < 2.2e-16 ***
## s(Grad.Rate)     1   49606672   49606672  16.495 5.654e-05 ***
## Residuals      505 1518732105    3007394                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                Npar Df  Npar F     Pr(F)    
## (Intercept)                                 
## Private                                     
## s(Apps)              3  3.5498 0.0144298 *  
## s(Accept)            3 13.1894  2.65e-08 ***
## s(Enroll)            3  4.6291 0.0033235 ** 
## s(Top10perc)         3  1.2305 0.2979615    
## s(Room.Board)        3  0.8922 0.4449808    
## s(Personal)          3  6.4175 0.0002849 ***
## s(perc.alumni)       3  1.0756 0.3589723    
## s(Expend)            3 28.2545 < 2.2e-16 ***
## s(Grad.Rate)         3  1.2128 0.3044197    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(1, 3))
plot(gam_fit, se = TRUE, col = "blue")

Out of state tuition increases if the university in question is a private one.

As the number of applications received increases, out of state tuition decreases.

As the number of applications accepted increases, out of state tuition increases.

As the number of new students enrolled increases, out of state tuition decreases.

As the percent of new students from the top 10% in high school increases, the out of state tuition increases except for the 50 to 70% range where the tuition is comparable to that of about 30% new students comprised of top 10% high school students.

As the room and board costs increase, out of state tuition increases.

As personal spending increases, out of state tuition follows a U - shaped curve, decreasing then increasing about the majority of observations. At the higher ends of personal spending, out of state tuition drops off drastically decreasing.

As the percent of alumni who donate increases, out of state tuition increases.

As instructional expenditure per student increases, out of state tuition increases until a maximum point where it slightly drops off.

(c) Evaluate the model obtained on the test set, and explain the results obtained.

preds_gam = predict(gam_fit, newdata = test)

mse = mean((test$Outstate - preds_gam)^2)

sqrt(mse)
## [1] 1941.611

The GAM model resulted in an RMSE of 1941.611 meaning that on average predictions were within $1,941.61 of the actual out of state tuition cost. This is reasonably low at about 11% of the overall range of out of state tuition in the testing data.

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

Expend, Personal, and Top10perc have signs of non-linear relationships with Outstate. This is due to their curves bending, shifting, or dipping to where the slope could not be considered linear anymore.