assignment 6

6. a.

# Load required packages
library(ISLR)

## Warning: package 'ISLR' was built under R version 4.4.3

library(boot)
set.seed(15)

cv.error <- sapply(1:5, function(d) {
  fit <- glm(wage ~ poly(age, d), data=Wage)
  cv.glm(Wage, fit, K=10)$delta[1]
})
cv.error

## [1] 1676.192 1599.786 1596.256 1593.755 1595.106

best.d <- which.min(cv.error)
cat("Optimal degree by CV is", best.d, "\n")

## Optimal degree by CV is 4

fits <- lapply(1:5, function(d) lm(wage ~ poly(age, d), data=Wage))
anova(fits[[1]], fits[[2]], fits[[3]], fits[[4]], fits[[5]])

## Analysis of Variance Table
## 
## Model 1: wage ~ poly(age, d)
## Model 2: wage ~ poly(age, d)
## Model 3: wage ~ poly(age, d)
## Model 4: wage ~ poly(age, d)
## Model 5: wage ~ poly(age, d)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

fit4 <- lm(wage ~ poly(age, 4), data=Wage)

# create a fine age grid
age_grid <- seq(min(Wage$age), max(Wage$age), length=200)
preds <- predict(fit4, newdata=list(age=age_grid), se=TRUE)

with(Wage, plot(age, wage, col="gray", pch=19, cex=0.5,
                xlab="Age", ylab="Wage"))
lines(age_grid, preds$fit, col="blue", lwd=2)
lines(age_grid, preds$fit + 2*preds$se.fit, col="blue", lty=2)
lines(age_grid, preds$fit - 2*preds$se.fit, col="blue", lty=2)

fits <- lapply(1:5, function(d) lm(wage ~ poly(age, d), data=Wage))
anova(fits[[1]], fits[[2]], fits[[3]], fits[[4]], fits[[5]])

## Analysis of Variance Table
## 
## Model 1: wage ~ poly(age, d)
## Model 2: wage ~ poly(age, d)
## Model 3: wage ~ poly(age, d)
## Model 4: wage ~ poly(age, d)
## Model 5: wage ~ poly(age, d)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The degree that was chosen was degree 4. However, when comparing degree 4 to the results of the ANOVA test we can see that it suggests that degree 3 is optimal. By looking at the results of the ANOVA, we can see that the p-value of degree 3 is just slightly above the significance level of 0.05 (0.051046), but the evidence is not that strong. But as we saw before the cross-validation results do infact support degree 4 as the optimal choice. This does align with the ANOVA test finding that degree 4 provides some additional fit, but the evidence is weak.

b.

set.seed(2)
cv.errors <- sapply(2:10, function(k) {
  Wage$age.cut <- cut(Wage$age, k)
  fit <- glm(wage ~ age.cut, data=Wage)
  cv.glm(Wage, fit, K=10)$delta[1]
})
names(cv.errors) <- 2:10
cv.errors

##        2        3        4        5        6        7        8        9 
## 1734.448 1681.623 1635.869 1632.864 1621.643 1613.148 1601.850 1610.852 
##       10 
## 1603.607

best.K <- as.numeric(names(cv.errors)[which.min(cv.errors)])
cat("Best number of cuts by CV:", best.K, "\n")

## Best number of cuts by CV: 8

Wage$age.cut <- cut(Wage$age, best.K)
fit.best <- lm(wage ~ age.cut, data=Wage)

with(Wage, {
  plot(age, wage,
       col="gray50", pch=19, cex=0.5,
       xlab="Age", ylab="Wage",
       main=paste(best.K, "Bin Step-Function Fit"))
  edges    <- seq(min(age), max(age), length=best.K+1)
  bin.means <- tapply(wage, age.cut, mean)
  for (i in seq_len(best.K)) {
    segments(edges[i],   bin.means[i],
             edges[i+1], bin.means[i],
             col="firebrick", lwd=2)
  }
})

10. a.

library(leaps)

## Warning: package 'leaps' was built under R version 4.4.2

set.seed(1)

attach(College)

train <- sample(length(Outstate), length(Outstate) / 2)
test <- -train

College.train <- College[train, ]
College.test <- College[test, ]

fit <- regsubsets(Outstate ~ ., data = College.train, nvmax = 17, method = "forward")
fit.summary <- summary(fit)

par(mfrow = c(1, 3))

plot(fit.summary$cp, xlab = "Number of variables", ylab = "Cp", type = "l")

min.cp <- min(fit.summary$cp)
std.cp <- sd(fit.summary$cp)

abline(h = min.cp + 0.2 * std.cp, col = "red", lty = 2)
abline(h = min.cp - 0.2 * std.cp, col = "red", lty = 2)

plot(fit.summary$bic, xlab = "Number of variables", ylab = "BIC", type='l')

min.bic <- min(fit.summary$bic)
std.bic <- sd(fit.summary$bic)

best_bic <- which.min(fit.summary$bic)
cat("BIC is minimized with", best_bic, "predictors\n")

## BIC is minimized with 6 predictors

abline(h = min.bic + 0.2 * std.bic, col = "red", lty = 2)
abline(h = min.bic - 0.2 * std.bic, col = "red", lty = 2)

plot(fit.summary$adjr2, xlab = "Number of variables", ylab = "Adjusted R2", type = "l", ylim = c(0.4, 0.84))

max.adjr2 <- max(fit.summary$adjr2)
std.adjr2 <- sd(fit.summary$adjr2)

abline(h = max.adjr2 + 0.2 * std.adjr2, col = "red", lty = 2)
abline(h = max.adjr2 - 0.2 * std.adjr2, col = "red", lty = 2)

lm1 <- regsubsets(Outstate ~ ., data = College, method = "forward")
coeffs <- coef(fit, id = 6)

names(coeffs)

## [1] "(Intercept)" "PrivateYes"  "Room.Board"  "Terminal"    "perc.alumni"
## [6] "Expend"      "Grad.Rate"

b.

library(gam)

## Warning: package 'gam' was built under R version 4.4.3

## Loading required package: splines

## Loading required package: foreach

## Warning: package 'foreach' was built under R version 4.4.2

## Loaded gam 1.22-5

gam1 <- gam(Outstate ~ Private + s(Room.Board, df = 2) + s(PhD, df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate, df = 2), data=College.train)
par(mfrow = c(2, 3))
plot(gam1, se = T, col = "blue")

When looking at the Private boxplot, we can see that it shows that private universities tend to have a significantly higher out of state tuition compared to public universities. For the Room.Board graph, we can see that it shows a general postive relationship with out of state tuition. This relationship is more clear at the higher room and board costs. Next, when looking at the PhD graph, we can see The percentage of faculty with PhDs has very little effect up until maybe ~70 %, and then a slight uptick thereafter. The wide CIs at the extremes (below ~20 % and above ~80 %) suggest that these estimates are less certain. Next, when looking at the perc.alumni graph, we can see that it shows that as alumni‐giving rates climb, predicted tuition rises roughly linearly. Next, when looking at the Expend graph, we can see that this is the most non-linear variable, and we can see that in the graph it shows that out-of-state tuition climbs steeply as per-student expenditures go from about $5k to $30k, then levels off past that. Lastly, when looking at the Grad.Rate graph, we can see that graduation rate has almost no effect until you hit very high rates, where there’s a modest positive bump.

c.

preds_gam <- predict(gam1, newdata = College.test)

test_mse_gam  <- mean((College.test$Outstate - preds_gam)^2)
test_rmse_gam <- sqrt(test_mse_gam)

tss           <- mean((College.test$Outstate - mean(College.test$Outstate))^2)
test_r2_gam   <- 1 - test_mse_gam / tss

cat("GAM Test MSE  =", round(test_mse_gam, 2),   "\n")

## GAM Test MSE  = 3349290

cat("GAM Test RMSE =", round(test_rmse_gam, 2),  "\n")

## GAM Test RMSE = 1830.11

cat("GAM Test R²   =", round(test_r2_gam,  3),   "\n")

## GAM Test R²   = 0.766

From these results, we can see that the gam predicts out‐of‐state tuition with an average squared error of about 3.35 million (MSE), which corresponds to a root‐mean‐squared error of roughly $1,830. That means that on average the model is within about $1,830 of the true tuition. In terms of explained variance, the model achieves an R-square value of 0.766, so it represents for about 76.6 % of the variability in tuition for schools it hasn’t seen. With that level of accuracy, including the errors under $2 000 on tuition that typically runs $5 000–$30 000, and explaining over three‐quarters of the variance, indicates the smooth terms for Room.Board, Expend, and the other predictors are capturing the key nonlinear relationships without overfitting, and that the GAM generalizes substantially better than a simple mean only baseline.

d.

summary(gam1)

## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 2) + s(PhD, 
##     df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate, 
##     df = 2), data = College.train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7402.89 -1114.45   -12.67  1282.69  7470.60 
## 
## (Dispersion Parameter for gaussian family taken to be 3711182)
## 
##     Null Deviance: 6989966760 on 387 degrees of freedom
## Residual Deviance: 1384271126 on 373 degrees of freedom
## AIC: 6987.021 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                         Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private                  1 1778718277 1778718277 479.286 < 2.2e-16 ***
## s(Room.Board, df = 2)    1 1577115244 1577115244 424.963 < 2.2e-16 ***
## s(PhD, df = 2)           1  322431195  322431195  86.881 < 2.2e-16 ***
## s(perc.alumni, df = 2)   1  336869281  336869281  90.771 < 2.2e-16 ***
## s(Expend, df = 5)        1  530538753  530538753 142.957 < 2.2e-16 ***
## s(Grad.Rate, df = 2)     1   86504998   86504998  23.309 2.016e-06 ***
## Residuals              373 1384271126    3711182                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                        Npar Df  Npar F     Pr(F)    
## (Intercept)                                         
## Private                                             
## s(Room.Board, df = 2)        1  1.9157    0.1672    
## s(PhD, df = 2)               1  0.9699    0.3253    
## s(perc.alumni, df = 2)       1  0.1859    0.6666    
## s(Expend, df = 5)            4 20.5075 2.665e-15 ***
## s(Grad.Rate, df = 2)         1  0.5702    0.4506    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ANOVA shows that Outstate and Expend show promising evidence of a non-linear relationship.

assignment 6

Valeria Villarreal epi354

2025-04-25

6.

a.

b.

10.

a.

b.

c.

d.