Assignment 6

library(ISLR2)

## Warning: package 'ISLR2' was built under R version 4.4.2

library(boot)
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.3

Question 6 :

Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial.What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

set.seed(42)
res <- sapply(1:6, function(i) {
  fit <- glm(wage ~ poly(age, i), data = Wage)
  cv.glm(Wage, fit, K = 5)$delta[1]
})
which.min(res)

## [1] 6

plot(1:6, res, xlab = "Degree", ylab = "Test MSE", type = "l")
abline(v = which.min(res), col = "red", lty = 2)

fit <- glm(wage ~ poly(age, which.min(res)), data = Wage)
plot(Wage$age, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4))
points(1:100, predict(fit, data.frame(age = 1:100)), type = "l", col = "red")

summary(glm(wage ~ poly(age, 6), data = Wage))

## 
## Call:
## glm(formula = wage ~ poly(age, 6), data = Wage)
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    111.7036     0.7286 153.316  < 2e-16 ***
## poly(age, 6)1  447.0679    39.9063  11.203  < 2e-16 ***
## poly(age, 6)2 -478.3158    39.9063 -11.986  < 2e-16 ***
## poly(age, 6)3  125.5217    39.9063   3.145  0.00167 ** 
## poly(age, 6)4  -77.9112    39.9063  -1.952  0.05099 .  
## poly(age, 6)5  -35.8129    39.9063  -0.897  0.36956    
## poly(age, 6)6   62.7077    39.9063   1.571  0.11620    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1592.512)
## 
##     Null deviance: 5222086  on 2999  degrees of freedom
## Residual deviance: 4766389  on 2993  degrees of freedom
## AIC: 30642
## 
## Number of Fisher Scoring iterations: 2

fit1 <- lm(wage ~ poly(age, 1), data = Wage)
fit2 <- lm(wage ~ poly(age, 2), data = Wage)
fit3 <- lm(wage ~ poly(age, 3), data = Wage)
fit4 <- lm(wage ~ poly(age, 4), data = Wage)
fit5 <- lm(wage ~ poly(age, 5), data = Wage)
anova(fit1, fit2, fit3, fit4, fit5)

## Analysis of Variance Table
## 
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The selected degree is 4. When testing with ANOVA, degrees 1, 2 and 3 are highly significant and 4 is marginal.

Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.

set.seed(42)
res <- sapply(2:10, function(i) {
  Wage$cats <- cut(Wage$age, i)
  fit <- glm(wage ~ cats, data = Wage)
  cv.glm(Wage, fit, K = 5)$delta[1]
})
names(res) <- 2:10
plot(2:10, res, xlab = "Cuts", ylab = "Test MSE", type = "l")
which.min(res)

## 8 
## 7

abline(v = names(which.min(res)), col = "red", lty = 2)

fit <- glm(wage ~ cut(age, 8), data = Wage)
plot(Wage$age, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4))
points(18:80, predict(fit, data.frame(age = 18:80)), type = "l", col = "red")

Question 10

This question relates to the College data set.

Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

library(leaps)

# helper function to predict from a regsubsets model
predict.regsubsets <- function(object, newdata, id, ...) {
  form <- as.formula(object$call[[2]])
  mat <- model.matrix(form, newdata)
  coefi <- coef(object, id = id)
  xvars <- names(coefi)
  mat[, xvars] %*% coefi
}

set.seed(42)
train <- rep(TRUE, nrow(College))
train[sample(1:nrow(College), nrow(College) * 1 / 3)] <- FALSE
fit <- regsubsets(Outstate ~ ., data = College[train, ], nvmax = 17, method = "forward")

plot(summary(fit)$bic, type = "b")

which.min(summary(fit)$bic)

## [1] 11

# or via cross-validation
err <- sapply(1:17, function(i) {
  x <- coef(fit, id = i)
  mean((College$Outstate[!train] - predict(fit, College[!train, ], i))^2)
})
which.min(err)

## [1] 16

min(summary(fit)$bic)

## [1] -690.9375

For the sake of simplicity we’ll choose 6

coef(fit, id = 6)

##   (Intercept)    PrivateYes    Room.Board           PhD   perc.alumni 
## -3540.0544008  2736.4231642     0.9061752    33.7848157    47.1998115 
##        Expend     Grad.Rate 
##     0.2421685    33.3137332

Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

library(gam)

## Warning: package 'gam' was built under R version 4.4.3

## Loading required package: splines

## Loading required package: foreach

## Loaded gam 1.22-5

fit <- gam(Outstate ~ Private + s(Room.Board, 2) + s(PhD, 2) + s(perc.alumni, 2) +
  s(Expend, 2) + s(Grad.Rate, 2), data = College[train, ])

summary(fit)

## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 2) + s(PhD, 
##     2) + s(perc.alumni, 2) + s(Expend, 2) + s(Grad.Rate, 2), 
##     data = College[train, ])
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7112.59 -1188.98    33.13  1238.54  8738.65 
## 
## (Dispersion Parameter for gaussian family taken to be 3577008)
## 
##     Null Deviance: 8471793308 on 517 degrees of freedom
## Residual Deviance: 1809966249 on 506.0001 degrees of freedom
## AIC: 9300.518 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private             1 2327235738 2327235738 650.610 < 2.2e-16 ***
## s(Room.Board, 2)    1 1741918029 1741918029 486.976 < 2.2e-16 ***
## s(PhD, 2)           1  668408518  668408518 186.863 < 2.2e-16 ***
## s(perc.alumni, 2)   1  387819183  387819183 108.420 < 2.2e-16 ***
## s(Expend, 2)        1  625813340  625813340 174.954 < 2.2e-16 ***
## s(Grad.Rate, 2)     1  111881207  111881207  31.278 3.664e-08 ***
## Residuals         506 1809966249    3577008                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df Npar F     Pr(F)    
## (Intercept)                                   
## Private                                       
## s(Room.Board, 2)        1  2.224   0.13648    
## s(PhD, 2)               1  5.773   0.01664 *  
## s(perc.alumni, 2)       1  0.365   0.54581    
## s(Expend, 2)            1 61.182 3.042e-14 ***
## s(Grad.Rate, 2)         1  4.126   0.04274 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Inference:

Expend has a very strong non-linear relationship with Outstate. The effect isn’t just linear — students at institutions with higher spending see sharp increases in tuition, especially at certain thresholds.
PhD and Grad.Rate also show statistically significant non-linear effects, but they’re more modest. This means their influence on tuition changes depending on their values, but not as dramatically as Expend.
Room.Board and perc.alumni do not contribute significant non-linear patterns — a linear model might be sufficient for those.

# Set up layout for multiple plots
par(mfrow = c(2, 3))  # 2 rows, 3 columns of plots

# Plot the smooth terms
plot(fit, se = TRUE, col = "blue")

Inference:

GAM plots reveal non-linear trends especially for Expend, PhD, and Grad.Rate.
Expend shows the strongest curvature — schools spending more tend to charge a lot more up to a threshold, then level off.

Evaluate the model obtained on the test set, and explain the results obtained.

pred <- predict(fit, College[!train, ])
err_gam <- mean((College$Outstate[!train] - pred)^2)
plot(err, ylim = c(min(err_gam, err), max(err)), type = "b")
abline(h = err_gam, col = "red", lty = 2)

# r-squared
1 - err_gam / mean((College$Outstate[!train] - mean(College$Outstate[!train]))^2)

## [1] 0.7655905

Fit: The GAM model is fitting well, with non-linear terms capturing the complexity of the relationships between predictors and Outstate.
Evaluation: The model explains a significant portion of the variation in the test set (R² = 0.765), indicating a good generalization to unseen data.

For which variables, if any, is there evidence of a non-linear relationship with the response?

summary(fit)

## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 2) + s(PhD, 
##     2) + s(perc.alumni, 2) + s(Expend, 2) + s(Grad.Rate, 2), 
##     data = College[train, ])
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7112.59 -1188.98    33.13  1238.54  8738.65 
## 
## (Dispersion Parameter for gaussian family taken to be 3577008)
## 
##     Null Deviance: 8471793308 on 517 degrees of freedom
## Residual Deviance: 1809966249 on 506.0001 degrees of freedom
## AIC: 9300.518 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private             1 2327235738 2327235738 650.610 < 2.2e-16 ***
## s(Room.Board, 2)    1 1741918029 1741918029 486.976 < 2.2e-16 ***
## s(PhD, 2)           1  668408518  668408518 186.863 < 2.2e-16 ***
## s(perc.alumni, 2)   1  387819183  387819183 108.420 < 2.2e-16 ***
## s(Expend, 2)        1  625813340  625813340 174.954 < 2.2e-16 ***
## s(Grad.Rate, 2)     1  111881207  111881207  31.278 3.664e-08 ***
## Residuals         506 1809966249    3577008                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df Npar F     Pr(F)    
## (Intercept)                                   
## Private                                       
## s(Room.Board, 2)        1  2.224   0.13648    
## s(PhD, 2)               1  5.773   0.01664 *  
## s(perc.alumni, 2)       1  0.365   0.54581    
## s(Expend, 2)            1 61.182 3.042e-14 ***
## s(Grad.Rate, 2)         1  4.126   0.04274 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Significant Variables:

- Private, Room.Board, PhD, perc.alumni, Expend, Grad.Rate all have significant effects on Outstate, but the significance varies between parametric (linear) and nonparametric (non-linear) effects.

Non-linear Effects:

- Expend shows the strongest non-linear effect, and Grad.Rate also has a significant but weaker non-linear relationship.

- Room.Board and perc.alumni show non-linear relationships in the parametric sense, but not in the nonparametric sense. This suggests that linear effects may be enough for those variables.

- PhD shows a non-linear relationship in both parametric and nonparametric terms, indicating that smoothing is beneficial.

Assignment 6

Sreeja Yalamaddi

2025-04-25