library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.2
library(boot)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3

Question 6 :

  1. Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial.What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.
set.seed(42)
res <- sapply(1:6, function(i) {
  fit <- glm(wage ~ poly(age, i), data = Wage)
  cv.glm(Wage, fit, K = 5)$delta[1]
})
which.min(res)
## [1] 6
plot(1:6, res, xlab = "Degree", ylab = "Test MSE", type = "l")
abline(v = which.min(res), col = "red", lty = 2)

fit <- glm(wage ~ poly(age, which.min(res)), data = Wage)
plot(Wage$age, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4))
points(1:100, predict(fit, data.frame(age = 1:100)), type = "l", col = "red")

summary(glm(wage ~ poly(age, 6), data = Wage))
## 
## Call:
## glm(formula = wage ~ poly(age, 6), data = Wage)
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    111.7036     0.7286 153.316  < 2e-16 ***
## poly(age, 6)1  447.0679    39.9063  11.203  < 2e-16 ***
## poly(age, 6)2 -478.3158    39.9063 -11.986  < 2e-16 ***
## poly(age, 6)3  125.5217    39.9063   3.145  0.00167 ** 
## poly(age, 6)4  -77.9112    39.9063  -1.952  0.05099 .  
## poly(age, 6)5  -35.8129    39.9063  -0.897  0.36956    
## poly(age, 6)6   62.7077    39.9063   1.571  0.11620    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1592.512)
## 
##     Null deviance: 5222086  on 2999  degrees of freedom
## Residual deviance: 4766389  on 2993  degrees of freedom
## AIC: 30642
## 
## Number of Fisher Scoring iterations: 2
fit1 <- lm(wage ~ poly(age, 1), data = Wage)
fit2 <- lm(wage ~ poly(age, 2), data = Wage)
fit3 <- lm(wage ~ poly(age, 3), data = Wage)
fit4 <- lm(wage ~ poly(age, 4), data = Wage)
fit5 <- lm(wage ~ poly(age, 5), data = Wage)
anova(fit1, fit2, fit3, fit4, fit5)
## Analysis of Variance Table
## 
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The selected degree is 4. When testing with ANOVA, degrees 1, 2 and 3 are highly significant and 4 is marginal.

  1. Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.
set.seed(42)
res <- sapply(2:10, function(i) {
  Wage$cats <- cut(Wage$age, i)
  fit <- glm(wage ~ cats, data = Wage)
  cv.glm(Wage, fit, K = 5)$delta[1]
})
names(res) <- 2:10
plot(2:10, res, xlab = "Cuts", ylab = "Test MSE", type = "l")
which.min(res)
## 8 
## 7
abline(v = names(which.min(res)), col = "red", lty = 2)

fit <- glm(wage ~ cut(age, 8), data = Wage)
plot(Wage$age, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4))
points(18:80, predict(fit, data.frame(age = 18:80)), type = "l", col = "red")

Question 10

This question relates to the College data set.

  1. Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.
library(leaps)

# helper function to predict from a regsubsets model
predict.regsubsets <- function(object, newdata, id, ...) {
  form <- as.formula(object$call[[2]])
  mat <- model.matrix(form, newdata)
  coefi <- coef(object, id = id)
  xvars <- names(coefi)
  mat[, xvars] %*% coefi
}

set.seed(42)
train <- rep(TRUE, nrow(College))
train[sample(1:nrow(College), nrow(College) * 1 / 3)] <- FALSE
fit <- regsubsets(Outstate ~ ., data = College[train, ], nvmax = 17, method = "forward")

plot(summary(fit)$bic, type = "b")

which.min(summary(fit)$bic)
## [1] 11
# or via cross-validation
err <- sapply(1:17, function(i) {
  x <- coef(fit, id = i)
  mean((College$Outstate[!train] - predict(fit, College[!train, ], i))^2)
})
which.min(err)
## [1] 16
min(summary(fit)$bic)
## [1] -690.9375

For the sake of simplicity we’ll choose 6

coef(fit, id = 6)
##   (Intercept)    PrivateYes    Room.Board           PhD   perc.alumni 
## -3540.0544008  2736.4231642     0.9061752    33.7848157    47.1998115 
##        Expend     Grad.Rate 
##     0.2421685    33.3137332
  1. Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.
library(gam)
## Warning: package 'gam' was built under R version 4.4.3
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-5
fit <- gam(Outstate ~ Private + s(Room.Board, 2) + s(PhD, 2) + s(perc.alumni, 2) +
  s(Expend, 2) + s(Grad.Rate, 2), data = College[train, ])
summary(fit)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 2) + s(PhD, 
##     2) + s(perc.alumni, 2) + s(Expend, 2) + s(Grad.Rate, 2), 
##     data = College[train, ])
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7112.59 -1188.98    33.13  1238.54  8738.65 
## 
## (Dispersion Parameter for gaussian family taken to be 3577008)
## 
##     Null Deviance: 8471793308 on 517 degrees of freedom
## Residual Deviance: 1809966249 on 506.0001 degrees of freedom
## AIC: 9300.518 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private             1 2327235738 2327235738 650.610 < 2.2e-16 ***
## s(Room.Board, 2)    1 1741918029 1741918029 486.976 < 2.2e-16 ***
## s(PhD, 2)           1  668408518  668408518 186.863 < 2.2e-16 ***
## s(perc.alumni, 2)   1  387819183  387819183 108.420 < 2.2e-16 ***
## s(Expend, 2)        1  625813340  625813340 174.954 < 2.2e-16 ***
## s(Grad.Rate, 2)     1  111881207  111881207  31.278 3.664e-08 ***
## Residuals         506 1809966249    3577008                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df Npar F     Pr(F)    
## (Intercept)                                   
## Private                                       
## s(Room.Board, 2)        1  2.224   0.13648    
## s(PhD, 2)               1  5.773   0.01664 *  
## s(perc.alumni, 2)       1  0.365   0.54581    
## s(Expend, 2)            1 61.182 3.042e-14 ***
## s(Grad.Rate, 2)         1  4.126   0.04274 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Inference:

# Set up layout for multiple plots
par(mfrow = c(2, 3))  # 2 rows, 3 columns of plots

# Plot the smooth terms
plot(fit, se = TRUE, col = "blue")

Inference:

  1. Evaluate the model obtained on the test set, and explain the results obtained.
pred <- predict(fit, College[!train, ])
err_gam <- mean((College$Outstate[!train] - pred)^2)
plot(err, ylim = c(min(err_gam, err), max(err)), type = "b")
abline(h = err_gam, col = "red", lty = 2)

# r-squared
1 - err_gam / mean((College$Outstate[!train] - mean(College$Outstate[!train]))^2)
## [1] 0.7655905
  1. For which variables, if any, is there evidence of a non-linear relationship with the response?
summary(fit)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 2) + s(PhD, 
##     2) + s(perc.alumni, 2) + s(Expend, 2) + s(Grad.Rate, 2), 
##     data = College[train, ])
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7112.59 -1188.98    33.13  1238.54  8738.65 
## 
## (Dispersion Parameter for gaussian family taken to be 3577008)
## 
##     Null Deviance: 8471793308 on 517 degrees of freedom
## Residual Deviance: 1809966249 on 506.0001 degrees of freedom
## AIC: 9300.518 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private             1 2327235738 2327235738 650.610 < 2.2e-16 ***
## s(Room.Board, 2)    1 1741918029 1741918029 486.976 < 2.2e-16 ***
## s(PhD, 2)           1  668408518  668408518 186.863 < 2.2e-16 ***
## s(perc.alumni, 2)   1  387819183  387819183 108.420 < 2.2e-16 ***
## s(Expend, 2)        1  625813340  625813340 174.954 < 2.2e-16 ***
## s(Grad.Rate, 2)     1  111881207  111881207  31.278 3.664e-08 ***
## Residuals         506 1809966249    3577008                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df Npar F     Pr(F)    
## (Intercept)                                   
## Private                                       
## s(Room.Board, 2)        1  2.224   0.13648    
## s(PhD, 2)               1  5.773   0.01664 *  
## s(perc.alumni, 2)       1  0.365   0.54581    
## s(Expend, 2)            1 61.182 3.042e-14 ***
## s(Grad.Rate, 2)         1  4.126   0.04274 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Significant Variables:

- Private, Room.Board, PhD, perc.alumni, Expend, Grad.Rate all have significant effects on Outstate, but the significance varies between parametric (linear) and nonparametric (non-linear) effects.

Non-linear Effects:

- Expend shows the strongest non-linear effect, and Grad.Rate also has a significant but weaker non-linear relationship.

- Room.Board and perc.alumni show non-linear relationships in the parametric sense, but not in the nonparametric sense. This suggests that linear effects may be enough for those variables.

- PhD shows a non-linear relationship in both parametric and nonparametric terms, indicating that smoothing is beneficial.