library(ISLR2)
library(boot)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
##     melanoma
library(leaps)
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-2

6

a. Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

attach(Wage)
set.seed(99)
err <- rep(0,5)
for (i in 1:5){
  glm.fit <- glm(wage ~ poly(age,i),data=Wage)
  err[i]<- cv.glm(Wage,glm.fit,K=10)$delta[1]
}

plot(err, type="b", xlab="Degree", ylab="Test MSE")
points(which.min(err), err[4], col="red", pch=20, cex=2)

Through cross-validation, the degree with min error is shown to be the 4th degree. To compare, we now run the ANOVA test:

fit.1 <- lm(wage ~ age , data = Wage)
fit.2 <- lm(wage ~ poly(age , 2), data = Wage)
fit.3 <- lm(wage ~ poly(age , 3), data = Wage)
fit.4 <- lm(wage ~ poly(age , 4), data = Wage)
fit.5 <- lm(wage ~ poly(age , 5), data = Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The p-values of the different degrees indicate that a quadratic or cubic degrees provide a good fit for the data.

fit <- lm(wage ~ poly(age , 3), data = Wage)
agelims <- range(age)
age.grid <- seq(from = agelims[1], to = agelims [2])
preds <- predict(fit , newdata = list(age = age.grid), se = TRUE)
se.bands <- cbind(preds$fit + 2 * preds$se.fit ,preds$fit - 2 * preds$se.fit)

plot(age , wage , xlim = agelims , cex = .5, col = "darkgrey")
lines(age.grid, preds$fit , lwd = 2, col = "blue")
matlines(age.grid , se.bands, lwd = 1, col = "blue", lty = 3)

b. Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.

set.seed(99)
err <- rep(NA, 10)
for(i in 2:10){
  Wage$age.cut <- cut(Wage$age,i)
  glm.fit <- glm(wage ~ age.cut, data=Wage)
  err[i] <- cv.glm(Wage, glm.fit, K=10)$delta[1]}

plot(2:10, err[-1], type="b")
points(which.min(err), err[which.min(err)], col="red", pch=20, cex=2)

The number of cuts indicated by cross-validation is 8, which we use to fit the regression:

step <- glm(wage ~ cut(age, 8), data=Wage)
preds <- predict(step, data.frame(age = age.grid))
plot(age, wage, col="black")
lines(age.grid, preds, col="red")

detach(Wage)
  1. This question relates to the College data set.

    attach(College)
  1. Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward step wise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.
set.seed(99)
train <- createDataPartition(College$Outstate, p = 0.7, list = FALSE)
training <- College[train,]
testing <- College[-train,]

fit_fwd <- regsubsets(Outstate ~ ., data= training, nvmax=17, method="forward")
summary <- summary(fit_fwd)

par(mfrow = c(1, 3))

plot(summary$cp, type = "l")
min_cp <- min(summary$cp)
sd_cp <- sd(summary$cp)

plot(summary$bic, type = "l")
min_bic <- min(summary$bic)
sd_bic <- sd(summary$bic)

plot(summary$adjr2, type = "l", ylim = c(0.45,0.8))

max_adj = max(summary$adjr2)
sd_adj = sd(summary$adjr2)
which.min(summary$cp)
## [1] 13
which.min(summary$bic)
## [1] 12
which.max(summary$adjr2)
## [1] 13

The best subsets include around 13 and 12 predictors, however in the BIC plot we can observe that a local minimum is achieved around the model with 6 predictors. For CP and R2, beyond 6 predictors there is not a substantial improvement for the models, therefore we can choose a model with 6 predictors, with coefficients:

coef(fit_fwd, id=6) 
##   (Intercept)    PrivateYes    Room.Board      Terminal   perc.alumni 
## -4184.6616355  2683.7986714     1.0021114    42.7394905    50.2026915 
##        Expend     Grad.Rate 
##     0.2017187    26.6202852

b. Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

gam_train <- gam(Outstate ~ Private+s(Expend,df= 3)+s(Room.Board,df=3)+s(Terminal,df=3)+s(perc.alumni,df=3)+s(Grad.Rate,df=3), data=training)

par(mfrow = c(2, 3))
plot(gam_train, se = T, col = "red")

There appears to be a somewhat linear behavior for Room.Board and perc.alumni.

c. Evaluate the model obtained on the test set, and explain the results obtained.

pred <- predict(gam_train, testing)
err <-  mean((testing$Outstate - pred)^2)
err
## [1] 3377651
tss <- mean((testing$Outstate - mean(testing$Outstate))^2)
rss <-  1 - err / tss
rss
## [1] 0.7871247

For this model there is an associated R squared value of 0.79.

d. For which variables, if any, is there evidence of a non-linear relationship with the response?

summary(gam_train)
## 
## Call: gam(formula = Outstate ~ Private + s(Expend, df = 3) + s(Room.Board, 
##     df = 3) + s(Terminal, df = 3) + s(perc.alumni, df = 3) + 
##     s(Grad.Rate, df = 3), data = training)
## Deviance Residuals:
##    Min     1Q Median     3Q    Max 
##  -7262  -1111     95   1350   5089 
## 
## (Dispersion Parameter for gaussian family taken to be 3589533)
## 
##     Null Deviance: 8892099442 on 545 degrees of freedom
## Residual Deviance: 1898862849 on 529 degrees of freedom
## AIC: 9809.279 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                         Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private                  1 2192049040 2192049040 610.678 < 2.2e-16 ***
## s(Expend, df = 3)        1 2827261955 2827261955 787.641 < 2.2e-16 ***
## s(Room.Board, df = 3)    1  475244621  475244621 132.397 < 2.2e-16 ***
## s(Terminal, df = 3)      1  110352804  110352804  30.743 4.657e-08 ***
## s(perc.alumni, df = 3)   1  148971129  148971129  41.502 2.651e-10 ***
## s(Grad.Rate, df = 3)     1   87736351   87736351  24.442 1.030e-06 ***
## Residuals              529 1898862849    3589533                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                        Npar Df Npar F   Pr(F)    
## (Intercept)                                      
## Private                                          
## s(Expend, df = 3)            2 43.409 < 2e-16 ***
## s(Room.Board, df = 3)        2  3.567 0.02892 *  
## s(Terminal, df = 3)          2  2.069 0.12737    
## s(perc.alumni, df = 3)       2  1.258 0.28512    
## s(Grad.Rate, df = 3)         2  2.533 0.08034 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ANOVA’s nonparametric p-value indicates that there is a non-linear relationship between Outstate and Expend (2e-16) and Outstate and Room Board (0.0289) in a lesser degree.

detach(College)