Assignment 6

Rudy Martinez

7/30/2021

Libraries

library(ISLR)
library(tidyverse)
library(data.table)
library(leaps)
library(glmnet)
library(boot)
library(gam)

Exercises

Exercise 6

In this exercise, you will further analyze the `Wage` data set considered throughout this chapter.

(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

attach(Wage)
set.seed(1)

all.deltas = rep(NA, 10)

for (i in 1:10) {
  glm.fit = glm(wage~poly(age, i), data=Wage)
  all.deltas[i] = cv.glm(Wage, glm.fit, K=10)$delta[2]
}

min_error = which.min(all.deltas)
phrase = " = Lowest Error (degree at which we will be performing the best polynomial regression)"

paste(min_error, phrase)

## [1] "9  = Lowest Error (degree at which we will be performing the best polynomial regression)"

plot(1:10, all.deltas, xlab="Degree", ylab="CV error", type="l", pch=20, lwd=2, ylim=c(1590, 1700))

min.point = min(all.deltas)
sd.points = sd(all.deltas)

fit.1 = lm(wage~poly(age, 1), data=Wage)
fit.2 = lm(wage~poly(age, 2), data=Wage)
fit.3 = lm(wage~poly(age, 3), data=Wage)
fit.4 = lm(wage~poly(age, 4), data=Wage)
fit.5 = lm(wage~poly(age, 5), data=Wage)
fit.6 = lm(wage~poly(age, 6), data=Wage)
fit.7 = lm(wage~poly(age, 7), data=Wage)
fit.8 = lm(wage~poly(age, 8), data=Wage)
fit.9 = lm(wage~poly(age, 9), data=Wage)
fit.10 = lm(wage~poly(age, 10), data=Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5, fit.6, fit.7, fit.8, fit.9, fit.10)

## Analysis of Variance Table
## 
## Model  1: wage ~ poly(age, 1)
## Model  2: wage ~ poly(age, 2)
## Model  3: wage ~ poly(age, 3)
## Model  4: wage ~ poly(age, 4)
## Model  5: wage ~ poly(age, 5)
## Model  6: wage ~ poly(age, 6)
## Model  7: wage ~ poly(age, 7)
## Model  8: wage ~ poly(age, 8)
## Model  9: wage ~ poly(age, 9)
## Model 10: wage ~ poly(age, 10)
##    Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1    2998 5022216                                    
## 2    2997 4793430  1    228786 143.7638 < 2.2e-16 ***
## 3    2996 4777674  1     15756   9.9005  0.001669 ** 
## 4    2995 4771604  1      6070   3.8143  0.050909 .  
## 5    2994 4770322  1      1283   0.8059  0.369398    
## 6    2993 4766389  1      3932   2.4709  0.116074    
## 7    2992 4763834  1      2555   1.6057  0.205199    
## 8    2991 4763707  1       127   0.0796  0.777865    
## 9    2990 4756703  1      7004   4.4014  0.035994 *  
## 10   2989 4756701  1         3   0.0017  0.967529    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(wage~age, data=Wage, col="darkgrey")
agelims = range(Wage$age)
age.grid = seq(from=agelims[1], to=agelims[2])

lm.fitd3 = lm(wage~poly(age, 3), data=Wage)
lm.fitd4 = lm(wage~poly(age, 4), data=Wage)

lm.predd3 = predict(lm.fitd3, data.frame(age=age.grid))
lm.predd4 = predict(lm.fitd4, data.frame(age=age.grid))
lines(age.grid, lm.predd3, col="blue", lwd=2)
lines(age.grid, lm.predd4, col="red", lwd=2)

The anova() suggests that degree 4 or 3 and degree 9 are not that different and in this case we should really consider a degree 4 or 3 polynomial regression over a degree 9.
We really should not use degree 9 as it does not really improve insight very much, if at all, and only stands to complicate our model.

(b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained.

cv1 = rep(NA, 10)

for (i in 2:10) {
  Wage$age.cut = cut(Wage$age, i)
  lm.fit = glm(wage~age.cut, data=Wage)
  cv1[i] = cv.glm(Wage, lm.fit, K=10)$delta[2]
}

min_error_step = which.min(cv1)
phrase_2 = " = Lowest Error (degree at which we will be performing the best polynomial regression)"
paste(min_error_step, phrase_2)

## [1] "8  = Lowest Error (degree at which we will be performing the best polynomial regression)"

plot(2:10, cv1[-1], xlab="Number of cuts", ylab="CV error", type="l", pch=20, lwd=2)

plot(wage ~ age, data = Wage, col = "grey")
fit = glm(wage ~ cut(age, min_error_step), data = Wage)

preds = predict(fit, list(age = age.grid))
lines(age.grid, preds, col = "red", lwd = 2)

Exercise 10

This question relates to the `College` data set.

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

attach(College)
set.seed(1)

train = sample(length(Outstate), length(Outstate) / 2)
test = -train

College.train = College[train, ]
College.test = College[test, ]

fit = regsubsets(Outstate ~ ., data = College.train, nvmax = 17, method = "forward")
fit.summary = summary(fit)

par(mfrow = c(1, 3),bg = "white")

#Plot 1
plot(fit.summary$cp, xlab = "Number of variables", ylab = "Cp", type = "l")
min.cp = min(fit.summary$cp)
std.cp = sd(fit.summary$cp)
abline(h = min.cp + 0.2 * std.cp, col = "black", lty = 2)
abline(h = min.cp - 0.2 * std.cp, col = "black", lty = 2)

#Plot 2
plot(fit.summary$bic, xlab = "Number of variables", ylab = "BIC", type='l')
min.bic = min(fit.summary$bic)
std.bic = sd(fit.summary$bic)
abline(h = min.bic + 0.2 * std.bic, col = "black", lty = 2)
abline(h = min.bic - 0.2 * std.bic, col = "black", lty = 2)

#Plot 3
plot(fit.summary$adjr2, xlab = "Number of variables", ylab = "Adjusted R2", type = "l", ylim = c(0.4, 0.84))
max.adjr2 = max(fit.summary$adjr2)
std.adjr2 = sd(fit.summary$adjr2)
abline(h = max.adjr2 + 0.2 * std.adjr2, col = "black", lty = 2)
abline(h = max.adjr2 - 0.2 * std.adjr2, col = "black", lty = 2)

cp_result = which.min(fit.summary$cp)
bic_result = which.min(fit.summary$bic)
adjr2_result = which.min(fit.summary$adjr2)

cp = "cp = "
bic = "bic = "
adjr2 = "adjr2 = "

paste(cp, cp_result)

## [1] "cp =  14"

paste(bic, bic_result)

## [1] "bic =  6"

paste(adjr2, adjr2_result)

## [1] "adjr2 =  1"

co = coef(fit, id = 6)
names(co)

## [1] "(Intercept)" "PrivateYes"  "Room.Board"  "Terminal"    "perc.alumni"
## [6] "Expend"      "Grad.Rate"

We’ll select 6 since that has the best bic score.

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

gam.fit = gam(Outstate ~ Private + s(Room.Board, df = 2) + s(Terminal, df = 2) + 
    s(perc.alumni, df = 2) + s(Expend, df = 2) + s(Grad.Rate, df = 2), data = College.train)

par(mfrow = c(2, 3), bg = "white")
plot(gam.fit, se = T, col = "red")

(c) Evaluate the model obtained on the test set, and explain the results obtained.

gam.pred = predict(gam.fit, College.test)
gam.err = mean((College.test$Outstate - gam.pred)^2)
gam.err

## [1] 3456745

gam.tss = mean((College.test$Outstate - mean(College.test$Outstate))^2)
test.rss = 1 - gam.err / gam.tss
test.rss

## [1] 0.7584943

phrase_3 = "We obtain the following test R^2 when using GAM with 6 predictors = "
paste(phrase_3, test.rss)

## [1] "We obtain the following test R^2 when using GAM with 6 predictors =  0.7584942641655"

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

summary(gam.fit)

## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 2) + s(Terminal, 
##     df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 2) + s(Grad.Rate, 
##     df = 2), data = College.train)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -6632.2 -1268.4  -125.7  1362.1  8676.0 
## 
## (Dispersion Parameter for gaussian family taken to be 3959960)
## 
##     Null Deviance: 6989966760 on 387 degrees of freedom
## Residual Deviance: 1488945983 on 376.0003 degrees of freedom
## AIC: 7009.303 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                         Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private                  1 1848374254 1848374254 466.766 < 2.2e-16 ***
## s(Room.Board, df = 2)    1 1732263048 1732263048 437.445 < 2.2e-16 ***
## s(Terminal, df = 2)      1  358063651  358063651  90.421 < 2.2e-16 ***
## s(perc.alumni, df = 2)   1  365964119  365964119  92.416 < 2.2e-16 ***
## s(Expend, df = 2)        1  470210508  470210508 118.741 < 2.2e-16 ***
## s(Grad.Rate, df = 2)     1   89293627   89293627  22.549 2.918e-06 ***
## Residuals              376 1488945983    3959960                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                        Npar Df Npar F     Pr(F)    
## (Intercept)                                        
## Private                                            
## s(Room.Board, df = 2)        1  1.737    0.1883    
## s(Terminal, df = 2)          1  0.718    0.3973    
## s(perc.alumni, df = 2)       1  0.310    0.5780    
## s(Expend, df = 2)            1 50.821 5.218e-12 ***
## s(Grad.Rate, df = 2)         1  0.900    0.3434    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Non-parametric Anova test shows a strong evidence of non-linear relationship between response and Expend

Libraries

Exercises

Exercise 6

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

Exercise 10

This question relates to the College data set.

In this exercise, you will further analyze the `Wage` data set considered throughout this chapter.

This question relates to the `College` data set.