Assignment 6

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

library(ISLR)

## Warning: package 'ISLR' was built under R version 3.6.3

library(boot)
data("Wage")

Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

delta_z = rep(NA, 10)
for (i in 1:10) {LMfit1 = glm(wage~poly(age, i), data=Wage)
  delta_z[i] = cv.glm(Wage, LMfit1, K=10)$delta[1]}

delta_z

##  [1] 1675.557 1601.395 1595.659 1595.339 1596.171 1595.740 1594.250 1596.182
##  [9] 1591.607 1593.251

plot(1:10, delta_z, xlab = 'Degree', ylab = 'Test Error', type = 'l')
degree.min <- min(delta_z)
points(degree.min, delta_z[degree.min], col = 'blue', lwd = 2, pch = 20)

fit1= lm(wage∼age ,data=Wage)
fit2= lm(wage∼poly(age ,2) ,data=Wage)
fit3= lm(wage∼poly(age ,3) ,data=Wage)
fit4= lm(wage∼poly(age ,4) ,data=Wage)
fit5= lm(wage∼poly(age ,5) ,data=Wage)
anova(fit1, fit2, fit3, fit4, fit5)

## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(wage~age, data = Wage, col = "darkgrey")
agelims = range(Wage$age)
age.grid = seq (from=agelims [1], to=agelims [2])
LMfit3 <- lm(wage ~ poly(age, 3), data = Wage)
preds=predict (LMfit3 ,newdata =list(age=age.grid))
lines(age.grid, preds, col = "red", lwd = 2)

Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained.

8 Cuts are optimal.

cv_errors = rep(NA, 10)
for (i in 2:10) {
  Wage$age.cut = cut(Wage$age, i)
  LMfit4 = glm(wage~age.cut, data=Wage)
  cv_errors[i] = cv.glm(Wage, LMfit4, K=10)$delta[2]
}
plot(2:10, cv_errors[-1], xlab="Number of cuts", ylab="Test error", type="l", pch=20, lwd=2)
degree.min2  =  min(cv_errors)
points(degree.min2, cv_errors[degree.min2], col = 'blue', lwd = 2, pch = 20)

LMfit5 <- glm(wage ~ cut(age, 8), data = Wage)
plot(wage~age, data = Wage, col = "darkgrey")
agelims = range(Wage$age)
age.grid = seq (from=agelims [1], to=agelims [2])
preds2=predict (LMfit5 ,newdata =list(age=age.grid))
lines(age.grid, preds2, col = "red", lwd = 2)

This question relates to the College data set.

data('College')
library(leaps)

## Warning: package 'leaps' was built under R version 3.6.3

Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

train  = sample(1: nrow(College), nrow(College)/2)
test <- -train
College.train = College[train, ]
College.test = College[test, ]
LMfit6 <- regsubsets(Outstate ~ ., data = College.train, method = 'forward')
fit.summary <- summary(LMfit6)

par(mfrow=c(2,2))
plot(fit.summary$cp ,xlab="Number of Variables ", ylab="Cp",
type="b")
points(which.min(fit.summary$cp), fit.summary$cp[which.min(fit.summary$cp)], col="red", cex=2, pch=20)
plot(fit.summary$bic ,xlab="Number of Variables ", ylab="BIC",
type="b")
points(which.min(fit.summary$bic), fit.summary$bic[which.min(fit.summary$bic)], col="red", cex=2, pch=20)

coef(LMfit6, id = 6)

##   (Intercept)    PrivateYes    Room.Board      Terminal   perc.alumni 
## -4429.2119297  2718.5469105     0.9975076    34.0583542    37.5178405 
##        Expend     Grad.Rate 
##     0.2043631    46.0407806

Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

library(gam)

## Warning: package 'gam' was built under R version 3.6.3

## Loading required package: splines

## Loading required package: foreach

## Warning: package 'foreach' was built under R version 3.6.3

## Loaded gam 1.16.1

GAMmod =  gam(Outstate ~ Private + s(Room.Board, 5) + s(Terminal, 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5), data = College.train)

## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored

par(mfrow = c(2,3))
plot(GAMmod, se = T, col = 'blue')

Evaluate the model obtained on the test set, and explain the results obtained.

R square is 0.7792718

predgam = predict(GAMmod, College.test)
gam.err = mean((College.test$Outstate - predgam)^2)
gam.err

## [1] 3555382

gam.tss = mean((College.test$Outstate - mean(College.test$Outstate))^2)
gam.tss

## [1] 16053612

gam.rss = 1 - gam.err/gam.tss
gam.rss

## [1] 0.7785307

For which variables, if any, is there evidence of a non-linear relationship with the response?

The model, Anova for Parametric Effects suggests a strong non-linear relationship between “Outstate” and “Expend”

summary(GAMmod)

## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 5) + s(Terminal, 
##     5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5), 
##     data = College.train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -6825.03 -1092.17   -50.86  1153.66  7805.79 
## 
## (Dispersion Parameter for gaussian family taken to be 3588585)
## 
##     Null Deviance: 6309716682 on 387 degrees of freedom
## Residual Deviance: 1295477607 on 360.9995 degrees of freedom
## AIC: 6985.3 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private             1 1632490381 1632490381 454.912 < 2.2e-16 ***
## s(Room.Board, 5)    1 1207287961 1207287961 336.425 < 2.2e-16 ***
## s(Terminal, 5)      1  294916787  294916787  82.182 < 2.2e-16 ***
## s(perc.alumni, 5)   1  220220352  220220352  61.367 5.346e-14 ***
## s(Expend, 5)        1  481145035  481145035 134.077 < 2.2e-16 ***
## s(Grad.Rate, 5)     1  176809984  176809984  49.270 1.111e-11 ***
## Residuals         361 1295477607    3588585                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df  Npar F     Pr(F)    
## (Intercept)                                    
## Private                                        
## s(Room.Board, 5)        4  1.9899   0.09550 .  
## s(Terminal, 5)          4  3.2012   0.01330 *  
## s(perc.alumni, 5)       4  0.9403   0.44059    
## s(Expend, 5)            4 17.1480 6.877e-13 ***
## s(Grad.Rate, 5)         4  2.8091   0.02551 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Assignment 6

7/24/2020