Chapter 7: 6, 10

6.

a)

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.5     ✓ dplyr   1.0.3
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(boot)
library(ISLR)

data("Wage")
plot(Wage$age, Wage$wage, xlab="age", ylab="wage")

set.seed(2021)
nums<- rep(0,10)

for (i in 1:10) {
    glm.fit <- glm(wage ~ poly(age, i), data = Wage)
    nums[i] <- cv.glm(Wage, glm.fit, K = 10)$delta[1]
}

plot(1:10, nums, 
     xlab= "poly degree",
     ylab= "error",
     pch =19,
     type= "b")

b<- glm(wage ~ poly(age, 4), data = Wage)
plot(Wage$age, Wage$wage)

a_range <- range(Wage$age)

a_pred <- seq(from = a_range[1], to = a_range[2], length.out = 100)
w_pred <- predict(b, newdata = list(age = a_pred))
lines(a_pred, w_pred, col = "green")

Anything beyond a 3rd or 4th degree polynomial isn’t suitable for the data based on the ANOVA.

b0 <- lm(wage ~ 1, data = Wage)
b1 <- lm(wage ~ poly(age, 1), data = Wage)
b2 <- lm(wage ~ poly(age, 2), data = Wage)
b3 <- lm(wage ~ poly(age, 3), data = Wage)
b4 <- lm(wage ~ poly(age, 4), data = Wage)
b5 <- lm(wage ~ poly(age, 5), data = Wage)
anova(b0, b1, b2, b3, b4, b5)
## Analysis of Variance Table
## 
## Model 1: wage ~ 1
## Model 2: wage ~ poly(age, 1)
## Model 3: wage ~ poly(age, 2)
## Model 4: wage ~ poly(age, 3)
## Model 5: wage ~ poly(age, 4)
## Model 6: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2999 5222086                                    
## 2   2998 5022216  1    199870 125.4443 < 2.2e-16 ***
## 3   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 4   2996 4777674  1     15756   9.8888  0.001679 ** 
## 5   2995 4771604  1      6070   3.8098  0.051046 .  
## 6   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

b)

cvs <- rep(0, 10)
for (i in 2:10) {
    Wage$age.cut <- cut(Wage$age, i)
    fit <- glm(wage ~ age.cut, data = Wage)
    cvs[i] <- cv.glm(Wage, fit, K = 10)$delta[1]
}
plot(2:10, cvs[-1], xlab = "cuts", ylab = "error", type = "l")

plot(wage ~ age, data = Wage, col = "black")
age_range <- range(Wage$age)
age.grid <- seq(from = age_range[1], to = age_range[2])
d <- glm(wage ~ cut(age, 8), data = Wage)
preds <- predict(d, data.frame(age = age.grid))
lines(age.grid, preds, col = "green", lwd = 2)

10.

a) Vars = 6 seems to be an appropriate size.

data("College")
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
##     melanoma
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(gam)
## Loading required package: splines
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded gam 1.20
library(leaps)
set.seed(2021)
index <- sample(seq_len(nrow(College)), size = nrow(College)*0.7)
train<- College[index,]
test<- College[-index,]

d_forward <- regsubsets(Outstate ~ ., data = train, method = "forward")
d_forward.summary <- summary(d_forward)

plot(d_forward.summary$cp, xlab = "# of vars", ylab = "cp", type = "l")

plot(d_forward.summary$bic, xlab = "# of vars", ylab = "BIC", type='l')

plot(d_forward.summary$adjr2, xlab = "# of vars", ylab = "adjusted R2", type = "l", ylim = c(0.4, 0.84))

e<- regsubsets(Outstate ~ ., data = College, method = "forward")
co<- coef(e, id = 6)
names(co)
## [1] "(Intercept)" "PrivateYes"  "Room.Board"  "PhD"         "perc.alumni"
## [6] "Expend"      "Grad.Rate"

b)

f<- gam(Outstate ~ Private + s(Room.Board, df = 2) + s(PhD, df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate, df = 2), data=train)

par(mfrow = c(3,2))
plot(f, col = "green")

c) R^2 for this model is 0.80.

pred <- predict(f, test)
er <- mean((test$Outstate - pred)^2)
er
## [1] 3749831
tss <- mean((test$Outstate - mean(test$Outstate))^2)
rsq <- 1 - er / tss
rsq
## [1] 0.7589525

d) Expend has a significant output on the non-parametric anova.

summary(f)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 2) + s(PhD, 
##     df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate, 
##     df = 2), data = train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7261.27 -1085.40    50.93  1284.70  7331.61 
## 
## (Dispersion Parameter for gaussian family taken to be 3378765)
## 
##     Null Deviance: 8918804182 on 542 degrees of freedom
## Residual Deviance: 1783987786 on 528 degrees of freedom
## AIC: 9720.686 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                         Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private                  1 2434529847 2434529847 720.538 < 2.2e-16 ***
## s(Room.Board, df = 2)    1 1833044509 1833044509 542.519 < 2.2e-16 ***
## s(PhD, df = 2)           1  519608348  519608348 153.786 < 2.2e-16 ***
## s(perc.alumni, df = 2)   1  410750820  410750820 121.568 < 2.2e-16 ***
## s(Expend, df = 5)        1  710656072  710656072 210.330 < 2.2e-16 ***
## s(Grad.Rate, df = 2)     1   86133168   86133168  25.492 6.124e-07 ***
## Residuals              528 1783987786    3378765                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                        Npar Df  Npar F  Pr(F)    
## (Intercept)                                      
## Private                                          
## s(Room.Board, df = 2)        1  1.3410 0.2474    
## s(PhD, df = 2)               1  1.5414 0.2150    
## s(perc.alumni, df = 2)       1  1.2645 0.2613    
## s(Expend, df = 5)            4 28.7223 <2e-16 ***
## s(Grad.Rate, df = 2)         1  1.6963 0.1933    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1