Chapter 07 (page 297): 6, 10

Chapter 7 ex 6

set.seed(42)
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.5
library(boot)
## Warning: package 'boot' was built under R version 4.0.5
attach(Wage)

agelims=range(age)

#Setting up models for CV
all.deltas = rep(NA, 5)
for (i in 1:5) {
  glm.fit = glm(wage~poly(age, i))
  all.deltas[i] = cv.glm(Wage, glm.fit, K=5)$delta[2]
}
plot(1:5, all.deltas, xlab="Degree", ylab="CV error", type="l", pch=20, lwd=2,ylim=c(1500, 1700))
min.point = min(all.deltas)
sd.points = sd(all.deltas)
abline(h=min.point + 0.2 * sd.points, col="darkblue", lty="dashed")
abline(h=min.point - 0.2 * sd.points, col="lightblue", lty="dashed")

#Doing fit with statistical tests to determine scale
fit=lm(wage~poly(age,5),data=Wage)
coef(summary(fit))
##                 Estimate Std. Error     t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287647 153.2780243 0.000000e+00
## poly(age, 5)1  447.06785 39.9160847  11.2001930 1.491111e-28
## poly(age, 5)2 -478.31581 39.9160847 -11.9830341 2.367734e-32
## poly(age, 5)3  125.52169 39.9160847   3.1446392 1.679213e-03
## poly(age, 5)4  -77.91118 39.9160847  -1.9518743 5.104623e-02
## poly(age, 5)5  -35.81289 39.9160847  -0.8972045 3.696820e-01
#Reviewed model and decided to use polynomial of degree 3
fit2=lm(wage~poly(age,3),data=Wage)
age.grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit2, data.frame(age=age.grid))
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Degree-3 Polynomial")
lines(age.grid,preds,lwd=2,col="blue")

Lovaas answer ex 6a: From my CV chart I see that the line doesn’t go believe the 0.2 standard error until after 2 - we’ll need at least 2 degrees in this GLM model.

I fit the GLM model with 5 degrees and looked at the coefficient summary. The Pr(>|t|) was less than 0.05 (my assumed P value) for degrees 1, 2, and 3. It was near 0.05 for the forth degree but I will still leave that out for my chart.

I realize that the question technically asked for an ANOVA chart, so here ya go. note the p values are the same as in the summary of the model with 5 poly.

fit.1 = lm(wage~poly(age, 1), data=Wage)
fit.2 = lm(wage~poly(age, 2), data=Wage)
fit.3 = lm(wage~poly(age, 3), data=Wage)
fit.4 = lm(wage~poly(age, 4), data=Wage)
fit.5 = lm(wage~poly(age, 5), data=Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
## 
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#CV for various cuts
all.cvs = rep(NA, 11)
for (i in 2:11) {
  Wage$age.cut = cut(Wage$age, i)
  lm.fit = glm(wage~age.cut, data=Wage)
  all.cvs[i] = cv.glm(Wage, lm.fit, K=11)$delta[2]
}

plot(2:11, all.cvs[-1], xlab="Number of cuts", ylab="CV error", type="l", pch=20, lwd=2)

# Decided to go with 8
table(cut(age,8))
## 
## (17.9,25.8] (25.8,33.5] (33.5,41.2]   (41.2,49]   (49,56.8] (56.8,64.5] 
##         231         519         671         728         503         276 
## (64.5,72.2] (72.2,80.1] 
##          54          18
fit.cut8=lm(wage∼cut (age ,8))
coef(summary(fit.cut8))
##                        Estimate Std. Error   t value      Pr(>|t|)
## (Intercept)            76.28175   2.629812 29.006542 3.110596e-163
## cut(age, 8)(25.8,33.5] 25.83329   3.161343  8.171618  4.440913e-16
## cut(age, 8)(33.5,41.2] 40.22568   3.049065 13.192791  1.136044e-38
## cut(age, 8)(41.2,49]   43.50112   3.018341 14.412262  1.406253e-45
## cut(age, 8)(49,56.8]   40.13583   3.176792 12.634076  1.098741e-35
## cut(age, 8)(56.8,64.5] 44.10243   3.564299 12.373380  2.481643e-34
## cut(age, 8)(64.5,72.2] 28.94825   6.041576  4.791505  1.736008e-06
## cut(age, 8)(72.2,80.1] 15.22418   9.781110  1.556488  1.196978e-01
#Predictions and plots
agelims=range(age) #IDK why I had to reinstate that
age.grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit.cut8, data.frame(age=age.grid))
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("8-Cut Step Model")
lines(age.grid,preds,lwd=2,col="blue")

Lovaas answer ex 6b: The CV error is lowest around 8 or 11 cuts. It’s probably not best to use more than 10 cuts, so I’ll stick with 8. I’ll admit to stealing the code from the CV from the internet; it’s better than what I’ve found in the book.

Chapter 7 ex 10

This question relates to the College data set.

library(ISLR)
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.5
# ?College
sum(is.na(College$Apps)) # no need to fix data
## [1] 0
attach(College)
#Split data into test and train
train=sample(c(TRUE,FALSE), nrow(College),rep=TRUE)
test=(!train)
college.train = College[train,]
college.test = College[test,]

regfit.fwd=regsubsets(Outstate∼.,data = college.train, nvmax =19,method = "forward")
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = college.train, nvmax = 19, 
##     method = "forward")
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
##           PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 )  " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 3  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 4  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 8  ( 1 )  "*"        " "  " "    " "    " "       "*"       " "        
## 9  ( 1 )  "*"        " "  " "    " "    " "       "*"       " "        
## 10  ( 1 ) "*"        " "  " "    " "    " "       "*"       " "        
## 11  ( 1 ) "*"        " "  " "    " "    " "       "*"       " "        
## 12  ( 1 ) "*"        " "  "*"    " "    " "       "*"       " "        
## 13  ( 1 ) "*"        "*"  "*"    " "    " "       "*"       " "        
## 14  ( 1 ) "*"        "*"  "*"    "*"    " "       "*"       " "        
## 15  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       " "        
## 16  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
## 17  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
##           P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 2  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 3  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 6  ( 1 )  " "         "*"        " "   " "      " " "*"      " "      
## 7  ( 1 )  " "         "*"        " "   "*"      " " "*"      " "      
## 8  ( 1 )  " "         "*"        " "   "*"      " " "*"      " "      
## 9  ( 1 )  " "         "*"        " "   "*"      " " "*"      "*"      
## 10  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 11  ( 1 ) "*"         "*"        " "   "*"      "*" "*"      "*"      
## 12  ( 1 ) "*"         "*"        " "   "*"      "*" "*"      "*"      
## 13  ( 1 ) "*"         "*"        " "   "*"      "*" "*"      "*"      
## 14  ( 1 ) "*"         "*"        " "   "*"      "*" "*"      "*"      
## 15  ( 1 ) "*"         "*"        " "   "*"      "*" "*"      "*"      
## 16  ( 1 ) "*"         "*"        " "   "*"      "*" "*"      "*"      
## 17  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
##           perc.alumni Expend Grad.Rate
## 1  ( 1 )  " "         "*"    " "      
## 2  ( 1 )  " "         "*"    " "      
## 3  ( 1 )  " "         "*"    " "      
## 4  ( 1 )  "*"         "*"    " "      
## 5  ( 1 )  "*"         "*"    "*"      
## 6  ( 1 )  "*"         "*"    "*"      
## 7  ( 1 )  "*"         "*"    "*"      
## 8  ( 1 )  "*"         "*"    "*"      
## 9  ( 1 )  "*"         "*"    "*"      
## 10  ( 1 ) "*"         "*"    "*"      
## 11  ( 1 ) "*"         "*"    "*"      
## 12  ( 1 ) "*"         "*"    "*"      
## 13  ( 1 ) "*"         "*"    "*"      
## 14  ( 1 ) "*"         "*"    "*"      
## 15  ( 1 ) "*"         "*"    "*"      
## 16  ( 1 ) "*"         "*"    "*"      
## 17  ( 1 ) "*"         "*"    "*"
reg.summary = summary(regfit.fwd)


par(mfrow = c(1, 3))
plot(reg.summary$cp, xlab = "Number of Variables", ylab = "Cp", type = "l")
min.cp = min(reg.summary$cp)
std.cp = sd(reg.summary$cp)
abline(h = min.cp + 0.2 * std.cp, col = "darkgreen", lty = 2)
abline(h = min.cp - 0.2 * std.cp, col = "darkgreen", lty = 2)
plot(reg.summary$bic, xlab = "Number of Variables", ylab = "BIC", type = "l")
min.bic = min(reg.summary$bic)
std.bic = sd(reg.summary$bic)
abline(h = min.bic + 0.2 * std.bic, col = "darkblue", lty = 2)
abline(h = min.bic - 0.2 * std.bic, col = "darkblue", lty = 2)
plot(reg.summary$adjr2, xlab = "Number of Variables", ylab = "Adjusted R2", 
    type = "l", ylim = c(0.4, 0.84))
max.adjr2 = max(reg.summary$adjr2)
std.adjr2 = sd(reg.summary$adjr2)
abline(h = max.adjr2 + 0.2 * std.adjr2, col = "purple", lty = 2)
abline(h = max.adjr2 - 0.2 * std.adjr2, col = "purple", lty = 2)

Lovaas answer ex 10a: Based on the summary of my forward stepwise selection, we can see a ranking of the variables. Using Mallow’s CP (far left in plot), we need at least around 7 variables. The lower the BIC (middle plot) is, the better, so I want 6-7 variables. The higher the r-squared the better - we may want to go with 6 or 7 variables, since the r-squared shoots up a bit around there. I’ll go with 7; there’s not much difference in CV or BIC and a small boost in r-squared.

reg.fit = regsubsets(Outstate ~ ., data = college.train, method = "forward")
coefi = coef(reg.fit, id = 7)
names(coefi)
## [1] "(Intercept)" "PrivateYes"  "Room.Board"  "Personal"    "Terminal"   
## [6] "perc.alumni" "Expend"      "Grad.Rate"

Lovaas answer ex 10a some more: The variable selected under forward stepwise selection are “Private,” “Room.Board,” “Personal,” “Terminal,” ’perc.alumni," “Expend,” “Grad.Rate.”

library(akima)
## Warning: package 'akima' was built under R version 4.0.5
library(gam)
## Warning: package 'gam' was built under R version 4.0.5
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.0.5
## Loaded gam 1.20
gam1=lm(Outstate~ Private + s(Room.Board, df = 2) + s(Personal, df = 3) + Terminal + perc.alumni + Expend + Grad.Rate,data=college.train)
par(mfrow=c(2,2))
plot(gam1, se=TRUE,col="blue")
## Warning in plot.window(...): "se" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "se" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "se" is not a
## graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "se" is not a
## graphical parameter
## Warning in box(...): "se" is not a graphical parameter
## Warning in title(...): "se" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "se" is not a graphical
## parameter
## Warning in plot.window(...): "se" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "se" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "se" is not a
## graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "se" is not a
## graphical parameter
## Warning in box(...): "se" is not a graphical parameter
## Warning in title(...): "se" is not a graphical parameter
## Warning in plot.window(...): "se" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "se" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "se" is not a
## graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "se" is not a
## graphical parameter
## Warning in box(...): "se" is not a graphical parameter
## Warning in title(...): "se" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "se" is not a graphical
## parameter
## Warning in plot.window(...): "se" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "se" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "se" is not a
## graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "se" is not a
## graphical parameter
## Warning in box(...): "se" is not a graphical parameter
## Warning in title(...): "se" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "se" is not a graphical
## parameter

Lovaas answer ex 10b: It’s weird, you can do a lot with GAM (throw some splines in there, for example), but the problem didn’t ask/tell me what to do. I would have been overwhelmed with the options if it had, selecting the knots in a spline or the degrees in a polynomial for each of my 7 variables would have been stressful.

Anyways, the residuals plot and normalized q-q plots look pretty good. There’s a bit of curvature towards the edges of the q-q plots but no worse than what I’ve seen before. The cook’ distance plot (bottom right) does list 3 or four outliers, but those could be removed.

college.pred = predict(gam1, college.test)
x = mean((college.test[,"Outstate"] - college.pred)^2)
x
## [1] 4387794
gam.tss = mean((college.test$Outstate - mean(college.test$Outstate))^2)
test.rss = 1 - x/gam.tss
test.rss
## [1] 0.7326108

Lovaas answer ex 10c: Test r-squared is 73%. I bet if I played around a bit with the imputs into the GAM model I could get a slightly better fit.

  1. For which variables, if any, is there evidence of a non-linear relationship with the response?
summary(gam1)
## 
## Call:
## lm(formula = Outstate ~ Private + s(Room.Board, df = 2) + s(Personal, 
##     df = 3) + Terminal + perc.alumni + Expend + Grad.Rate, data = college.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7684.5 -1287.8  -101.9  1238.2  9066.2 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -2.924e+03  7.718e+02  -3.789 0.000176 ***
## PrivateYes             2.598e+03  2.805e+02   9.262  < 2e-16 ***
## s(Room.Board, df = 2)  1.048e+00  1.169e-01   8.971  < 2e-16 ***
## s(Personal, df = 3)   -4.193e-01  1.689e-01  -2.482 0.013480 *  
## Terminal               2.960e+01  8.768e+00   3.376 0.000812 ***
## perc.alumni            4.193e+01  9.861e+00   4.252 2.66e-05 ***
## Expend                 2.683e-01  2.599e-02  10.321  < 2e-16 ***
## Grad.Rate              2.457e+01  7.281e+00   3.375 0.000814 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1976 on 384 degrees of freedom
## Multiple R-squared:  0.7598, Adjusted R-squared:  0.7554 
## F-statistic: 173.5 on 7 and 384 DF,  p-value: < 2.2e-16
anova(gam1)
## Analysis of Variance Table
## 
## Response: Outstate
##                        Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private                 1 1978109809 1978109809 506.627 < 2.2e-16 ***
## s(Room.Board, df = 2)   1 1499260751 1499260751 383.986 < 2.2e-16 ***
## s(Personal, df = 3)     1   75820656   75820656  19.419 1.364e-05 ***
## Terminal                1  441198603  441198603 112.998 < 2.2e-16 ***
## perc.alumni             1  275459911  275459911  70.550 8.791e-16 ***
## Expend                  1  427896942  427896942 109.591 < 2.2e-16 ***
## Grad.Rate               1   44471296   44471296  11.390 0.0008137 ***
## Residuals             384 1499316701    3904471                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Lovaas answer ex 10d: The p values for all intercepts are well below 0.05, so all could fit well under a linear model.