Problem 3

  1. Suppose we fit a curve with basis functions b1(X) = X, b2(X) = (X − 1)2I(X ≥ 1). (Note that I(X ≥ 1) equals 1 for X ≥ 1 and 0 otherwise.) We fit the linear regression model Y = β0 + β1b1(X) + β2b2(X) + ε, and obtain coefficient estimates ˆ β0 = 1, ˆ β1 = 1, ˆβ2 = −2. Sketch the estimated curve between X = −2 and X = 2. Note the intercepts, slopes, and other relevant information.
x <-  -2:2
y <-  1 + x + -2 * (x-1)^2 * I(x>1)
plot(x, y, cex=1.5, pch = 19, col="Purple")

Problem 8

Fit some of the non-linear models investigated in this chapter to the Auto data set. Is there evidence for non-linear relationships in this data set? Create some informative plots to justify your answer.

library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.6.3
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus
attach(Auto)
set.seed(42)
corrgram(Auto, upper.panel=panel.cor)

Cylinders, displacement, horsepower, and weight adversely effect MPG, while year seems to have a positive effect. We must disregard origin here, it is a factor variable.

fits <- list()
cyl_rss <- rep(NA, 4)
disp_rss <-  rep(NA, 4)
hp_rss <-  rep(NA, 4)
w_rss <-  rep(NA, 4)
for (d in 1:4) {
    fits[[d]] = lm(mpg ~ poly(cylinders, d), data = Auto)
    cyl_rss[d] = deviance(fits[[d]])
}
cyl_anova <- anova(fits[[1]], fits[[2]], fits[[3]], fits[[4]])
for (d in 1:10) {
    fits[[d]] = lm(mpg ~ poly(displacement, d), data = Auto)
    disp_rss[d] = deviance(fits[[d]])
}
disp_anova <- anova(fits[[1]], fits[[2]], fits[[3]], fits[[4]], fits[[5]], fits[[6]], fits[[7]], fits[[8]], fits[[9]], fits[[10]])
for (d in 1:10) {
    fits[[d]] = lm(mpg ~ poly(horsepower, d), data = Auto)
    hp_rss[d] = deviance(fits[[d]])
}
hp_anova <- anova(fits[[1]], fits[[2]], fits[[3]], fits[[4]], fits[[5]], fits[[6]], fits[[7]], fits[[8]], fits[[9]], fits[[10]])
for (d in 1:10) {
    fits[[d]] = lm(mpg ~ poly(weight, d), data = Auto)
    w_rss[d] = deviance(fits[[d]])
}
w_anova <- anova(fits[[1]], fits[[2]], fits[[3]], fits[[4]], fits[[5]], fits[[6]], fits[[7]], fits[[8]], fits[[9]], fits[[10]])

The analysis of variance shows horsepower should use the sixth fit.

hp_anova
## Analysis of Variance Table
## 
## Model  1: mpg ~ poly(horsepower, d)
## Model  2: mpg ~ poly(horsepower, d)
## Model  3: mpg ~ poly(horsepower, d)
## Model  4: mpg ~ poly(horsepower, d)
## Model  5: mpg ~ poly(horsepower, d)
## Model  6: mpg ~ poly(horsepower, d)
## Model  7: mpg ~ poly(horsepower, d)
## Model  8: mpg ~ poly(horsepower, d)
## Model  9: mpg ~ poly(horsepower, d)
## Model 10: mpg ~ poly(horsepower, d)
##    Res.Df    RSS Df Sum of Sq        F    Pr(>F)    
## 1     390 9385.9                                    
## 2     389 7442.0  1   1943.89 104.9077 < 2.2e-16 ***
## 3     388 7426.4  1     15.59   0.8415  0.359535    
## 4     387 7399.5  1     26.91   1.4525  0.228881    
## 5     386 7223.4  1    176.15   9.5065  0.002197 ** 
## 6     385 7150.3  1     73.04   3.9417  0.047819 *  
## 7     384 7086.6  1     63.69   3.4372  0.064516 .  
## 8     383 7081.9  1      4.72   0.2548  0.614030    
## 9     382 7066.6  1     15.35   0.8285  0.363273    
## 10    381 7059.7  1      6.84   0.3689  0.543953    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit=lm(mpg~poly(horsepower,6),data=Auto)
coef(summary(fit))
##                         Estimate Std. Error    t value      Pr(>|t|)
## (Intercept)            23.445918  0.2176656 107.715325 1.449683e-289
## poly(horsepower, 6)1 -120.137744  4.3095585 -27.877042  2.164963e-94
## poly(horsepower, 6)2   44.089528  4.3095585  10.230637  6.845463e-22
## poly(horsepower, 6)3   -3.948849  4.3095585  -0.916300  3.600832e-01
## poly(horsepower, 6)4   -5.187810  4.3095585  -1.203792  2.294098e-01
## poly(horsepower, 6)5   13.272187  4.3095585   3.079709  2.220714e-03
## poly(horsepower, 6)6   -8.546238  4.3095585  -1.983089  4.806767e-02
fit2=lm(mpg~poly(horsepower,6,raw=T),data=Auto)
coef(summary(fit2))
##                                    Estimate   Std. Error   t value    Pr(>|t|)
## (Intercept)                   -1.621416e+02 7.142691e+01 -2.270035 0.023757615
## poly(horsepower, 6, raw = T)1  1.123809e+01 4.016610e+00  2.797903 0.005401987
## poly(horsepower, 6, raw = T)2 -2.436366e-01 8.969415e-02 -2.716304 0.006899198
## poly(horsepower, 6, raw = T)3  2.580141e-03 1.018801e-03  2.532528 0.011720819
## poly(horsepower, 6, raw = T)4 -1.452995e-05 6.218662e-06 -2.336507 0.019977094
## poly(horsepower, 6, raw = T)5  4.172832e-08 1.939327e-08  2.151691 0.032043069
## poly(horsepower, 6, raw = T)6 -4.803212e-11 2.422086e-11 -1.983089 0.048067671
hplims=range(horsepower)
hp.grid=seq(from=hplims[1],to=hplims[2])
preds=predict(fit,newdata=list(hp=hp.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
plot(horsepower,mpg,xlim=hplims,cex=.5,col="darkgrey")
title("Degree-6 Polynomial")

### Cross Validation Cross validation also selects the 6th (or 8th) degree Polynomial

library(glmnet)
library(boot)

cv.errs <- rep(NA, 15)
for (d in 1:15) {
    fit <-  glm(mpg ~ poly(horsepower, d), data = Auto)
    cv.errs[d] <-  cv.glm(Auto, fit, K = 10)$delta[2]
}
cv.errs
##  [1] 24.36098 19.30116 19.38650 19.17300 19.25729 18.74147 18.98289 19.03350
##  [9] 18.73875 19.27448 19.24204 19.10546 19.16816 35.12284 91.56307
which.min(cv.errs)
## [1] 9

### Step Function

cv.errs <- rep(NA, 10)
for (c in 2:10) {
    Auto$dis.cut <-  cut(Auto$horsepower, c)
    fit <-  glm(mpg ~ dis.cut, data = Auto)
    cv.errs[c] <-  cv.glm(Auto, fit, K = 10)$delta[2]
}
cv.errs
##  [1]       NA 37.99124 33.53460 25.01922 22.35480 21.89851 21.23304 19.85376
##  [9] 20.95053 21.06961
which.min(cv.errs)
## [1] 8

### Splines

library(splines)
fit=lm(mpg~bs(horsepower,knots=c(25,40,60)),data=Auto)
pred=predict(fit,newdata=list(horsepower=hp.grid),se=T)
## Warning in predict.lm(fit, newdata = list(horsepower = hp.grid), se = T):
## prediction from a rank-deficient fit may be misleading
plot(horsepower,mpg,col="gray")
lines(hp.grid,pred$fit,lwd=2, col="#de2d26")
lines(hp.grid,pred$fit+2*pred$se,lty="dashed")
lines(hp.grid,pred$fit-2*pred$se,lty="dashed")

dim(bs(horsepower,knots=c(25,40,60)))
## [1] 392   6
dim(bs(horsepower,df=6))
## [1] 392   6
attr(bs(horsepower,df=6),"knots")
##   25%   50%   75% 
##  75.0  93.5 126.0
# Natural Spline
fit2=lm(mpg~ns(horsepower,df=4),data=Auto)
pred2=predict(fit2,newdata=list(horsepower=hp.grid),se=T)
plot(horsepower,mpg,xlim=hplims,cex=.5,col="darkgrey")
lines(hp.grid, pred2$fit,col="red",lwd=2)

# Smooth Spline
fit=smooth.spline(horsepower,mpg,df=16)
fit2=smooth.spline(horsepower,mpg,cv=TRUE)
## Warning in smooth.spline(horsepower, mpg, cv = TRUE): cross-validation with non-
## unique 'x' values seems doubtful
fit2$df
## [1] 5.784761
plot(horsepower,mpg,xlim=hplims,cex=.5,col="darkgrey")
lines(fit,col="red",lwd=2)
lines(fit2,col="blue",lwd=2)
legend("topright",legend=c("16 DF","5.8 DF"),col=c("red","blue"),lty=1,lwd=2,cex=.8)
title("Local Regression")

library(gam)
## Warning: package 'gam' was built under R version 3.6.3
## Loading required package: foreach
## Loaded gam 1.16.1
gam.m3=gam(mpg~s(cylinders,6)+s(horsepower,6)+ displacement, data=Auto)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
par(mfrow=c(1,3))
plot(gam.m3, se=TRUE,col="blue")

gam.m1=gam(mpg~s(horsepower,5)+displacement,data=Wage)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
gam.m2=gam(mpg~cylinders+s(horsepower,5)+displacement,data=Wage)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
anova(gam.m1,gam.m2,gam.m3,test="F")
## Analysis of Deviance Table
## 
## Model 1: mpg ~ s(horsepower, 5) + displacement
## Model 2: mpg ~ cylinders + s(horsepower, 5) + displacement
## Model 3: mpg ~ s(cylinders, 6) + s(horsepower, 6) + displacement
##   Resid. Df Resid. Dev     Df Deviance      F    Pr(>F)    
## 1       385     6139.9                                     
## 2       384     6117.1 1.0000    22.75 1.4912 0.2227819    
## 3       380     5796.8 3.9999   320.31 5.2496 0.0003992 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
preds=predict(gam.m2,newdata=Wage)
## Warning: 'newdata' had 3000 rows but variables found have 392 rows
## Warning: 'newdata' had 3000 rows but variables found have 392 rows
gam.lo=gam(mpg~s(cylinders,df=4)+lo(horsepower,span=0.7)+displacement,data=Wage)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
plot.Gam(gam.lo, se=TRUE, col="green")

gam.lo.i=gam(mpg~lo(cylinders,horsepower,span=0.5)+displacement,data=Wage) 
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
summary(gam.m3)
## 
## Call: gam(formula = mpg ~ s(cylinders, 6) + s(horsepower, 6) + displacement, 
##     data = Auto)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.3442  -2.1726  -0.3638   1.8283  17.5850 
## 
## (Dispersion Parameter for gaussian family taken to be 15.2547)
## 
##     Null Deviance: 23818.99 on 391 degrees of freedom
## Residual Deviance: 5796.801 on 380.0001 degrees of freedom
## AIC: 2194.417 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                   Df  Sum Sq Mean Sq F value Pr(>F)    
## s(cylinders, 6)    1 13803.2 13803.2  904.85 <2e-16 ***
## s(horsepower, 6)   1  1756.0  1756.0  115.11 <2e-16 ***
## displacement       1   239.0   239.0   15.67  9e-05 ***
## Residuals        380  5796.8    15.3                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                  Npar Df  Npar F     Pr(F)    
## (Intercept)                                   
## s(cylinders, 6)        3  8.9368 9.814e-06 ***
## s(horsepower, 6)       5 16.2123 1.688e-14 ***
## displacement                                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Problem 9

This question uses the variables dis (the weighted mean of distances to five Boston employment centers) and nox (nitrogen oxides concentration in parts per 10 million) from the Boston data. We will treat dis as the predictor and nox as the response.

set.seed(42)
library(MASS)
attach(Boston)

(a) Use the poly() function to fit a cubic polynomial regression to predict nox using dis. Report the regression output, and plot the resulting data and polynomial fits.

fit_lm <-  lm(nox ~ poly(dis, 3), data = Boston)
summary(fit_lm)
## 
## Call:
## lm(formula = nox ~ poly(dis, 3), data = Boston)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.121130 -0.040619 -0.009738  0.023385  0.194904 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.554695   0.002759 201.021  < 2e-16 ***
## poly(dis, 3)1 -2.003096   0.062071 -32.271  < 2e-16 ***
## poly(dis, 3)2  0.856330   0.062071  13.796  < 2e-16 ***
## poly(dis, 3)3 -0.318049   0.062071  -5.124 4.27e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared:  0.7148, Adjusted R-squared:  0.7131 
## F-statistic: 419.3 on 3 and 502 DF,  p-value: < 2.2e-16

The summary output shows poly terms, where dis is used to predict nox, are significant.

dislmt <- range(dis)
disgrd <- seq(from = dislmt[1], to = dislmt[2], by=0.1)
pred_lm <- predict(fit_lm, list(dis = disgrd))
plot(nox ~ dis, data = Boston, col = "gray")
lines(disgrd, pred_lm, col = "steel blue", lwd = 2)

On the plot, we can see the line has a smooth curve fitted to the data.

(b) Plot the polynomial fits for a range of different polynomial degrees (say, from 1 to 10), and report the associated residual sum of squares.

db_rss <- rep(NA, 10)
for (i in 1:10) {
    fit_lm = lm(nox ~ poly(dis, i), data = Boston)
    db_rss[i] = sum(fit_lm$residuals^2)
}
db_rss
##  [1] 2.768563 2.035262 1.934107 1.932981 1.915290 1.878257 1.849484 1.835630
##  [9] 1.833331 1.832171

The training RSS has a monotonic decrease with each degree in the polynomial.

(c) Perform cross-validation or another approach to select the optimal degree for the polynomial, and explain your results.

I’ll produce a 10-fold cross validation and select the best polynomial.

ad  <-  rep(NA, 10)
for (i in 1:10) {
    fit_lm = glm(nox ~ poly(dis, i), data = Boston)
    ad[i] = cv.glm(Boston, fit_lm, K = 10)$delta[2]
}
plot(1:10, ad, xlab = "Degree", ylab = "CV error", type = "l", pch = 20, 
    lwd = 2)

which.min(ad)
## [1] 4

My results show the CV error decreases as we increase the polynomial degree until 3 or 4, then begins to increase from 5-7, drops slightly for 8-fold, and then raises again. As it is the simplest model, and with the help of the which.min function, I would choose 3-fold as the most optimal polynomial degree.

(d) Use the bs() function to fit a regression spline to predict nox using dis. Report the output for the fit using four degrees of freedom. How did you choose the knots? Plot the resulting fit.

library(splines)
fit_spl = lm(nox ~ bs(dis, df = 4, knots = c(3, 7, 11)), data = Boston)
summary(fit_spl)
## 
## Call:
## lm(formula = nox ~ bs(dis, df = 4, knots = c(3, 7, 11)), data = Boston)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.130710 -0.039850 -0.008357  0.027792  0.188518 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            0.714346   0.015846  45.081  < 2e-16 ***
## bs(dis, df = 4, knots = c(3, 7, 11))1 -0.006626   0.024307  -0.273   0.7853    
## bs(dis, df = 4, knots = c(3, 7, 11))2 -0.296980   0.018293 -16.234  < 2e-16 ***
## bs(dis, df = 4, knots = c(3, 7, 11))3 -0.222840   0.033763  -6.600 1.05e-10 ***
## bs(dis, df = 4, knots = c(3, 7, 11))4 -0.379811   0.042317  -8.975  < 2e-16 ***
## bs(dis, df = 4, knots = c(3, 7, 11))5 -0.222959   0.086870  -2.567   0.0106 *  
## bs(dis, df = 4, knots = c(3, 7, 11))6 -0.304346   0.063378  -4.802 2.08e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06137 on 499 degrees of freedom
## Multiple R-squared:  0.7229, Adjusted R-squared:  0.7196 
## F-statistic:   217 on 6 and 499 DF,  p-value: < 2.2e-16
pred_sp <-  predict(fit_spl, list(dis = disgrd))
plot(nox ~ dis, data = Boston, col = "darkgrey")
lines(disgrd, pred_sp, col = "steel blue", lwd = 3)

Results within the summary show all terms in this spline fit are significant. The plot confirms that, all but the large values (dis > 10), the spline fits data well.

(e) Now fit a regression spline for a range of degrees of freedom, and plot the resulting fits and report the resulting RSS. Describe the results obtained.

cv.er = rep(NA, 18)
for (i in 3:18) {
    fit_lm <-  lm(nox ~ bs(dis, df = i), data = Boston)
    cv.er[i] <-  sum(fit_lm$residuals^2)
}
cv.er[-c(1, 2)]
##  [1] 1.934107 1.922775 1.840173 1.833966 1.829884 1.816995 1.825653 1.792535
##  [9] 1.796992 1.788999 1.782350 1.781838 1.782798 1.783546 1.779789 1.775838

With the exception of df=7, df=13, and df=14, the training RSS decreases with each degree in the polynomial.

(f) Perform cross-validation or another approach in order to select the best degrees of freedom for a regression spline on this data. Describe your results.

cv_full <-  rep(NA, 18)
for (i in 3:18) {
    fit_lm <- glm(nox ~ bs(dis, df = i), data = Boston)
    cv_full[i] <- cv.glm(Boston, fit_lm, K = 10)$delta[2]
}

plot(3:18, cv_full[-c(1, 2)], col = "steel blue" ,  lwd = 2, type = "l", xlab = "df", ylab = "CV error")

which.min(cv_full)
## [1] 10

Again using the which.min function, we are able to confirm 10 degrees of freedom is the most optimal.