x <- -2:2
y <- 1 + x + -2 * (x-1)^2 * I(x>1)
plot(x, y, cex=1.5, pch = 19, col="Purple")
Fit some of the non-linear models investigated in this chapter to the Auto data set. Is there evidence for non-linear relationships in this data set? Create some informative plots to justify your answer.
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.6.3
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
attach(Auto)
set.seed(42)
corrgram(Auto, upper.panel=panel.cor)
Cylinders, displacement, horsepower, and weight adversely effect MPG, while year seems to have a positive effect. We must disregard origin here, it is a factor variable.
fits <- list()
cyl_rss <- rep(NA, 4)
disp_rss <- rep(NA, 4)
hp_rss <- rep(NA, 4)
w_rss <- rep(NA, 4)
for (d in 1:4) {
fits[[d]] = lm(mpg ~ poly(cylinders, d), data = Auto)
cyl_rss[d] = deviance(fits[[d]])
}
cyl_anova <- anova(fits[[1]], fits[[2]], fits[[3]], fits[[4]])
for (d in 1:10) {
fits[[d]] = lm(mpg ~ poly(displacement, d), data = Auto)
disp_rss[d] = deviance(fits[[d]])
}
disp_anova <- anova(fits[[1]], fits[[2]], fits[[3]], fits[[4]], fits[[5]], fits[[6]], fits[[7]], fits[[8]], fits[[9]], fits[[10]])
for (d in 1:10) {
fits[[d]] = lm(mpg ~ poly(horsepower, d), data = Auto)
hp_rss[d] = deviance(fits[[d]])
}
hp_anova <- anova(fits[[1]], fits[[2]], fits[[3]], fits[[4]], fits[[5]], fits[[6]], fits[[7]], fits[[8]], fits[[9]], fits[[10]])
for (d in 1:10) {
fits[[d]] = lm(mpg ~ poly(weight, d), data = Auto)
w_rss[d] = deviance(fits[[d]])
}
w_anova <- anova(fits[[1]], fits[[2]], fits[[3]], fits[[4]], fits[[5]], fits[[6]], fits[[7]], fits[[8]], fits[[9]], fits[[10]])
The analysis of variance shows horsepower should use the sixth fit.
hp_anova
## Analysis of Variance Table
##
## Model 1: mpg ~ poly(horsepower, d)
## Model 2: mpg ~ poly(horsepower, d)
## Model 3: mpg ~ poly(horsepower, d)
## Model 4: mpg ~ poly(horsepower, d)
## Model 5: mpg ~ poly(horsepower, d)
## Model 6: mpg ~ poly(horsepower, d)
## Model 7: mpg ~ poly(horsepower, d)
## Model 8: mpg ~ poly(horsepower, d)
## Model 9: mpg ~ poly(horsepower, d)
## Model 10: mpg ~ poly(horsepower, d)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 390 9385.9
## 2 389 7442.0 1 1943.89 104.9077 < 2.2e-16 ***
## 3 388 7426.4 1 15.59 0.8415 0.359535
## 4 387 7399.5 1 26.91 1.4525 0.228881
## 5 386 7223.4 1 176.15 9.5065 0.002197 **
## 6 385 7150.3 1 73.04 3.9417 0.047819 *
## 7 384 7086.6 1 63.69 3.4372 0.064516 .
## 8 383 7081.9 1 4.72 0.2548 0.614030
## 9 382 7066.6 1 15.35 0.8285 0.363273
## 10 381 7059.7 1 6.84 0.3689 0.543953
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit=lm(mpg~poly(horsepower,6),data=Auto)
coef(summary(fit))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.445918 0.2176656 107.715325 1.449683e-289
## poly(horsepower, 6)1 -120.137744 4.3095585 -27.877042 2.164963e-94
## poly(horsepower, 6)2 44.089528 4.3095585 10.230637 6.845463e-22
## poly(horsepower, 6)3 -3.948849 4.3095585 -0.916300 3.600832e-01
## poly(horsepower, 6)4 -5.187810 4.3095585 -1.203792 2.294098e-01
## poly(horsepower, 6)5 13.272187 4.3095585 3.079709 2.220714e-03
## poly(horsepower, 6)6 -8.546238 4.3095585 -1.983089 4.806767e-02
fit2=lm(mpg~poly(horsepower,6,raw=T),data=Auto)
coef(summary(fit2))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.621416e+02 7.142691e+01 -2.270035 0.023757615
## poly(horsepower, 6, raw = T)1 1.123809e+01 4.016610e+00 2.797903 0.005401987
## poly(horsepower, 6, raw = T)2 -2.436366e-01 8.969415e-02 -2.716304 0.006899198
## poly(horsepower, 6, raw = T)3 2.580141e-03 1.018801e-03 2.532528 0.011720819
## poly(horsepower, 6, raw = T)4 -1.452995e-05 6.218662e-06 -2.336507 0.019977094
## poly(horsepower, 6, raw = T)5 4.172832e-08 1.939327e-08 2.151691 0.032043069
## poly(horsepower, 6, raw = T)6 -4.803212e-11 2.422086e-11 -1.983089 0.048067671
hplims=range(horsepower)
hp.grid=seq(from=hplims[1],to=hplims[2])
preds=predict(fit,newdata=list(hp=hp.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
plot(horsepower,mpg,xlim=hplims,cex=.5,col="darkgrey")
title("Degree-6 Polynomial")
### Cross Validation Cross validation also selects the 6th (or 8th) degree Polynomial
library(glmnet)
library(boot)
cv.errs <- rep(NA, 15)
for (d in 1:15) {
fit <- glm(mpg ~ poly(horsepower, d), data = Auto)
cv.errs[d] <- cv.glm(Auto, fit, K = 10)$delta[2]
}
cv.errs
## [1] 24.36098 19.30116 19.38650 19.17300 19.25729 18.74147 18.98289 19.03350
## [9] 18.73875 19.27448 19.24204 19.10546 19.16816 35.12284 91.56307
which.min(cv.errs)
## [1] 9
### Step Function
cv.errs <- rep(NA, 10)
for (c in 2:10) {
Auto$dis.cut <- cut(Auto$horsepower, c)
fit <- glm(mpg ~ dis.cut, data = Auto)
cv.errs[c] <- cv.glm(Auto, fit, K = 10)$delta[2]
}
cv.errs
## [1] NA 37.99124 33.53460 25.01922 22.35480 21.89851 21.23304 19.85376
## [9] 20.95053 21.06961
which.min(cv.errs)
## [1] 8
### Splines
library(splines)
fit=lm(mpg~bs(horsepower,knots=c(25,40,60)),data=Auto)
pred=predict(fit,newdata=list(horsepower=hp.grid),se=T)
## Warning in predict.lm(fit, newdata = list(horsepower = hp.grid), se = T):
## prediction from a rank-deficient fit may be misleading
plot(horsepower,mpg,col="gray")
lines(hp.grid,pred$fit,lwd=2, col="#de2d26")
lines(hp.grid,pred$fit+2*pred$se,lty="dashed")
lines(hp.grid,pred$fit-2*pred$se,lty="dashed")
dim(bs(horsepower,knots=c(25,40,60)))
## [1] 392 6
dim(bs(horsepower,df=6))
## [1] 392 6
attr(bs(horsepower,df=6),"knots")
## 25% 50% 75%
## 75.0 93.5 126.0
# Natural Spline
fit2=lm(mpg~ns(horsepower,df=4),data=Auto)
pred2=predict(fit2,newdata=list(horsepower=hp.grid),se=T)
plot(horsepower,mpg,xlim=hplims,cex=.5,col="darkgrey")
lines(hp.grid, pred2$fit,col="red",lwd=2)
# Smooth Spline
fit=smooth.spline(horsepower,mpg,df=16)
fit2=smooth.spline(horsepower,mpg,cv=TRUE)
## Warning in smooth.spline(horsepower, mpg, cv = TRUE): cross-validation with non-
## unique 'x' values seems doubtful
fit2$df
## [1] 5.784761
plot(horsepower,mpg,xlim=hplims,cex=.5,col="darkgrey")
lines(fit,col="red",lwd=2)
lines(fit2,col="blue",lwd=2)
legend("topright",legend=c("16 DF","5.8 DF"),col=c("red","blue"),lty=1,lwd=2,cex=.8)
title("Local Regression")
library(gam)
## Warning: package 'gam' was built under R version 3.6.3
## Loading required package: foreach
## Loaded gam 1.16.1
gam.m3=gam(mpg~s(cylinders,6)+s(horsepower,6)+ displacement, data=Auto)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
par(mfrow=c(1,3))
plot(gam.m3, se=TRUE,col="blue")
gam.m1=gam(mpg~s(horsepower,5)+displacement,data=Wage)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
gam.m2=gam(mpg~cylinders+s(horsepower,5)+displacement,data=Wage)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
anova(gam.m1,gam.m2,gam.m3,test="F")
## Analysis of Deviance Table
##
## Model 1: mpg ~ s(horsepower, 5) + displacement
## Model 2: mpg ~ cylinders + s(horsepower, 5) + displacement
## Model 3: mpg ~ s(cylinders, 6) + s(horsepower, 6) + displacement
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 385 6139.9
## 2 384 6117.1 1.0000 22.75 1.4912 0.2227819
## 3 380 5796.8 3.9999 320.31 5.2496 0.0003992 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
preds=predict(gam.m2,newdata=Wage)
## Warning: 'newdata' had 3000 rows but variables found have 392 rows
## Warning: 'newdata' had 3000 rows but variables found have 392 rows
gam.lo=gam(mpg~s(cylinders,df=4)+lo(horsepower,span=0.7)+displacement,data=Wage)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
plot.Gam(gam.lo, se=TRUE, col="green")
gam.lo.i=gam(mpg~lo(cylinders,horsepower,span=0.5)+displacement,data=Wage)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
summary(gam.m3)
##
## Call: gam(formula = mpg ~ s(cylinders, 6) + s(horsepower, 6) + displacement,
## data = Auto)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -11.3442 -2.1726 -0.3638 1.8283 17.5850
##
## (Dispersion Parameter for gaussian family taken to be 15.2547)
##
## Null Deviance: 23818.99 on 391 degrees of freedom
## Residual Deviance: 5796.801 on 380.0001 degrees of freedom
## AIC: 2194.417
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(cylinders, 6) 1 13803.2 13803.2 904.85 <2e-16 ***
## s(horsepower, 6) 1 1756.0 1756.0 115.11 <2e-16 ***
## displacement 1 239.0 239.0 15.67 9e-05 ***
## Residuals 380 5796.8 15.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(cylinders, 6) 3 8.9368 9.814e-06 ***
## s(horsepower, 6) 5 16.2123 1.688e-14 ***
## displacement
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This question uses the variables dis (the weighted mean of distances to five Boston employment centers) and nox (nitrogen oxides concentration in parts per 10 million) from the Boston data. We will treat dis as the predictor and nox as the response.
set.seed(42)
library(MASS)
attach(Boston)
(a) Use the poly() function to fit a cubic polynomial regression to predict nox using dis. Report the regression output, and plot the resulting data and polynomial fits.
fit_lm <- lm(nox ~ poly(dis, 3), data = Boston)
summary(fit_lm)
##
## Call:
## lm(formula = nox ~ poly(dis, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.121130 -0.040619 -0.009738 0.023385 0.194904
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.554695 0.002759 201.021 < 2e-16 ***
## poly(dis, 3)1 -2.003096 0.062071 -32.271 < 2e-16 ***
## poly(dis, 3)2 0.856330 0.062071 13.796 < 2e-16 ***
## poly(dis, 3)3 -0.318049 0.062071 -5.124 4.27e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared: 0.7148, Adjusted R-squared: 0.7131
## F-statistic: 419.3 on 3 and 502 DF, p-value: < 2.2e-16
The summary output shows poly terms, where dis is used to predict nox, are significant.
dislmt <- range(dis)
disgrd <- seq(from = dislmt[1], to = dislmt[2], by=0.1)
pred_lm <- predict(fit_lm, list(dis = disgrd))
plot(nox ~ dis, data = Boston, col = "gray")
lines(disgrd, pred_lm, col = "steel blue", lwd = 2)
On the plot, we can see the line has a smooth curve fitted to the data.
(b) Plot the polynomial fits for a range of different polynomial degrees (say, from 1 to 10), and report the associated residual sum of squares.
db_rss <- rep(NA, 10)
for (i in 1:10) {
fit_lm = lm(nox ~ poly(dis, i), data = Boston)
db_rss[i] = sum(fit_lm$residuals^2)
}
db_rss
## [1] 2.768563 2.035262 1.934107 1.932981 1.915290 1.878257 1.849484 1.835630
## [9] 1.833331 1.832171
The training RSS has a monotonic decrease with each degree in the polynomial.
(c) Perform cross-validation or another approach to select the optimal degree for the polynomial, and explain your results.
I’ll produce a 10-fold cross validation and select the best polynomial.
ad <- rep(NA, 10)
for (i in 1:10) {
fit_lm = glm(nox ~ poly(dis, i), data = Boston)
ad[i] = cv.glm(Boston, fit_lm, K = 10)$delta[2]
}
plot(1:10, ad, xlab = "Degree", ylab = "CV error", type = "l", pch = 20,
lwd = 2)
which.min(ad)
## [1] 4
My results show the CV error decreases as we increase the polynomial degree until 3 or 4, then begins to increase from 5-7, drops slightly for 8-fold, and then raises again. As it is the simplest model, and with the help of the which.min function, I would choose 3-fold as the most optimal polynomial degree.
(d) Use the bs() function to fit a regression spline to predict nox using dis. Report the output for the fit using four degrees of freedom. How did you choose the knots? Plot the resulting fit.
library(splines)
fit_spl = lm(nox ~ bs(dis, df = 4, knots = c(3, 7, 11)), data = Boston)
summary(fit_spl)
##
## Call:
## lm(formula = nox ~ bs(dis, df = 4, knots = c(3, 7, 11)), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.130710 -0.039850 -0.008357 0.027792 0.188518
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.714346 0.015846 45.081 < 2e-16 ***
## bs(dis, df = 4, knots = c(3, 7, 11))1 -0.006626 0.024307 -0.273 0.7853
## bs(dis, df = 4, knots = c(3, 7, 11))2 -0.296980 0.018293 -16.234 < 2e-16 ***
## bs(dis, df = 4, knots = c(3, 7, 11))3 -0.222840 0.033763 -6.600 1.05e-10 ***
## bs(dis, df = 4, knots = c(3, 7, 11))4 -0.379811 0.042317 -8.975 < 2e-16 ***
## bs(dis, df = 4, knots = c(3, 7, 11))5 -0.222959 0.086870 -2.567 0.0106 *
## bs(dis, df = 4, knots = c(3, 7, 11))6 -0.304346 0.063378 -4.802 2.08e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06137 on 499 degrees of freedom
## Multiple R-squared: 0.7229, Adjusted R-squared: 0.7196
## F-statistic: 217 on 6 and 499 DF, p-value: < 2.2e-16
pred_sp <- predict(fit_spl, list(dis = disgrd))
plot(nox ~ dis, data = Boston, col = "darkgrey")
lines(disgrd, pred_sp, col = "steel blue", lwd = 3)
Results within the summary show all terms in this spline fit are significant. The plot confirms that, all but the large values (
dis > 10), the spline fits data well.
(e) Now fit a regression spline for a range of degrees of freedom, and plot the resulting fits and report the resulting RSS. Describe the results obtained.
cv.er = rep(NA, 18)
for (i in 3:18) {
fit_lm <- lm(nox ~ bs(dis, df = i), data = Boston)
cv.er[i] <- sum(fit_lm$residuals^2)
}
cv.er[-c(1, 2)]
## [1] 1.934107 1.922775 1.840173 1.833966 1.829884 1.816995 1.825653 1.792535
## [9] 1.796992 1.788999 1.782350 1.781838 1.782798 1.783546 1.779789 1.775838
With the exception of df=7, df=13, and df=14, the training RSS decreases with each degree in the polynomial.
(f) Perform cross-validation or another approach in order to select the best degrees of freedom for a regression spline on this data. Describe your results.
cv_full <- rep(NA, 18)
for (i in 3:18) {
fit_lm <- glm(nox ~ bs(dis, df = i), data = Boston)
cv_full[i] <- cv.glm(Boston, fit_lm, K = 10)$delta[2]
}
plot(3:18, cv_full[-c(1, 2)], col = "steel blue" , lwd = 2, type = "l", xlab = "df", ylab = "CV error")
which.min(cv_full)
## [1] 10
Again using the which.min function, we are able to confirm 10 degrees of freedom is the most optimal.