library(MASS)
library(broom)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(knitr)
library(caret)
## Loading required package: lattice
attach(Boston)
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
library(splines)
##The summary of the cubic fit above shows that all of the polynomial coefficients are significant in predicting nox from dis. The plot shows a smooth curve that fits the data well with confidence intervals that are small until the upper limit of the data.
polymodel <- lm(nox ~ poly(dis, 3), data = Boston)
tidy(polymodel)
## # A tibble: 4 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.555 0.00276 201. 0.
## 2 poly(dis, 3)1 -2.00 0.0621 -32.3 1.60e-124
## 3 poly(dis, 3)2 0.856 0.0621 13.8 6.13e- 37
## 4 poly(dis, 3)3 -0.318 0.0621 -5.12 4.27e- 7
summary(polymodel)
##
## Call:
## lm(formula = nox ~ poly(dis, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.121130 -0.040619 -0.009738 0.023385 0.194904
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.554695 0.002759 201.021 < 2e-16 ***
## poly(dis, 3)1 -2.003096 0.062071 -32.271 < 2e-16 ***
## poly(dis, 3)2 0.856330 0.062071 13.796 < 2e-16 ***
## poly(dis, 3)3 -0.318049 0.062071 -5.124 4.27e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared: 0.7148, Adjusted R-squared: 0.7131
## F-statistic: 419.3 on 3 and 502 DF, p-value: < 2.2e-16
dislims <- range(Boston$dis)
dis.grid <- seq(from = dislims[1], to = dislims[2], by = 0.1)
preds <- predict(polymodel, list(dis = dis.grid))
plot(nox ~ dis, data = Boston, col = "darkgrey")
lines(dis.grid, preds, col = "red", lwd = 2)
#Part II – Now we fit the polymodel (now calling it Fit2) between the degrees of 1 to 10, and compute the residual sum of squares.
##The RSS decreases as the degree of polynomial increases and thus, the highest polynomial has the lowest RSS. However, it is clear from the plots that the small and high dis values are more and more uncertain (higher confidence intervals on the extremes) with increasing polynomials.
rss <- rep(NA, 10)
for (i in 1:10) {
fit2 <- lm(nox ~ poly(dis, i), data = Boston)
rss[i] <- sum(fit2$residuals^2)
}
plot(1:10, rss, xlab = "Degree", ylab = "RSS", type = "l")
deltas <- rep(NA, 10)
for (i in 1:10) {
fit <- glm(nox ~ poly(dis, i), data = Boston)
deltas[i] <- cv.glm(Boston, fit, K = 10)$delta[2]
}
plot(1:10, deltas, xlab = "Degree", ylab = "Test MSE", type = "l")
fit_ns <- lm(nox ~ bs(dis, knots = c(4, 7, 11)), data = Boston)
summary(fit_ns)
##
## Call:
## lm(formula = nox ~ bs(dis, knots = c(4, 7, 11)), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.124567 -0.040355 -0.008702 0.024740 0.192920
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.73926 0.01331 55.537 < 2e-16 ***
## bs(dis, knots = c(4, 7, 11))1 -0.08861 0.02504 -3.539 0.00044 ***
## bs(dis, knots = c(4, 7, 11))2 -0.31341 0.01680 -18.658 < 2e-16 ***
## bs(dis, knots = c(4, 7, 11))3 -0.26618 0.03147 -8.459 3.00e-16 ***
## bs(dis, knots = c(4, 7, 11))4 -0.39802 0.04647 -8.565 < 2e-16 ***
## bs(dis, knots = c(4, 7, 11))5 -0.25681 0.09001 -2.853 0.00451 **
## bs(dis, knots = c(4, 7, 11))6 -0.32926 0.06327 -5.204 2.85e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06185 on 499 degrees of freedom
## Multiple R-squared: 0.7185, Adjusted R-squared: 0.7151
## F-statistic: 212.3 on 6 and 499 DF, p-value: < 2.2e-16
pred <- predict(fit_ns, list(dis = dis.grid))
plot(nox ~ dis, data = Boston, col = "black")
lines(dis.grid, preds, col = "blue", lwd = 2)
rss <- rep(NA, 16)
for (i in 3:16) {
fit_rs <- lm(nox ~ bs(dis, df = i), data = Boston)
rss[i] <- sum(fit$residuals^2)
}
plot(3:16, rss[-c(1, 2)], xlab = "Degrees of freedom", ylab = "RSS", type = "l")
# Part VI – Using Cross Validation again to report MSE results.