library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.3
library(boot)
attach(Wage)
library(gam)
## Warning: package 'gam' was built under R version 4.3.3
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.3.3
## Loaded gam 1.22-5
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
```{{r}} set.seed(15) cv.error <- rep(0,5)
for (i in 1:5){ glm.fit <- glm(wage ~ poly(age,i),data=Wage) cv.error[i]<- cv.glm(Wage,glm.fit,K=10)$delta[1] } cv.error
plot(cv.error, type=“b”, xlab=“Degree”, ylab=“Test MSE”) points(which.min(cv.error), cv.error[4], col=“red”, pch=20, cex=2)
The optimal degree chosen through cross-validation is 4.
```r
fit1 <- lm(wage ~ age, data = Wage)
fit2 <- lm(wage ~ poly(age, 2), data = Wage)
fit3 <- lm(wage ~ poly(age, 3), data = Wage)
fit4 <- lm(wage ~ poly(age, 4), data = Wage)
fit5 <- lm(wage ~ poly(age, 5), data = Wage)
anova(fit1, fit2, fit3, fit4, fit5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The p-values indicate that either cubic or quadratic fit provide a reasonable fit for the data.
age_lim <- range(age)
age_grid <- seq(from=age_lim[1], to=age_lim[2])
preds <- predict(fit4, newdata=list(age=age_grid),se=TRUE)
se_bands <- cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
plot(age, wage, xlim=age_lim, cex=.5, col="darkgrey")
lines(age_grid, preds$fit, lwd=2, col="blue")
matlines(age_grid, se_bands, lwd=1, col="blue", lty=3)
set.seed(2)
cv.errors <- rep(NA, 10)
for(i in 2:10){
Wage$age.cut <- cut(Wage$age,i)
glm.fit <- glm(wage ~ age.cut, data=Wage)
cv.errors[i] <- cv.glm(Wage, glm.fit, K=10)$delta[1]
}
cv.errors
## [1] NA 1734.448 1681.623 1635.869 1632.864 1621.643 1613.148 1601.850
## [9] 1610.852 1603.607
plot(2:10, cv.errors[-1], type="b", xlab="Number of cuts", ylab="Test MSE")
points(which.min(cv.errors), cv.errors[which.min(cv.errors)], col="red", pch=20, cex=2)
The optimal number of cuts is 8.
fit_step <- glm(wage ~ cut(age, 8), data=Wage)
preds <- predict(fit_step, data.frame(age = age_grid))
plot(age, wage, col="darkgray")
lines(age_grid, preds, col="darkgreen", lwd=2)
```{{r}} summary(maritl)
plot(maritl, wage) title(“Marital Status vs Wage”)
The plot shows that married men have the highest average wage, but also the highest variability in wage.
```r
summary(jobclass)
## 1. Industrial 2. Information
## 1544 1456
plot(jobclass, wage)
title("Jobclass vs Wage")
Jobs in the information sector pay more on average than those in the industrial sector.
fit0 <- gam(wage ~ lo(year, span = 0.7) + s(age, 5) + education, data = Wage)
fit1 <- gam(wage ~ lo(year, span = 0.7) + s(age, 5) + education + jobclass, data = Wage)
fit2 <- gam(wage ~ lo(year, span = 0.7) + s(age, 5) + education + maritl, data = Wage)
fit3 <- gam(wage ~ lo(year, span = 0.7) + s(age, 5) + education + jobclass + maritl, data = Wage)
anova(fit0, fit1, fit2, fit3)
## Analysis of Deviance Table
##
## Model 1: wage ~ lo(year, span = 0.7) + s(age, 5) + education
## Model 2: wage ~ lo(year, span = 0.7) + s(age, 5) + education + jobclass
## Model 3: wage ~ lo(year, span = 0.7) + s(age, 5) + education + maritl
## Model 4: wage ~ lo(year, span = 0.7) + s(age, 5) + education + jobclass +
## maritl
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 2987.1 3691855
## 2 2986.1 3679689 1 12166 0.0014637 **
## 3 2983.1 3597526 3 82163 9.53e-15 ***
## 4 2982.1 3583675 1 13852 0.0006862 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Fit 3 is significantly better than the other models.
par(mfrow = c(3, 3))
plot(fit3, se = T, col = "blue")
This question uses the variables dis (the weighted mean of distances to five Boston employment centers) and nox (nitrogen oxides concentration in parts per 10 million) from the Boston data. We will treat dis as the predictor and nox as the response.
set.seed(1)
fit <- lm(nox ~ poly(dis, 3), data = Boston)
summary(fit)
##
## Call:
## lm(formula = nox ~ poly(dis, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.121130 -0.040619 -0.009738 0.023385 0.194904
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.554695 0.002759 201.021 < 2e-16 ***
## poly(dis, 3)1 -2.003096 0.062071 -32.271 < 2e-16 ***
## poly(dis, 3)2 0.856330 0.062071 13.796 < 2e-16 ***
## poly(dis, 3)3 -0.318049 0.062071 -5.124 4.27e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared: 0.7148, Adjusted R-squared: 0.7131
## F-statistic: 419.3 on 3 and 502 DF, p-value: < 2.2e-16
dislims <- range(Boston$dis)
dis.grid <- seq(from = dislims[1], to = dislims[2], by = 0.1)
preds <- predict(fit, list(dis = dis.grid))
plot(nox ~ dis, data = Boston, col = "darkgrey")
lines(dis.grid, preds, col = "red", lwd = 2)
All polynomial terms are significant.
rss <- rep(NA, 10)
for (i in 1:10) {
fit <- lm(nox ~ poly(dis, i), data = Boston)
rss[i] <- sum(fit$residuals^2)
}
plot(1:10, rss, xlab = "Degree", ylab = "RSS", type = "l")
The RSS decreases as we increase the degree of polynomial. The minimum RSS is at polynomial 10.
deltas <- rep(NA, 10)
for (i in 1:10) {
fit <- glm(nox ~ poly(dis, i), data = Boston)
deltas[i] <- cv.glm(Boston, fit, K = 10)$delta[1]
}
plot(1:10, deltas, xlab = "Degree", ylab = "Test MSE", type = "l")
A degree 4 polynomial minimizes test MSE.
fit <- lm(nox ~ bs(dis, knots = c(4, 7, 11)), data = Boston)
summary(fit)
##
## Call:
## lm(formula = nox ~ bs(dis, knots = c(4, 7, 11)), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.124567 -0.040355 -0.008702 0.024740 0.192920
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.73926 0.01331 55.537 < 2e-16 ***
## bs(dis, knots = c(4, 7, 11))1 -0.08861 0.02504 -3.539 0.00044 ***
## bs(dis, knots = c(4, 7, 11))2 -0.31341 0.01680 -18.658 < 2e-16 ***
## bs(dis, knots = c(4, 7, 11))3 -0.26618 0.03147 -8.459 3.00e-16 ***
## bs(dis, knots = c(4, 7, 11))4 -0.39802 0.04647 -8.565 < 2e-16 ***
## bs(dis, knots = c(4, 7, 11))5 -0.25681 0.09001 -2.853 0.00451 **
## bs(dis, knots = c(4, 7, 11))6 -0.32926 0.06327 -5.204 2.85e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06185 on 499 degrees of freedom
## Multiple R-squared: 0.7185, Adjusted R-squared: 0.7151
## F-statistic: 212.3 on 6 and 499 DF, p-value: < 2.2e-16
pred <- predict(fit, list(dis = dis.grid))
plot(nox ~ dis, data = Boston, col = "darkgrey")
lines(dis.grid, preds, col = "red", lwd = 2)
All terms in spline fit are significant.
rss <- rep(NA, 16)
for (i in 3:16) {
fit <- lm(nox ~ bs(dis, df = i), data = Boston)
rss[i] <- sum(fit$residuals^2)
}
plot(3:16, rss[-c(1, 2)], xlab = "Degrees of freedom", ylab = "RSS", type = "l")
RSS decreases until 14 and then slightly begins to increase again.
cv <- rep(NA, 16)
for (i in 3:16) {
fit <- glm(nox ~ bs(dis, df = i), data = Boston)
cv[i] <- cv.glm(Boston, fit, K = 10)$delta[1]
}
## Warning in bs(dis, degree = 3L, knots = numeric(0), Boundary.knots = c(1.1296,
## : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = numeric(0), Boundary.knots = c(1.1296,
## : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = numeric(0), Boundary.knots = c(1.137, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = numeric(0), Boundary.knots = c(1.137, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = 3.0992, Boundary.knots = c(1.137, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = 3.0992, Boundary.knots = c(1.137, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = 3.2157, Boundary.knots = c(1.1296, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = 3.2157, Boundary.knots = c(1.1296, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(2.354, 4.2474), Boundary.knots =
## c(1.137, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
## Warning in bs(dis, degree = 3L, knots = c(2.354, 4.2474), Boundary.knots =
## c(1.137, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
## Warning in bs(dis, degree = 3L, knots = c(2.4212, 4.38856666666667),
## Boundary.knots = c(1.1296, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(2.4212, 4.38856666666667),
## Boundary.knots = c(1.1296, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(2.1105, 3.2721, 5.2146),
## Boundary.knots = c(1.1296, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(2.1105, 3.2721, 5.2146),
## Boundary.knots = c(1.1296, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(2.1, 3.1323, 5.118), Boundary.knots =
## c(1.137, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
## Warning in bs(dis, degree = 3L, knots = c(2.1, 3.1323, 5.118), Boundary.knots =
## c(1.137, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
## Warning in bs(dis, degree = 3L, knots = c(1.92938, 2.55946, 3.78776, 5.51426:
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.92938, 2.55946, 3.78776, 5.51426:
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.93736, 2.59666, 3.79734, 5.5005:
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.93736, 2.59666, 3.79734, 5.5005:
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.86156666666667, 2.34846666666667, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.86156666666667, 2.34846666666667, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.79777142857143, 2.19782857142857, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.79777142857143, 2.19782857142857, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.7936, 2.16771428571429,
## 2.74465714285714, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.7936, 2.16771428571429,
## 2.74465714285714, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.734325, 2.0941, 2.522775, 3.1827, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.734325, 2.0941, 2.522775, 3.1827, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.751575, 2.1084, 2.53475, 3.2797, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.751575, 2.1084, 2.53475, 3.2797, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.71552222222222, 2.00284444444444, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.71552222222222, 2.00284444444444, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.66286666666667, 2.00613333333333, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.66286666666667, 2.00613333333333, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.62008, 1.92938, 2.22906, 2.5899, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.62008, 1.92938, 2.22906, 2.5899, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.6283, 1.9512, 2.25965, 2.6775, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.6283, 1.9512, 2.25965, 2.6775, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.61225454545455, 1.89514545454545, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.61225454545455, 1.89514545454545, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.61066363636364, 1.88728181818182, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.61066363636364, 1.88728181818182, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.60476666666667, 1.8651, 2.1103, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.60476666666667, 1.8651, 2.1103, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.5881, 1.82231666666667, 2.097575, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.5881, 1.82231666666667, 2.097575, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.58949230769231, 1.84561538461538, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.58949230769231, 1.84561538461538, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.5741, 1.8209, 2.0527, 2.271, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.5741, 1.8209, 2.0527, 2.271, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.54201428571429, 1.7936,
## 1.97882857142857, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.54201428571429, 1.7936,
## 1.97882857142857, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.5768, 1.81652857142857,
## 1.99424285714286, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.5768, 1.81652857142857,
## 1.99424285714286, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
plot(3:16, cv[-c(1, 2)], xlab = "Degrees of freedom", ylab = "Test MSE", type = "l")
Test MSE is lowest at 13 degrees of freedom.
In Section 7.7, it was mentioned that GAMs are generally ft using a backftting approach. The idea behind backftting is actually quite simple. We will now explore backfitting in the context of multiple linear regression. Suppose that we would like to perform multiple linear regression, but we do not have software to do so. Instead, we only have software to perform simple linear regression. Therefore, we take the following iterative approach: we repeatedly hold all but one coeffcient estimate fixed at its current value, and update only that coefficient estimate using a simple linear regression. The process is continued until convergence—that is, until the coeffcient estimates stop changing. We now try this out on a toy example. (a) Generate a response Y and two predictors X1 and X2, with n = 100.
set.seed(123)
n <- 100
X1 <- rnorm(n)
X2 <- rnorm(n)
Y <- X1 + 2*X2 + rnorm(n)
beta1 <- 1
beta2 <- lm(Y - beta1*X1 ~ X2)$coef[2]
beta1 <- lm(Y - beta2*X2 ~ X1)$coef[2]
niter <- 1000
beta0_list <- numeric(niter)
beta1_list <- numeric(niter)
beta2_list <- numeric(niter)
for (i in 1:niter) {
beta2 <- lm(Y - beta1*X1 ~ X2)$coef[2]
beta1 <- lm(Y - beta2*X2 ~ X1)$coef[2]
beta0 <- mean(Y - beta1*X1 - beta2*X2)
beta0_list[i] <- beta0
beta1_list[i] <- beta1
beta2_list[i] <- beta2
}
plot(1:niter, beta0_list, col = "red", type = "l", ylim = range(c(beta0_list, beta1_list, beta2_list)), ylab = "Coefficient estimates")
lines(1:niter, beta1_list, col = "blue", type = "l")
lines(1:niter, beta2_list, col = "green", type = "l")
abline(a = mean(Y - beta2_list[niter]*X2 - beta1_list[niter]*X1), b = beta1_list[niter], col = "purple")
abline(a = mean(Y - beta2_list[niter]*X2 - beta1_list[niter]*X1), b = beta2_list[niter], col = "orange")
legend("topright", legend = c("beta0", "beta1", "beta2", "MLR beta1", "MLR beta2"), col = c("red", "blue", "green", "purple", "orange"), lty = c(1,1,1,2,2))
mlr <- lm(Y ~ X1 + X2)
summary(mlr)
##
## Call:
## lm(formula = Y ~ X1 + X2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8730 -0.6607 -0.1245 0.6214 2.0798
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.13507 0.09614 1.405 0.163
## X1 0.86683 0.10487 8.266 7.29e-13 ***
## X2 2.02381 0.09899 20.444 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9513 on 97 degrees of freedom
## Multiple R-squared: 0.8291, Adjusted R-squared: 0.8256
## F-statistic: 235.3 on 2 and 97 DF, p-value: < 2.2e-16
Two iterations provides a reasonably good estimate of the coefficients.
11.undefined