data(Wage)
head(Wage)
## year age maritl race education region
## 231655 2006 18 1. Never Married 1. White 1. < HS Grad 2. Middle Atlantic
## 86582 2004 24 1. Never Married 1. White 4. College Grad 2. Middle Atlantic
## 161300 2003 45 2. Married 1. White 3. Some College 2. Middle Atlantic
## 155159 2003 43 2. Married 3. Asian 4. College Grad 2. Middle Atlantic
## 11443 2005 50 4. Divorced 1. White 2. HS Grad 2. Middle Atlantic
## 376662 2008 54 2. Married 1. White 4. College Grad 2. Middle Atlantic
## jobclass health health_ins logwage wage
## 231655 1. Industrial 1. <=Good 2. No 4.318063 75.04315
## 86582 2. Information 2. >=Very Good 2. No 4.255273 70.47602
## 161300 1. Industrial 1. <=Good 1. Yes 4.875061 130.98218
## 155159 2. Information 2. >=Very Good 1. Yes 5.041393 154.68529
## 11443 2. Information 1. <=Good 1. Yes 4.318063 75.04315
## 376662 2. Information 2. >=Very Good 1. Yes 4.845098 127.11574
attach(Wage)
1. Polynomial fit
# Fit polynomial on age for each marital status group
fit_maritl <- lm(wage ~ poly(age, 4) + maritl, data = Wage)
summary(fit_maritl)
##
## Call:
## lm(formula = wage ~ poly(age, 4) + maritl, data = Wage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -102.015 -23.164 -4.176 14.663 208.484
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 102.393 1.786 57.344 < 2e-16 ***
## poly(age, 4)1 341.951 44.798 7.633 3.06e-14 ***
## poly(age, 4)2 -399.237 41.585 -9.600 < 2e-16 ***
## poly(age, 4)3 86.866 39.889 2.178 0.0295 *
## poly(age, 4)4 -63.458 39.477 -1.607 0.1081
## maritl2. Married 13.880 2.099 6.611 4.49e-11 ***
## maritl3. Widowed -4.548 9.281 -0.490 0.6241
## maritl4. Divorced -2.837 3.408 -0.832 0.4052
## maritl5. Separated -3.483 5.639 -0.618 0.5368
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.42 on 2991 degrees of freedom
## Multiple R-squared: 0.1099, Adjusted R-squared: 0.1075
## F-statistic: 46.14 on 8 and 2991 DF, p-value: < 2.2e-16
# Create grid for predictions
agelims <- range(age)
age.grid <- seq(from = agelims[1], to = agelims[2])
preds <- predict(fit_maritl,
newdata = data.frame(age = age.grid,
maritl = rep("2. Married", length(age.grid))),
se = TRUE)
se.bands <- cbind(preds$fit + 2 * preds$se.fit,
preds$fit - 2 * preds$se.fit)
plot(age, wage, col = "darkgrey", cex = 0.5,
main = "Polynomial Fit: Wage ~ Age (Married)")
lines(age.grid, preds$fit, lwd = 2, col = "blue")
matlines(age.grid, se.bands, lwd = 1, col = "blue", lty = 3)

2. ANOVA to choose degree
fit.1 <- lm(wage ~ maritl, data = Wage)
fit.2 <- lm(wage ~ poly(age, 2) + maritl, data = Wage)
fit.3 <- lm(wage ~ poly(age, 3) + maritl, data = Wage)
fit.4 <- lm(wage ~ poly(age, 4) + maritl, data = Wage)
fit.5 <- lm(wage ~ poly(age, 5) + maritl, data = Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
##
## Model 1: wage ~ maritl
## Model 2: wage ~ poly(age, 2) + maritl
## Model 3: wage ~ poly(age, 3) + maritl
## Model 4: wage ~ poly(age, 4) + maritl
## Model 5: wage ~ poly(age, 5) + maritl
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2995 4858941
## 2 2993 4659679 2 199262 64.0943 <2e-16 ***
## 3 2992 4652387 1 7292 4.6909 0.0304 *
## 4 2991 4648371 1 4016 2.5833 0.1081
## 5 2990 4647801 1 570 0.3668 0.5448
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
3. Step function
table(cut(age, 4))
##
## (17.9,33.5] (33.5,49] (49,64.5] (64.5,80.1]
## 750 1399 779 72
fit_step <- lm(wage ~ cut(age, 4) + maritl, data = Wage)
preds_step <- predict(fit_step,
newdata = data.frame(age = age.grid,
maritl = rep("2. Married", length(age.grid))),
se = TRUE)
plot(age, wage, col = "darkgrey", cex = 0.5,
main = "Step Function: Wage ~ Age + Marital Status")
lines(age.grid, preds_step$fit, lwd = 2, col = "red")

4. Natural spline
fit_ns <- lm(wage ~ ns(age, df = 4) + jobclass, data = Wage)
preds_ns <- predict(fit_ns,
newdata = data.frame(age = age.grid,
jobclass = rep("1. Industrial", length(age.grid))),
se = TRUE)
se.bands_ns <- cbind(preds_ns$fit + 2 * preds_ns$se.fit,
preds_ns$fit - 2 * preds_ns$se.fit)
plot(age, wage, col = "darkgrey", cex = 0.5,
main = "Natural Spline: Wage ~ Age + Job Class")
lines(age.grid, preds_ns$fit, lwd = 2, col = "blue")
matlines(age.grid, se.bands_ns, lwd = 1, col = "blue", lty = 3)

5. Smoothing spline
fit_smooth <- smooth.spline(age, wage, cv = TRUE)
## Warning in smooth.spline(age, wage, cv = TRUE): cross-validation with
## non-unique 'x' values seems doubtful
fit_smooth$df
## [1] 6.794596
plot(age, wage, col = "darkgrey", cex = 0.5,
main = "Smoothing Spline: Wage ~ Age")
lines(fit_smooth, col = "red", lwd = 2)

6. Local regression
fit_lo1 <- loess(wage ~ age, span = 0.2, data = Wage)
fit_lo2 <- loess(wage ~ age, span = 0.5, data = Wage)
plot(age, wage, col = "darkgrey", cex = 0.5,
main = "Local Regression (LOESS): Wage ~ Age")
lines(age.grid, predict(fit_lo1, data.frame(age = age.grid)),
col = "red", lwd = 2)
lines(age.grid, predict(fit_lo2, data.frame(age = age.grid)),
col = "blue", lwd = 2)
legend("topright", legend = c("Span = 0.2", "Span = 0.5"),
col = c("red", "blue"), lty = 1, lwd = 2)

7. GAM
gam_m1 <- gam(wage ~ s(age, 4) + maritl, data = Wage)
gam_m2 <- gam(wage ~ s(age, 4) + jobclass, data = Wage)
gam_m3 <- gam(wage ~ s(age, 4) + maritl + jobclass, data = Wage)
anova(gam_m1, gam_m3, test = "F")
## Analysis of Deviance Table
##
## Model 1: wage ~ s(age, 4) + maritl
## Model 2: wage ~ s(age, 4) + maritl + jobclass
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 2991 4647171
## 2 2990 4476501 1 170670 114 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(1, 3))
plot(gam_m3, se = TRUE, col = "blue")
