data(Wage)
head(Wage)
##        year age           maritl     race       education             region
## 231655 2006  18 1. Never Married 1. White    1. < HS Grad 2. Middle Atlantic
## 86582  2004  24 1. Never Married 1. White 4. College Grad 2. Middle Atlantic
## 161300 2003  45       2. Married 1. White 3. Some College 2. Middle Atlantic
## 155159 2003  43       2. Married 3. Asian 4. College Grad 2. Middle Atlantic
## 11443  2005  50      4. Divorced 1. White      2. HS Grad 2. Middle Atlantic
## 376662 2008  54       2. Married 1. White 4. College Grad 2. Middle Atlantic
##              jobclass         health health_ins  logwage      wage
## 231655  1. Industrial      1. <=Good      2. No 4.318063  75.04315
## 86582  2. Information 2. >=Very Good      2. No 4.255273  70.47602
## 161300  1. Industrial      1. <=Good     1. Yes 4.875061 130.98218
## 155159 2. Information 2. >=Very Good     1. Yes 5.041393 154.68529
## 11443  2. Information      1. <=Good     1. Yes 4.318063  75.04315
## 376662 2. Information 2. >=Very Good     1. Yes 4.845098 127.11574
attach(Wage)

1. Polynomial fit

# Fit polynomial on age for each marital status group
fit_maritl <- lm(wage ~ poly(age, 4) + maritl, data = Wage)
summary(fit_maritl)
## 
## Call:
## lm(formula = wage ~ poly(age, 4) + maritl, data = Wage)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -102.015  -23.164   -4.176   14.663  208.484 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         102.393      1.786  57.344  < 2e-16 ***
## poly(age, 4)1       341.951     44.798   7.633 3.06e-14 ***
## poly(age, 4)2      -399.237     41.585  -9.600  < 2e-16 ***
## poly(age, 4)3        86.866     39.889   2.178   0.0295 *  
## poly(age, 4)4       -63.458     39.477  -1.607   0.1081    
## maritl2. Married     13.880      2.099   6.611 4.49e-11 ***
## maritl3. Widowed     -4.548      9.281  -0.490   0.6241    
## maritl4. Divorced    -2.837      3.408  -0.832   0.4052    
## maritl5. Separated   -3.483      5.639  -0.618   0.5368    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.42 on 2991 degrees of freedom
## Multiple R-squared:  0.1099, Adjusted R-squared:  0.1075 
## F-statistic: 46.14 on 8 and 2991 DF,  p-value: < 2.2e-16
# Create grid for predictions
agelims <- range(age)
age.grid <- seq(from = agelims[1], to = agelims[2])

preds <- predict(fit_maritl, 
                 newdata = data.frame(age = age.grid, 
                 maritl = rep("2. Married", length(age.grid))),
                 se = TRUE)


se.bands <- cbind(preds$fit + 2 * preds$se.fit,
                  preds$fit - 2 * preds$se.fit)

plot(age, wage, col = "darkgrey", cex = 0.5,
     main = "Polynomial Fit: Wage ~ Age (Married)")
lines(age.grid, preds$fit, lwd = 2, col = "blue")
matlines(age.grid, se.bands, lwd = 1, col = "blue", lty = 3)

2. ANOVA to choose degree

fit.1 <- lm(wage ~ maritl, data = Wage)
fit.2 <- lm(wage ~ poly(age, 2) + maritl,   data = Wage)
fit.3 <- lm(wage ~ poly(age, 3) + maritl,   data = Wage)
fit.4 <- lm(wage ~ poly(age, 4) + maritl,   data = Wage)
fit.5 <- lm(wage ~ poly(age, 5) + maritl,   data = Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
## 
## Model 1: wage ~ maritl
## Model 2: wage ~ poly(age, 2) + maritl
## Model 3: wage ~ poly(age, 3) + maritl
## Model 4: wage ~ poly(age, 4) + maritl
## Model 5: wage ~ poly(age, 5) + maritl
##   Res.Df     RSS Df Sum of Sq       F Pr(>F)    
## 1   2995 4858941                                
## 2   2993 4659679  2    199262 64.0943 <2e-16 ***
## 3   2992 4652387  1      7292  4.6909 0.0304 *  
## 4   2991 4648371  1      4016  2.5833 0.1081    
## 5   2990 4647801  1       570  0.3668 0.5448    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3. Step function

table(cut(age, 4))
## 
## (17.9,33.5]   (33.5,49]   (49,64.5] (64.5,80.1] 
##         750        1399         779          72
fit_step <- lm(wage ~ cut(age, 4) + maritl, data = Wage)

preds_step <- predict(fit_step,
                      newdata = data.frame(age = age.grid,
                                           maritl = rep("2. Married", length(age.grid))),
                      se = TRUE)

plot(age, wage, col = "darkgrey", cex = 0.5,
     main = "Step Function: Wage ~ Age + Marital Status")
lines(age.grid, preds_step$fit, lwd = 2, col = "red")

4. Natural spline

fit_ns <- lm(wage ~ ns(age, df = 4) + jobclass, data = Wage)

preds_ns <- predict(fit_ns,
                    newdata = data.frame(age = age.grid,
                                         jobclass = rep("1. Industrial", length(age.grid))),
                    se = TRUE)
se.bands_ns <- cbind(preds_ns$fit + 2 * preds_ns$se.fit,
                     preds_ns$fit - 2 * preds_ns$se.fit)

plot(age, wage, col = "darkgrey", cex = 0.5,
     main = "Natural Spline: Wage ~ Age + Job Class")
lines(age.grid, preds_ns$fit, lwd = 2, col = "blue")
matlines(age.grid, se.bands_ns, lwd = 1, col = "blue", lty = 3)

5. Smoothing spline

fit_smooth <- smooth.spline(age, wage, cv = TRUE)
## Warning in smooth.spline(age, wage, cv = TRUE): cross-validation with
## non-unique 'x' values seems doubtful
fit_smooth$df
## [1] 6.794596
plot(age, wage, col = "darkgrey", cex = 0.5,
     main = "Smoothing Spline: Wage ~ Age")
lines(fit_smooth, col = "red", lwd = 2)

6. Local regression

fit_lo1 <- loess(wage ~ age, span = 0.2, data = Wage)
fit_lo2 <- loess(wage ~ age, span = 0.5, data = Wage)

plot(age, wage, col = "darkgrey", cex = 0.5,
     main = "Local Regression (LOESS): Wage ~ Age")
lines(age.grid, predict(fit_lo1, data.frame(age = age.grid)),
      col = "red", lwd = 2)
lines(age.grid, predict(fit_lo2, data.frame(age = age.grid)),
      col = "blue", lwd = 2)
legend("topright", legend = c("Span = 0.2", "Span = 0.5"),
       col = c("red", "blue"), lty = 1, lwd = 2)

7. GAM

gam_m1 <- gam(wage ~ s(age, 4) + maritl,            data = Wage)
gam_m2 <- gam(wage ~ s(age, 4) + jobclass,          data = Wage)
gam_m3 <- gam(wage ~ s(age, 4) + maritl + jobclass, data = Wage)

anova(gam_m1, gam_m3, test = "F")
## Analysis of Deviance Table
## 
## Model 1: wage ~ s(age, 4) + maritl
## Model 2: wage ~ s(age, 4) + maritl + jobclass
##   Resid. Df Resid. Dev Df Deviance   F    Pr(>F)    
## 1      2991    4647171                              
## 2      2990    4476501  1   170670 114 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(1, 3))
plot(gam_m3, se = TRUE, col = "blue")