R Markdown: Non-Linear Modeling

This code comes from the textbook lab in chapter 7. Feel free to reference it for further details.

We will be using data from the ISLR package:

library(ISLR)
attach(Wage)

From book: In this application, we examine a number of factors that relate to wages for a group of males from the Atlantic region of the United States. In particular, we wish to understand the association between an employee’s age and education, as well as the calendar year, on his wage.

Visualize the Data

library(tidyverse)

ggplot(Wage, aes(age, wage))+
  geom_point()

Polynomial Regression

In this section we will see that there are many ways to specify polynomial regression models.

Using the poly() function

Gives a matrix whose columns are a basis of orthogonal polynomials.

# polynomial regression 
# poly uses matrix of orthogonal polynomials
fit=lm(wage~poly(age ,4) ,data=Wage)
coef(summary (fit))
##                 Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287409 153.283015 0.000000e+00
## poly(age, 4)1  447.06785 39.9147851  11.200558 1.484604e-28
## poly(age, 4)2 -478.31581 39.9147851 -11.983424 2.355831e-32
## poly(age, 4)3  125.52169 39.9147851   3.144742 1.678622e-03
## poly(age, 4)4  -77.91118 39.9147851  -1.951938 5.103865e-02

Raw polynomials

It doesn’t matter if we use the raw polynomials or the orthogonal ones. The inference of the models will be the same in the end.

# fit raw powers
fit2=lm(wage~poly(age ,4,raw=T),data=Wage)
coef(summary(fit2))
##                             Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept)            -1.841542e+02 6.004038e+01 -3.067172 0.0021802539
## poly(age, 4, raw = T)1  2.124552e+01 5.886748e+00  3.609042 0.0003123618
## poly(age, 4, raw = T)2 -5.638593e-01 2.061083e-01 -2.735743 0.0062606446
## poly(age, 4, raw = T)3  6.810688e-03 3.065931e-03  2.221409 0.0263977518
## poly(age, 4, raw = T)4 -3.203830e-05 1.641359e-05 -1.951938 0.0510386498

I() wrapper to exponentiate

# use the I() wrapper to expoentiate
fit2a=lm(wage~age+I(age^2)+I(age^3)+I(age^4),data=Wage)
coef(fit2a)
##   (Intercept)           age      I(age^2)      I(age^3)      I(age^4) 
## -1.841542e+02  2.124552e+01 -5.638593e-01  6.810688e-03 -3.203830e-05

Creating a matrix with the powers

# uses cbind 
fit2b=lm(wage~cbind(age ,age^2,age^3, age ^4),data=Wage)
coef(fit2b)
##                        (Intercept) cbind(age, age^2, age^3, age^4)age 
##                      -1.841542e+02                       2.124552e+01 
##    cbind(age, age^2, age^3, age^4)    cbind(age, age^2, age^3, age^4) 
##                      -5.638593e-01                       6.810688e-03 
##    cbind(age, age^2, age^3, age^4) 
##                      -3.203830e-05

Predicting values with approximate confidence intervals

# grid values and predict
agelims =range(age)
age.grid=seq(from=agelims [1],to=agelims [2])
preds=predict (fit ,newdata =list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit +2*preds$se.fit ,preds$fit -2*preds$se.fit)

# plot
par(mfrow=c(1,1),mar=c(4.5,4.5,1,1) ,oma=c(0,0,4,0))
plot(age ,wage ,xlim=agelims ,cex =.5,col=" darkgrey ")
title(" Degree-4 Polynomial ",outer=T)
lines(age.grid ,preds$fit ,lwd=2,col="blue")
matlines (age.grid ,se.bands ,lwd=1, col=" blue",lty=3)

Comparing Models with ANOVA

# compare models with ANOVA
fit.1=lm(wage~age ,data=Wage)
fit.2=lm(wage~poly(age ,2),data=Wage)
fit.3=lm(wage~poly(age ,3),data=Wage)
fit.4=lm(wage~poly(age ,4),data=Wage)
fit.5=lm(wage~poly(age ,5),data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Same as inference from 5th degree polynomial.

# same inference
coef(summary (fit.5))
##                 Estimate Std. Error     t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287647 153.2780243 0.000000e+00
## poly(age, 5)1  447.06785 39.9160847  11.2001930 1.491111e-28
## poly(age, 5)2 -478.31581 39.9160847 -11.9830341 2.367734e-32
## poly(age, 5)3  125.52169 39.9160847   3.1446392 1.679213e-03
## poly(age, 5)4  -77.91118 39.9160847  -1.9518743 5.104623e-02
## poly(age, 5)5  -35.81289 39.9160847  -0.8972045 3.696820e-01

Inducing education and age in the model.

# anova works with orthogonal polys or not
fit.1=lm(wage~education+age ,data=Wage)
fit.2=lm(wage~education+poly(age ,2) ,data=Wage)
fit.3=lm(wage~education+poly(age ,3) ,data=Wage)
anova(fit.1,fit.2,fit.3)
## Analysis of Variance Table
## 
## Model 1: wage ~ education + age
## Model 2: wage ~ education + poly(age, 2)
## Model 3: wage ~ education + poly(age, 3)
##   Res.Df     RSS Df Sum of Sq        F Pr(>F)    
## 1   2994 3867992                                 
## 2   2993 3725395  1    142597 114.6969 <2e-16 ***
## 3   2992 3719809  1      5587   4.4936 0.0341 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Polynomial Logistic Regression

# logistic regression 
# wage greater than 250
fit=glm(I(wage >250)~poly(age ,4),data=Wage , family=binomial )
summary(fit)
## 
## Call:
## glm(formula = I(wage > 250) ~ poly(age, 4), family = binomial, 
##     data = Wage)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3110  -0.2607  -0.2488  -0.1791   3.7859  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -4.3012     0.3451 -12.465  < 2e-16 ***
## poly(age, 4)1  71.9642    26.1176   2.755  0.00586 ** 
## poly(age, 4)2 -85.7729    35.9043  -2.389  0.01690 *  
## poly(age, 4)3  34.1626    19.6890   1.735  0.08272 .  
## poly(age, 4)4 -47.4008    24.0909  -1.968  0.04912 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 730.53  on 2999  degrees of freedom
## Residual deviance: 701.22  on 2995  degrees of freedom
## AIC: 711.22
## 
## Number of Fisher Scoring iterations: 9

Prediction, back transformatin, and confidence intervals

# predict
preds=predict(fit ,newdata =list(age=age.grid),se=T)

# back transform
pfit=exp(preds$fit)/(1+exp(preds$fit))
se.bands.logit = cbind(preds$fit +2*preds$se.fit , preds$fit -2*preds$se.fit)
se.bands = exp(se.bands.logit)/(1+exp(se.bands.logit))

preds=predict (fit ,newdata =list(age=age.grid),type="response",
               se=T)

# plot logistic
plot(age ,I(wage >250),xlim=agelims ,type="n",ylim=c(0,.2))
points(jitter(age), I((wage >250)/5),cex=.5,pch ="|",
         col="darkgrey")
lines(age.grid ,pfit ,lwd=2, col ="blue")
matlines (age.grid ,se.bands ,lwd=1, col=" blue",lty=3)

Step-wise Regression

# Step function 
table(cut(age ,4))
## 
## (17.9,33.5]   (33.5,49]   (49,64.5] (64.5,80.1] 
##         750        1399         779          72
fit=lm(wage~cut(age ,4),data=Wage)
coef(summary (fit))
##                         Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)            94.158392   1.476069 63.789970 0.000000e+00
## cut(age, 4)(33.5,49]   24.053491   1.829431 13.148074 1.982315e-38
## cut(age, 4)(49,64.5]   23.664559   2.067958 11.443444 1.040750e-29
## cut(age, 4)(64.5,80.1]  7.640592   4.987424  1.531972 1.256350e-01

Splines

Basis functions

Defining our own knots:

# Splines
library(splines)

#basis functions
par(mfrow=c(1,1))
fit=lm(wage~bs(age ,knots=c(25,40,60) ),data=Wage)
pred=predict (fit ,newdata =list(age=age.grid),se=T)
plot(age ,wage ,col="gray")
lines(age.grid ,pred$fit ,lwd=2)
lines(age.grid ,pred$fit+2*pred$se ,lty="dashed")
lines(age.grid ,pred$fit-2*pred$se ,lty="dashed")

R defined knots:

# knots chosen by R
attr(bs(age ,df=6) ,"knots")
##   25%   50%   75% 
## 33.75 42.00 51.00

Natural Splines

# natural spline
fit2=lm(wage~ns(age ,df=4),data=Wage)

plot(age ,wage ,xlim=agelims ,cex =.5,col=" darkgrey ")
title("Smoothing Spline")
fit=smooth.spline(age ,wage ,df=16)
fit2=smooth.spline(age ,wage ,cv=TRUE)
## Warning in smooth.spline(age, wage, cv = TRUE): cross-validation with non-
## unique 'x' values seems doubtful
fit2$df
## [1] 6.794596
lines(fit ,col="red",lwd =2)
lines(fit2 ,col="blue",lwd=2)
legend ("topright",legend=c("16 DF" ,"6.8 DF"),
          col=c("red","blue"),lty=1,lwd=2, cex =.8)

LOESS

# LOESS
plot(age ,wage ,xlim=agelims ,cex =.5,col=" darkgrey ")
title("Local Regression")
fit=loess(wage~age ,span=.2,data=Wage)
fit2=loess(wage~age ,span=.5,data=Wage)
lines(age.grid ,predict (fit ,data.frame(age=age.grid)),
        col="red",lwd=2)
lines(age.grid ,predict (fit2 ,data.frame(age=age.grid)),
        col="blue",lwd=2)
legend ("topright",legend=c("Span=0.2"," Span=0.5"),
          col=c("red","blue"),lty=1,lwd=2, cex =.8)

We can even use ggplot

ggplot(Wage, aes(age, wage))+
  geom_point(alpha=0.5)+
  geom_smooth(span=0.5)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'