This code comes from the textbook lab in chapter 7. Feel free to reference it for further details.
We will be using data from the ISLR package:
library(ISLR)
attach(Wage)
From book: In this application, we examine a number of factors that relate to wages for a group of males from the Atlantic region of the United States. In particular, we wish to understand the association between an employee’s age and education, as well as the calendar year, on his wage.
library(tidyverse)
ggplot(Wage, aes(age, wage))+
geom_point()
In this section we will see that there are many ways to specify polynomial regression models.
Gives a matrix whose columns are a basis of orthogonal polynomials.
# polynomial regression
# poly uses matrix of orthogonal polynomials
fit=lm(wage~poly(age ,4) ,data=Wage)
coef(summary (fit))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 111.70361 0.7287409 153.283015 0.000000e+00
## poly(age, 4)1 447.06785 39.9147851 11.200558 1.484604e-28
## poly(age, 4)2 -478.31581 39.9147851 -11.983424 2.355831e-32
## poly(age, 4)3 125.52169 39.9147851 3.144742 1.678622e-03
## poly(age, 4)4 -77.91118 39.9147851 -1.951938 5.103865e-02
It doesn’t matter if we use the raw polynomials or the orthogonal ones. The inference of the models will be the same in the end.
# fit raw powers
fit2=lm(wage~poly(age ,4,raw=T),data=Wage)
coef(summary(fit2))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.841542e+02 6.004038e+01 -3.067172 0.0021802539
## poly(age, 4, raw = T)1 2.124552e+01 5.886748e+00 3.609042 0.0003123618
## poly(age, 4, raw = T)2 -5.638593e-01 2.061083e-01 -2.735743 0.0062606446
## poly(age, 4, raw = T)3 6.810688e-03 3.065931e-03 2.221409 0.0263977518
## poly(age, 4, raw = T)4 -3.203830e-05 1.641359e-05 -1.951938 0.0510386498
# use the I() wrapper to expoentiate
fit2a=lm(wage~age+I(age^2)+I(age^3)+I(age^4),data=Wage)
coef(fit2a)
## (Intercept) age I(age^2) I(age^3) I(age^4)
## -1.841542e+02 2.124552e+01 -5.638593e-01 6.810688e-03 -3.203830e-05
# uses cbind
fit2b=lm(wage~cbind(age ,age^2,age^3, age ^4),data=Wage)
coef(fit2b)
## (Intercept) cbind(age, age^2, age^3, age^4)age
## -1.841542e+02 2.124552e+01
## cbind(age, age^2, age^3, age^4) cbind(age, age^2, age^3, age^4)
## -5.638593e-01 6.810688e-03
## cbind(age, age^2, age^3, age^4)
## -3.203830e-05
# grid values and predict
agelims =range(age)
age.grid=seq(from=agelims [1],to=agelims [2])
preds=predict (fit ,newdata =list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit +2*preds$se.fit ,preds$fit -2*preds$se.fit)
# plot
par(mfrow=c(1,1),mar=c(4.5,4.5,1,1) ,oma=c(0,0,4,0))
plot(age ,wage ,xlim=agelims ,cex =.5,col=" darkgrey ")
title(" Degree-4 Polynomial ",outer=T)
lines(age.grid ,preds$fit ,lwd=2,col="blue")
matlines (age.grid ,se.bands ,lwd=1, col=" blue",lty=3)
# compare models with ANOVA
fit.1=lm(wage~age ,data=Wage)
fit.2=lm(wage~poly(age ,2),data=Wage)
fit.3=lm(wage~poly(age ,3),data=Wage)
fit.4=lm(wage~poly(age ,4),data=Wage)
fit.5=lm(wage~poly(age ,5),data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Same as inference from 5th degree polynomial.
# same inference
coef(summary (fit.5))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 111.70361 0.7287647 153.2780243 0.000000e+00
## poly(age, 5)1 447.06785 39.9160847 11.2001930 1.491111e-28
## poly(age, 5)2 -478.31581 39.9160847 -11.9830341 2.367734e-32
## poly(age, 5)3 125.52169 39.9160847 3.1446392 1.679213e-03
## poly(age, 5)4 -77.91118 39.9160847 -1.9518743 5.104623e-02
## poly(age, 5)5 -35.81289 39.9160847 -0.8972045 3.696820e-01
Inducing education and age in the model.
# anova works with orthogonal polys or not
fit.1=lm(wage~education+age ,data=Wage)
fit.2=lm(wage~education+poly(age ,2) ,data=Wage)
fit.3=lm(wage~education+poly(age ,3) ,data=Wage)
anova(fit.1,fit.2,fit.3)
## Analysis of Variance Table
##
## Model 1: wage ~ education + age
## Model 2: wage ~ education + poly(age, 2)
## Model 3: wage ~ education + poly(age, 3)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2994 3867992
## 2 2993 3725395 1 142597 114.6969 <2e-16 ***
## 3 2992 3719809 1 5587 4.4936 0.0341 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# logistic regression
# wage greater than 250
fit=glm(I(wage >250)~poly(age ,4),data=Wage , family=binomial )
summary(fit)
##
## Call:
## glm(formula = I(wage > 250) ~ poly(age, 4), family = binomial,
## data = Wage)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.3110 -0.2607 -0.2488 -0.1791 3.7859
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.3012 0.3451 -12.465 < 2e-16 ***
## poly(age, 4)1 71.9642 26.1176 2.755 0.00586 **
## poly(age, 4)2 -85.7729 35.9043 -2.389 0.01690 *
## poly(age, 4)3 34.1626 19.6890 1.735 0.08272 .
## poly(age, 4)4 -47.4008 24.0909 -1.968 0.04912 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 730.53 on 2999 degrees of freedom
## Residual deviance: 701.22 on 2995 degrees of freedom
## AIC: 711.22
##
## Number of Fisher Scoring iterations: 9
# predict
preds=predict(fit ,newdata =list(age=age.grid),se=T)
# back transform
pfit=exp(preds$fit)/(1+exp(preds$fit))
se.bands.logit = cbind(preds$fit +2*preds$se.fit , preds$fit -2*preds$se.fit)
se.bands = exp(se.bands.logit)/(1+exp(se.bands.logit))
preds=predict (fit ,newdata =list(age=age.grid),type="response",
se=T)
# plot logistic
plot(age ,I(wage >250),xlim=agelims ,type="n",ylim=c(0,.2))
points(jitter(age), I((wage >250)/5),cex=.5,pch ="|",
col="darkgrey")
lines(age.grid ,pfit ,lwd=2, col ="blue")
matlines (age.grid ,se.bands ,lwd=1, col=" blue",lty=3)
# Step function
table(cut(age ,4))
##
## (17.9,33.5] (33.5,49] (49,64.5] (64.5,80.1]
## 750 1399 779 72
fit=lm(wage~cut(age ,4),data=Wage)
coef(summary (fit))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 94.158392 1.476069 63.789970 0.000000e+00
## cut(age, 4)(33.5,49] 24.053491 1.829431 13.148074 1.982315e-38
## cut(age, 4)(49,64.5] 23.664559 2.067958 11.443444 1.040750e-29
## cut(age, 4)(64.5,80.1] 7.640592 4.987424 1.531972 1.256350e-01
Defining our own knots:
# Splines
library(splines)
#basis functions
par(mfrow=c(1,1))
fit=lm(wage~bs(age ,knots=c(25,40,60) ),data=Wage)
pred=predict (fit ,newdata =list(age=age.grid),se=T)
plot(age ,wage ,col="gray")
lines(age.grid ,pred$fit ,lwd=2)
lines(age.grid ,pred$fit+2*pred$se ,lty="dashed")
lines(age.grid ,pred$fit-2*pred$se ,lty="dashed")
R defined knots:
# knots chosen by R
attr(bs(age ,df=6) ,"knots")
## 25% 50% 75%
## 33.75 42.00 51.00
# natural spline
fit2=lm(wage~ns(age ,df=4),data=Wage)
plot(age ,wage ,xlim=agelims ,cex =.5,col=" darkgrey ")
title("Smoothing Spline")
fit=smooth.spline(age ,wage ,df=16)
fit2=smooth.spline(age ,wage ,cv=TRUE)
## Warning in smooth.spline(age, wage, cv = TRUE): cross-validation with non-
## unique 'x' values seems doubtful
fit2$df
## [1] 6.794596
lines(fit ,col="red",lwd =2)
lines(fit2 ,col="blue",lwd=2)
legend ("topright",legend=c("16 DF" ,"6.8 DF"),
col=c("red","blue"),lty=1,lwd=2, cex =.8)
# LOESS
plot(age ,wage ,xlim=agelims ,cex =.5,col=" darkgrey ")
title("Local Regression")
fit=loess(wage~age ,span=.2,data=Wage)
fit2=loess(wage~age ,span=.5,data=Wage)
lines(age.grid ,predict (fit ,data.frame(age=age.grid)),
col="red",lwd=2)
lines(age.grid ,predict (fit2 ,data.frame(age=age.grid)),
col="blue",lwd=2)
legend ("topright",legend=c("Span=0.2"," Span=0.5"),
col=c("red","blue"),lty=1,lwd=2, cex =.8)
We can even use ggplot
ggplot(Wage, aes(age, wage))+
geom_point(alpha=0.5)+
geom_smooth(span=0.5)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'