Roses are red. Violets are blue. Now the spring meets me in the fine age, All I need is you.
This a example of the spline regression method used in the Statistical Learning course.
data(Wage,package = "ISLR")
head(Wage)
## year age sex maritl race education
## 231655 2006 18 1. Male 1. Never Married 1. White 1. < HS Grad
## 86582 2004 24 1. Male 1. Never Married 1. White 4. College Grad
## 161300 2003 45 1. Male 2. Married 1. White 3. Some College
## 155159 2003 43 1. Male 2. Married 3. Asian 4. College Grad
## 11443 2005 50 1. Male 4. Divorced 1. White 2. HS Grad
## 376662 2008 54 1. Male 2. Married 1. White 4. College Grad
## region jobclass health health_ins
## 231655 2. Middle Atlantic 1. Industrial 1. <=Good 2. No
## 86582 2. Middle Atlantic 2. Information 2. >=Very Good 2. No
## 161300 2. Middle Atlantic 1. Industrial 1. <=Good 1. Yes
## 155159 2. Middle Atlantic 2. Information 2. >=Very Good 1. Yes
## 11443 2. Middle Atlantic 2. Information 1. <=Good 1. Yes
## 376662 2. Middle Atlantic 2. Information 2. >=Very Good 1. Yes
## logwage wage
## 231655 4.318063 75.04315
## 86582 4.255273 70.47602
## 161300 4.875061 130.98218
## 155159 5.041393 154.68529
## 11443 4.318063 75.04315
## 376662 4.845098 127.11574
names(Wage)#to check all the columns
## [1] "year" "age" "sex" "maritl" "race"
## [6] "education" "region" "jobclass" "health" "health_ins"
## [11] "logwage" "wage"
library(splines)
cubic.model=lm(wage~bs(age,knots=c(25,40,60)),data=Wage) #3 knots, seven parameters
attach(Wage)
agelims=range(age)#get the upper and under border
agelims#18,80
## [1] 18 80
age.grid=seq(from=agelims[1],to=agelims[2])#18-80
pred=predict(cubic.model,newdata=list(age=age.grid),se=T)#need to return the standard error of the predict
plot(age,wage,col="gray")#plot the scatter plot of age and wage
title("Cubic Splines")
lines(age.grid,pred$fit,lwd=2)#plot the fitted spline curve #line width
lines(age.grid,pred$fit+2*pred$se,lty="dashed")#+2 sigma #line type
lines(age.grid,pred$fit-2*pred$se,lty="dashed")#-2 sigma #line type
dim(bs(age,knots=c(25,40,60))) #fix the number of knots is 3,df =7-1 (intercept is 1)
## [1] 3000 6
dim(bs(age,df=6))#fix the df directly
## [1] 3000 6
attr(bs(age,df=6),"knots") #get the quantiles of 4th quantiles
## 25% 50% 75%
## 33.75 42.00 51.00
head(ns(age,df=4))
## 1 2 3 4
## [1,] 0.00000000 0.00000000 0.00000000 0.00000000
## [2,] 0.01731602 -0.13795411 0.31872157 -0.18076746
## [3,] 0.75108560 0.16605047 0.09131609 -0.05061290
## [4,] 0.78017192 0.07222489 0.11002552 -0.06235889
## [5,] 0.52933205 0.38879374 0.13708340 -0.05540437
## [6,] 0.34484721 0.49710028 0.19449432 -0.03644180
head(bs(age,df=6))
## 1 2 3 4 5 6
## [1,] 0.000000 0.0000000000 0.00000000 0.0000000 0.000000e+00 0.000000000
## [2,] 0.537145 0.2083075655 0.01731602 0.0000000 0.000000e+00 0.000000000
## [3,] 0.000000 0.0421607378 0.75108560 0.2046761 2.077562e-03 0.000000000
## [4,] 0.000000 0.0999365637 0.78017192 0.1198146 7.694675e-05 0.000000000
## [5,] 0.000000 0.0001951886 0.52933205 0.4310760 3.939674e-02 0.000000000
## [6,] 0.000000 0.0000000000 0.34484721 0.5257560 1.282898e-01 0.001107056
natrual.model=lm(wage~ns(age,df=4),data=Wage) #choose 4 knots as a example
pred2=predict(natrual.model,newdata=list(age=age.grid),se=T)
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Natrual Splines")
lines(age.grid, pred2$fit,col="red",lwd=2)
smooth.1=smooth.spline(age,wage,df=16) #specify the df, not accurate,will display warning
smooth.2=smooth.spline(age,wage,cv=TRUE)# use cross validation method
smooth.2$df #the final df achieved by cross validation
## [1] 6.794596
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Smoothing Spline")
lines(smooth.1,col="red",lwd=2)
lines(smooth.2,col="blue",lwd=2)#cross validation
legend("topright",legend=c("16 DF","6.8 DF"),col=c("red","blue"),lty=1,lwd=2,cex=.8)