Reference

Spline regression

Dataset

library(splines) 
library(ggplot2)

df <- data.frame(x=1:15,
                 y=c(2, 10, 20, 25, 20, 10, 19, 15, 19, 10, 6, 20, 31, 31, 40))

#create scatter plot with line of best fit
ggplot(df, aes(x=x, y=y)) +
  geom_point() +
  geom_smooth(method=loess, se=FALSE)
## `geom_smooth()` using formula = 'y ~ x'

Fitted curve with high order terms

# plot a line
m1 <- lm(y~x,data=df)
plot(y~x,df)
lines(df$x,predict(m1),col="blue")

# # square
m2_1 <- lm(y~  I(x**2)+x,data=df)
plot(y~x,df)
lines(df$x,predict(m2_1),col="blue")

# a cube curve 
m2 <- lm(y~ I(x**3)+I(x**2)+x,data=df)
plot(y~x,df)
lines(df$x,predict(m2),col="blue")

Spline curve by bins/ sections (one power)

# divide datasets into bins, cut off 4 and 11
newdf = data.frame(x = df$x, 
                   y = df$y, 
                   tt1 = (df$x - 4)  * (df$x > 4),
                   tt2 = (df$x - 11)  * (df$x > 11))

# spline curve with line sharp join connection , if tt1 is one power of x. tt1 has a big influence to the original cube curve. 
# cubic
m3 <- lm(y~ I(x**3)+I(x**2)+x+tt1+tt2,data=newdf)
plot(y~x,newdf)
lines(newdf$x,predict(m3),col="blue")

# square
m3_2 <- lm(y~  I(x**2)+x+tt1+tt2,data=newdf)
plot(y~x,newdf)
lines(newdf$x,predict(m3_2),col="blue")

Spline curve by bins (high order term, more smooth in connection)

independent variables and spline terms should be a third-degree polynomial.

# divide datasets into bins, 4, 11
newdf = data.frame(x = df$x, 
                   y = df$y, 
                   tt1 = (df$x - 4)**1  * (df$x > 4),
                   tt2 = (df$x - 11)**1  * (df$x > 11))

# spline curve with smooth join connection , if tt1 is cube of x. tt1 has a little influence to the original cube curve. 
# cubic
m3 <- lm(y~ I(x**3)+I(x**2)+x+I(tt1**3)+I(tt2**3),data=newdf)
summary(m3)
## 
## Call:
## lm(formula = y ~ I(x^3) + I(x^2) + x + I(tt1^3) + I(tt2^3), data = newdf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.848 -1.995  0.686  2.177  7.345 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) -16.22068   18.59306  -0.872    0.406
## I(x^3)        0.21546    0.57347   0.376    0.716
## I(x^2)       -3.82574    6.19736  -0.617    0.552
## x            21.14499   20.23340   1.045    0.323
## I(tt1^3)     -0.07695    0.63034  -0.122    0.906
## I(tt2^3)     -0.39690    0.34543  -1.149    0.280
## 
## Residual standard error: 5.399 on 9 degrees of freedom
## Multiple R-squared:  0.8229, Adjusted R-squared:  0.7246 
## F-statistic: 8.367 on 5 and 9 DF,  p-value: 0.003385
plot(y~x,newdf)
lines(newdf$x,predict(m3),col="blue")

#################### square
m3_3 <- lm(y~  I(x**2)+x+I(tt1**3)+I(tt2**3),data=newdf)
summary(m3_3)
## 
## Call:
## lm(formula = y ~ I(x^2) + x + I(tt1^3) + I(tt2^3), data = newdf)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.4288 -1.3394  0.6158  2.3573  7.5122 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -9.83433    7.20456  -1.365  0.20217   
## I(x^2)      -1.50265    0.40298  -3.729  0.00392 **
## x           13.68074    3.66713   3.731  0.00391 **
## I(tt1^3)     0.15928    0.04294   3.710  0.00404 **
## I(tt2^3)    -0.46721    0.27759  -1.683  0.12327   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.162 on 10 degrees of freedom
## Multiple R-squared:  0.8202, Adjusted R-squared:  0.7482 
## F-statistic:  11.4 on 4 and 10 DF,  p-value: 0.0009593
plot(y~x,newdf)
lines(newdf$x,predict(m3_3),col="blue")

Using a function

# creating b-spline term in a regression model 
# Remember that the default spline model in R is a third-degree polynomial. This is because it is hard for the eye to detect the discontinuity at the knots.

model <- lm(y~ bs(x, knots=c(4,11)),data=newdf)
summary(model)
## 
## Call:
## lm(formula = y ~ bs(x, knots = c(4, 11)), data = newdf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.848 -1.995  0.686  2.177  7.345 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 1.314      5.303   0.248 0.809863    
## bs(x, knots = c(4, 11))1   14.140      9.701   1.458 0.178948    
## bs(x, knots = c(4, 11))2   29.480     10.178   2.896 0.017703 *  
## bs(x, knots = c(4, 11))3   -6.919     11.464  -0.604 0.561025    
## bs(x, knots = c(4, 11))4   31.787      8.756   3.630 0.005483 ** 
## bs(x, knots = c(4, 11))5   38.226      7.456   5.127 0.000622 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.399 on 9 degrees of freedom
## Multiple R-squared:  0.8229, Adjusted R-squared:  0.7246 
## F-statistic: 8.367 on 5 and 9 DF,  p-value: 0.003385
# plot x and predictions
lims <- range(df$x)
lims.grid <- seq(from=lims[1],to=lims[2])
pred <- predict(model,newdata=list(x=lims.grid),se=T)
# head(pred)

# plot with std error area
plot(y~x,newdf)
lines(newdf$x,pred$fit,col="blue")
lines(newdf$x,pred$fit+2*pred$se.fit,col="green")
lines(newdf$x,pred$fit-2*pred$se.fit,col="green")

# how to determine K knots
# A somewhat more objective approach is to use cross-validation.
# This procedure can be repeated for different numbers of knots K. Then the value of K giving the smallest RSS is chosen.