What

I don’t really understand how to interpret the rcs (= restricted cubic spline) terms in details, but the linear version (lsp) is simpler enough that one can get a rough idea about the nonlinear version.

I consulted these resources:

https://stats.stackexchange.com/questions/282591/restricted-cubic-spline-interpretation
https://stats.stackexchange.com/questions/97845/interpreting-spline-results
https://stats.stackexchange.com/questions/155573/visualizing-nonlinear-regression/396125
Harrell’s book, Regression Modeling Strategies, which has no example interpretation of rcs I can find, so not that useful

Init

options(digits = 3)
library(pacman)
p_load(kirkegaard, rms)

Simulate data

#simulate data with varying knots
d = tibble(
  x = 1:100,
  
  #even spline
  k2_even = discretize(x, breaks = 3, labels = "number"),
  
  #default split used in rms - 10, 50, 90th centiles
  k2_default = c(rep(1, 10), rep(2, 80), rep(3, 10)),
  
  #generate linear spline values
  linear_spline = case_when(k2_even == 1 ~ x*1,
                 k2_even == 2 ~ x*0 + 34,
                 k2_even == 3 ~ x*2 - 100),
  
  #natural spline values -- actually a polynomial
  natural_spline = 1*x + 5*x^2 - .1*x^3
)

Plots and fits

Linear spline

#linear spline
#fit spline
i_fit = ols(as.formula(glue::glue("linear_spline ~ lsp(x, c(34, 67))")), data = d)
i_fit

## Linear Regression Model
##  
##  ols(formula = as.formula(glue::glue("linear_spline ~ lsp(x, c(34, 67))")), 
##      data = d)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs     100    LR chi2       Inf    R2       1.000    
##  sigma0.0000    d.f.            3    R2 adj   1.000    
##  d.f.     96    Pr(> chi2) 0.0000    g       26.407    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -1.511e-13 -1.628e-15  4.905e-18  1.216e-15  1.190e-13 
##  
##  
##            Coef    S.E.   t         Pr(>|t|)
##  Intercept  0.0000 0.0000  0.00e+00 1.0000  
##  x          1.0000 0.0000  3.53e+15 <0.0001 
##  x'        -1.0000 0.0000 -2.15e+15 <0.0001 
##  x''        2.0000 0.0000  4.29e+15 <0.0001 
##

d$fit = predict(i_fit)
  
#plot
ggplot(d) +
  geom_point(mapping = aes(x, linear_spline, color = ordered(k2_even))) +
  geom_line(mapping = aes(x, fit), color = "orange") +
  theme_bw()

For the linear case, we simply add up the coefficients as we go along. Thus, the first segment has a coefficient of 1, the second has 0 (1 + -1), and the last has 2 (1 + -1 + 2).

Natural spline

#natural spline
#fit spline
i_fit = ols(as.formula(glue::glue("natural_spline ~ rcs(x, 3)")), data = d)
i_fit

## Linear Regression Model
##  
##  ols(formula = as.formula(glue::glue("natural_spline ~ rcs(x, 3)")), 
##      data = d)
##  
##                    Model Likelihood     Discrimination    
##                       Ratio Test           Indexes        
##  Obs        100    LR chi2    385.82    R2       0.979    
##  sigma2087.0395    d.f.            2    R2 adj   0.978    
##  d.f.        97    Pr(> chi2) 0.0000    g    14783.190    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -6786.99 -1538.44   -86.51  1766.52  3030.20 
##  
##  
##            Coef       S.E.     t      Pr(>|t|)
##  Intercept -2863.7787 591.4845  -4.84 <0.0001 
##  x           245.7267  19.5578  12.56 <0.0001 
##  x'         -873.0223  24.2298 -36.03 <0.0001 
##

d$fit = predict(i_fit)
i_fit$Design$parms$x

## [1] 10.9 50.5 90.1

#function
Function(i_fit)

## function (x = NA) 
## {
##     -2863.7787 + 245.72672 * x - 0.1391794 * pmax(x - 10.9, 0)^3 + 
##         0.27835881 * pmax(x - 50.5, 0)^3 - 0.1391794 * pmax(x - 
##         90.1, 0)^3
## }
## <environment: 0x56088710d5d0>

#plot
ggplot(d) +
  geom_point(mapping = aes(x, natural_spline, color = ordered(k2_default))) +
  geom_line(mapping = aes(x, fit), color = "orange") +
  theme_bw()

Trying to apply the same interpretation to the natural spline: the first slope is somewhat positive (52), then second is negative (about -80), and the last part is very negative (about -1800). If we actually try to match up the values with something specific, we find that they don’t entirely match anything we try.