Polynomials and Splines: Head acceleration in motorcycle accidents

References

Load packages

## Functiona to install necessary packages
inst <- function (PKG) {
    if (!(PKG %in% rownames(installed.packages()))) {
        install.packages(pkgs = PKG, dependencies = TRUE)
    }
}
## Install these if not already installed
inst("knitr")
inst("ggplot2")
inst("gridExtra")
inst("gam")
inst("mfp")
## Load MASS for dataset
library(MASS)
## Load ggplot2 for plotting
library(ggplot2)
## Load gridExtra for fine tuning
library(gridExtra)
## Multiple Fractional Polynomial Model
library(mfp)
## Generalized Additive Models
library(mgcv)
## Multivariate Adaptive Regression Spline (MARS) Models
library(earth)

Load dataset

Data from a Simulated Motorcycle Accident

Description:
     A data frame giving a series of measurements of head acceleration
     in a simulated motorcycle accident, used to test crash helmets.

Format:
     ‘times’ in milliseconds after impact.
     ‘accel’ in g.

Source:
     Silverman, B. W. (1985) Some aspects of the spline smoothing
     approach to non-parametric curve fitting.  _Journal of the Royal
     Statistical Society series B_ *47*, 1-52.
## Load
data(mcycle)
## Check quantiles
quantile(mcycle$times, probs = seq(from = 0, to = 1, by = 0.2))
##    0%   20%   40%   60%   80%  100% 
##  2.40 14.68 18.44 26.52 36.20 57.60
## Set internal 4 thresholds for 5 categories
thresholds <- c(12,24,36,48)
## Thresholds for quintiles
thresholdsQuint <- c(14.68, 18.44, 26.52, 36.20)
## Thresholds for deciles
thresholdsDec <- quantile(mcycle$times, probs = seq(from = 0, to = 1, by = 0.1))[c(-1,-11)]
## Define a function to create a dataset for splines
CreateIndicatorAndSplines <- function(x, y, posKnots) {
    ## Add 0 at the beginning of posKnots
    posKnots <- c(0, posKnots)
    ## Create a data frame
    dataset <- data.frame(x = x, y = y)
    ## Create a category indicator variable
    dataset["indicator"] <- cut(x, breaks = c(-Inf, posKnots, Inf))
    ## Create variables for spline: named s1, s2, ...
    for (i in seq_along(posKnots)) {
        dataset[paste0("s",i)] <- pmax(x - posKnots[i], 0)
    }
    ## Return a dataset
    dataset
}
## Create datasets with different thresholds
mcycle1     <- CreateIndicatorAndSplines(x = mcycle$times, y = mcycle$accel, posKnots = thresholds)
mcycleQuint <- CreateIndicatorAndSplines(x = mcycle$times, y = mcycle$accel, posKnots = thresholdsQuint)
mcycleDec   <- CreateIndicatorAndSplines(x = mcycle$times, y = mcycle$accel, posKnots = thresholdsDec)
## Plot raw data as a scatter plot (Used as a base plot)
plotBase <- ggplot(data = mcycle1,
                      mapping = aes(x = x, y = y)) +
    labs(title = "Motor Cycle dataset") +
    layer(geom = "point") +
    scale_x_continuous(breaks = seq(from = 0, to = 100, by = 20)) +
    theme_bw() +
    theme(legend.key = element_blank())
plotBase

plot of chunk unnamed-chunk-4

## Load another dataset more suitable for polynomials
fpeg  <- read.table("http://www.umass.edu/statdata/statdata/data/fpexample.dat", col.names = c("ID","X","Y"))
thresholdsFpeg <- c(20,40,60,80)
fpeg1 <- CreateIndicatorAndSplines(x = fpeg$X, y = fpeg$Y, posKnots = thresholdsFpeg)
plotBaseFpeg <- ggplot(data = fpeg1,
       mapping = aes(x = x, y = y)) +
    labs(title = "Fractional Polynomial Example dataset") +
    layer(geom = "point") +
    theme_bw() +
    theme(legend.key = element_blank())
plotBaseFpeg

plot of chunk unnamed-chunk-5

Define functions for plotting and prediction

## Define a helper function for ggplot2
stat_fun <- function(fun, args, mapping) {
    stat_function(fun = fun, args = args, mapping = mapping, n = 1001, size = 1.5, alpha = 1)
}

## Define prediction function
PredFun <- function(x, model, posKnots) {
    ## Create data for prediction
    newData <- CreateIndicatorAndSplines(x = x, y = NA, posKnots = posKnots)
    ## Predict for it
    predict(object = model, newdata = newData)
}

## Define a "fit and plot" function (dependent on thresholds)
FitLayer <- function(formula, data = mcycle1, fitFun = lm, posKnots = thresholds, print = FALSE, ...) {
    ## Fit model
    modelFit <- fitFun(formula = formula,
                       data    = data, ...)
    ## Print result
    if (print == TRUE) print(summary(modelFit))
    ## Return a layer
    stat_fun(fun = PredFun, args = list(model = modelFit, posKnots = posKnots), mapping = aes(x = x))
}

Live demonstration

### Naive methods
## Linear term only
plotBase + FitLayer(y ~ x)

plot of chunk unnamed-chunk-7

## Category indicator (5 equal-width categories)
plotBase + FitLayer(y ~ indicator)

plot of chunk unnamed-chunk-7

## Category indicator (Quintiles perform better here)
plotBase + FitLayer(y ~ indicator, data = mcycleQuint, posKnots = thresholdsQuint)

plot of chunk unnamed-chunk-7

## Category indicator (Deciles perform better here)
plotBase + FitLayer(y ~ indicator, data = mcycleDec, posKnots = thresholdsDec)

plot of chunk unnamed-chunk-7



### "Polynomial family"
## Linear term only
plotBase + FitLayer(y ~ x)

plot of chunk unnamed-chunk-7

## Quadratic
plotBase + FitLayer(y ~ x + I(x^2))

plot of chunk unnamed-chunk-7

## Cubic
plotBase + FitLayer(y ~ x + I(x^2) + I(x^3))

plot of chunk unnamed-chunk-7

## Fractional
plotBase + FitLayer(y ~ x + I(x^2) + I(x^(1/2)))

plot of chunk unnamed-chunk-7

## Multiple Fractional Polynomial Model (Automatic)
plotBase + FitLayer(y ~ fp(x), fitFun = mfp, print = T)
## 
## Call:
## glm(formula = y ~ I((x/10)^0.5) + I((x/10)^0.5 * log((x/10))), 
##     data = data)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -92.15  -29.06    2.11   31.35  101.96  
## 
## Coefficients:
##                             Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)                    244.3       62.7    3.89   0.00016 ***
## I((x/10)^0.5)                 -280.5       61.2   -4.59 0.0000105 ***
## I((x/10)^0.5 * log((x/10)))    111.7       22.0    5.07 0.0000013 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1872)
## 
##     Null deviance: 308223  on 132  degrees of freedom
## Residual deviance: 243420  on 130  degrees of freedom
## AIC: 1385
## 
## Number of Fisher Scoring iterations: 2

plot of chunk unnamed-chunk-7

## Multiple Fractional Polynomial Model (FP data)
plotBaseFpeg + FitLayer(y ~ fp(x), data = fpeg1, fitFun = mfp, posKnots = thresholdsFpeg, print = T)
## 
## Call:
## glm(formula = y ~ I((x/100)^-1) + I((x/100)^-1 * log((x/100))), 
##     data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -10.370   -3.312   -0.143    3.553   10.830  
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     9.674      1.402     6.9  5.4e-10 ***
## I((x/100)^-1)                  10.050      0.747    13.4  < 2e-16 ***
## I((x/100)^-1 * log((x/100)))    3.138      0.233    13.5  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 23.56)
## 
##     Null deviance: 6576.8  on 99  degrees of freedom
## Residual deviance: 2285.3  on 97  degrees of freedom
## AIC: 604.7
## 
## Number of Fisher Scoring iterations: 2

plot of chunk unnamed-chunk-7

## 5th Power
plotBase + FitLayer(y ~ poly(x, degree = 5, raw = TRUE))

plot of chunk unnamed-chunk-7

## 10th Power
plotBase + FitLayer(y ~ poly(x, degree = 10, raw = TRUE))

plot of chunk unnamed-chunk-7



### "Spline family"
## Category indicator ("Intercept" spline)
plotBase + FitLayer(y ~ indicator)

plot of chunk unnamed-chunk-7

## Linear spline
plotBase + FitLayer(y ~ x + s2 + s3 + s4 + s5)

plot of chunk unnamed-chunk-7

## Linear spline (Quintiles)
plotBase + FitLayer(y ~ bs(x, df = NULL, knots = thresholdsQuint, degree = 1))

plot of chunk unnamed-chunk-7


## Quadratic spline (unrestricted)
plotBase + FitLayer(y ~ x + I(x^2) + I(s2^2) + I(s3^2) + I(s4^2) + I(s5^2))

plot of chunk unnamed-chunk-7

## Quadratic spline restricted with 1st segment linear restriction
plotBase + FitLayer(y ~ x +        + I(s2^2) + I(s3^2) + I(s4^2) + I(s5^2))

plot of chunk unnamed-chunk-7

## Quadratic spline restricted with 1st & last segment linear restriction
plotBase + FitLayer(y ~ x +        + I(s2^2 - s5^2) + I(s3^2 - s5^2) + I(s4^2 - s5^2))

plot of chunk unnamed-chunk-7


## Cubic spline (unrestricted)
plotBase + FitLayer(y ~ x + I(x^2) + I(x^3) + I(s2^3) + I(s3^3) + I(s4^3) + I(s5^3))

plot of chunk unnamed-chunk-7



## Spline with splines::bs(): Linear spline with 1 equally-sized segment (df-degree knots)
plotBase + FitLayer(y ~ bs(x, df = 1, knots = NULL, degree = 1))

plot of chunk unnamed-chunk-7

## Spline with splines::bs(): Linear spline with 2 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 2, knots = NULL, degree = 1))

plot of chunk unnamed-chunk-7

## Spline with splines::bs(): Linear spline with 3 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 3, knots = NULL, degree = 1))

plot of chunk unnamed-chunk-7

## Spline with splines::bs(): Linear spline with 4 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 4, knots = NULL, degree = 1))

plot of chunk unnamed-chunk-7

## Spline with splines::bs(): Linear spline with 5 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 5, knots = NULL, degree = 1))

plot of chunk unnamed-chunk-7

## Spline with splines::bs(): Linear spline with fixed boundaries
plotBase + FitLayer(y ~ bs(x, df = NULL, knots = c(15,20,33,40), degree = 1))

plot of chunk unnamed-chunk-7

## Spline with splines::bs(): Linear spline with 10 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 10, knots = NULL, degree = 1))

plot of chunk unnamed-chunk-7

## Spline with splines::bs(): Linear spline with 20 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 20, knots = NULL, degree = 1))

plot of chunk unnamed-chunk-7



## Spline with splines::bs(): Quadratic spline with 1 equally-sized segment (df-degree knots)
plotBase + FitLayer(y ~ bs(x, df = 2, knots = NULL, degree = 2))

plot of chunk unnamed-chunk-7

## Spline with splines::bs(): Quadratic spline with 2 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 3, knots = NULL, degree = 2))

plot of chunk unnamed-chunk-7

## Spline with splines::bs(): Quadratic spline with 3 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 4, knots = NULL, degree = 2))

plot of chunk unnamed-chunk-7

## Spline with splines::bs(): Quadratic spline with 4 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 5, knots = NULL, degree = 2))

plot of chunk unnamed-chunk-7

## Spline with splines::bs(): Quadratic spline with 5 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 6, knots = NULL, degree = 2))

plot of chunk unnamed-chunk-7

## Spline with splines::bs(): Quadratic spline with 10 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 11, knots = NULL, degree = 2))

plot of chunk unnamed-chunk-7

## Spline with splines::bs(): Quadratic spline with 20 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 21, knots = NULL, degree = 2))

plot of chunk unnamed-chunk-7



## Generalized Additive Model (Nonparametric smoother)
plotBase + FitLayer(y ~ s(x, bs = "cr"), fitFun = gam, print = T)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## y ~ s(x, bs = "cr")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -25.55       1.95   -13.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##       edf Ref.df    F p-value    
## s(x) 8.39   8.87 53.8  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.783   Deviance explained = 79.7%
## GCV score = 544.48  Scale est. = 506.04    n = 133

plot of chunk unnamed-chunk-7



## LOESS span = 0.75
plotBase + FitLayer(y ~ x, fitFun = loess)

plot of chunk unnamed-chunk-7

## LOESS span = 0.40
plotBase + FitLayer(y ~ x, fitFun = loess, span = 0.4)

plot of chunk unnamed-chunk-7



## Multivariate Adaptive Regression Spline (MARS) Models
plotBase + FitLayer(y ~ x, fitFun = earth, print = T)
## Call: earth(formula=formula, data=data)
## 
##             coefficients
## (Intercept)      -90.993
## h(x-19.4)          9.250
## h(19.4-x)          8.073
## h(x-31.2)        -10.236
## 
## Selected 4 of 6 terms, and 1 of 1 predictors 
## Importance: x
## Number of terms at each degree of interaction: 1 3 (additive model)
## GCV 1120    RSS 133670    GRSq 0.524    RSq 0.5663

plot of chunk unnamed-chunk-7