Greenland S. Dose-response and trend analysis in epidemiology: alternatives to categorical analysis. http://www.ncbi.nlm.nih.gov/pubmed/7548341
Silverman. Some aspects of the spline smoothing approach to non-parametric curve fitting.: http://www-personal.umich.edu/~jizhu/jizhu/wuke/Silverman-JRSSB85.pdf
mcycle dataset in CSV: http://vincentarelbundock.github.io/Rdatasets/datasets.html
Fractional polynomial example dataset: http://www.umass.edu/statdata/statdata/data/fpexample.txt
## Functiona to install necessary packages
inst <- function (PKG) {
if (!(PKG %in% rownames(installed.packages()))) {
install.packages(pkgs = PKG, dependencies = TRUE)
}
}
## Install these if not already installed
inst("knitr")
inst("ggplot2")
inst("gridExtra")
inst("gam")
inst("mfp")
## Load MASS for dataset
library(MASS)
## Load ggplot2 for plotting
library(ggplot2)
## Load gridExtra for fine tuning
library(gridExtra)
## Multiple Fractional Polynomial Model
library(mfp)
## Generalized Additive Models
library(mgcv)
## Multivariate Adaptive Regression Spline (MARS) Models
library(earth)
Data from a Simulated Motorcycle Accident
Description:
A data frame giving a series of measurements of head acceleration
in a simulated motorcycle accident, used to test crash helmets.
Format:
‘times’ in milliseconds after impact.
‘accel’ in g.
Source:
Silverman, B. W. (1985) Some aspects of the spline smoothing
approach to non-parametric curve fitting. _Journal of the Royal
Statistical Society series B_ *47*, 1-52.
## Load
data(mcycle)
## Check quantiles
quantile(mcycle$times, probs = seq(from = 0, to = 1, by = 0.2))
## 0% 20% 40% 60% 80% 100%
## 2.40 14.68 18.44 26.52 36.20 57.60
## Set internal 4 thresholds for 5 categories
thresholds <- c(12,24,36,48)
## Thresholds for quintiles
thresholdsQuint <- c(14.68, 18.44, 26.52, 36.20)
## Thresholds for deciles
thresholdsDec <- quantile(mcycle$times, probs = seq(from = 0, to = 1, by = 0.1))[c(-1,-11)]
## Define a function to create a dataset for splines
CreateIndicatorAndSplines <- function(x, y, posKnots) {
## Add 0 at the beginning of posKnots
posKnots <- c(0, posKnots)
## Create a data frame
dataset <- data.frame(x = x, y = y)
## Create a category indicator variable
dataset["indicator"] <- cut(x, breaks = c(-Inf, posKnots, Inf))
## Create variables for spline: named s1, s2, ...
for (i in seq_along(posKnots)) {
dataset[paste0("s",i)] <- pmax(x - posKnots[i], 0)
}
## Return a dataset
dataset
}
## Create datasets with different thresholds
mcycle1 <- CreateIndicatorAndSplines(x = mcycle$times, y = mcycle$accel, posKnots = thresholds)
mcycleQuint <- CreateIndicatorAndSplines(x = mcycle$times, y = mcycle$accel, posKnots = thresholdsQuint)
mcycleDec <- CreateIndicatorAndSplines(x = mcycle$times, y = mcycle$accel, posKnots = thresholdsDec)
## Plot raw data as a scatter plot (Used as a base plot)
plotBase <- ggplot(data = mcycle1,
mapping = aes(x = x, y = y)) +
labs(title = "Motor Cycle dataset") +
layer(geom = "point") +
scale_x_continuous(breaks = seq(from = 0, to = 100, by = 20)) +
theme_bw() +
theme(legend.key = element_blank())
plotBase
## Load another dataset more suitable for polynomials
fpeg <- read.table("http://www.umass.edu/statdata/statdata/data/fpexample.dat", col.names = c("ID","X","Y"))
thresholdsFpeg <- c(20,40,60,80)
fpeg1 <- CreateIndicatorAndSplines(x = fpeg$X, y = fpeg$Y, posKnots = thresholdsFpeg)
plotBaseFpeg <- ggplot(data = fpeg1,
mapping = aes(x = x, y = y)) +
labs(title = "Fractional Polynomial Example dataset") +
layer(geom = "point") +
theme_bw() +
theme(legend.key = element_blank())
plotBaseFpeg
## Define a helper function for ggplot2
stat_fun <- function(fun, args, mapping) {
stat_function(fun = fun, args = args, mapping = mapping, n = 1001, size = 1.5, alpha = 1)
}
## Define prediction function
PredFun <- function(x, model, posKnots) {
## Create data for prediction
newData <- CreateIndicatorAndSplines(x = x, y = NA, posKnots = posKnots)
## Predict for it
predict(object = model, newdata = newData)
}
## Define a "fit and plot" function (dependent on thresholds)
FitLayer <- function(formula, data = mcycle1, fitFun = lm, posKnots = thresholds, print = FALSE, ...) {
## Fit model
modelFit <- fitFun(formula = formula,
data = data, ...)
## Print result
if (print == TRUE) print(summary(modelFit))
## Return a layer
stat_fun(fun = PredFun, args = list(model = modelFit, posKnots = posKnots), mapping = aes(x = x))
}
### Naive methods
## Linear term only
plotBase + FitLayer(y ~ x)
## Category indicator (5 equal-width categories)
plotBase + FitLayer(y ~ indicator)
## Category indicator (Quintiles perform better here)
plotBase + FitLayer(y ~ indicator, data = mcycleQuint, posKnots = thresholdsQuint)
## Category indicator (Deciles perform better here)
plotBase + FitLayer(y ~ indicator, data = mcycleDec, posKnots = thresholdsDec)
### "Polynomial family"
## Linear term only
plotBase + FitLayer(y ~ x)
## Quadratic
plotBase + FitLayer(y ~ x + I(x^2))
## Cubic
plotBase + FitLayer(y ~ x + I(x^2) + I(x^3))
## Fractional
plotBase + FitLayer(y ~ x + I(x^2) + I(x^(1/2)))
## Multiple Fractional Polynomial Model (Automatic)
plotBase + FitLayer(y ~ fp(x), fitFun = mfp, print = T)
##
## Call:
## glm(formula = y ~ I((x/10)^0.5) + I((x/10)^0.5 * log((x/10))),
## data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -92.15 -29.06 2.11 31.35 101.96
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 244.3 62.7 3.89 0.00016 ***
## I((x/10)^0.5) -280.5 61.2 -4.59 0.0000105 ***
## I((x/10)^0.5 * log((x/10))) 111.7 22.0 5.07 0.0000013 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1872)
##
## Null deviance: 308223 on 132 degrees of freedom
## Residual deviance: 243420 on 130 degrees of freedom
## AIC: 1385
##
## Number of Fisher Scoring iterations: 2
## Multiple Fractional Polynomial Model (FP data)
plotBaseFpeg + FitLayer(y ~ fp(x), data = fpeg1, fitFun = mfp, posKnots = thresholdsFpeg, print = T)
##
## Call:
## glm(formula = y ~ I((x/100)^-1) + I((x/100)^-1 * log((x/100))),
## data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -10.370 -3.312 -0.143 3.553 10.830
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.674 1.402 6.9 5.4e-10 ***
## I((x/100)^-1) 10.050 0.747 13.4 < 2e-16 ***
## I((x/100)^-1 * log((x/100))) 3.138 0.233 13.5 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 23.56)
##
## Null deviance: 6576.8 on 99 degrees of freedom
## Residual deviance: 2285.3 on 97 degrees of freedom
## AIC: 604.7
##
## Number of Fisher Scoring iterations: 2
## 5th Power
plotBase + FitLayer(y ~ poly(x, degree = 5, raw = TRUE))
## 10th Power
plotBase + FitLayer(y ~ poly(x, degree = 10, raw = TRUE))
### "Spline family"
## Category indicator ("Intercept" spline)
plotBase + FitLayer(y ~ indicator)
## Linear spline
plotBase + FitLayer(y ~ x + s2 + s3 + s4 + s5)
## Linear spline (Quintiles)
plotBase + FitLayer(y ~ bs(x, df = NULL, knots = thresholdsQuint, degree = 1))
## Quadratic spline (unrestricted)
plotBase + FitLayer(y ~ x + I(x^2) + I(s2^2) + I(s3^2) + I(s4^2) + I(s5^2))
## Quadratic spline restricted with 1st segment linear restriction
plotBase + FitLayer(y ~ x + + I(s2^2) + I(s3^2) + I(s4^2) + I(s5^2))
## Quadratic spline restricted with 1st & last segment linear restriction
plotBase + FitLayer(y ~ x + + I(s2^2 - s5^2) + I(s3^2 - s5^2) + I(s4^2 - s5^2))
## Cubic spline (unrestricted)
plotBase + FitLayer(y ~ x + I(x^2) + I(x^3) + I(s2^3) + I(s3^3) + I(s4^3) + I(s5^3))
## Spline with splines::bs(): Linear spline with 1 equally-sized segment (df-degree knots)
plotBase + FitLayer(y ~ bs(x, df = 1, knots = NULL, degree = 1))
## Spline with splines::bs(): Linear spline with 2 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 2, knots = NULL, degree = 1))
## Spline with splines::bs(): Linear spline with 3 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 3, knots = NULL, degree = 1))
## Spline with splines::bs(): Linear spline with 4 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 4, knots = NULL, degree = 1))
## Spline with splines::bs(): Linear spline with 5 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 5, knots = NULL, degree = 1))
## Spline with splines::bs(): Linear spline with fixed boundaries
plotBase + FitLayer(y ~ bs(x, df = NULL, knots = c(15,20,33,40), degree = 1))
## Spline with splines::bs(): Linear spline with 10 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 10, knots = NULL, degree = 1))
## Spline with splines::bs(): Linear spline with 20 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 20, knots = NULL, degree = 1))
## Spline with splines::bs(): Quadratic spline with 1 equally-sized segment (df-degree knots)
plotBase + FitLayer(y ~ bs(x, df = 2, knots = NULL, degree = 2))
## Spline with splines::bs(): Quadratic spline with 2 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 3, knots = NULL, degree = 2))
## Spline with splines::bs(): Quadratic spline with 3 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 4, knots = NULL, degree = 2))
## Spline with splines::bs(): Quadratic spline with 4 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 5, knots = NULL, degree = 2))
## Spline with splines::bs(): Quadratic spline with 5 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 6, knots = NULL, degree = 2))
## Spline with splines::bs(): Quadratic spline with 10 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 11, knots = NULL, degree = 2))
## Spline with splines::bs(): Quadratic spline with 20 equally-sized segment
plotBase + FitLayer(y ~ bs(x, df = 21, knots = NULL, degree = 2))
## Generalized Additive Model (Nonparametric smoother)
plotBase + FitLayer(y ~ s(x, bs = "cr"), fitFun = gam, print = T)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## y ~ s(x, bs = "cr")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -25.55 1.95 -13.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(x) 8.39 8.87 53.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.783 Deviance explained = 79.7%
## GCV score = 544.48 Scale est. = 506.04 n = 133
## LOESS span = 0.75
plotBase + FitLayer(y ~ x, fitFun = loess)
## LOESS span = 0.40
plotBase + FitLayer(y ~ x, fitFun = loess, span = 0.4)
## Multivariate Adaptive Regression Spline (MARS) Models
plotBase + FitLayer(y ~ x, fitFun = earth, print = T)
## Call: earth(formula=formula, data=data)
##
## coefficients
## (Intercept) -90.993
## h(x-19.4) 9.250
## h(19.4-x) 8.073
## h(x-31.2) -10.236
##
## Selected 4 of 6 terms, and 1 of 1 predictors
## Importance: x
## Number of terms at each degree of interaction: 1 3 (additive model)
## GCV 1120 RSS 133670 GRSq 0.524 RSq 0.5663