# *************************************
# CRP 245 Part 2 Day 4 Linearity
# *************************************
# *************************************
# * Sample R Code for Testing Linearity between
# * a Continuous Response / Continuous Predictor
# * Using Restricted Cubic Splines and a Lack-of-Fit Test
# *************************************
# Author, Megan Neely, PhD (with some modifications from Steve)
# Download and load the linearity dataset:
load(url("http://www.duke.edu/~sgrambow/crp241data/linearity.RData"))
# Plot the data to see the relationship between x and y:
# - The dots represent the observed data, which contain noise (i.e. error)
# we are trying to estimate using the data (i.e. via regression)
plot(linearity$x,linearity$y,pch=19,main='Scatterplot for X and Y',
xlab='X', ylab='Y')

# What happens if we "fit a line" to these data to estimate
# the relationship between x and y?
# - That is, we fit a linear regression line and assume the relationship
# between x and y is linear?
# (1) Fit the regression line
# NOTE: p-value for x is greater than 0.05; says there is no relationship
# between x and y, but we know that is not true from looking at the
# data. Happens because this analysis ignored the non-linear
# relationship between x and y.
fit <- lm(y~x,data=linearity)
summary(fit)
##
## Call:
## lm(formula = y ~ x, data = linearity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4729 -1.8603 0.1531 1.9353 4.5427
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.19872 0.89892 14.68 3.09e-15 ***
## x -0.07083 0.04754 -1.49 0.147
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.483 on 30 degrees of freedom
## Multiple R-squared: 0.06889, Adjusted R-squared: 0.03785
## F-statistic: 2.22 on 1 and 30 DF, p-value: 0.1467
# (2) Plot the fitted regression line (blue) and again the data (dots)
#
plot(linearity$x,linearity$y,pch=19,main='Scatterplot for X & Y with
Simple Linear Regression Line', xlab='X', ylab='Y')
abline(fit,col='blue',lty=2, lwd=2)

# check the diagnostics
par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(fit)

par(mfrow=c(1,1)) # Change back to 1 x 1
# looks like a nonlinear (curvilinear) relationship in the diagnostics
# and in the scatter plot
# How do we fit this? Multiple possibilities:
# 1. Use simple polynomials (x^2)
# 2. Use loess Curve Fitting (Local Polynomial Regression)
# loess Curve Fitting This is a method for fitting a smooth
# curve between two variables or fitting a smooth surface between
# an outcome and multiple predictor variables.
# 3. Fit a linear spline
# 4. Restricted Cubic Spline (RCS)
# 1. Simple Polynomials (X^2)
# create x2 to be x*x
linearity$x2 = linearity$x*linearity$x;
# now fit the polynomial model
fit.poly <- lm(y~x+x2,data=linearity)
summary(fit.poly)
##
## Call:
## lm(formula = y ~ x + x2, data = linearity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.13152 -0.64060 0.09362 0.72246 2.46334
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.872156 0.616323 12.77 1.96e-13 ***
## x 0.869151 0.086104 10.09 5.33e-11 ***
## x2 -0.028484 0.002531 -11.25 4.25e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.09 on 29 degrees of freedom
## Multiple R-squared: 0.8265, Adjusted R-squared: 0.8145
## F-statistic: 69.07 on 2 and 29 DF, p-value: 9.341e-12
# (2) Plot the fitted regression line (blue) and again the data (dots)
# get predicted values for Y_hat
linearity$y.quad_pred <- predict(fit.poly)
# plot the data
plot( linearity$x,linearity$y,pch=19,main='Scatterplot w/ Quadratic Fit',
xlab='X',ylab='Y')
lines(linearity$x,linearity$y.quad_pred,col='blue',lty=2,lwd=2)

# check the diagnostics for polynomial model
par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(fit.poly)

par(mfrow=c(1,1)) # Change back to 1 x 1
# 2. Use loess Curve Fitting (Local Polynomial Regression)
# loess Curve Fitting This is a method for fitting a
# smooth curve between two variables
fit.loess <- loess(linearity$y ~ linearity$x)
summary(fit.loess)
## Call:
## loess(formula = linearity$y ~ linearity$x)
##
## Number of Observations: 32
## Equivalent Number of Parameters: 4.35
## Residual Standard Error: 1.13
## Trace of smoother matrix: 4.76 (exact)
##
## Control settings:
## span : 0.75
## degree : 2
## family : gaussian
## surface : interpolate cell = 0.2
## normalize: TRUE
## parametric: FALSE
## drop.square: FALSE
# get predicted values of y
y.pred_loess <- predict(fit.loess)
# Plot the data
plot( linearity$x,linearity$y,pch=19,main='Scatterplot w/ loess fit',
xlab='X',ylab='Y')
lines(linearity$x,y.pred_loess,col='dark green',lwd=2, lty=6)

#superimpose polynomial
plot( linearity$x,linearity$y,pch=19,
main='Scatterplot w/ loess & quadritc fits',
xlab='X',ylab='Y')
lines(linearity$x,y.pred_loess,col='dark green',lwd=2, lty=6)
lines(linearity$x,linearity$y.quad_pred,col='blue',lty=2,lwd=2)
legend('bottomleft',
legend=c('loess','Quadratic'),
col=c('dark green','blue'),
lwd=2,lty=c(6,2),bty='n')

# 3. Linear Splines
# Iea here is to try to approximate the non-linear relationship using
# linear splines which approximates the non-linear relationship as a
# series of linear ones. Here, the parabolic relationship could be
# approximated using a linear spline with one knot at x=17 (ish).
# That is, there appears to be a positive linear relationship from
# x = 0 to x = 17 and then a negative linear relationship from x = 17 to
# x = 32. Linear splines are much more interpretable clinically.
# See the code below for how to fit a linear spline; again, I would
# only do this after a non-linear association has been found and after
# plotting the relationship to see if approximating it with a linear
# spline is reasonable.
# - Make linear spline terms:
linearity$x17 <- ( linearity$x - 17 )
linearity$x17[ linearity$x17<0 ] <- 0
# - Fit linear spline model
fit3.lin.spline <- lm(y ~ x + x17,data=linearity)
summary(fit3.lin.spline)
##
## Call:
## lm(formula = y ~ x + x17, data = linearity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3468 -0.7455 0.0425 0.8522 2.3381
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.44859 0.58590 16.127 5.11e-16 ***
## x 0.36368 0.05032 7.227 5.88e-08 ***
## x17 -0.91180 0.09351 -9.751 1.17e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.221 on 29 degrees of freedom
## Multiple R-squared: 0.7824, Adjusted R-squared: 0.7674
## F-statistic: 52.13 on 2 and 29 DF, p-value: 2.492e-10
# Plot linear spline fit
# Note: Generated data is already sorted in ascending order and
# there are no replicate values; that will not be true for real data.
# So will need to make a sequence of plotting x-values. Code below does
# that; but you may need to adjust the increment in the seq() function to
# make sense for you data (i.e. by=0.1 or by=5) Note: From the plot, you
# can see that the RCS fit (light blue) is flexible enough to get very
# close to the true relationship (red); but that linear spline fit (magenta)
# is not as close (it overestimates the true relationhip for some values of x and under-
# estimates for others); but this is the trade-off between model fit and interpretability.
x.range <- range(linearity$x)
x.seq <- seq(x.range[1],x.range[2],by=1)
x.seq17 <- ( linearity$x - 17 )
x.seq17[ x.seq17<0 ] <- 0
yhat.ls <- predict(fit3.lin.spline,data.frame(x=x.seq,x17=x.seq17))
plot( linearity$x, linearity$y ,pch=19)
title("Data with simple linear spline fit")
abline(fit,col='blue',lty=2, lwd=2)
lines(x.seq, yhat.ls, col='magenta', lty=2, lwd=2)
# So how can we determine if there is evidence of a
# non-linear relationship? Many ways, but a really simple
# and powerful way to do is to compare the fit of a
# regression model that assumes a linear relationship
# to the fit of one that assumes a non-linear relationship.
# A common way of modeling a non-linear relationship between
# a response and a predictor is through a transformation of
# the predictor called a Restricted Cubic Spline (RCS).
# For simplicity's sake, you can think of the RCS as adding
# squared and cubed terms based on the predictor to the model
# statement. Then a lack-of-fit test is performed to see if the
# RCS model fits the data better than the linear model.
# If so, then there is evidence of a non-linear relationship.
# To determine the nature of the non-linear relationship,
# you can plot the predicted values from the RCS model as a
# function of x. Code below shows you how to do this.
# Install and load package with the function to create the
# RSC variables. NOTE: Uncomment install.packages() line to
# install package ONCE; then comment back out and only use
# the library() command to load it.
#install.packages('Hmisc')
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units

# - Create the RCS variables
rcs.x <- rcspline.eval(linearity$x, nk=5, inclx=TRUE)
colnames(rcs.x) <- c('x','x1','x2','x3')
# - Fit the regression model with RCS terms
fit3.rcs <- lm(y ~ rcs.x,data=linearity)
summary(fit3.rcs)
##
## Call:
## lm(formula = y ~ rcs.x, data = linearity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.28581 -0.54905 0.02518 0.60831 2.28503
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.6924 0.7764 11.196 1.19e-11 ***
## rcs.xx 0.5177 0.1268 4.084 0.000354 ***
## rcs.xx1 -1.1693 1.1372 -1.028 0.312964
## rcs.xx2 1.1357 2.6086 0.435 0.666758
## rcs.xx3 -0.1688 2.9150 -0.058 0.954236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.119 on 27 degrees of freedom
## Multiple R-squared: 0.8298, Adjusted R-squared: 0.8046
## F-statistic: 32.92 on 4 and 27 DF, p-value: 5.052e-10
# plot the data
y.rcs.pred <- predict(fit3.rcs) # get predicted y's
plot( linearity$x,linearity$y,pch=19,main='Scatterplot w/ RCS fit',
xlab='X',ylab='Y')
lines(linearity$x,y.rcs.pred,col='dark orange',lwd=2, lty=6)

# - Perform lack-of-fit test between linear and RCS model:
# - Note: p-value < 0.05; so reject H0: linear model is adequate;
# there is evidence of a non-linear relationship!
lof.test <- anova(fit,fit3.rcs)
lof.test
## Analysis of Variance Table
##
## Model 1: y ~ x
## Model 2: y ~ rcs.x
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 30 184.982
## 2 27 33.808 3 151.17 40.244 4.211e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# we can use the likelihood ratio test
# First time you use lmtest, you must
# install the package!
# install.packages("lmtest")
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# with this test, you are comparing two nested
# models
# FULL model includes the rcs fit
# REDUCED model is the simple linear fit
# lets use the factor version we just created
# we have to use the subset function here because
# there is missing data in the f.BMI variable
# For LRT test, both models need to use the same observations
FULL.model <- fit3.rcs
REDUCED.model <- fit
# now compare models
lrtest(FULL.model,REDUCED.model)
## Likelihood ratio test
##
## Model 1: y ~ rcs.x
## Model 2: y ~ x
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 6 -46.285
## 2 3 -73.478 -3 54.386 9.283e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1