# *************************************
#  CRP 245 Part 2 Day 4 Linearity 
# *************************************

# *************************************
# *      Sample R Code for Testing Linearity between       
# *      a Continuous Response / Continuous Predictor      
# * Using Restricted Cubic Splines and a Lack-of-Fit Test 
# *************************************
# Author, Megan Neely, PhD (with some modifications from Steve)

# Download and load the linearity dataset: 
load(url("http://www.duke.edu/~sgrambow/crp241data/linearity.RData"))

# Plot the data to see the relationship between x and y: 
# - The dots represent the observed data, which contain noise (i.e. error)
#   we are trying to estimate using the data (i.e. via regression)
plot(linearity$x,linearity$y,pch=19,main='Scatterplot for X and Y',
      xlab='X', ylab='Y')

# What happens if we "fit a line" to these data to estimate 
# the relationship between x and y?
# - That is, we fit a linear regression line and assume the relationship 
#   between x and y is linear?
# (1) Fit the regression line
#  NOTE: p-value for x is greater than 0.05; says there is no relationship 
#  between x and y, but we know that is not true from looking at the 
#  data. Happens because this analysis ignored the non-linear 
#  relationship between x and y. 
fit <- lm(y~x,data=linearity) 
summary(fit)
## 
## Call:
## lm(formula = y ~ x, data = linearity)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4729 -1.8603  0.1531  1.9353  4.5427 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.19872    0.89892   14.68 3.09e-15 ***
## x           -0.07083    0.04754   -1.49    0.147    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.483 on 30 degrees of freedom
## Multiple R-squared:  0.06889,    Adjusted R-squared:  0.03785 
## F-statistic:  2.22 on 1 and 30 DF,  p-value: 0.1467
# (2) Plot the fitted regression line (blue) and again the data (dots) 
#     
plot(linearity$x,linearity$y,pch=19,main='Scatterplot for X & Y with 
      Simple Linear Regression Line', xlab='X', ylab='Y')
abline(fit,col='blue',lty=2, lwd=2)

# check the diagnostics
par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(fit)

par(mfrow=c(1,1)) # Change back to 1 x 1

# looks like a nonlinear (curvilinear) relationship in the diagnostics
# and in the scatter plot

# How do we fit this? Multiple possibilities:
# 1. Use simple polynomials (x^2)
# 2. Use loess Curve Fitting (Local Polynomial Regression)
# loess Curve Fitting This is a method for fitting a smooth 
# curve between two variables or fitting a smooth surface between 
# an outcome and multiple predictor variables.
# 3. Fit a linear spline
# 4. Restricted Cubic Spline (RCS)

# 1. Simple Polynomials (X^2)
# create x2 to be x*x
linearity$x2 = linearity$x*linearity$x;
# now fit the polynomial model
fit.poly <- lm(y~x+x2,data=linearity) 
summary(fit.poly)
## 
## Call:
## lm(formula = y ~ x + x2, data = linearity)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.13152 -0.64060  0.09362  0.72246  2.46334 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.872156   0.616323   12.77 1.96e-13 ***
## x            0.869151   0.086104   10.09 5.33e-11 ***
## x2          -0.028484   0.002531  -11.25 4.25e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.09 on 29 degrees of freedom
## Multiple R-squared:  0.8265, Adjusted R-squared:  0.8145 
## F-statistic: 69.07 on 2 and 29 DF,  p-value: 9.341e-12
# (2) Plot the fitted regression line (blue) and again the data (dots) 

# get predicted values for Y_hat
linearity$y.quad_pred <- predict(fit.poly)
# plot the data
plot( linearity$x,linearity$y,pch=19,main='Scatterplot w/ Quadratic Fit',
      xlab='X',ylab='Y')
lines(linearity$x,linearity$y.quad_pred,col='blue',lty=2,lwd=2)

# check the diagnostics for polynomial model
par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(fit.poly)

par(mfrow=c(1,1)) # Change back to 1 x 1

# 2. Use loess Curve Fitting (Local Polynomial Regression)
# loess Curve Fitting This is a method for fitting a 
# smooth curve between two variables
fit.loess <- loess(linearity$y ~ linearity$x)
summary(fit.loess)
## Call:
## loess(formula = linearity$y ~ linearity$x)
## 
## Number of Observations: 32 
## Equivalent Number of Parameters: 4.35 
## Residual Standard Error: 1.13 
## Trace of smoother matrix: 4.76  (exact)
## 
## Control settings:
##   span     :  0.75 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2
##   normalize:  TRUE
##  parametric:  FALSE
## drop.square:  FALSE
# get predicted values of y
y.pred_loess <- predict(fit.loess)
# Plot the data
plot( linearity$x,linearity$y,pch=19,main='Scatterplot w/ loess fit',
      xlab='X',ylab='Y')
lines(linearity$x,y.pred_loess,col='dark green',lwd=2, lty=6)

#superimpose polynomial 
plot( linearity$x,linearity$y,pch=19,
      main='Scatterplot w/ loess & quadritc fits',
      xlab='X',ylab='Y')
lines(linearity$x,y.pred_loess,col='dark green',lwd=2, lty=6)
lines(linearity$x,linearity$y.quad_pred,col='blue',lty=2,lwd=2)
legend('bottomleft',
       legend=c('loess','Quadratic'),
       col=c('dark green','blue'),
       lwd=2,lty=c(6,2),bty='n')

# 3. Linear Splines
# Iea here is to try to approximate the non-linear relationship using 
# linear splines which approximates the non-linear relationship as a 
# series of linear ones. Here, the parabolic relationship could be 
# approximated using a linear spline with one knot at x=17 (ish). 
# That is, there appears to be a positive linear relationship from 
# x = 0 to x = 17 and then a negative linear relationship from x = 17 to 
# x = 32. Linear splines are much more interpretable clinically. 
# See the code below for how to fit a linear spline; again, I would 
# only do this after a non-linear association has been found and after 
# plotting the relationship to see if approximating it with a linear 
# spline is reasonable. 

# - Make linear spline terms: 
linearity$x17 <- ( linearity$x - 17 )
linearity$x17[ linearity$x17<0 ] <- 0

# - Fit linear spline model 
fit3.lin.spline <- lm(y ~ x + x17,data=linearity)
summary(fit3.lin.spline)
## 
## Call:
## lm(formula = y ~ x + x17, data = linearity)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3468 -0.7455  0.0425  0.8522  2.3381 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.44859    0.58590  16.127 5.11e-16 ***
## x            0.36368    0.05032   7.227 5.88e-08 ***
## x17         -0.91180    0.09351  -9.751 1.17e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.221 on 29 degrees of freedom
## Multiple R-squared:  0.7824, Adjusted R-squared:  0.7674 
## F-statistic: 52.13 on 2 and 29 DF,  p-value: 2.492e-10
# Plot linear spline fit
# Note: Generated data is already sorted in ascending order and 
# there are no replicate values; that will not be true for real data. 
# So will need to make a sequence of plotting x-values. Code below does 
# that; but you may need to adjust the increment in the seq() function to 
# make sense for you data (i.e. by=0.1 or by=5) Note: From the plot, you 
# can see that the RCS fit (light blue) is flexible enough to get very  
# close to the true relationship (red); but that linear spline fit (magenta) 
#   is not as close (it overestimates the true relationhip for some values of x and under-
#   estimates for others); but this is the trade-off between model fit and interpretability. 
x.range <- range(linearity$x)
x.seq <- seq(x.range[1],x.range[2],by=1)
x.seq17 <- ( linearity$x - 17 )
x.seq17[ x.seq17<0 ] <- 0
yhat.ls <- predict(fit3.lin.spline,data.frame(x=x.seq,x17=x.seq17))

plot( linearity$x, linearity$y ,pch=19)
title("Data with  simple linear spline fit")
abline(fit,col='blue',lty=2, lwd=2)
lines(x.seq, yhat.ls, col='magenta', lty=2, lwd=2)

# So how can we determine if there is evidence of a 
# non-linear relationship? Many ways, but a really simple
# and powerful way to do is to compare the fit of a 
# regression model that assumes a linear relationship 
# to the fit of one that assumes a non-linear relationship. 
# A common way of modeling a non-linear relationship between 
# a response and a predictor is through a transformation of 
# the predictor called a Restricted Cubic Spline (RCS). 
# For simplicity's sake, you can think of the RCS as adding 
# squared and cubed terms based on the predictor to the model 
# statement. Then a lack-of-fit test is performed to see if the
# RCS model fits the data better than the linear model. 
# If so, then there is evidence of a non-linear relationship. 
# To determine the nature of the non-linear relationship, 
# you can plot the predicted values from the RCS model as a 
# function of x.  Code below shows you how to do this. 

# Install and load package with the function to create the 
# RSC variables. NOTE: Uncomment install.packages() line to 
# install package ONCE; then comment back out and only use 
# the library() command to load it.
#install.packages('Hmisc')
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units

# - Create the RCS variables 
rcs.x <- rcspline.eval(linearity$x, nk=5, inclx=TRUE)
colnames(rcs.x) <- c('x','x1','x2','x3')

# - Fit the regression model with RCS terms
fit3.rcs <- lm(y ~ rcs.x,data=linearity)
summary(fit3.rcs)
## 
## Call:
## lm(formula = y ~ rcs.x, data = linearity)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.28581 -0.54905  0.02518  0.60831  2.28503 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.6924     0.7764  11.196 1.19e-11 ***
## rcs.xx        0.5177     0.1268   4.084 0.000354 ***
## rcs.xx1      -1.1693     1.1372  -1.028 0.312964    
## rcs.xx2       1.1357     2.6086   0.435 0.666758    
## rcs.xx3      -0.1688     2.9150  -0.058 0.954236    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.119 on 27 degrees of freedom
## Multiple R-squared:  0.8298, Adjusted R-squared:  0.8046 
## F-statistic: 32.92 on 4 and 27 DF,  p-value: 5.052e-10
# plot the data 
y.rcs.pred <- predict(fit3.rcs) # get predicted y's
plot( linearity$x,linearity$y,pch=19,main='Scatterplot w/ RCS fit',
      xlab='X',ylab='Y')
lines(linearity$x,y.rcs.pred,col='dark orange',lwd=2, lty=6)

# - Perform lack-of-fit test between linear and RCS model: 
#   - Note: p-value < 0.05; so reject H0: linear model is adequate; 
#           there is evidence of a non-linear relationship!
lof.test <- anova(fit,fit3.rcs)
lof.test
## Analysis of Variance Table
## 
## Model 1: y ~ x
## Model 2: y ~ rcs.x
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1     30 184.982                                  
## 2     27  33.808  3    151.17 40.244 4.211e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# we can use the likelihood ratio test
# First time you use lmtest, you must
# install the package!
# install.packages("lmtest")
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# with this test, you are comparing two nested 
# models 
# FULL model includes the rcs fit
# REDUCED model is the simple linear fit
# lets use the factor version we just created
# we have to use the subset function here because
# there is missing data in the  f.BMI variable
# For LRT test, both models need to use the same observations
FULL.model <- fit3.rcs
REDUCED.model <- fit
# now compare models
lrtest(FULL.model,REDUCED.model)
## Likelihood ratio test
## 
## Model 1: y ~ rcs.x
## Model 2: y ~ x
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1   6 -46.285                         
## 2   3 -73.478 -3 54.386  9.283e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1