# ~ CRP 245 Part 2 Day 5 More Linearity ~
# Repeats some code from Part 2 Day 4
# with a few changes to simplify coding
# Download and load the linearity dataset:
load(url("http://www.duke.edu/~sgrambow/crp241data/linearity.RData"))
# Plot the data to see the relationship between x and y:
# - The dots represent the observed data, which contain noise (i.e. error)
# we are trying to estimate using the data (i.e. via regression)
library(ggplot2)
ggplot(linearity, aes(x=x, y=y))+
geom_point()

# What happens if we "fit a line" to these data to estimate
# the relationship between x and y?
# - That is, we fit a linear regression line and assume the relationship
# between x and y is linear?
# (1) Fit the regression line
# NOTE: p-value for x is greater than 0.05; says there is no relationship
# between x and y, but we know that is not true from looking at the
# data. Happens because this analysis ignored the non-linear
# relationship between x and y.
fit <- lm(y~x,data=linearity)
summary(fit)
##
## Call:
## lm(formula = y ~ x, data = linearity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4729 -1.8603 0.1531 1.9353 4.5427
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.19872 0.89892 14.68 3.09e-15 ***
## x -0.07083 0.04754 -1.49 0.147
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.483 on 30 degrees of freedom
## Multiple R-squared: 0.06889, Adjusted R-squared: 0.03785
## F-statistic: 2.22 on 1 and 30 DF, p-value: 0.1467
# plot the fit
# plot the data with RCS fit
ggplot(linearity, aes(x=x, y=y))+
geom_point()+
geom_smooth(method=lm,data=linearity,
formula= y~x,se=FALSE)

# Using restricted cubic splines (RCS) to assess
# linearity for continuous variables. Can also be used
# as a way to model continuous variables in regression
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# - Install and load package with the function to create
# the RCS variables:
# NOTE: Uncomment install.packages() line to install
# package ONCE; then comment back out and only use
# the library() command to load it.
# Need rms package
# install.packages("rms")
library(rms)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
# In the Rscript on day 4 we used Hmisc library
# Today we use RMS package which has very simple
# coding for RCS variables
# - Create the RCS variables
# we can use rcs() function in RMS package
# to calculate restricted cubic spline version
# of continuous variable in model.
# Just use rcs(x)
# This will use a restricted cubic spline with
# 5 knots. if you want fewer knots, enter a
# 2nd argument after comma -- rcs(x,3)
# - Fit the regression model with a restricted
# cubic spline for x
fit.rcs <- lm(y ~ rcs(x),data=linearity)
summary(fit.rcs)
##
## Call:
## lm(formula = y ~ rcs(x), data = linearity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.28581 -0.54905 0.02518 0.60831 2.28503
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.6924 0.7764 11.196 1.19e-11 ***
## rcs(x)x 0.5177 0.1268 4.084 0.000354 ***
## rcs(x)x' -1.1693 1.1372 -1.028 0.312964
## rcs(x)x'' 1.1357 2.6086 0.435 0.666758
## rcs(x)x''' -0.1688 2.9150 -0.058 0.954236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.119 on 27 degrees of freedom
## Multiple R-squared: 0.8298, Adjusted R-squared: 0.8046
## F-statistic: 32.92 on 4 and 27 DF, p-value: 5.052e-10
# plot the data with RCS fit
ggplot(linearity, aes(x=x, y=y))+
geom_point()+
geom_smooth(method=lm,data=linearity,
formula= y~rcs(x,3),se=FALSE)

# Then a lack-of-fit test is performed to see
# if the RCS model fits the data better than
# the linear model. If so, then there is evidence
# of a non-linear relationship.
# - Perform lack-of-fit test between linear and RCS model:
# Use the anova() function
# 1st argument is reduced model
# 2nd argument is more complex model
# significant p-value suggests 2nd model better than 1st
lof.test <- anova(fit,fit.rcs)
lof.test
## Analysis of Variance Table
##
## Model 1: y ~ x
## Model 2: y ~ rcs(x)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 30 184.982
## 2 27 33.808 3 151.17 40.244 4.211e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# - Note: p-value < 0.05; so reject H0: linear model is adequate;
# there is evidence of a non-linear relationship!
# Significant p-value suggests RCS fit is superior.
# Let's say we want to stick with the RCS fit.
# What do we do next?
# The only downside to using RCS is that the actual
# RCS terms are not easily interpretable in a clinical
# setting.
# One approach is to try to approximate the non-linear
# relationship using linear splines which approximates
# the non-linear relationship as a series of linear ones.
# - Here, the parabolic relationship could be approximated
# using a linear spline with one knot at x=17 (ish).
# That is, there appears to be a positive linear relationship
# from x = 0 to x = 17 and then a negative linear relationship
# from x = 17 to x = 32. Linear splines are much more
# interpretable clinically.
# - See the code below for how to fit a linear spline;
# Two key points we should mention
# (1) Only consider this AFTER a non-linear association
# has been found
# (2) Only consider this AFTER plotting the relationship
# to see if approximating it with a linear spline is
# reasonable.
# we can easily fit linear spline term using
# lspline package
# remember to install first time you use it
# install.packages("lspline")
library(lspline)
## Warning: package 'lspline' was built under R version 4.0.4
# we want to use lspline() function
# 1st argument -- continuous variable of interest
# 2nd argument -- knots = numeric vector of knot positions
# we want a single knot at x=17
# - Fit linear spline model
fit.lsp <- lm(y ~ lspline(x,knots=c(17)) ,data=linearity)
summary(fit.lsp)
##
## Call:
## lm(formula = y ~ lspline(x, knots = c(17)), data = linearity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3468 -0.7455 0.0425 0.8522 2.3381
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.44859 0.58590 16.127 5.11e-16 ***
## lspline(x, knots = c(17))1 0.36368 0.05032 7.227 5.88e-08 ***
## lspline(x, knots = c(17))2 -0.54812 0.05424 -10.105 5.21e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.221 on 29 degrees of freedom
## Multiple R-squared: 0.7824, Adjusted R-squared: 0.7674
## F-statistic: 52.13 on 2 and 29 DF, p-value: 2.492e-10
confint(fit.lsp)
## 2.5 % 97.5 %
## (Intercept) 8.2502858 10.6468963
## lspline(x, knots = c(17))1 0.2607595 0.4665931
## lspline(x, knots = c(17))2 -0.6590618 -0.4371787
# - Plot linear spline fit
# - Note: From the plot, you can see that the RCS fit (orange)
# is flexible but that linear spline fit (purple)
# is not as close (it overestimates the true relationhip
# for some values of x and under estimates for others);
# but this is the trade-off between model fit and
# interpretability.
ggplot(linearity, aes(x=x, y=y))+
geom_point()+
geom_smooth(method=lm,data=linearity,
formula= y~lspline(x,knots=c(17)),se=FALSE)+
geom_smooth(method=lm,data=linearity,
formula= y~rcs(x),col='green',se=FALSE)

# Reporting coefficients like in diabetes paper
# for x < 17 slope coefficient is 0.36
# for x > 17 slope coefficient is -0.55
#######################################
# ~
# RScript for Fitting Cox PH Model #
# to CGD data and checking the #
# Proportional Hazards Assumption #
#######################################
# Survival Regression Example -- CGD Data
# Data are from a placebo controlled trial of gamma interferon in
# chronic granulotomous disease (CGD). Outcome of interest is time to
# first serious infection observed through end of study for each patient.
# Data Set: cgd
# Data Dictionary:
# (1) id: subject identifiction number
# (2) treat: placebo or gamma interferon (1 = gamma; 0 = placebo)
# (3) sex: sex (1 = Male; 0 = Female)
# (4) age: age in years, at study entry
# (5) height: height in cm at study entry
# (6) weight: weight in kg at study entry
# (7) inherit: pattern of inheritance (1 = X-linked; 0 = autosomal)
# (8) propylac: use of prophylactic antibiotics at study entry (1 = yes; 0 = no)
# (9) tstop: time to fist serious infection or censoring
# (10) status: binary indicator of event or censoring (1 = event, 0 = censor)
## Download and load the data file = bpdata
load(url("http://www.duke.edu/~sgrambow/crp241data/cgd.RData"))
# Install and load package for Survival Analysis
#install.packages('survival')
library(survival)
# Compare survival curves by treatment arm using Cox PH regression
fit.mcox <- coxph(Surv(tstop, status) ~ treat+age, data=cgd)
summary(fit.mcox)
## Call:
## coxph(formula = Surv(tstop, status) ~ treat + age, data = cgd)
##
## n= 128, number of events= 44
##
## coef exp(coef) se(coef) z Pr(>|z|)
## treat -1.15715 0.31438 0.33740 -3.430 0.000604 ***
## age -0.02832 0.97208 0.01714 -1.652 0.098485 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## treat 0.3144 3.181 0.1623 0.609
## age 0.9721 1.029 0.9400 1.005
##
## Concordance= 0.656 (se = 0.045 )
## Likelihood ratio test= 14.71 on 2 df, p=6e-04
## Wald test = 13.41 on 2 df, p=0.001
## Score (logrank) test = 14.39 on 2 df, p=8e-04
# (1) Overall Model Fit
# Check the Likelihood ratio test which is analagous to F-test in linear
# regression; simultaenously testing that all # beta coefficients are
# zero (null hypothesis) vs at least one is not zero.
# This tells you that your model is doing something...
# RESULT: for this model, F=14.71 with 2 DF, p-value =0.0006
# All betas are NOT zero
# (2) Generalized R-Squared
# Analagous to multiple R-squared from linear regression
# RESULT: Here we see R2 = 0.109.
# Checking Porportional Hazards Assumption
fit.mcox_zph <- cox.zph(fit.mcox)
fit.mcox_zph
## chisq df p
## treat 0.0369 1 0.85
## age 0.5071 1 0.48
## GLOBAL 0.5158 2 0.77
# if we examine the function, it provides a p-value for
# each variable in the model. The p-value
# corresponds to the following hypothesis
# H0: Proportional Hazards satisfied
# H1: Proportional Hazards not satisfied
# Therefore, significant p-value suggests nonproportional hazards
# examining the p-values, both are > 0.05, so no nonproportional hazards
# we can also generate plots to visualized non-proportionality
# plots should display a roughly horizontal line.
plot(fit.mcox_zph,var="treat")

plot(fit.mcox_zph,var="age")

# end of file