# ~ CRP 245 Part 2 Day 5 More Linearity ~
# Repeats some code from Part 2 Day 4
# with a few changes to simplify coding

# Download and load the linearity dataset: 
load(url("http://www.duke.edu/~sgrambow/crp241data/linearity.RData"))

# Plot the data to see the relationship between x and y: 
# - The dots represent the observed data, which contain noise (i.e. error)
#   we are trying to estimate using the data (i.e. via regression)
library(ggplot2)
ggplot(linearity, aes(x=x, y=y))+
   geom_point()

# What happens if we "fit a line" to these data to estimate 
# the relationship between x and y?
# - That is, we fit a linear regression line and assume the relationship 
#   between x and y is linear?
# (1) Fit the regression line
#  NOTE: p-value for x is greater than 0.05; says there is no relationship 
#  between x and y, but we know that is not true from looking at the 
#  data. Happens because this analysis ignored the non-linear 
#  relationship between x and y. 
fit <- lm(y~x,data=linearity) 
summary(fit)
## 
## Call:
## lm(formula = y ~ x, data = linearity)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4729 -1.8603  0.1531  1.9353  4.5427 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.19872    0.89892   14.68 3.09e-15 ***
## x           -0.07083    0.04754   -1.49    0.147    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.483 on 30 degrees of freedom
## Multiple R-squared:  0.06889,    Adjusted R-squared:  0.03785 
## F-statistic:  2.22 on 1 and 30 DF,  p-value: 0.1467
# plot the fit
# plot the data with RCS fit
ggplot(linearity, aes(x=x, y=y))+
   geom_point()+
   geom_smooth(method=lm,data=linearity,
               formula= y~x,se=FALSE)

# Using restricted cubic splines (RCS) to assess
# linearity for continuous variables. Can also be used
# as a way to model continuous variables in regression
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# - Install and load package with the function to create 
#   the RCS variables: 
#   NOTE: Uncomment install.packages() line to install 
#   package ONCE; then comment back out and only use 
#   the library() command to load it.
# Need rms package
# install.packages("rms")
library(rms)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
# In the Rscript on day 4 we used Hmisc library
# Today we use RMS package which has very simple
# coding for RCS variables
# - Create the RCS variables 
# we can use rcs() function in RMS package
# to calculate restricted cubic spline version
# of continuous variable in model.
# Just use rcs(x)
# This will use a restricted cubic spline with
# 5 knots.  if you want fewer knots, enter a 
# 2nd argument after comma -- rcs(x,3)

# - Fit the regression model with a restricted
# cubic spline for x
fit.rcs <- lm(y ~ rcs(x),data=linearity)
summary(fit.rcs)
## 
## Call:
## lm(formula = y ~ rcs(x), data = linearity)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.28581 -0.54905  0.02518  0.60831  2.28503 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.6924     0.7764  11.196 1.19e-11 ***
## rcs(x)x       0.5177     0.1268   4.084 0.000354 ***
## rcs(x)x'     -1.1693     1.1372  -1.028 0.312964    
## rcs(x)x''     1.1357     2.6086   0.435 0.666758    
## rcs(x)x'''   -0.1688     2.9150  -0.058 0.954236    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.119 on 27 degrees of freedom
## Multiple R-squared:  0.8298, Adjusted R-squared:  0.8046 
## F-statistic: 32.92 on 4 and 27 DF,  p-value: 5.052e-10
# plot the data with RCS fit
ggplot(linearity, aes(x=x, y=y))+
   geom_point()+
   geom_smooth(method=lm,data=linearity,
               formula= y~rcs(x,3),se=FALSE)

# Then a lack-of-fit test is performed to see 
# if the RCS model fits the data better than 
# the linear model. If so, then there is evidence 
# of a non-linear relationship. 

# - Perform lack-of-fit test between linear and RCS model: 
#   Use the anova() function
#   1st argument is reduced model
#   2nd argument is more complex model
#   significant p-value suggests 2nd model better than 1st
lof.test <- anova(fit,fit.rcs)

lof.test
## Analysis of Variance Table
## 
## Model 1: y ~ x
## Model 2: y ~ rcs(x)
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1     30 184.982                                  
## 2     27  33.808  3    151.17 40.244 4.211e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#   - Note: p-value < 0.05; so reject H0: linear model is adequate; 
#           there is evidence of a non-linear relationship!
# Significant p-value suggests RCS fit is superior.

# Let's say we want to stick with the RCS fit.  
# What do we do next?
# The only downside to using RCS is that the actual 
# RCS terms are not easily interpretable in a clinical 
# setting. 
# One approach is to try to approximate the non-linear 
# relationship using linear splines which approximates 
# the non-linear relationship as a series of linear ones.
# - Here, the parabolic relationship could be approximated 
#   using a linear spline with one knot at x=17 (ish). 
#   That is, there appears to be a positive linear relationship 
#   from x = 0 to x = 17 and then a negative linear relationship 
#   from x = 17 to x = 32. Linear splines are much more 
#   interpretable clinically. 
# - See the code below for how to fit a linear spline;
# Two key points we should mention
# (1) Only consider this AFTER a non-linear association 
#     has been found 
# (2) Only consider this AFTER plotting the relationship 
#     to see if approximating it with a linear spline is 
#     reasonable. 

# we can easily fit linear spline term using 
# lspline package
# remember to install first time you use it
# install.packages("lspline")
library(lspline)
## Warning: package 'lspline' was built under R version 4.0.4
# we want to use lspline() function
# 1st argument -- continuous variable of interest
# 2nd argument -- knots = numeric vector of knot positions
# we want a single knot at x=17

# - Fit linear spline model 
fit.lsp <- lm(y ~ lspline(x,knots=c(17)) ,data=linearity)
summary(fit.lsp)
## 
## Call:
## lm(formula = y ~ lspline(x, knots = c(17)), data = linearity)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3468 -0.7455  0.0425  0.8522  2.3381 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 9.44859    0.58590  16.127 5.11e-16 ***
## lspline(x, knots = c(17))1  0.36368    0.05032   7.227 5.88e-08 ***
## lspline(x, knots = c(17))2 -0.54812    0.05424 -10.105 5.21e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.221 on 29 degrees of freedom
## Multiple R-squared:  0.7824, Adjusted R-squared:  0.7674 
## F-statistic: 52.13 on 2 and 29 DF,  p-value: 2.492e-10
confint(fit.lsp)
##                                 2.5 %     97.5 %
## (Intercept)                 8.2502858 10.6468963
## lspline(x, knots = c(17))1  0.2607595  0.4665931
## lspline(x, knots = c(17))2 -0.6590618 -0.4371787
# - Plot linear spline fit
# - Note: From the plot, you can see that the RCS fit (orange) 
#     is flexible but that linear spline fit (purple) 
#     is not as close (it overestimates the true relationhip 
#     for some values of x and under estimates for others); 
#     but this is the trade-off between model fit and 
#     interpretability. 

ggplot(linearity, aes(x=x, y=y))+
   geom_point()+
   geom_smooth(method=lm,data=linearity,
               formula= y~lspline(x,knots=c(17)),se=FALSE)+
   geom_smooth(method=lm,data=linearity,
               formula= y~rcs(x),col='green',se=FALSE)

# Reporting coefficients like in diabetes paper
# for x < 17 slope coefficient is 0.36
# for x > 17 slope coefficient is -0.55




####################################### 
#        ~
# RScript for Fitting Cox PH Model #
# to CGD data and checking the     #
# Proportional Hazards Assumption  #
####################################### 
# Survival Regression Example -- CGD Data 

# Data are from a placebo controlled trial of gamma interferon in 
# chronic granulotomous disease (CGD). Outcome of interest is time to 
# first serious infection observed through end of study for each patient.

# Data Set: cgd

# Data Dictionary: 
# (1)  id:           subject identifiction number
# (2)  treat:        placebo or gamma interferon (1 = gamma; 0 = placebo)
# (3)  sex:          sex (1 = Male; 0 = Female)
# (4)  age:          age in years, at study entry
# (5)  height:       height in cm at study entry
# (6)  weight:       weight in kg at study entry
# (7)  inherit:      pattern of inheritance (1 = X-linked; 0 = autosomal)
# (8)  propylac:     use of prophylactic antibiotics at study entry (1 = yes; 0 = no)
# (9)  tstop:        time to fist serious infection or censoring 
# (10) status:       binary indicator of event or censoring (1 = event, 0 = censor)

## Download and load the data file = bpdata
load(url("http://www.duke.edu/~sgrambow/crp241data/cgd.RData"))
# Install and load package for Survival Analysis 
#install.packages('survival')
library(survival)

# Compare survival curves by treatment arm using Cox PH regression 
fit.mcox <- coxph(Surv(tstop, status) ~ treat+age, data=cgd)
summary(fit.mcox)
## Call:
## coxph(formula = Surv(tstop, status) ~ treat + age, data = cgd)
## 
##   n= 128, number of events= 44 
## 
##           coef exp(coef) se(coef)      z Pr(>|z|)    
## treat -1.15715   0.31438  0.33740 -3.430 0.000604 ***
## age   -0.02832   0.97208  0.01714 -1.652 0.098485 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##       exp(coef) exp(-coef) lower .95 upper .95
## treat    0.3144      3.181    0.1623     0.609
## age      0.9721      1.029    0.9400     1.005
## 
## Concordance= 0.656  (se = 0.045 )
## Likelihood ratio test= 14.71  on 2 df,   p=6e-04
## Wald test            = 13.41  on 2 df,   p=0.001
## Score (logrank) test = 14.39  on 2 df,   p=8e-04
# (1) Overall Model Fit
# Check the Likelihood ratio test which is analagous to F-test in linear
# regression; simultaenously testing that all # beta coefficients are 
# zero (null hypothesis) vs at least one is not zero.  
# This tells you that your model is doing something...
# RESULT: for this model, F=14.71 with 2 DF, p-value =0.0006
#         All betas are NOT zero

# (2) Generalized R-Squared
# Analagous to multiple R-squared from linear regression
# RESULT: Here we see R2 = 0.109.  

# Checking Porportional Hazards Assumption
fit.mcox_zph <- cox.zph(fit.mcox)
fit.mcox_zph
##         chisq df    p
## treat  0.0369  1 0.85
## age    0.5071  1 0.48
## GLOBAL 0.5158  2 0.77
# if we examine the function, it provides a p-value for 
# each variable in the model.  The p-value
# corresponds to the following hypothesis
# H0: Proportional Hazards satisfied
# H1: Proportional Hazards not satisfied
# Therefore, significant p-value suggests nonproportional hazards
# examining the p-values, both are > 0.05, so no nonproportional hazards
# we can also generate plots to visualized non-proportionality
# plots should display a roughly horizontal line.
plot(fit.mcox_zph,var="treat")

plot(fit.mcox_zph,var="age")

# end of file