#
# CRP245 Spring 2021 Final Exam
#
# R code for Question 8 and 9
# Prediction for Heart Failure Patients -- Pakistan Case Study
# n=299 patients, 13 variables
#
# Variables
#
# Time Follow-up time in days
# Event Mortality or censor
# Gender (*using variable name from original study)
# Smoking Smoking status (0=no; 1=yes)
# Diabetes Dabetes status (0=no; 1=yes)
# SBP Higher than normal Systolic BP (0=no; 1=yes)
# Anaemia Anaemia status (0=no; 1=yes)
# Age Age in years
# Eject.Fr Eection fraction
# Sodium serum sodium
# Creatinine serum creatinine
# Platelets platelets count
# CPK creatinine phosphokinase
## Download and load the data file = prevend
load(url("http://www.duke.edu/~sgrambow/crp241data/predicthf.RData"))
# structure of the data
str(predicthf)
## 'data.frame': 299 obs. of 13 variables:
## $ Time : int 97 180 31 87 113 10 250 27 87 87 ...
## $ Event : int 0 0 1 0 0 1 0 1 0 0 ...
## $ Gender : int 0 1 1 1 1 1 1 1 1 1 ...
## $ Smoking : int 0 1 1 0 0 0 1 0 0 1 ...
## $ Diabetes : int 0 1 0 0 0 0 0 1 0 0 ...
## $ SBP : int 0 0 1 0 0 0 0 1 1 0 ...
## $ Anaemia : int 1 1 0 1 0 1 0 0 0 0 ...
## $ Age : num 43 73 70 65 64 75 70 94 75 80 ...
## $ Eject.Fr : int 50 30 20 25 60 15 40 38 45 25 ...
## $ Sodium : int 135 142 134 141 137 137 136 134 137 144 ...
## $ Creatinine: num 1.3 1.18 1.83 1.1 1 1.2 2.7 1.83 1.18 1.1 ...
## $ Platelets : num 237000 160000 263358 298000 242000 ...
## $ CPK : int 358 231 582 305 1610 246 582 582 582 898 ...
# summary of overall data -- any missing values?
summary(predicthf)
## Time Event Gender Smoking
## Min. : 4.0 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 73.0 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :115.0 Median :0.0000 Median :1.0000 Median :0.0000
## Mean :130.3 Mean :0.3211 Mean :0.6488 Mean :0.3211
## 3rd Qu.:203.0 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :285.0 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Diabetes SBP Anaemia Age
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :40.00
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:51.00
## Median :0.0000 Median :0.0000 Median :0.0000 Median :60.00
## Mean :0.4181 Mean :0.3512 Mean :0.4314 Mean :60.83
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:70.00
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :95.00
## Eject.Fr Sodium Creatinine Platelets
## Min. :14.00 Min. :113.0 Min. :0.500 Min. : 25100
## 1st Qu.:30.00 1st Qu.:134.0 1st Qu.:0.900 1st Qu.:212500
## Median :38.00 Median :137.0 Median :1.100 Median :262000
## Mean :38.08 Mean :136.6 Mean :1.394 Mean :263358
## 3rd Qu.:45.00 3rd Qu.:140.0 3rd Qu.:1.400 3rd Qu.:303500
## Max. :80.00 Max. :148.0 Max. :9.400 Max. :850000
## CPK
## Min. : 23.0
## 1st Qu.: 116.5
## Median : 250.0
## Mean : 581.8
## 3rd Qu.: 582.0
## Max. :7861.0
# visualize distribution of survival times
# use jitter to avoid stacking points
stripchart(predicthf$Time,
main="time to death or censoring (days)",
xlab="days",
ylab="time to death-censoring ",
method="jitter",
col="blue",
pch=19
)
# standard KM figure
# install.packages('survminer')
# we need this package for the ggsurvplot
# command used below to create the KM plot
library(survminer)
## Loading required package: ggplot2
## Loading required package: ggpubr
library(survival) # need this for standard survival functions
# Obtain survival estimates using survfit() function
fit.km <- survfit(Surv(predicthf$Time,predicthf$Event) ~ 1 , data=predicthf)
# plot KM
ggsurvplot(fit.km, data = predicthf,
risk.table = TRUE,
surv.median.line = "hv")
## Warning in .add_surv_median(p, fit, type = surv.median.line, fun = fun, : Median
## survival not reached.


# lasso regression
y <- Surv(predicthf$Time,predicthf$Event)
x <- model.matrix(y~.,predicthf)[,c(-1:-3)]
# install.packages("rms")
# load rms package
library(rms)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
# run standard cox model with all covariates
cph(y ~ x)
## Cox Proportional Hazards Model
##
## cph(formula = y ~ x)
##
## Model Tests Discrimination
## Indexes
## Obs 299 LR chi2 81.95 R2 0.248
## Events 96 d.f. 11 Dxy 0.482
## Center -4.3059 Pr(> chi2) 0.0000 g 1.132
## Score chi2 88.39 gr 3.103
## Pr(> chi2) 0.0000
##
## Coef S.E. Wald Z Pr(>|Z|)
## Gender -0.2377 0.2516 -0.94 0.3448
## Smoking 0.1291 0.2512 0.51 0.6073
## Diabetes 0.1400 0.2232 0.63 0.5304
## SBP 0.4759 0.2162 2.20 0.0277
## Anaemia 0.4603 0.2169 2.12 0.0338
## Age 0.0464 0.0093 4.98 <0.0001
## Eject.Fr -0.0489 0.0105 -4.67 <0.0001
## Sodium -0.0442 0.0233 -1.90 0.0577
## Creatinine 0.3218 0.0701 4.59 <0.0001
## Platelets 0.0000 0.0000 -0.41 0.6804
## CPK 0.0002 0.0001 2.23 0.0260
##
# note that c-index is equal to Dxy/2+.5
# which is (0.482/2)+.5 = 0.741
# LASSO REGRESSION MODELING
# install.packages("glmnet")
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.0.4
## Loading required package: Matrix
## Loaded glmnet 4.1-1
# Perform Lasso model
lasso.mod=glmnet(x,y,family = "cox", alpha=1)
# install.packages("plotmo")
library(plotmo) # for plot_glmnet
## Warning: package 'plotmo' was built under R version 4.0.4
## Loading required package: plotrix
## Loading required package: TeachingDemos
## Warning: package 'TeachingDemos' was built under R version 4.0.4
##
## Attaching package: 'TeachingDemos'
## The following objects are masked from 'package:Hmisc':
##
## cnvrt.coords, subplot
# plot the solution path for lasso model
plot_glmnet(lasso.mod)

# perform cross-validation for lasso model
# set seed for reproducibility
set.seed(100)
# perform cross-validation
cv.out.lasso=cv.glmnet(x,y,alpha=1,family = "cox")
# plot the cross-validation curve
plot(cv.out.lasso)

# print out coefficients for lambda.1se
coef(cv.out.lasso,s="lambda.1se")
## 11 x 1 sparse Matrix of class "dgCMatrix"
## 1
## Gender .
## Smoking .
## Diabetes .
## SBP .
## Anaemia .
## Age 0.01700736
## Eject.Fr -0.01429952
## Sodium .
## Creatinine 0.18480318
## Platelets .
## CPK .
# print out AUC (c-index) for the lambda.1se lasso model
# install.packages("survAUC")
library(survAUC)
## Warning: package 'survAUC' was built under R version 4.0.5
pred.lasso=predict(cv.out.lasso,newx=x,s="lambda.1se",type="response")
r.AUC <- AUC.uno(y,y,(pred.lasso),seq(0,250.1,1))
r.AUC$iauc
## [1] 0.7304817
# PERFORM RIDGE REGRESSION MODELING
# Perform Ridge Regression -- recall alpha=0 for ridge
ridge.model <- glmnet(x,y,family = "cox", alpha=0)
# plot solution path
plot_glmnet(ridge.model)

# set seed for reproducibility of cross-validation
set.seed(200)
cv.out.ridge=cv.glmnet(x,y,alpha=0,family = "cox")
# plot the cross-validation curve
plot(cv.out.ridge)

# print model coefficients for lambda.1se
coef(cv.out.ridge,s="lambda.1se")
## 11 x 1 sparse Matrix of class "dgCMatrix"
## 1
## Gender -7.289374e-04
## Smoking -1.611772e-04
## Diabetes -4.514277e-03
## SBP 6.569394e-02
## Anaemia 5.096381e-02
## Age 5.959247e-03
## Eject.Fr -5.259709e-03
## Sodium -1.060097e-02
## Creatinine 7.096712e-02
## Platelets -8.688418e-08
## CPK 1.705966e-05
# print out AUC (c-index) for the lambda.1se ridge model
pred.ridge=predict(cv.out.ridge,newx=x,s="lambda.1se",type="response")
r.AUC <- AUC.uno(y,y,(pred.ridge),seq(0,250.1,1))
r.AUC$iauc
## [1] 0.7670144
# END OF FILE