#
# CRP245 Spring 2021 Final Exam
#
# R code for Question 8 and 9
# Prediction for Heart Failure Patients -- Pakistan Case Study
# n=299 patients, 13 variables
#
# Variables
#
# Time           Follow-up time in days
# Event          Mortality or censor
# Gender         (*using variable name from original study)
# Smoking        Smoking status (0=no; 1=yes)
# Diabetes       Dabetes status (0=no; 1=yes)
# SBP            Higher than normal Systolic BP (0=no; 1=yes)
# Anaemia        Anaemia status (0=no; 1=yes)
# Age            Age in years
# Eject.Fr       Eection fraction
# Sodium         serum sodium
# Creatinine     serum creatinine
# Platelets      platelets count
# CPK            creatinine phosphokinase

## Download and load the data file = prevend
load(url("http://www.duke.edu/~sgrambow/crp241data/predicthf.RData"))

# structure of the data 
str(predicthf)
## 'data.frame':    299 obs. of  13 variables:
##  $ Time      : int  97 180 31 87 113 10 250 27 87 87 ...
##  $ Event     : int  0 0 1 0 0 1 0 1 0 0 ...
##  $ Gender    : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ Smoking   : int  0 1 1 0 0 0 1 0 0 1 ...
##  $ Diabetes  : int  0 1 0 0 0 0 0 1 0 0 ...
##  $ SBP       : int  0 0 1 0 0 0 0 1 1 0 ...
##  $ Anaemia   : int  1 1 0 1 0 1 0 0 0 0 ...
##  $ Age       : num  43 73 70 65 64 75 70 94 75 80 ...
##  $ Eject.Fr  : int  50 30 20 25 60 15 40 38 45 25 ...
##  $ Sodium    : int  135 142 134 141 137 137 136 134 137 144 ...
##  $ Creatinine: num  1.3 1.18 1.83 1.1 1 1.2 2.7 1.83 1.18 1.1 ...
##  $ Platelets : num  237000 160000 263358 298000 242000 ...
##  $ CPK       : int  358 231 582 305 1610 246 582 582 582 898 ...
# summary of overall data -- any missing values?
summary(predicthf)
##       Time           Event            Gender          Smoking      
##  Min.   :  4.0   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 73.0   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :115.0   Median :0.0000   Median :1.0000   Median :0.0000  
##  Mean   :130.3   Mean   :0.3211   Mean   :0.6488   Mean   :0.3211  
##  3rd Qu.:203.0   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :285.0   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     Diabetes           SBP            Anaemia            Age       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :40.00  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:51.00  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :60.00  
##  Mean   :0.4181   Mean   :0.3512   Mean   :0.4314   Mean   :60.83  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:70.00  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :95.00  
##     Eject.Fr         Sodium        Creatinine      Platelets     
##  Min.   :14.00   Min.   :113.0   Min.   :0.500   Min.   : 25100  
##  1st Qu.:30.00   1st Qu.:134.0   1st Qu.:0.900   1st Qu.:212500  
##  Median :38.00   Median :137.0   Median :1.100   Median :262000  
##  Mean   :38.08   Mean   :136.6   Mean   :1.394   Mean   :263358  
##  3rd Qu.:45.00   3rd Qu.:140.0   3rd Qu.:1.400   3rd Qu.:303500  
##  Max.   :80.00   Max.   :148.0   Max.   :9.400   Max.   :850000  
##       CPK        
##  Min.   :  23.0  
##  1st Qu.: 116.5  
##  Median : 250.0  
##  Mean   : 581.8  
##  3rd Qu.: 582.0  
##  Max.   :7861.0
# visualize distribution of survival times
# use jitter to avoid stacking points
stripchart(predicthf$Time,
           main="time to death or censoring (days)",
           xlab="days",
           ylab="time to death-censoring ",
           method="jitter",
           col="blue",
           pch=19
)

# standard KM figure
# install.packages('survminer')
# we need this package for the ggsurvplot
# command used below to create the KM plot
library(survminer)
## Loading required package: ggplot2
## Loading required package: ggpubr
library(survival) # need this for standard survival functions
# Obtain survival estimates using survfit() function
fit.km <- survfit(Surv(predicthf$Time,predicthf$Event) ~ 1 , data=predicthf)
# plot KM 
ggsurvplot(fit.km, data = predicthf, 
           risk.table = TRUE, 
           surv.median.line = "hv")
## Warning in .add_surv_median(p, fit, type = surv.median.line, fun = fun, : Median
## survival not reached.

# lasso regression
y <- Surv(predicthf$Time,predicthf$Event)
x <- model.matrix(y~.,predicthf)[,c(-1:-3)]

# install.packages("rms")
# load rms package
library(rms)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
# run standard cox model with all covariates
cph(y ~ x)
## Cox Proportional Hazards Model
##  
##  cph(formula = y ~ x)
##  
##                          Model Tests    Discrimination    
##                                                Indexes    
##  Obs        299    LR chi2     81.95    R2       0.248    
##  Events      96    d.f.           11    Dxy      0.482    
##  Center -4.3059    Pr(> chi2) 0.0000    g        1.132    
##                    Score chi2  88.39    gr       3.103    
##                    Pr(> chi2) 0.0000                      
##  
##             Coef    S.E.   Wald Z Pr(>|Z|)
##  Gender     -0.2377 0.2516 -0.94  0.3448  
##  Smoking     0.1291 0.2512  0.51  0.6073  
##  Diabetes    0.1400 0.2232  0.63  0.5304  
##  SBP         0.4759 0.2162  2.20  0.0277  
##  Anaemia     0.4603 0.2169  2.12  0.0338  
##  Age         0.0464 0.0093  4.98  <0.0001 
##  Eject.Fr   -0.0489 0.0105 -4.67  <0.0001 
##  Sodium     -0.0442 0.0233 -1.90  0.0577  
##  Creatinine  0.3218 0.0701  4.59  <0.0001 
##  Platelets   0.0000 0.0000 -0.41  0.6804  
##  CPK         0.0002 0.0001  2.23  0.0260  
## 
# note that c-index is equal to Dxy/2+.5
# which is (0.482/2)+.5 = 0.741

# LASSO REGRESSION MODELING

# install.packages("glmnet")
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.0.4
## Loading required package: Matrix
## Loaded glmnet 4.1-1
# Perform Lasso model
lasso.mod=glmnet(x,y,family = "cox", alpha=1)

# install.packages("plotmo")
library(plotmo) # for plot_glmnet
## Warning: package 'plotmo' was built under R version 4.0.4
## Loading required package: plotrix
## Loading required package: TeachingDemos
## Warning: package 'TeachingDemos' was built under R version 4.0.4
## 
## Attaching package: 'TeachingDemos'
## The following objects are masked from 'package:Hmisc':
## 
##     cnvrt.coords, subplot
# plot the solution path for lasso model
plot_glmnet(lasso.mod)

# perform cross-validation for lasso model
# set seed for reproducibility
set.seed(100)
# perform cross-validation
cv.out.lasso=cv.glmnet(x,y,alpha=1,family = "cox")
# plot the cross-validation curve
plot(cv.out.lasso)

# print out coefficients for lambda.1se
coef(cv.out.lasso,s="lambda.1se")
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                      1
## Gender      .         
## Smoking     .         
## Diabetes    .         
## SBP         .         
## Anaemia     .         
## Age         0.01700736
## Eject.Fr   -0.01429952
## Sodium      .         
## Creatinine  0.18480318
## Platelets   .         
## CPK         .
# print out AUC (c-index) for the lambda.1se lasso model
# install.packages("survAUC")
library(survAUC)
## Warning: package 'survAUC' was built under R version 4.0.5
pred.lasso=predict(cv.out.lasso,newx=x,s="lambda.1se",type="response")
r.AUC <- AUC.uno(y,y,(pred.lasso),seq(0,250.1,1))
r.AUC$iauc
## [1] 0.7304817
# PERFORM RIDGE REGRESSION MODELING

# Perform Ridge Regression -- recall alpha=0 for ridge
ridge.model <- glmnet(x,y,family = "cox", alpha=0)

# plot solution path
plot_glmnet(ridge.model)

# set seed for reproducibility of cross-validation
set.seed(200)
cv.out.ridge=cv.glmnet(x,y,alpha=0,family = "cox")
# plot the cross-validation curve
plot(cv.out.ridge)

# print model coefficients for lambda.1se 
coef(cv.out.ridge,s="lambda.1se")
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                        1
## Gender     -7.289374e-04
## Smoking    -1.611772e-04
## Diabetes   -4.514277e-03
## SBP         6.569394e-02
## Anaemia     5.096381e-02
## Age         5.959247e-03
## Eject.Fr   -5.259709e-03
## Sodium     -1.060097e-02
## Creatinine  7.096712e-02
## Platelets  -8.688418e-08
## CPK         1.705966e-05
# print out AUC (c-index) for the lambda.1se ridge model
pred.ridge=predict(cv.out.ridge,newx=x,s="lambda.1se",type="response")
r.AUC <- AUC.uno(y,y,(pred.ridge),seq(0,250.1,1))
r.AUC$iauc
## [1] 0.7670144
# END OF FILE