In survival analysis, we use statistical approaches to identify the time it takes for a certain event to occur. This event could be customer churn in marketing or failure time in machines.  

Basic Settings and Data Import

Let’s begin by loading the required libraries and importing the data set we are going to use for this model.

options(scipen = 999)

# set working directory
setwd("C:/Users/awani/Desktop/50daysofAnalytics")

# load libraries
if (!require("pacman")) install.packages("pacman")
pacman::p_load(survival, ranger, ggplot2,stringr, dplyr,ggfortify, tidyr, sas7bdat, knitr)

#read data
data = read.csv("surv_data.csv") 

 

Explanatory Data Analysis

Before we jump into any kind of model training, it is essential that we understand data well. A quick eyeballing uni-variate and bi-variate plots can make us familiar with the data set.

#- univariate - numerical variables
summary(data[,c("dur","nit","hemo","age","calc")])
##       dur             nit             hemo           age       
##  Min.   : 1.00   Min.   :0.778   Min.   : 4.9   Min.   :38.00  
##  1st Qu.: 7.00   1st Qu.:1.146   1st Qu.: 8.8   1st Qu.:51.00  
##  Median :15.00   Median :1.322   Median :10.2   Median :60.00  
##  Mean   :24.02   Mean   :1.392   Mean   :10.2   Mean   :60.15  
##  3rd Qu.:35.00   3rd Qu.:1.568   3rd Qu.:12.0   3rd Qu.:67.00  
##  Max.   :92.00   Max.   :2.236   Max.   :14.6   Max.   :82.00  
##       calc      
##  Min.   : 7.00  
##  1st Qu.: 9.00  
##  Median :10.00  
##  Mean   :10.12  
##  3rd Qu.:10.00  
##  Max.   :18.00
#plot
ggplot(gather(data[,c("dur","nit","hemo","age","calc")]), aes(value)) + 
  geom_histogram(bins = 5, fill = "blue", alpha = 0.6) + 
  facet_wrap(~key, scales = 'free_x')

#- Univariate - Categorical variables

# Event variable - dead
kable(table(data$dead),
      col.names = c("dead", "Frequency"), align = 'l')
dead Frequency
0 17
1 48
#gender
table(data$sex)
## 
##  0  1 
## 38 27
# - Bivariate analysis - let's look at Hemoglobin by gender
newdata = data[,c("hemo","sex")]

#density plot - hemo
ggplot(data, aes(x=hemo)) + geom_density(aes(group=factor(sex), 
                                                fill = factor(sex), color = factor(sex)), alpha = 0.3)

#density plot - nit
ggplot(data, aes(x=nit)) + geom_density(aes(group=factor(sex), 
                                             fill = factor(sex), color = factor(sex)),  alpha = 0.3)

 

Kaplan Meier Estimator

The Kaplan-Meier estimator is a non-parametric statistic used to estimate the survival function from lifetime data. In medical research, it is often used to measure the fraction of patients living for a certain amount of time after treatment.

km_fit = survfit(Surv(dur, dead) ~ 1, data=data)
summary(km_fit)
## Call: survfit(formula = Surv(dur, dead) ~ 1, data = data)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1     65       2   0.9692  0.0214      0.92814        1.000
##     2     63       3   0.9231  0.0331      0.86052        0.990
##     3     60       1   0.9077  0.0359      0.83998        0.981
##     5     57       2   0.8758  0.0411      0.79888        0.960
##     6     55       4   0.8121  0.0489      0.72171        0.914
##     7     51       3   0.7644  0.0533      0.66681        0.876
##     9     45       1   0.7474  0.0547      0.64749        0.863
##    11     44       5   0.6625  0.0603      0.55429        0.792
##    13     36       1   0.6441  0.0613      0.53441        0.776
##    14     34       1   0.6251  0.0624      0.51406        0.760
##    15     33       1   0.6062  0.0633      0.49398        0.744
##    16     32       2   0.5683  0.0648      0.45453        0.711
##    17     29       2   0.5291  0.0660      0.41439        0.676
##    18     27       1   0.5095  0.0664      0.39470        0.658
##    19     26       2   0.4703  0.0668      0.35603        0.621
##    24     22       1   0.4489  0.0671      0.33493        0.602
##    25     21       1   0.4275  0.0672      0.31417        0.582
##    26     20       1   0.4062  0.0672      0.29373        0.562
##    32     18       1   0.3836  0.0671      0.27224        0.541
##    35     17       1   0.3610  0.0669      0.25115        0.519
##    37     16       1   0.3385  0.0664      0.23046        0.497
##    41     15       1   0.3159  0.0657      0.21018        0.475
##    42     13       1   0.2916  0.0650      0.18844        0.451
##    51     12       1   0.2673  0.0639      0.16727        0.427
##    52     11       1   0.2430  0.0626      0.14671        0.403
##    54      9       1   0.2160  0.0612      0.12400        0.376
##    58      7       1   0.1851  0.0597      0.09841        0.348
##    66      6       1   0.1543  0.0572      0.07463        0.319
##    67      5       1   0.1234  0.0534      0.05285        0.288
##    88      3       1   0.0823  0.0490      0.02564        0.264
##    89      2       1   0.0411  0.0380      0.00673        0.252
##    92      1       1   0.0000     NaN           NA           NA
autoplot(km_fit)

You might have noticed that KM treats time as the only independent variable in the model. The effect of other variables can be incorporated by treating them as different groups or strata.

km_sex_fit = survfit(Surv(dur, dead) ~ sex, data=data)
autoplot(km_sex_fit)

To assess the impact of a numeric variable, it needs to be first converted into categorical and included in the model.

# define a new variable with catergorical nit value
data$nit_cat = ifelse(data$nit > median(data$nit), "More than Median", "Less than Median")

km_nit_fit = survfit(Surv(dur, dead) ~ nit_cat, data=data)
summary(km_nit_fit)
## Call: survfit(formula = Surv(dur, dead) ~ nit_cat, data = data)
## 
##                 nit_cat=Less than Median 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     2     36       1   0.9722  0.0274       0.9200        1.000
##     6     35       1   0.9444  0.0382       0.8725        1.000
##     7     34       2   0.8889  0.0524       0.7919        0.998
##    11     30       4   0.7704  0.0714       0.6423        0.924
##    13     25       1   0.7396  0.0749       0.6063        0.902
##    16     24       1   0.7087  0.0779       0.5714        0.879
##    17     22       1   0.6765  0.0807       0.5354        0.855
##    19     21       2   0.6121  0.0849       0.4663        0.803
##    24     17       1   0.5761  0.0872       0.4281        0.775
##    25     16       1   0.5401  0.0889       0.3911        0.746
##    26     15       1   0.5041  0.0900       0.3553        0.715
##    32     13       1   0.4653  0.0910       0.3171        0.683
##    35     12       1   0.4265  0.0913       0.2803        0.649
##    41     11       1   0.3878  0.0909       0.2449        0.614
##    42     10       1   0.3490  0.0897       0.2109        0.578
##    52      9       1   0.3102  0.0877       0.1782        0.540
##    54      7       1   0.2659  0.0856       0.1414        0.500
##    58      5       1   0.2127  0.0834       0.0986        0.459
##    67      4       1   0.1595  0.0777       0.0614        0.414
##    88      2       1   0.0798  0.0685       0.0148        0.429
##    89      1       1   0.0000     NaN           NA           NA
## 
##                 nit_cat=More than Median 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1     29       2   0.9310  0.0471       0.8432        1.000
##     2     27       2   0.8621  0.0640       0.7453        0.997
##     3     25       1   0.8276  0.0701       0.7009        0.977
##     5     22       2   0.7524  0.0815       0.6085        0.930
##     6     20       3   0.6395  0.0917       0.4828        0.847
##     7     17       1   0.6019  0.0937       0.4436        0.817
##     9     15       1   0.5618  0.0956       0.4024        0.784
##    11     14       1   0.5216  0.0969       0.3625        0.751
##    14     10       1   0.4695  0.1002       0.3089        0.713
##    15      9       1   0.4173  0.1018       0.2587        0.673
##    16      8       1   0.3651  0.1015       0.2117        0.630
##    17      7       1   0.3130  0.0995       0.1678        0.584
##    18      6       1   0.2608  0.0956       0.1271        0.535
##    37      5       1   0.2087  0.0896       0.0899        0.484
##    51      3       1   0.1391  0.0824       0.0435        0.444
##    66      2       1   0.0696  0.0642       0.0114        0.424
##    92      1       1   0.0000     NaN           NA           NA
autoplot(km_nit_fit)

In acturial estimations, idea is very similar, but time-periods are not defined by event times. Generally, the time periods are of equal length based on some decision-context.

#-- Acturial estimation
# To define intervals use the parameter 'times' in summary
summary(km_fit, times = seq(from = 1, to = 100, by = 10))
## Call: survfit(formula = Surv(dur, dead) ~ 1, data = data)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1     65       2   0.9692  0.0214      0.92814        1.000
##    11     44      19   0.6625  0.0603      0.55429        0.792
##    21     22      10   0.4703  0.0668      0.35603        0.621
##    31     18       3   0.4062  0.0672      0.29373        0.562
##    41     15       4   0.3159  0.0657      0.21018        0.475
##    51     12       2   0.2673  0.0639      0.16727        0.427
##    61      6       3   0.1851  0.0597      0.09841        0.348
##    71      4       2   0.1234  0.0534      0.05285        0.288
##    81      3       0   0.1234  0.0534      0.05285        0.288
##    91      1       2   0.0411  0.0380      0.00673        0.252

Cox Regression Model

David Cox proposed a proportional hazard model, together with a partial likelihood method that handles survival analysis pretty much like a regression. In a proportional hazards model, the unique effect of a unit increase in a covariate is multiplicative with respect to the hazard rate. For example, taking a drug may reduced the hazard rate for a stroke occurring.

cox = coxph(Surv(dur, dead) ~ nit + hemo + sex + age + calc , data = data)
summary(cox)
## Call:
## coxph(formula = Surv(dur, dead) ~ nit + hemo + sex + age + calc, 
##     data = data)
## 
##   n= 65, number of events= 48 
## 
##          coef exp(coef) se(coef)      z Pr(>|z|)   
## nit   1.81459   6.13855  0.65279  2.780  0.00544 **
## hemo -0.14826   0.86221  0.06191 -2.395  0.01663 * 
## sex  -0.17923   0.83591  0.32382 -0.553  0.57993   
## age  -0.02206   0.97818  0.01624 -1.358  0.17451   
## calc  0.14311   1.15386  0.10201  1.403  0.16066   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      exp(coef) exp(-coef) lower .95 upper .95
## nit     6.1386     0.1629    1.7077   22.0660
## hemo    0.8622     1.1598    0.7637    0.9734
## sex     0.8359     1.1963    0.4431    1.5769
## age     0.9782     1.0223    0.9475    1.0098
## calc    1.1539     0.8667    0.9448    1.4092
## 
## Concordance= 0.676  (se = 0.051 )
## Rsquare= 0.218   (max possible= 0.991 )
## Likelihood ratio test= 16  on 5 df,   p=0.006831
## Wald test            = 16.88  on 5 df,   p=0.004739
## Score (logrank) test = 17.92  on 5 df,   p=0.003045
# plot the model
cox_fit = survfit(cox)
plot(cox_fit, main = "cph model", xlab="Days")

autoplot(cox_fit)

Aalen’s additive regression model for censored data

The additive model provides an alternative to Cox regression. It can provide detailed information about temporal influence of each of the covariates not available in Cox’s model.

aa_fit = aareg(Surv(dur, dead) ~ nit + hemo +
                 sex + calc + age , 
               data = data)
# plot
summary(aa_fit)
## $table
##                  slope          coef     se(coef)          z          p
## Intercept -0.111172034 -0.0104916864 0.0574591518 -0.1825938 0.85511673
## nit        0.214634493  0.0465006931 0.0243874701  1.9067453 0.05655358
## hemo      -0.007410443 -0.0027510714 0.0016316280 -1.6860898 0.09177850
## sex       -0.009768252 -0.0049324027 0.0083961527 -0.5874599 0.55689490
## calc       0.017982211  0.0044702099 0.0040079907  1.1153244 0.26471143
## age       -0.001104918 -0.0004796267 0.0004551339 -1.0538145 0.29196786
## 
## $test
## [1] "aalen"
## 
## $test.statistic
##   Intercept         nit        hemo         sex        calc         age 
##  -0.1011581   3.7892752 -24.4427435  -1.5262736  12.2388196 -60.9025576 
## 
## $test.var
##            b0                                                          
## b0  0.3069227  -0.5527844  -3.433719 -0.6477151   -3.791427   -7.995316
##    -0.5527844   3.9493619   0.344662  1.3945779    4.598795  -41.590371
##    -3.4337193   0.3446620 210.154398 12.6006870  -15.403601  313.328792
##    -0.6477151   1.3945779  12.600687  6.7500719    6.469844   -7.175451
##    -3.7914270   4.5987950 -15.403601  6.4698438  120.413910 -165.893604
##    -7.9953157 -41.5903709 313.328792 -7.1754507 -165.893604 3339.971229
## 
## $test.var2
## NULL
## 
## $chisq
##          [,1]
## [1,] 7.576173
## 
## $n
## [1] 65 22 32
## 
## attr(,"class")
## [1] "summary.aareg"
autoplot(aa_fit)