In survival analysis, we use statistical approaches to identify the time it takes for a certain event to occur. This event could be customer churn in marketing or failure time in machines.
Basic Settings and Data Import
Let’s begin by loading the required libraries and importing the data set we are going to use for this model.
options(scipen = 999)
# set working directory
setwd("C:/Users/awani/Desktop/50daysofAnalytics")
# load libraries
if (!require("pacman")) install.packages("pacman")
pacman::p_load(survival, ranger, ggplot2,stringr, dplyr,ggfortify, tidyr, sas7bdat, knitr)
#read data
data = read.csv("surv_data.csv")
Explanatory Data Analysis
Before we jump into any kind of model training, it is essential that we understand data well. A quick eyeballing uni-variate and bi-variate plots can make us familiar with the data set.
#- univariate - numerical variables
summary(data[,c("dur","nit","hemo","age","calc")])
## dur nit hemo age
## Min. : 1.00 Min. :0.778 Min. : 4.9 Min. :38.00
## 1st Qu.: 7.00 1st Qu.:1.146 1st Qu.: 8.8 1st Qu.:51.00
## Median :15.00 Median :1.322 Median :10.2 Median :60.00
## Mean :24.02 Mean :1.392 Mean :10.2 Mean :60.15
## 3rd Qu.:35.00 3rd Qu.:1.568 3rd Qu.:12.0 3rd Qu.:67.00
## Max. :92.00 Max. :2.236 Max. :14.6 Max. :82.00
## calc
## Min. : 7.00
## 1st Qu.: 9.00
## Median :10.00
## Mean :10.12
## 3rd Qu.:10.00
## Max. :18.00
#plot
ggplot(gather(data[,c("dur","nit","hemo","age","calc")]), aes(value)) +
geom_histogram(bins = 5, fill = "blue", alpha = 0.6) +
facet_wrap(~key, scales = 'free_x')
#- Univariate - Categorical variables
# Event variable - dead
kable(table(data$dead),
col.names = c("dead", "Frequency"), align = 'l')
| dead | Frequency |
|---|---|
| 0 | 17 |
| 1 | 48 |
#gender
table(data$sex)
##
## 0 1
## 38 27
# - Bivariate analysis - let's look at Hemoglobin by gender
newdata = data[,c("hemo","sex")]
#density plot - hemo
ggplot(data, aes(x=hemo)) + geom_density(aes(group=factor(sex),
fill = factor(sex), color = factor(sex)), alpha = 0.3)
#density plot - nit
ggplot(data, aes(x=nit)) + geom_density(aes(group=factor(sex),
fill = factor(sex), color = factor(sex)), alpha = 0.3)
Kaplan Meier Estimator
The Kaplan-Meier estimator is a non-parametric statistic used to estimate the survival function from lifetime data. In medical research, it is often used to measure the fraction of patients living for a certain amount of time after treatment.
km_fit = survfit(Surv(dur, dead) ~ 1, data=data)
summary(km_fit)
## Call: survfit(formula = Surv(dur, dead) ~ 1, data = data)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 65 2 0.9692 0.0214 0.92814 1.000
## 2 63 3 0.9231 0.0331 0.86052 0.990
## 3 60 1 0.9077 0.0359 0.83998 0.981
## 5 57 2 0.8758 0.0411 0.79888 0.960
## 6 55 4 0.8121 0.0489 0.72171 0.914
## 7 51 3 0.7644 0.0533 0.66681 0.876
## 9 45 1 0.7474 0.0547 0.64749 0.863
## 11 44 5 0.6625 0.0603 0.55429 0.792
## 13 36 1 0.6441 0.0613 0.53441 0.776
## 14 34 1 0.6251 0.0624 0.51406 0.760
## 15 33 1 0.6062 0.0633 0.49398 0.744
## 16 32 2 0.5683 0.0648 0.45453 0.711
## 17 29 2 0.5291 0.0660 0.41439 0.676
## 18 27 1 0.5095 0.0664 0.39470 0.658
## 19 26 2 0.4703 0.0668 0.35603 0.621
## 24 22 1 0.4489 0.0671 0.33493 0.602
## 25 21 1 0.4275 0.0672 0.31417 0.582
## 26 20 1 0.4062 0.0672 0.29373 0.562
## 32 18 1 0.3836 0.0671 0.27224 0.541
## 35 17 1 0.3610 0.0669 0.25115 0.519
## 37 16 1 0.3385 0.0664 0.23046 0.497
## 41 15 1 0.3159 0.0657 0.21018 0.475
## 42 13 1 0.2916 0.0650 0.18844 0.451
## 51 12 1 0.2673 0.0639 0.16727 0.427
## 52 11 1 0.2430 0.0626 0.14671 0.403
## 54 9 1 0.2160 0.0612 0.12400 0.376
## 58 7 1 0.1851 0.0597 0.09841 0.348
## 66 6 1 0.1543 0.0572 0.07463 0.319
## 67 5 1 0.1234 0.0534 0.05285 0.288
## 88 3 1 0.0823 0.0490 0.02564 0.264
## 89 2 1 0.0411 0.0380 0.00673 0.252
## 92 1 1 0.0000 NaN NA NA
autoplot(km_fit)
You might have noticed that KM treats time as the only independent variable in the model. The effect of other variables can be incorporated by treating them as different groups or strata.
km_sex_fit = survfit(Surv(dur, dead) ~ sex, data=data)
autoplot(km_sex_fit)
To assess the impact of a numeric variable, it needs to be first converted into categorical and included in the model.
# define a new variable with catergorical nit value
data$nit_cat = ifelse(data$nit > median(data$nit), "More than Median", "Less than Median")
km_nit_fit = survfit(Surv(dur, dead) ~ nit_cat, data=data)
summary(km_nit_fit)
## Call: survfit(formula = Surv(dur, dead) ~ nit_cat, data = data)
##
## nit_cat=Less than Median
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 2 36 1 0.9722 0.0274 0.9200 1.000
## 6 35 1 0.9444 0.0382 0.8725 1.000
## 7 34 2 0.8889 0.0524 0.7919 0.998
## 11 30 4 0.7704 0.0714 0.6423 0.924
## 13 25 1 0.7396 0.0749 0.6063 0.902
## 16 24 1 0.7087 0.0779 0.5714 0.879
## 17 22 1 0.6765 0.0807 0.5354 0.855
## 19 21 2 0.6121 0.0849 0.4663 0.803
## 24 17 1 0.5761 0.0872 0.4281 0.775
## 25 16 1 0.5401 0.0889 0.3911 0.746
## 26 15 1 0.5041 0.0900 0.3553 0.715
## 32 13 1 0.4653 0.0910 0.3171 0.683
## 35 12 1 0.4265 0.0913 0.2803 0.649
## 41 11 1 0.3878 0.0909 0.2449 0.614
## 42 10 1 0.3490 0.0897 0.2109 0.578
## 52 9 1 0.3102 0.0877 0.1782 0.540
## 54 7 1 0.2659 0.0856 0.1414 0.500
## 58 5 1 0.2127 0.0834 0.0986 0.459
## 67 4 1 0.1595 0.0777 0.0614 0.414
## 88 2 1 0.0798 0.0685 0.0148 0.429
## 89 1 1 0.0000 NaN NA NA
##
## nit_cat=More than Median
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 29 2 0.9310 0.0471 0.8432 1.000
## 2 27 2 0.8621 0.0640 0.7453 0.997
## 3 25 1 0.8276 0.0701 0.7009 0.977
## 5 22 2 0.7524 0.0815 0.6085 0.930
## 6 20 3 0.6395 0.0917 0.4828 0.847
## 7 17 1 0.6019 0.0937 0.4436 0.817
## 9 15 1 0.5618 0.0956 0.4024 0.784
## 11 14 1 0.5216 0.0969 0.3625 0.751
## 14 10 1 0.4695 0.1002 0.3089 0.713
## 15 9 1 0.4173 0.1018 0.2587 0.673
## 16 8 1 0.3651 0.1015 0.2117 0.630
## 17 7 1 0.3130 0.0995 0.1678 0.584
## 18 6 1 0.2608 0.0956 0.1271 0.535
## 37 5 1 0.2087 0.0896 0.0899 0.484
## 51 3 1 0.1391 0.0824 0.0435 0.444
## 66 2 1 0.0696 0.0642 0.0114 0.424
## 92 1 1 0.0000 NaN NA NA
autoplot(km_nit_fit)
In acturial estimations, idea is very similar, but time-periods are not defined by event times. Generally, the time periods are of equal length based on some decision-context.
#-- Acturial estimation
# To define intervals use the parameter 'times' in summary
summary(km_fit, times = seq(from = 1, to = 100, by = 10))
## Call: survfit(formula = Surv(dur, dead) ~ 1, data = data)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 65 2 0.9692 0.0214 0.92814 1.000
## 11 44 19 0.6625 0.0603 0.55429 0.792
## 21 22 10 0.4703 0.0668 0.35603 0.621
## 31 18 3 0.4062 0.0672 0.29373 0.562
## 41 15 4 0.3159 0.0657 0.21018 0.475
## 51 12 2 0.2673 0.0639 0.16727 0.427
## 61 6 3 0.1851 0.0597 0.09841 0.348
## 71 4 2 0.1234 0.0534 0.05285 0.288
## 81 3 0 0.1234 0.0534 0.05285 0.288
## 91 1 2 0.0411 0.0380 0.00673 0.252
Cox Regression Model
David Cox proposed a proportional hazard model, together with a partial likelihood method that handles survival analysis pretty much like a regression. In a proportional hazards model, the unique effect of a unit increase in a covariate is multiplicative with respect to the hazard rate. For example, taking a drug may reduced the hazard rate for a stroke occurring.
cox = coxph(Surv(dur, dead) ~ nit + hemo + sex + age + calc , data = data)
summary(cox)
## Call:
## coxph(formula = Surv(dur, dead) ~ nit + hemo + sex + age + calc,
## data = data)
##
## n= 65, number of events= 48
##
## coef exp(coef) se(coef) z Pr(>|z|)
## nit 1.81459 6.13855 0.65279 2.780 0.00544 **
## hemo -0.14826 0.86221 0.06191 -2.395 0.01663 *
## sex -0.17923 0.83591 0.32382 -0.553 0.57993
## age -0.02206 0.97818 0.01624 -1.358 0.17451
## calc 0.14311 1.15386 0.10201 1.403 0.16066
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## nit 6.1386 0.1629 1.7077 22.0660
## hemo 0.8622 1.1598 0.7637 0.9734
## sex 0.8359 1.1963 0.4431 1.5769
## age 0.9782 1.0223 0.9475 1.0098
## calc 1.1539 0.8667 0.9448 1.4092
##
## Concordance= 0.676 (se = 0.051 )
## Rsquare= 0.218 (max possible= 0.991 )
## Likelihood ratio test= 16 on 5 df, p=0.006831
## Wald test = 16.88 on 5 df, p=0.004739
## Score (logrank) test = 17.92 on 5 df, p=0.003045
# plot the model
cox_fit = survfit(cox)
plot(cox_fit, main = "cph model", xlab="Days")
autoplot(cox_fit)
Aalen’s additive regression model for censored data
The additive model provides an alternative to Cox regression. It can provide detailed information about temporal influence of each of the covariates not available in Cox’s model.
aa_fit = aareg(Surv(dur, dead) ~ nit + hemo +
sex + calc + age ,
data = data)
# plot
summary(aa_fit)
## $table
## slope coef se(coef) z p
## Intercept -0.111172034 -0.0104916864 0.0574591518 -0.1825938 0.85511673
## nit 0.214634493 0.0465006931 0.0243874701 1.9067453 0.05655358
## hemo -0.007410443 -0.0027510714 0.0016316280 -1.6860898 0.09177850
## sex -0.009768252 -0.0049324027 0.0083961527 -0.5874599 0.55689490
## calc 0.017982211 0.0044702099 0.0040079907 1.1153244 0.26471143
## age -0.001104918 -0.0004796267 0.0004551339 -1.0538145 0.29196786
##
## $test
## [1] "aalen"
##
## $test.statistic
## Intercept nit hemo sex calc age
## -0.1011581 3.7892752 -24.4427435 -1.5262736 12.2388196 -60.9025576
##
## $test.var
## b0
## b0 0.3069227 -0.5527844 -3.433719 -0.6477151 -3.791427 -7.995316
## -0.5527844 3.9493619 0.344662 1.3945779 4.598795 -41.590371
## -3.4337193 0.3446620 210.154398 12.6006870 -15.403601 313.328792
## -0.6477151 1.3945779 12.600687 6.7500719 6.469844 -7.175451
## -3.7914270 4.5987950 -15.403601 6.4698438 120.413910 -165.893604
## -7.9953157 -41.5903709 313.328792 -7.1754507 -165.893604 3339.971229
##
## $test.var2
## NULL
##
## $chisq
## [,1]
## [1,] 7.576173
##
## $n
## [1] 65 22 32
##
## attr(,"class")
## [1] "summary.aareg"
autoplot(aa_fit)