NCSU CISCO Data Science: Survival Analysis

We will use the following packages

require(datasets)
require(survival)

## Loading required package: survival
## Loading required package: splines

require(plyr)

## Loading required package: plyr

First let’s take a look at some data

head(lung)

##   inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss
## 1    3  306      2  74   1       1       90       100     1175      NA
## 2    3  455      2  68   1       0       90        90     1225      15
## 3    3 1010      1  56   1       0       90        90       NA      15
## 4    5  210      2  57   1       1       90        60     1150      11
## 5    1  883      2  60   1       0      100        90       NA       0
## 6   12 1022      1  74   1       1       50        80      513       0

This is data for lung cancer patients, more information on the dataset is found with:

help(lung)

NCCTG Lung Cancer Data

Let’s put the data into a dataframe.

survData <- lung

If we were loading this data from a csv, we can use R Studios Import Dataset feature or the code below.

survData <- read.csv("~/sampleSurvivalData.csv", stringsAsFactors=FALSE)

Now we will create a Survival Object from the data.

help(Surv)

Surv

We will use right censored duration data so the following will create a survival object for the lung data. Note the ‘+’ after some entries, these are durations that have been marked as censored

survObj <- Surv(time=survData$time, event=survData$status==2, type='right')
head(survObj)

## [1]  306   455  1010+  210   883  1022+

Now to fit a Kaplan-Meier survival curve. The survfit function takes an R formula, this is used in the same was as the lm package for linear modeling. For now, we will use all the data from survObj with ~ 1

fit <- survfit(survObj~1)
print(fit)

## Call: survfit(formula = survObj ~ 1)
## 
## records   n.max n.start  events  median 0.95LCL 0.95UCL 
##     228     228     228     165     310     285     363

We can get the survival table for specific times:

summary(fit, times=c(5,11,12,13,15,26,30))

## Call: survfit(formula = survObj ~ 1)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     5    228       1    0.996 0.00438        0.987        1.000
##    11    227       3    0.982 0.00869        0.966        1.000
##    12    224       1    0.978 0.00970        0.959        0.997
##    13    223       2    0.969 0.01142        0.947        0.992
##    15    221       1    0.965 0.01219        0.941        0.989
##    26    220       1    0.961 0.01290        0.936        0.986
##    30    219       1    0.956 0.01356        0.930        0.983

The full survival table is found with:

summary(fit)

We can see the K-M plot as follows:

plot(fit, main="K-M Plot for Lung Data", xlab="Time in Days", ylab="Proportion Surviving")

Using Variables

We can now use formulas to look for duration differences between groups based on other variables in the data. Here we replace the “1” from the above fit with the “sex” variable to see if there are differences between male and female patients.

fit <- survfit(survObj~survData$sex==1)
print(fit)

## Call: survfit(formula = survObj ~ survData$sex == 1)
## 
##                         records n.max n.start events median 0.95LCL
## survData$sex == 1=FALSE      90    90      90     53    426     348
## survData$sex == 1=TRUE      138   138     138    112    270     212
##                         0.95UCL
## survData$sex == 1=FALSE     550
## survData$sex == 1=TRUE      310

plot(fit, main="K-M Plot for Lung Data", xlab="Time in Days", ylab="Proportion Surviving", col=c(1:2))
legend('topright', c("Male","Female"), lty=1, col=c(1:2))

We can use coxph to fit a Cox Proportional Hazards regression model:

help(coxph)

coxph

coxph takes a formula in the same way as survfit:

coxFit <- coxph(survObj ~ survData$sex==1)
coxFit

## Call:
## coxph(formula = survObj ~ survData$sex == 1)
## 
## 
##                        coef exp(coef) se(coef)    z      p
## survData$sex == 1TRUE 0.531       1.7    0.167 3.18 0.0015
## 
## Likelihood ratio test=10.6  on 1 df, p=0.00111  n= 228, number of events= 165

summary(coxFit)

## Call:
## coxph(formula = survObj ~ survData$sex == 1)
## 
##   n= 228, number of events= 165 
## 
##                         coef exp(coef) se(coef)     z Pr(>|z|)   
## survData$sex == 1TRUE 0.5310    1.7007   0.1672 3.176  0.00149 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                       exp(coef) exp(-coef) lower .95 upper .95
## survData$sex == 1TRUE     1.701      0.588     1.226      2.36
## 
## Concordance= 0.579  (se = 0.022 )
## Rsquare= 0.046   (max possible= 0.999 )
## Likelihood ratio test= 10.63  on 1 df,   p=0.001111
## Wald test            = 10.09  on 1 df,   p=0.001491
## Score (logrank) test = 10.33  on 1 df,   p=0.001312

Since the p value is < .05 we can say the results are statistically significant. With the hazzard for males being 1.7 that of females, it would seem to be practically significant as well.

Accelerated Failure Time

We can also look for differences in lifespan rather than for differences in hazzard.

To do this we will use the survreg package:

help(survreg)

survreg

survreg takes a formula in the same way as survfit and coxph, but it also requires a parametric distribution. We will assume that the data is from a log-normal distribution.

aftFit <- survreg(survObj ~ survData$sex==1, dist="lognormal")
aftFit

## Call:
## survreg(formula = survObj ~ survData$sex == 1, dist = "lognormal")
## 
## Coefficients:
##           (Intercept) survData$sex == 1TRUE 
##             6.0179630            -0.5713763 
## 
## Scale= 1.07113 
## 
## Loglik(model)= -1162.6   Loglik(intercept only)= -1169.3
##  Chisq= 13.31 on 1 degrees of freedom, p= 0.00026 
## n= 228

summary(aftFit)

## 
## Call:
## survreg(formula = survObj ~ survData$sex == 1, dist = "lognormal")
##                         Value Std. Error     z        p
## (Intercept)            6.0180     0.1267 47.51 0.000000
## survData$sex == 1TRUE -0.5714     0.1567 -3.65 0.000266
## Log(scale)             0.0687     0.0561  1.22 0.220917
## 
## Scale= 1.07 
## 
## Log Normal distribution
## Loglik(model)= -1162.6   Loglik(intercept only)= -1169.3
##  Chisq= 13.31 on 1 degrees of freedom, p= 0.00026 
## Number of Newton-Raphson Iterations: 3 
## n= 228

Again the result is statistically significant. We can find the \(\theta\) value with:

exp(coef(aftFit)[2])

## survData$sex == 1TRUE 
##             0.5647476

This means that males have 56% of the lifespan as females in this dataset!

Plotting the survival curves for AFT models isn’t as direct as the K-M plots. The following code should help. This code expects a survfit object, a survreg object, and a x-axis max value. Note you would need to add an additional curve for each additional variable you add to the model.

samplePlots <- function(fit, sr, max){
  plot(fit, xlim=c(0,max), col=c("orange", "blue"), lwd=3, main="Kaplan-Meier Plot w/ AFT Curves", ylab = "Percent of Surviving Patients", xlab="Time in Days", mark=8,cex=2,cex.lab=1.4)
  curve(plnorm(x, coef(sr)[1], sr$scale, lower.tail=FALSE), from=0, to=max(max), col="orange", ylab =expression(hat(S)(t)), xlab='Minutes', ylim=c(0,1), lwd=2, add=T, lty=5)
  curve(plnorm(x, coef(sr)[1]+ coef(sr)[2], sr$scale, lower.tail=FALSE), from=0, to=max(max), add=T, col="blue", lty=5, lwd=2)
  legend('topright',c("Male", "Female"), col=c("orange", "blue"), lty=c(1,1), lwd=c(5,5), cex=1.2, inset=c(.05,0), bty="n")
}

samplePlots(fit, aftFit, 1000)

Homework Tips

Problem One

Load data:

subscriptions <- read.csv("subscriptions.csv", stringsAsFactors=FALSE)

Create a new duration column:

subscriptions$dur <- difftime(subscriptions$cancel_date, subscriptions$start_date, units="days")

For this dataset Event is cancelation of a subscription, so it would be any row that is not “N” for cancel_type.

subscriptions$Event <- subscriptions != "N"
head(subscriptions)

##   X customer_id rate_plan monthly_fee     market channel
## 1 1    93378898    Bottom          30 Smallville  Dealer
## 2 2    93276037    Bottom          40 Smallville  Dealer
## 3 3    93263531    Bottom          30 Smallville  Dealer
## 4 4    93263762    Bottom          30 Smallville  Dealer
## 5 5     7638484    Bottom          25     Gotham  Dealer
## 6 6    11850521    Bottom          25     Gotham  Dealer
##            start_date         cancel_date cancel_type           dur
## 1 1996-08-08 23:00:00 2015-04-20 00:00:00           N 6828.042 days
## 2 1997-01-06 22:00:00 2015-04-20 00:00:00           N 6677.042 days
## 3 1997-01-14 22:00:00 2014-09-18 23:00:00           V 6456.000 days
## 4 1997-02-09 22:00:00 2015-04-20 00:00:00           N 6643.042 days
## 5 1997-03-02 23:00:00 2013-03-19 00:00:00           V 5860.000 days
## 6 1997-03-25 23:00:00 2015-04-20 00:00:00           N 6599.000 days
##   Event.X Event.customer_id Event.rate_plan Event.monthly_fee Event.market
## 1    TRUE              TRUE            TRUE              TRUE         TRUE
## 2    TRUE              TRUE            TRUE              TRUE         TRUE
## 3    TRUE              TRUE            TRUE              TRUE         TRUE
## 4    TRUE              TRUE            TRUE              TRUE         TRUE
## 5    TRUE              TRUE            TRUE              TRUE         TRUE
## 6    TRUE              TRUE            TRUE              TRUE         TRUE
##   Event.channel Event.start_date Event.cancel_date Event.cancel_type
## 1          TRUE             TRUE              TRUE             FALSE
## 2          TRUE             TRUE              TRUE             FALSE
## 3          TRUE             TRUE              TRUE              TRUE
## 4          TRUE             TRUE              TRUE             FALSE
## 5          TRUE             TRUE              TRUE              TRUE
## 6          TRUE             TRUE              TRUE             FALSE
##   Event.dur
## 1      TRUE
## 2      TRUE
## 3      TRUE
## 4      TRUE
## 5      TRUE
## 6      TRUE

Problem Two

Problem Two requires some data manipulation.

First, we need to load the data and collapse the transaction data into one row per customer: (You will probably want to include more than just married_couple)

allState <- read.csv("AllStateTrainingData.csv", stringsAsFactors=FALSE)
survData <- ddply(allState, .(customer_ID), summarize, first = time[1], last=time[length(time)], first_day=day[1], last_day=day[length(day)], married=married_couple[1])

We need to adjust the data to handle the day variable

survData$last_day[survData$last_day<survData$first_day] <- survData$last_day[survData$last_day<survData$first_day] + 7
survData$days <- abs(survData$first_day - survData$last_day)

Transform the string time into date-time objects:

survData$first <- strptime(survData$first, format="%H:%M")
survData$last <- strptime(survData$last, format="%H:%M") + (86400 * survData$days)

Finally, we will get the duration:

survData$dur <- as.numeric(difftime(survData$last, survData$first, units="mins"))
head(survData)

##   customer_ID               first                last first_day last_day
## 1    10000000 2015-04-21 08:35:00 2015-04-21 12:07:00         0        0
## 2    10000005 2015-04-21 08:56:00 2015-04-21 09:09:00         3        3
## 3    10000007 2015-04-21 08:35:00 2015-04-21 14:26:00         4        4
## 4    10000013 2015-04-21 16:31:00 2015-04-23 09:31:00         2        4
## 5    10000014 2015-04-21 16:30:00 2015-04-25 17:50:00         4        8
## 6    10000016 2015-04-21 13:34:00 2015-04-22 13:15:00         2        3
##   married days  dur
## 1       1    0  212
## 2       0    0   13
## 3       0    0  351
## 4       1    2 2460
## 5       1    4 5840
## 6       0    1 1421

As this dataset contains no censored cases (it only includes data form customrs who purchased,) we don’t need to include the censored variable in the Surv model.

fit <- Surv(survData$dur)
plot(survfit(fit~1))