Feature Engineering for Survival Analysis

With feature engineering we accomplish several tasks:

customer.data.ta = ta.data.frame("cust_metrics_summary_june2016", schemaName = "p17")
survival.columns = c('cust_id','plan','region','data_usage_avg','sms_usage_avg','voice_minutes_avg','cust_days',
                     'close_dt','open_dt')
survival.data.ta = ta.transform(customer.data.ta[, survival.columns],
                                voice_24hours_avg = voice_minutes_avg / (24 * 60),
                                sms100_usage_avg = sms_usage_avg / 100,
                                duration = if(!is.null(close_dt)) close_dt - open_dt
                                            else '1/31/2017' - open_dt,
                                event = if(!is.null(close_dt)) 1 else 0)

ta.dropTable("customers_for_coxph", schemaName = "p17")
## [1] -2
ta.create(survival.data.ta[, union(setdiff(survival.columns, c('close_dt','open_dt')), 
                                   c('voice_24hours_avg','sms100_usage_avg','duration','event'))],
          table = 'customers_for_coxph', schemaName = 'p17', 
          tableType = 'fact', partitionKey = 'cust_id', row.names = TRUE,
          colTypes = c(event='int'))


customer.data.ta = ta.data.frame("customers_for_coxph", schemaName = "p17")

Creating Survival Model

all.features = setdiff(ta.colnames(customer.data.ta), 
                       c("row_names", "cust_id", "event", "cust_days", "duration",
                         "voice_minutes_avg", "sms_usage_avg"))
categorical.features = intersect(all.features, c("plan","region"))

cox.model = aa.coxph(data = customer.data.ta, 
                     feature.columns = all.features,
                     categorical.columns = categorical.features,
                     time.interval.column = "cust_days",
                     event.column = "event",
                     accumulate = "cust_id")

names(cox.model)
## [1] "coefficient.table"      "linear.predictor.table"
## [3] "output"
ta.head(cox.model$coefficient.table)
## Warning in ta.show(tadf, maxRows): Printing rows in random order since base
## table/view is neither ordered nor have row_names column.
##   id        predictor category  coefficient     exp_coef    std_error
## 1  4             plan Offer 20  0.000000000 1.0000000000 0.000000e+00
## 2  3 sms100_usage_avg     <NA>  0.004797805 1.0048093325 2.332993e-04
## 3 14             plan Offer 99 -8.769738940 0.0001553641 3.255242e+01
## 4 13             plan Offer 55  0.338764680 1.4032131022 1.895832e-01
## 5 11             plan Offer 40 -0.704473669 0.4943687073 2.890598e-01
## 6 21             plan  Plan 50  0.143465489 1.1542669747 2.011758e-02
##      z_score      p_value significance
## 1        NaN          NaN             
## 2 20.5650233 0.000000e+00          ***
## 3 -0.2694036 7.876191e-01             
## 4  1.7868920 7.395495e-02            .
## 5 -2.4371211 1.480472e-02            *
## 6  7.1313477 9.940000e-13          ***
ta.head(cox.model$linear.predictor.table)
## Warning in ta.show(tadf, maxRows): Printing rows in random order since base
## table/view is neither ordered nor have row_names column.
##   linear_predictor event time_interval    cust_id
## 1        0.1082385     0            37 1027518795
## 2       -0.3268490     0            37 1026538795
## 3        2.0575348     1            37 1034208790
## 4        0.2077554     0            37 1032768795
## 5        0.2147397     0            37 1027438795
## 6        0.5399243     0            37 1027578795
coeffs = as.data.frame(cox.model$coefficient.table)

Hazard Ratios

Hazad Ratio: Plan (Categorical)

data = coeffs[coeffs$predictor=='plan' &
                (coeffs$p_value <= 0.05 | is.nan(coeffs$p_value)), ]
data$category = factor(data$category, levels = data$category[order(data$exp_coef)], ordered = TRUE)
createHazardRatioPlot(data, 
                      title = expression(paste("Log Hazard Ratios ", exp(beta), " for Plans")),
                      subtitle = "Plan 'Offer 20' as baseline at 1. Only plans with significant values included")

createHazardRatioPlot(data, value = "coefficient", 
                      title = "Hazard Ratio Coefficients for Plans",
                      subtitle = "Plan 'Offer 20' as baseline at 0. Only plans with signficant values included")

Hazard Ratio: Region (Categorical)

data = coeffs[coeffs$predictor=='region' &
                (coeffs$p_value <= 0.05 | is.nan(coeffs$p_value)), ]
data$category = factor(data$category, levels = data$category[order(data$exp_coef)], ordered = TRUE)
createHazardRatioPlot(data, 
                      title = expression(paste("Log Hazard Ratio ", exp(beta), " for Regions")),
                      subtitle = "Region R1 as baseline at 1. Only regions with significant values included")

createHazardRatioPlot(data, value = 'coefficient', angle = 0,
                      title = "Hazard Ratio Coefficients for Regions",
                      subtitle = 'Region R1 as baseline at 0. Only plans with significant values included')

Hazard Ratio: Usage (Numerical Features)

data = coeffs[!coeffs$predictor %in% c('region','plan'), ]
data$predictor = factor(data$predictor, levels = data$predictor[order(data$exp_coef)], ordered = TRUE)
createHazardRatioPlot(data, 'predictor', 
                      title = expression(paste("Log Hazard Ratio ", exp(beta), " of Numeric Predictors")),
                      subtitle = "Voice, Data, and Text Usage Changes by One Unit", angle=0)

createHazardRatioPlot(data, 'predictor', value = 'coefficient', 
                      title = expression(paste("Hazard Ratio ", beta, " of Numeric Predictors")),
                      subtitle = "Voice, Data, and Text Usage Changes by One Unit", angle=0)

Comparing Customer Survivability

survival.probs = aa.cox.survfit(object = cox.model$coefficient.table,
               cox.model.table = cox.model$linear.predictor.table,
               predict.table = ta.subset(customer.data.ta, cust_id %in% c('1050527710','1064467365')),
               predict.feature.columns = all.features,
               predict.feature.names = all.features,
               accumulate = 'cust_id')

data = as.data.frame(survival.probs$survival.probability)
data$cust_id = factor(data$cust_id)
ggplot(data) +
  geom_line(aes(time_interval, survival_prob, group=cust_id, color=cust_id), size=1) +
  scale_color_solarized(name="Customer") +
  labs(x="Days", y="Proportion Not Canceling", 
       title = "Survival Curves by Customers", subtitle = NULL) +
  theme_minimal(base_size = 14, base_family = 'mono') +
  theme(legend.position = "bottom")

Hazard Ratios by Customers

Reference Customer for Proportional Hazard

Let’s evaluate hazard ratios between all customers and reference mean “customer”. This is scoring of customers using computed Cox PH model.

First, create “average” customer to use as a reference (a baseline) to compute proportional hazards. Such customer could be specificly designed based on research use case, e.g. analyzing certain demographics, customer experience, etc.

customer_numeric_means = ta.colMeans(customer.data.ta[,setdiff(all.features, 
                                                            categorical.features)])
customer_all_means = cbind(as.data.frame(t(customer_numeric_means)),
                           data.frame(plan="Plan 30", region="R2", 
                                      stringsAsFactors = FALSE))
ta.dropTable("customer_means", schemaName = "p17")
## [1] -2
ta.create(data=customer_all_means, table="customer_means", schemaName = "p17", 
          tableType = "dimension")
customer.means.ta = ta.data.frame("customer_means", schemaName = "p17")

Scoring Customers

# compute ratios
customer.ratios = aa.cox.predict(
  object = cox.model, 
  predicts = customer.data.ta,
  refs = customer.means.ta,
  predicts.partition.column = "1",
  predict.feature.columns = all.features,
  predict.feature.names = all.features,
  ref.feature.columns = all.features,
  refs.partition.column = c('1'),
  accumulate = c("cust_id")
  )

predictions.coxph = customer.ratios[[1]]
n = ta.dim(predictions.coxph)[[1]]

Analyzing Proportional Hazard Customers

# Get just bottom and top N customers
N = 100
percentiles = aa.percentile(predictions.coxph, data.partition.column = "1",
                            target.columns = 'hazardratio',
                            percentile = c(N/n * 100, (1.0 - N/n)*100))
percentiles.values = as.matrix(percentiles$result)
bottom.value = percentiles.values[[1,2]]
top.value = percentiles.values[[2,2]]

bottom_and_top_customers = as.data.frame(
  ta.transform(ta.subset(predictions.coxph, hazardratio <= bottom.value | hazardratio > top.value), 
               group = if(hazardratio <= bottom.value) 'bottom' else 'top'),
  stringsAsFactors = FALSE)
bottom_and_top_customers$cust_id =
  factor(bottom_and_top_customers$cust_id, 
         levels = bottom_and_top_customers$cust_id[order(bottom_and_top_customers$hazardratio)], 
         ordered = TRUE)

bottom_and_top_customers$group = factor(bottom_and_top_customers$group, 
                                        levels=c('top','bottom'), 
                                        labels=c('High Risk','Low Risk'),
                                        ordered = TRUE)

Customers with Regions

Besides each bar stands concrete customer sorted by their proportional hazard ratio. We always select bottom (low riks) and top (high risk) to display.

Customers with Plans

Customers with Phone Usages