With feature engineering we accomplish several tasks:
customer.data.ta = ta.data.frame("cust_metrics_summary_june2016", schemaName = "p17")
survival.columns = c('cust_id','plan','region','data_usage_avg','sms_usage_avg','voice_minutes_avg','cust_days',
'close_dt','open_dt')
survival.data.ta = ta.transform(customer.data.ta[, survival.columns],
voice_24hours_avg = voice_minutes_avg / (24 * 60),
sms100_usage_avg = sms_usage_avg / 100,
duration = if(!is.null(close_dt)) close_dt - open_dt
else '1/31/2017' - open_dt,
event = if(!is.null(close_dt)) 1 else 0)
ta.dropTable("customers_for_coxph", schemaName = "p17")
## [1] -2
ta.create(survival.data.ta[, union(setdiff(survival.columns, c('close_dt','open_dt')),
c('voice_24hours_avg','sms100_usage_avg','duration','event'))],
table = 'customers_for_coxph', schemaName = 'p17',
tableType = 'fact', partitionKey = 'cust_id', row.names = TRUE,
colTypes = c(event='int'))
customer.data.ta = ta.data.frame("customers_for_coxph", schemaName = "p17")
all.features = setdiff(ta.colnames(customer.data.ta),
c("row_names", "cust_id", "event", "cust_days", "duration",
"voice_minutes_avg", "sms_usage_avg"))
categorical.features = intersect(all.features, c("plan","region"))
cox.model = aa.coxph(data = customer.data.ta,
feature.columns = all.features,
categorical.columns = categorical.features,
time.interval.column = "cust_days",
event.column = "event",
accumulate = "cust_id")
names(cox.model)
## [1] "coefficient.table" "linear.predictor.table"
## [3] "output"
ta.head(cox.model$coefficient.table)
## Warning in ta.show(tadf, maxRows): Printing rows in random order since base
## table/view is neither ordered nor have row_names column.
## id predictor category coefficient exp_coef std_error
## 1 4 plan Offer 20 0.000000000 1.0000000000 0.000000e+00
## 2 3 sms100_usage_avg <NA> 0.004797805 1.0048093325 2.332993e-04
## 3 14 plan Offer 99 -8.769738940 0.0001553641 3.255242e+01
## 4 13 plan Offer 55 0.338764680 1.4032131022 1.895832e-01
## 5 11 plan Offer 40 -0.704473669 0.4943687073 2.890598e-01
## 6 21 plan Plan 50 0.143465489 1.1542669747 2.011758e-02
## z_score p_value significance
## 1 NaN NaN
## 2 20.5650233 0.000000e+00 ***
## 3 -0.2694036 7.876191e-01
## 4 1.7868920 7.395495e-02 .
## 5 -2.4371211 1.480472e-02 *
## 6 7.1313477 9.940000e-13 ***
ta.head(cox.model$linear.predictor.table)
## Warning in ta.show(tadf, maxRows): Printing rows in random order since base
## table/view is neither ordered nor have row_names column.
## linear_predictor event time_interval cust_id
## 1 0.1082385 0 37 1027518795
## 2 -0.3268490 0 37 1026538795
## 3 2.0575348 1 37 1034208790
## 4 0.2077554 0 37 1032768795
## 5 0.2147397 0 37 1027438795
## 6 0.5399243 0 37 1027578795
coeffs = as.data.frame(cox.model$coefficient.table)
data = coeffs[coeffs$predictor=='plan' &
(coeffs$p_value <= 0.05 | is.nan(coeffs$p_value)), ]
data$category = factor(data$category, levels = data$category[order(data$exp_coef)], ordered = TRUE)
createHazardRatioPlot(data,
title = expression(paste("Log Hazard Ratios ", exp(beta), " for Plans")),
subtitle = "Plan 'Offer 20' as baseline at 1. Only plans with significant values included")
createHazardRatioPlot(data, value = "coefficient",
title = "Hazard Ratio Coefficients for Plans",
subtitle = "Plan 'Offer 20' as baseline at 0. Only plans with signficant values included")
data = coeffs[coeffs$predictor=='region' &
(coeffs$p_value <= 0.05 | is.nan(coeffs$p_value)), ]
data$category = factor(data$category, levels = data$category[order(data$exp_coef)], ordered = TRUE)
createHazardRatioPlot(data,
title = expression(paste("Log Hazard Ratio ", exp(beta), " for Regions")),
subtitle = "Region R1 as baseline at 1. Only regions with significant values included")
createHazardRatioPlot(data, value = 'coefficient', angle = 0,
title = "Hazard Ratio Coefficients for Regions",
subtitle = 'Region R1 as baseline at 0. Only plans with significant values included')
data = coeffs[!coeffs$predictor %in% c('region','plan'), ]
data$predictor = factor(data$predictor, levels = data$predictor[order(data$exp_coef)], ordered = TRUE)
createHazardRatioPlot(data, 'predictor',
title = expression(paste("Log Hazard Ratio ", exp(beta), " of Numeric Predictors")),
subtitle = "Voice, Data, and Text Usage Changes by One Unit", angle=0)
createHazardRatioPlot(data, 'predictor', value = 'coefficient',
title = expression(paste("Hazard Ratio ", beta, " of Numeric Predictors")),
subtitle = "Voice, Data, and Text Usage Changes by One Unit", angle=0)
survival.probs = aa.cox.survfit(object = cox.model$coefficient.table,
cox.model.table = cox.model$linear.predictor.table,
predict.table = ta.subset(customer.data.ta, cust_id %in% c('1050527710','1064467365')),
predict.feature.columns = all.features,
predict.feature.names = all.features,
accumulate = 'cust_id')
data = as.data.frame(survival.probs$survival.probability)
data$cust_id = factor(data$cust_id)
ggplot(data) +
geom_line(aes(time_interval, survival_prob, group=cust_id, color=cust_id), size=1) +
scale_color_solarized(name="Customer") +
labs(x="Days", y="Proportion Not Canceling",
title = "Survival Curves by Customers", subtitle = NULL) +
theme_minimal(base_size = 14, base_family = 'mono') +
theme(legend.position = "bottom")
Let’s evaluate hazard ratios between all customers and reference mean “customer”. This is scoring of customers using computed Cox PH model.
First, create “average” customer to use as a reference (a baseline) to compute proportional hazards. Such customer could be specificly designed based on research use case, e.g. analyzing certain demographics, customer experience, etc.
customer_numeric_means = ta.colMeans(customer.data.ta[,setdiff(all.features,
categorical.features)])
customer_all_means = cbind(as.data.frame(t(customer_numeric_means)),
data.frame(plan="Plan 30", region="R2",
stringsAsFactors = FALSE))
ta.dropTable("customer_means", schemaName = "p17")
## [1] -2
ta.create(data=customer_all_means, table="customer_means", schemaName = "p17",
tableType = "dimension")
customer.means.ta = ta.data.frame("customer_means", schemaName = "p17")
# compute ratios
customer.ratios = aa.cox.predict(
object = cox.model,
predicts = customer.data.ta,
refs = customer.means.ta,
predicts.partition.column = "1",
predict.feature.columns = all.features,
predict.feature.names = all.features,
ref.feature.columns = all.features,
refs.partition.column = c('1'),
accumulate = c("cust_id")
)
predictions.coxph = customer.ratios[[1]]
n = ta.dim(predictions.coxph)[[1]]
# Get just bottom and top N customers
N = 100
percentiles = aa.percentile(predictions.coxph, data.partition.column = "1",
target.columns = 'hazardratio',
percentile = c(N/n * 100, (1.0 - N/n)*100))
percentiles.values = as.matrix(percentiles$result)
bottom.value = percentiles.values[[1,2]]
top.value = percentiles.values[[2,2]]
bottom_and_top_customers = as.data.frame(
ta.transform(ta.subset(predictions.coxph, hazardratio <= bottom.value | hazardratio > top.value),
group = if(hazardratio <= bottom.value) 'bottom' else 'top'),
stringsAsFactors = FALSE)
bottom_and_top_customers$cust_id =
factor(bottom_and_top_customers$cust_id,
levels = bottom_and_top_customers$cust_id[order(bottom_and_top_customers$hazardratio)],
ordered = TRUE)
bottom_and_top_customers$group = factor(bottom_and_top_customers$group,
levels=c('top','bottom'),
labels=c('High Risk','Low Risk'),
ordered = TRUE)
Besides each bar stands concrete customer sorted by their proportional hazard ratio. We always select bottom (low riks) and top (high risk) to display.