Retail Churn Prediction Model

Nir Regev
Principal Data Scientist
Sisense Ltd.

May 16th, 2016

library(ade4)
## Warning: package 'ade4' was built under R version 3.2.5
library(cluster)
library(fpc)
## Warning: package 'fpc' was built under R version 3.2.5
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(Causata)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
library(partykit)
## 
## Attaching package: 'partykit'
## The following objects are masked from 'package:party':
## 
##     cforest, ctree, ctree_control, edge_simple, mob, mob_control,
##     node_barplot, node_bivplot, node_boxplot, node_inner,
##     node_surv, node_terminal
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.2.5
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.2.5
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.2.5
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.2.5
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.2.5
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.5
library(caret)
library(XML)
library(Causata)
library(foreign)
library(tidyr)
library(rpart)
set.seed(3456)

A Generic Model For Retail Churn Classificaiton

Features Engineering

# aggregate some measure on customer transactions
order.measures <- transactional_product_data[,c('CustomerID','OrderHeaderSubTotal','order_detail_UnitPrice','order_detail_UnitPriceDiscount','product_list_price')]
cust.aggr <- aggregate(. ~ CustomerID, order.measures, function(x) c(mean = mean(x)))
# customer dates analysis. gaps between ship time and due date time, to reveal late deliveries
cust.order.dates <- transactional_product_data[,c('CustomerID','OrderDate','DueDate','ShipDate')]
cust.order.dates$OrderDate <- as.Date(as.character(cust.order.dates$OrderDate), format="%m/%d/%Y")
cust.order.dates$DueDate <- as.Date(as.character(cust.order.dates$DueDate), format="%m/%d/%Y")
cust.order.dates$ShipDate <- as.Date(as.character(cust.order.dates$ShipDate), format="%m/%d/%Y")
cust.order.dates$late <- difftime(cust.order.dates$ShipDate,cust.order.dates$DueDate, units="days")  
# other orders attributes
orders.attributes <- transactional_product_data[
,c('CustomerID',
    'OnlineOrderFlag',
    'ProductCategoryName',
    'product_name',
    'Order_territory_name',
    'product_class',
    'Product_Subcategory_Name',
    'order_detail_UnitPrice',
    'order_detail_UnitPriceDiscount',
    'product_list_price',
    'margin',
    'OrderHeaderSubTotal',
    'order_detail_OrderQty'
   )]
orders.attributes.agg <- aggregate(. ~ CustomerID+OnlineOrderFlag+Order_territory_name+product_class+Product_Subcategory_Name+ProductCategoryName+product_name, orders.attributes, function(x) c(mean = mean(x)))
# preparing feature : online purchase
order.online.flag <- orders.attributes.agg[,c('CustomerID','OnlineOrderFlag','order_detail_UnitPrice')]
order.online.flag.nodup <- order.online.flag[!duplicated(order.online.flag), ]
order.online.flag.nodup.agg <- aggregate(. ~ CustomerID + OnlineOrderFlag,order.online.flag.nodup,mean)
order.online.flag.wide <- spread(order.online.flag.nodup.agg,OnlineOrderFlag,order_detail_UnitPrice)
colnames(order.online.flag.wide)[2] <- "offline_purshace"
colnames(order.online.flag.wide)[3] <- "online_purshace"

# preparing feature : purchase category
order.cat<- orders.attributes.agg[,c('CustomerID','ProductCategoryName','order_detail_UnitPrice')]
order.cat <- order.cat[!duplicated(order.cat), ]
order.cat.nodup.agg <- aggregate(. ~ CustomerID + ProductCategoryName,order.cat,mean)
order.cat.wide <- spread(order.cat.nodup.agg,ProductCategoryName,order_detail_UnitPrice)

# preparing feature : TerritoryID
TerritoryName <- orders.attributes.agg[,c('CustomerID','Order_territory_name','order_detail_UnitPrice')]
TerritoryName <- TerritoryName[!duplicated(TerritoryName), ]
TerritoryName.agg <- aggregate(. ~ CustomerID + Order_territory_name,TerritoryName,mean)
TerritoryName.wide <- spread(TerritoryName.agg,Order_territory_name,order_detail_UnitPrice)
for (i in 2:ncol(TerritoryName.wide)){
  colnames(TerritoryName.wide)[i] <- paste("Teritory_",i-1)
}

# preparing feature : ProductSubcategoryName
ProductSubcategoryName <- orders.attributes.agg[,c('CustomerID','Product_Subcategory_Name','order_detail_UnitPrice')]
ProductSubcategoryName <- ProductSubcategoryName[!duplicated(ProductSubcategoryName), ]
ProductSubcategoryName.agg <- aggregate(. ~ CustomerID + Product_Subcategory_Name,ProductSubcategoryName,mean)
ProductSubcategoryName.wide <- spread(ProductSubcategoryName.agg,Product_Subcategory_Name,order_detail_UnitPrice)

Integrating Customer meta data and transactional data

churn.dataset <- merge(customers_demo,order.online.flag.wide,all=TRUE)
churn.dataset <- merge(churn.dataset,order.cat.wide,all=TRUE)
churn.dataset <- merge(churn.dataset,TerritoryName.wide,all=TRUE)
churn.dataset <- merge(churn.dataset,ProductSubcategoryName.wide,all=TRUE)
churn.dataset <- merge(churn.dataset,orders.attributes.agg,all=TRUE)
churn.dataset <- merge(churn.dataset,churn_target,all=TRUE)
colnames(churn.dataset)[17] <- "NA_Category"

Feature selection - Analyze features variance

# remove zero variance variables
nzv <- nearZeroVar(churn.dataset, saveMetrics = TRUE)
zerovar <- nzv[nzv[,"zeroVar"] > 0, ]
zerovar.list <-rownames(zerovar)
`%ni%` <- Negate(`%in%`)
churn.dataset <- subset(churn.dataset,select = names(churn.dataset) %ni% zerovar.list)
# check which factors have only 1 level and remove if so
#churn.dataset <- churn.dataset[, sapply(churn.dataset, function(x) nlevels(x) > 1)]
churn.dataset$churn <- as.factor(churn.dataset$churn)
churn.dataset$Demographics <- NULL
churn.dataset$Demographics.1 <- NULL
churn.dataset$CustomerID <- NULL
churn.dataset$TerritoryID <- NULL
churn.dataset[,grep(pattern = "NA",names(churn.dataset))] <- NULL
churn.dataset[,grep(pattern ="ID",names(churn.dataset))] <- NULL
churn.dataset <- churn.dataset[,-c(grep(pattern ="Date",names(churn.dataset)))] 
#churn.dataset$churn <- NULL
target <- names(churn.dataset) %in% c("churn")
predictors <- churn.dataset[,!target]  

Paritition to training and testing set

trainIndex <- createDataPartition(churn.dataset$churn, p = .8,
                                  list = FALSE,
                                  times = 1)
Train <- churn.dataset[ trainIndex,]
Test  <- churn.dataset[-trainIndex,]
Train$churn_fact <- factor( ifelse(Train$churn==1, "YES", "NO") )
Test$churn_fact <- factor( ifelse(Test$churn==1, "YES", "NO") )
Train$churn <- NULL
Test$churn <- NULL

Fitting a classification decision tree

setwd('C:/Users/Nir.Regev/Documents/AdventureWorksUseCases/Presentation_Outputs')

fit <- rpart(churn_fact ~ .,
             method="class", data=Train,
             control=rpart.control(minsplit=5, cp=0.01))
# getting the optimal complexity param to minimuze cross validation error
opt.cp <- fit$cptable[which.min(fit$cptable[,"xerror"]),"CP"]
# prune tree according to optimal tree complexity
fit <- prune(fit, cp= opt.cp)
plot.party(as.party(fit))

plot(as.party(fit), main="Conditional Inference Tree")

Retail churn classificaiton tree Retail churn enhanced classificaiton tree

Churn prediction on new customers (that the model had not seen before)

# prediction on new data
# predict on new data
ctree.testing.set.predict <- predict(fit, type="class", newdata = Test) # predicted values
ctree.testing.set.predict.prob <- predict(fit, type="prob", newdata = Test) # predicted probs
ctree.testing.set.predict.prob.df <- data.frame(matrix(unlist(ctree.testing.set.predict.prob), nrow=nrow(Test), byrow=T))
# confusion matrix

cm <- confusionMatrix(ctree.testing.set.predict, Test$churn_fact)
cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   NO  YES
##        NO  9749  613
##        YES  648 4876
##                                           
##                Accuracy : 0.9206          
##                  95% CI : (0.9163, 0.9248)
##     No Information Rate : 0.6545          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8248          
##  Mcnemar's Test P-Value : 0.3383          
##                                           
##             Sensitivity : 0.9377          
##             Specificity : 0.8883          
##          Pos Pred Value : 0.9408          
##          Neg Pred Value : 0.8827          
##              Prevalence : 0.6545          
##          Detection Rate : 0.6137          
##    Detection Prevalence : 0.6523          
##       Balanced Accuracy : 0.9130          
##                                           
##        'Positive' Class : NO              
## 
# Estimated class probabilities
# plot ROC
roc_pred <- prediction(ctree.testing.set.predict.prob.df[,2], Test$churn_fact)
png(filename=paste("decision_tree_roc.png"),width = 1200, height = 800)
plot(performance(roc_pred, measure="sens", x.measure="spec"), colorize=TRUE)
auc.perf = performance(roc_pred, measure = "auc")
title(unlist(auc.perf@y.values))
dev.off()
## png 
##   2

Retail churn classificaiton tree