Nir Regev
Principal Data Scientist
Sisense Ltd.
May 16th, 2016
library(ade4)## Warning: package 'ade4' was built under R version 3.2.5
library(cluster)
library(fpc)## Warning: package 'fpc' was built under R version 3.2.5
library(caret)## Loading required package: lattice
## Loading required package: ggplot2
library(Causata)
library(party)## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
library(partykit)##
## Attaching package: 'partykit'
## The following objects are masked from 'package:party':
##
## cforest, ctree, ctree_control, edge_simple, mob, mob_control,
## node_barplot, node_bivplot, node_boxplot, node_inner,
## node_surv, node_terminal
library(corrgram)## Warning: package 'corrgram' was built under R version 3.2.5
library(corrplot)## Warning: package 'corrplot' was built under R version 3.2.5
library(ROCR)## Warning: package 'ROCR' was built under R version 3.2.5
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.2.5
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(lubridate)## Warning: package 'lubridate' was built under R version 3.2.5
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(tidyr)## Warning: package 'tidyr' was built under R version 3.2.5
library(caret)
library(XML)
library(Causata)
library(foreign)
library(tidyr)
library(rpart)
set.seed(3456)# aggregate some measure on customer transactions
order.measures <- transactional_product_data[,c('CustomerID','OrderHeaderSubTotal','order_detail_UnitPrice','order_detail_UnitPriceDiscount','product_list_price')]
cust.aggr <- aggregate(. ~ CustomerID, order.measures, function(x) c(mean = mean(x)))
# customer dates analysis. gaps between ship time and due date time, to reveal late deliveries
cust.order.dates <- transactional_product_data[,c('CustomerID','OrderDate','DueDate','ShipDate')]
cust.order.dates$OrderDate <- as.Date(as.character(cust.order.dates$OrderDate), format="%m/%d/%Y")
cust.order.dates$DueDate <- as.Date(as.character(cust.order.dates$DueDate), format="%m/%d/%Y")
cust.order.dates$ShipDate <- as.Date(as.character(cust.order.dates$ShipDate), format="%m/%d/%Y")
cust.order.dates$late <- difftime(cust.order.dates$ShipDate,cust.order.dates$DueDate, units="days")
# other orders attributes
orders.attributes <- transactional_product_data[
,c('CustomerID',
'OnlineOrderFlag',
'ProductCategoryName',
'product_name',
'Order_territory_name',
'product_class',
'Product_Subcategory_Name',
'order_detail_UnitPrice',
'order_detail_UnitPriceDiscount',
'product_list_price',
'margin',
'OrderHeaderSubTotal',
'order_detail_OrderQty'
)]
orders.attributes.agg <- aggregate(. ~ CustomerID+OnlineOrderFlag+Order_territory_name+product_class+Product_Subcategory_Name+ProductCategoryName+product_name, orders.attributes, function(x) c(mean = mean(x)))
# preparing feature : online purchase
order.online.flag <- orders.attributes.agg[,c('CustomerID','OnlineOrderFlag','order_detail_UnitPrice')]
order.online.flag.nodup <- order.online.flag[!duplicated(order.online.flag), ]
order.online.flag.nodup.agg <- aggregate(. ~ CustomerID + OnlineOrderFlag,order.online.flag.nodup,mean)
order.online.flag.wide <- spread(order.online.flag.nodup.agg,OnlineOrderFlag,order_detail_UnitPrice)
colnames(order.online.flag.wide)[2] <- "offline_purshace"
colnames(order.online.flag.wide)[3] <- "online_purshace"
# preparing feature : purchase category
order.cat<- orders.attributes.agg[,c('CustomerID','ProductCategoryName','order_detail_UnitPrice')]
order.cat <- order.cat[!duplicated(order.cat), ]
order.cat.nodup.agg <- aggregate(. ~ CustomerID + ProductCategoryName,order.cat,mean)
order.cat.wide <- spread(order.cat.nodup.agg,ProductCategoryName,order_detail_UnitPrice)
# preparing feature : TerritoryID
TerritoryName <- orders.attributes.agg[,c('CustomerID','Order_territory_name','order_detail_UnitPrice')]
TerritoryName <- TerritoryName[!duplicated(TerritoryName), ]
TerritoryName.agg <- aggregate(. ~ CustomerID + Order_territory_name,TerritoryName,mean)
TerritoryName.wide <- spread(TerritoryName.agg,Order_territory_name,order_detail_UnitPrice)
for (i in 2:ncol(TerritoryName.wide)){
colnames(TerritoryName.wide)[i] <- paste("Teritory_",i-1)
}
# preparing feature : ProductSubcategoryName
ProductSubcategoryName <- orders.attributes.agg[,c('CustomerID','Product_Subcategory_Name','order_detail_UnitPrice')]
ProductSubcategoryName <- ProductSubcategoryName[!duplicated(ProductSubcategoryName), ]
ProductSubcategoryName.agg <- aggregate(. ~ CustomerID + Product_Subcategory_Name,ProductSubcategoryName,mean)
ProductSubcategoryName.wide <- spread(ProductSubcategoryName.agg,Product_Subcategory_Name,order_detail_UnitPrice)churn.dataset <- merge(customers_demo,order.online.flag.wide,all=TRUE)
churn.dataset <- merge(churn.dataset,order.cat.wide,all=TRUE)
churn.dataset <- merge(churn.dataset,TerritoryName.wide,all=TRUE)
churn.dataset <- merge(churn.dataset,ProductSubcategoryName.wide,all=TRUE)
churn.dataset <- merge(churn.dataset,orders.attributes.agg,all=TRUE)
churn.dataset <- merge(churn.dataset,churn_target,all=TRUE)
colnames(churn.dataset)[17] <- "NA_Category"# remove zero variance variables
nzv <- nearZeroVar(churn.dataset, saveMetrics = TRUE)
zerovar <- nzv[nzv[,"zeroVar"] > 0, ]
zerovar.list <-rownames(zerovar)
`%ni%` <- Negate(`%in%`)
churn.dataset <- subset(churn.dataset,select = names(churn.dataset) %ni% zerovar.list)
# check which factors have only 1 level and remove if so
#churn.dataset <- churn.dataset[, sapply(churn.dataset, function(x) nlevels(x) > 1)]
churn.dataset$churn <- as.factor(churn.dataset$churn)
churn.dataset$Demographics <- NULL
churn.dataset$Demographics.1 <- NULL
churn.dataset$CustomerID <- NULL
churn.dataset$TerritoryID <- NULL
churn.dataset[,grep(pattern = "NA",names(churn.dataset))] <- NULL
churn.dataset[,grep(pattern ="ID",names(churn.dataset))] <- NULL
churn.dataset <- churn.dataset[,-c(grep(pattern ="Date",names(churn.dataset)))]
#churn.dataset$churn <- NULL
target <- names(churn.dataset) %in% c("churn")
predictors <- churn.dataset[,!target] trainIndex <- createDataPartition(churn.dataset$churn, p = .8,
list = FALSE,
times = 1)
Train <- churn.dataset[ trainIndex,]
Test <- churn.dataset[-trainIndex,]
Train$churn_fact <- factor( ifelse(Train$churn==1, "YES", "NO") )
Test$churn_fact <- factor( ifelse(Test$churn==1, "YES", "NO") )
Train$churn <- NULL
Test$churn <- NULLsetwd('C:/Users/Nir.Regev/Documents/AdventureWorksUseCases/Presentation_Outputs')
fit <- rpart(churn_fact ~ .,
method="class", data=Train,
control=rpart.control(minsplit=5, cp=0.01))
# getting the optimal complexity param to minimuze cross validation error
opt.cp <- fit$cptable[which.min(fit$cptable[,"xerror"]),"CP"]
# prune tree according to optimal tree complexity
fit <- prune(fit, cp= opt.cp)
plot.party(as.party(fit))plot(as.party(fit), main="Conditional Inference Tree")
# prediction on new data
# predict on new data
ctree.testing.set.predict <- predict(fit, type="class", newdata = Test) # predicted values
ctree.testing.set.predict.prob <- predict(fit, type="prob", newdata = Test) # predicted probs
ctree.testing.set.predict.prob.df <- data.frame(matrix(unlist(ctree.testing.set.predict.prob), nrow=nrow(Test), byrow=T))
# confusion matrix
cm <- confusionMatrix(ctree.testing.set.predict, Test$churn_fact)
cm## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 9749 613
## YES 648 4876
##
## Accuracy : 0.9206
## 95% CI : (0.9163, 0.9248)
## No Information Rate : 0.6545
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8248
## Mcnemar's Test P-Value : 0.3383
##
## Sensitivity : 0.9377
## Specificity : 0.8883
## Pos Pred Value : 0.9408
## Neg Pred Value : 0.8827
## Prevalence : 0.6545
## Detection Rate : 0.6137
## Detection Prevalence : 0.6523
## Balanced Accuracy : 0.9130
##
## 'Positive' Class : NO
##
# Estimated class probabilities
# plot ROC
roc_pred <- prediction(ctree.testing.set.predict.prob.df[,2], Test$churn_fact)
png(filename=paste("decision_tree_roc.png"),width = 1200, height = 800)
plot(performance(roc_pred, measure="sens", x.measure="spec"), colorize=TRUE)
auc.perf = performance(roc_pred, measure = "auc")
title(unlist(auc.perf@y.values))
dev.off()## png
## 2