Customer Acquisition & Retention Case Study

Access acquisitionRetention data, cleanup and split

library(SMCRM)

data("acquisitionRetention")
ret = acquisitionRetention[,c(2:15)]
ret = na.omit(ret)
ret$acquisition = as.factor(ret$acquisition)
ret$industry = as.factor(ret$industry)
str(ret)

## 'data.frame':    500 obs. of  14 variables:
##  $ acquisition: Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 1 1 ...
##  $ duration   : num  1635 1039 1288 0 1631 ...
##  $ profit     : num  6134 3524 4081 -638 5446 ...
##  $ acq_exp    : num  694 460 249 638 589 ...
##  $ ret_exp    : num  972 450 805 0 920 ...
##  $ acq_exp_sq : num  480998 211628 62016 407644 346897 ...
##  $ ret_exp_sq : num  943929 202077 648089 0 846106 ...
##  $ freq       : num  6 11 21 0 2 7 15 13 0 0 ...
##  $ freq_sq    : num  36 121 441 0 4 49 225 169 0 0 ...
##  $ crossbuy   : num  5 6 6 0 9 4 5 5 0 0 ...
##  $ sow        : num  95 22 90 0 80 48 51 23 0 0 ...
##  $ industry   : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 1 2 1 2 ...
##  $ revenue    : num  47.2 45.1 29.1 40.6 48.7 ...
##  $ employees  : num  898 686 1423 181 631 ...

ret.dur = ret[ret$duration != 0,c(2:14)]
ret.acq = ret[,c(1,4,6,12:14)]

train.dur.samp = sample(nrow(ret.dur), .8*(nrow(ret.dur)))
train.dur = ret.dur[train.dur.samp,]
test.dur = ret.dur[-train.dur.samp,]

train.acq.samp = sample(nrow(ret.acq), .8*(nrow(ret.acq)))
train.acq = ret.acq[train.acq.samp,]
test.acq = ret.acq[-train.acq.samp,]

Decision Tree

library(tree)

set.seed(42)

tree.ret = tree(duration ~., data = train.dur)

plot(tree.ret)
text(tree.ret,pretty=0)

tree.pred = predict(tree.ret, test.dur)

tree.mse = mean((tree.pred - test.dur$duration)^2)
tree.mse

## [1] 3754.787

Cross validation and data pruning

set.seed(42)

cv.ret = cv.tree(tree.ret)
plot(cv.ret$size[c(1:7)], cv.ret$dev[c(1:7)], type = "b")

cv.ret$dev

## [1]  1333105  1700741  1681019  2075847  3299651  5262614 12830740

prune.ret = prune.tree(tree.ret, best = 6)
plot(prune.ret)
text(prune.ret, pretty = 0)

prune.pred = predict(prune.ret, test.dur)

prune.mse = mean((prune.pred - test.dur$duration)^2)
prune.mse

## [1] 4383.626

Bagging

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

set.seed(42)

bag.ret = randomForest(duration ~ ., data = train.dur, mtry = 12, ntree = 5000, importance = TRUE)

bag.pred = predict(bag.ret, newdata = test.dur)
bag.mse = mean((bag.pred - test.dur$duration)^2)
bag.mse

## [1] 702.3303

importance(bag.ret)

##               %IncMSE IncNodePurity
## profit      31.500249    278265.008
## acq_exp      4.641504     16989.319
## ret_exp     94.256012   5963809.346
## acq_exp_sq   5.385089     16647.213
## ret_exp_sq  94.881197   5961043.738
## freq       106.373864    168657.091
## freq_sq    108.394395    167675.052
## crossbuy     1.609548     14115.846
## sow          2.625875     26887.186
## industry    15.181202      9621.868
## revenue     -9.517640     27534.350
## employees    3.369105     34777.078

varImpPlot(bag.ret)

Boosting

library(gbm)

## Loaded gbm 2.1.8

set.seed(42)

boost.ret = gbm(duration ~., data = train.dur, distribution = "gaussian", n.trees = 5000, shrinkage = .01, verbose = F)

boost.pred = predict(boost.ret, newdata = test.dur)

## Using 5000 trees...

boost.mse = mean((boost.pred - test.dur$duration)^2)
boost.mse

## [1] 871.9039

summary(boost.ret)

##                   var    rel.inf
## ret_exp       ret_exp 86.3401189
## profit         profit  8.1026812
## freq             freq  3.8329582
## employees   employees  0.5134737
## revenue       revenue  0.3333241
## acq_exp       acq_exp  0.3310670
## sow               sow  0.3014918
## industry     industry  0.1407230
## crossbuy     crossbuy  0.1041621
## acq_exp_sq acq_exp_sq  0.0000000
## ret_exp_sq ret_exp_sq  0.0000000
## freq_sq       freq_sq  0.0000000

Random Forest Default

set.seed(42)

rf.ret = randomForest(duration ~., data = train.dur, importance = TRUE)

rf.pred = predict(rf.ret, newdata = test.dur)
rf.mse = mean((rf.pred - test.dur$duration)^2)
rf.mse

## [1] 862.1942

importance(rf.ret)

##              %IncMSE IncNodePurity
## profit     16.386152    2416024.63
## acq_exp     5.221105     132161.24
## ret_exp    26.230854    4362545.28
## acq_exp_sq  5.530906     135101.07
## ret_exp_sq 28.504272    4743173.26
## freq       16.998284     199361.12
## freq_sq    17.493410     184323.47
## crossbuy    2.624369      52687.68
## sow         1.448555      97948.64
## industry    1.239394      15447.49
## revenue     1.384401     115147.43
## employees   2.741892     142703.00

varImpPlot(rf.ret)

Predicted and Actual

plot(y = test.dur$duration, x = c(1:68), xlab = "Predicted", ylab = "Actual")
points(rf.pred, col = "blue")

Hyperparameters Tuning

set.seed(42)

mtry.value = seq(2,12,1)
ntree.values = seq(3e3,10e3,1e3)

# create a data frame with all combinations
hyper_grid = expand.grid(mtry = mtry.value, ntree = ntree.values)

# create empty vector to store 00B error values
oob_err = c()

# write a  for loop over the tows of hyper_grid to traiun thgew grid of models
for (i in 1:nrow(hyper_grid)) {
  model = randomForest(duration ~., data = train.dur, importance = T, mtry = hyper_grid$mtry[i], ntree = hyper_grid$ntree[i])
  oob_err[i] = model$mse[length(model$mse)]
}

opt_i = which.min(oob_err)
hyper_grid[opt_i,]

##    mtry ntree
## 26    5  5000

which.min(oob_err)

## [1] 26

rf.ret$mse[length(rf.ret$mse)]

## [1] 1520.612

oob_err

##  [1] 2485.677 1715.504 1526.015 1461.765 1467.993 1519.595 1542.214 1577.500
##  [9] 1579.147 1611.733 1622.010 2422.281 1670.717 1519.661 1464.843 1479.989
## [17] 1517.039 1537.621 1576.896 1571.150 1600.795 1622.799 2458.436 1689.432
## [25] 1498.109 1458.938 1479.696 1511.394 1534.262 1568.098 1588.315 1609.375
## [33] 1635.325 2427.629 1707.550 1496.731 1475.207 1471.640 1517.657 1541.333
## [41] 1559.871 1589.980 1605.197 1618.321 2502.558 1696.165 1511.563 1461.452
## [49] 1494.961 1502.952 1534.366 1566.257 1584.290 1599.283 1626.851 2467.471
## [57] 1707.204 1516.271 1466.615 1484.000 1505.864 1543.147 1566.748 1585.253
## [65] 1601.753 1617.048 2474.625 1695.499 1499.349 1468.541 1475.916 1505.897
## [73] 1544.727 1569.609 1598.773 1603.766 1616.838 2440.367 1695.764 1494.981
## [81] 1476.864 1476.816 1519.636 1541.672 1567.086 1581.791 1599.879 1627.422

Selecting best hyperparameters

set.seed(42)

hype.rf.ret = randomForest(duration ~ ., data = train.dur, mtry = 5, ntree = 9000, importance = T)

hype.rf.pred = predict(hype.rf.ret, newdata = test.dur)
hype.rf.mse = mean((hype.rf.pred - test.dur$duration)^2)
hype.rf.mse

## [1] 809.4861

importance(hype.rf.ret)

##               %IncMSE IncNodePurity
## profit      57.073714    1907598.34
## acq_exp     13.379535      82173.63
## ret_exp    120.738322    5085157.53
## acq_exp_sq  15.584653      80644.08
## ret_exp_sq 117.623536    4866164.03
## freq        95.810592     180056.86
## freq_sq     92.775624     182458.78
## crossbuy     4.917525      34345.33
## sow          1.472791      62291.94
## industry    11.814035      12446.84
## revenue     -2.904411      77274.92
## employees    7.971473      92228.11

varImpPlot(hype.rf.ret)

Predicted and Actual

plot(y = test.dur$duration, x = c(1:68))
points(hype.rf.pred, col = "blue")

Random Forest Classification

set.seed(42)

rf.class.ret = randomForest(acquisition ~., data = train.acq, importance = TRUE)

rf.class.pred = predict(rf.class.ret, newdata = test.acq)
table(test.acq$acquisition, rf.class.pred)

##    rf.class.pred
##      0  1
##   0 12 17
##   1  7 64

rf.class.err = 1 - (27+51)/100
rf.class.err

## [1] 0.22

importance(rf.class.ret)

##                    0         1 MeanDecreaseAccuracy MeanDecreaseGini
## acq_exp     9.726736 17.440556             22.38722         32.29424
## acq_exp_sq  8.602296 16.288328             20.61477         32.81734
## industry   24.244376 10.856302             24.34224         11.25669
## revenue    10.399403  7.082151             12.23199         33.73914
## employees  42.676453 34.337802             49.22097         67.11708

varImpPlot(rf.class.ret)

Hyperparameters Tuning

set.seed(42)

mtry.value = seq(2,5,1)
ntree.values = seq(3e3,10e3,1e3)

hyper_grid = expand.grid(mtry = mtry.value, ntree = ntree.values)

oob_err = c()

for (i in 1:nrow(hyper_grid)) {
  model = randomForest(acquisition ~., data = train.acq, importance = T, mtry = hyper_grid$mtry[i], ntree = hyper_grid$ntree[i])
  oob_err[i] = model$err.rate[length(model$err.rate)]
}

opt_i = which.min(oob_err)
hyper_grid[opt_i,]

##   mtry ntree
## 4    5  3000

Selecting the best hyperparameters

set.seed(42)

hype.rf.class.ret = randomForest(acquisition ~ ., data = train.acq, mtry = 2, ntree = 3000, importance = T)

hype.rf.class.pred = predict(hype.rf.class.ret, newdata = test.acq)
table(test.acq$acquisition, hype.rf.class.pred)

##    hype.rf.class.pred
##      0  1
##   0 12 17
##   1  8 63

hype.rf.class.err = 1 - (30+56)/100
hype.rf.class.err

## [1] 0.14

importance(hype.rf.class.ret)

##                    0        1 MeanDecreaseAccuracy MeanDecreaseGini
## acq_exp     24.94702 40.39002             52.76963         32.85903
## acq_exp_sq  24.11866 40.96689             52.45389         33.09297
## industry    56.17918 26.52844             55.37370         11.35234
## revenue     25.87034 17.87255             29.67675         32.94006
## employees  110.13824 86.27468            127.93717         66.85890

varImpPlot(hype.rf.class.ret)

Logistic Regression

class.log = glm(acquisition ~ ., data = train.acq, family = binomial)
summary(class.log)

## 
## Call:
## glm(formula = acquisition ~ ., family = binomial, data = train.acq)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2853  -0.4747   0.1912   0.5299   2.3726  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.413e+01  1.720e+00  -8.216  < 2e-16 ***
## acq_exp      3.033e-02  5.211e-03   5.819 5.91e-09 ***
## acq_exp_sq  -3.070e-05  5.118e-06  -5.999 1.99e-09 ***
## industry1    2.195e+00  3.526e-01   6.226 4.79e-10 ***
## revenue      6.720e-02  1.622e-02   4.144 3.41e-05 ***
## employees    7.645e-03  9.221e-04   8.291  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 508.75  on 399  degrees of freedom
## Residual deviance: 285.16  on 394  degrees of freedom
## AIC: 297.16
## 
## Number of Fisher Scoring iterations: 6

class.log.PredProb = predict.glm(class.log, newdata = test.acq, type = "response")
class.log.PredSur = ifelse(class.log.PredProb >= .5,1,0)
table(test.acq$acquisition, class.log.PredSur)

##    class.log.PredSur
##      0  1
##   0 13 16
##   1  4 67

class.log.error = 1 - (27+55)/100
class.log.error

## [1] 0.18

Diagnostic plot cooks distance

library(car)

## Loading required package: carData

cook.log = cooks.distance(class.log)
plot(cook.log, col = "red", pch = 20, cex = 1)
abline(h = 4/100, lty = 2, col = "blue")

large.cook = which(cooks.distance(class.log)>(4/100))

train.acq.cook = train.acq[-large.cook,]

vif(class.log)

##    acq_exp acq_exp_sq   industry    revenue  employees 
##  35.068611  35.951278   1.287413   1.041055   1.219445

Refit model

log.data = train.acq.cook[,c(1:2,4:6)]

class.log2 = glm(acquisition ~ ., data = log.data, family = binomial)
summary(class.log2)

## 
## Call:
## glm(formula = acquisition ~ ., family = binomial, data = log.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5402  -0.6281   0.2545   0.6651   2.2311  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.9140921  0.9484407  -7.290 3.10e-13 ***
## acq_exp     -0.0004338  0.0007984  -0.543    0.587    
## industry1    1.5982617  0.2949718   5.418 6.02e-08 ***
## revenue      0.0636390  0.0152426   4.175 2.98e-05 ***
## employees    0.0074663  0.0008641   8.640  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 506.31  on 396  degrees of freedom
## Residual deviance: 331.64  on 392  degrees of freedom
## AIC: 341.64
## 
## Number of Fisher Scoring iterations: 5

Refit model with select predictors

class.log3 = glm(acquisition ~ industry + revenue + employees, data = log.data, family = binomial)
summary(class.log3)

## 
## Call:
## glm(formula = acquisition ~ industry + revenue + employees, family = binomial, 
##     data = log.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4824  -0.6375   0.2510   0.6790   2.2527  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.0971905  0.8892065  -7.981 1.45e-15 ***
## industry1    1.5800737  0.2921598   5.408 6.36e-08 ***
## revenue      0.0629833  0.0151537   4.156 3.23e-05 ***
## employees    0.0074656  0.0008628   8.652  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 506.31  on 396  degrees of freedom
## Residual deviance: 331.93  on 393  degrees of freedom
## AIC: 339.93
## 
## Number of Fisher Scoring iterations: 5

class.log.PredProb = predict.glm(class.log3, newdata = test.acq, type = "response")
class.log.PredSur = ifelse(class.log.PredProb >= .5,1,0)
table(test.acq$acquisition, class.log.PredSur)

##    class.log.PredSur
##      0  1
##   0 15 14
##   1  7 64

class.log.error2 = 1 - (18+58)/100
class.log.error2

## [1] 0.24

Find ideal cutoff

library(ROCR)

pred = prediction(predict(class.log3, newdata = log.data, type = "response"), log.data$acquisition)

auc = round(as.numeric(performance(pred, measure = "auc")@y.values),3)

false.rates = performance(pred, "fpr", "fnr")
accuracy = performance(pred, "acc", "err")

perf = performance(pred, "tpr", "fpr")
plot(perf, colorize = T, main = "ROC curve")
text(.5,.5, paste("AUC", auc))

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='blue', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

Optimal threshold

class.log.PredProb = predict.glm(class.log3, newdata = test.acq, type = "response")
class.log.PredSur = ifelse(class.log.PredProb >= .6998,1,0)
table(test.acq$acquisition, class.log.PredSur)

##    class.log.PredSur
##      0  1
##   0 21  8
##   1 10 61

class.log.error3 = 1 - (31+46)/100
class.log.error3

## [1] 0.23

Odds Ratio

OR= exp(class.log3$coefficients)
round(OR, 3)

## (Intercept)   industry1     revenue   employees 
##       0.001       4.855       1.065       1.007

Assumption validation

par(mfrow=c(2,2))
plot(class.log3, which=c(1:4))

check for linearity assumption

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following object is masked from 'package:randomForest':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

probabilities <- predict(class.log3, type = "response")
predicted.classes <- ifelse(probabilities > 0.68044, 1, 0)

# Select only numeric predictors
mydata = dplyr::select_if(log.data[,c(3:5)], is.numeric) 
predictors <- colnames(mydata)
# Bind the logit and tidying the data for plot
mydata = mutate(mydata, logit = log(probabilities/(1-probabilities))) 
mydata = gather(mydata, key = "predictors", value = "predictor.value", -logit)

#making the plot
ggplot(mydata, aes(logit, predictor.value))+
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") + 
  theme_bw() + 
  facet_wrap(~predictors, scales = "free_y")

## `geom_smooth()` using formula 'y ~ x'

Customer Acquisition & Retention Case Study

Shams Tabrez

4/27/2021

Access acquisitionRetention data, cleanup and split

Decision Tree

Cross validation and data pruning

Bagging

Boosting

Random Forest Default

Predicted and Actual

Hyperparameters Tuning

Selecting best hyperparameters

Predicted and Actual

Random Forest Classification

Hyperparameters Tuning

Selecting the best hyperparameters

Logistic Regression

Diagnostic plot cooks distance

Refit model

Refit model with select predictors

Find ideal cutoff

Optimal threshold

Odds Ratio

Assumption validation

check for linearity assumption