Customer Acquisition & Retention Case Study

Loading, Cleaning and Splitting Data

library(SMCRM)

## Warning: package 'SMCRM' was built under R version 4.0.3

data("acquisitionRetention")
ret = acquisitionRetention[,c(2:15)]
ret = na.omit(ret)
ret$acquisition = as.factor(ret$acquisition)
ret$industry = as.factor(ret$industry)
str(ret)

## 'data.frame':    500 obs. of  14 variables:
##  $ acquisition: Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 1 1 ...
##  $ duration   : num  1635 1039 1288 0 1631 ...
##  $ profit     : num  6134 3524 4081 -638 5446 ...
##  $ acq_exp    : num  694 460 249 638 589 ...
##  $ ret_exp    : num  972 450 805 0 920 ...
##  $ acq_exp_sq : num  480998 211628 62016 407644 346897 ...
##  $ ret_exp_sq : num  943929 202077 648089 0 846106 ...
##  $ freq       : num  6 11 21 0 2 7 15 13 0 0 ...
##  $ freq_sq    : num  36 121 441 0 4 49 225 169 0 0 ...
##  $ crossbuy   : num  5 6 6 0 9 4 5 5 0 0 ...
##  $ sow        : num  95 22 90 0 80 48 51 23 0 0 ...
##  $ industry   : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 1 2 1 2 ...
##  $ revenue    : num  47.2 45.1 29.1 40.6 48.7 ...
##  $ employees  : num  898 686 1423 181 631 ...

ret.dur = ret[ret$duration != 0,c(2:14)]
ret.acq = ret[,c(1,4,6,12:14)]

train.dur.samp = sample(nrow(ret.dur), .8*(nrow(ret.dur)))
train.dur = ret.dur[train.dur.samp,]
test.dur = ret.dur[-train.dur.samp,]

train.acq.samp = sample(nrow(ret.acq), .8*(nrow(ret.acq)))
train.acq = ret.acq[train.acq.samp,]
test.acq = ret.acq[-train.acq.samp,]

Making the Tree

library(tree)

## Warning: package 'tree' was built under R version 4.0.4

set.seed(42)

tree.ret = tree(duration ~., data = train.dur)

plot(tree.ret)
text(tree.ret,pretty=0)

tree.pred = predict(tree.ret, test.dur)

tree.mse = mean((tree.pred - test.dur$duration)^2)
tree.mse

## [1] 3511.037

Cross-Validation and Pruning

set.seed(42)

cv.ret = cv.tree(tree.ret)
plot(cv.ret$size[c(1:7)], cv.ret$dev[c(1:7)], type = "b")

cv.ret$dev

## [1]  1279811  1471858  1471858  1661240  2224364  3717248  5025307 12662540

prune.ret = prune.tree(tree.ret, best = 6)
plot(prune.ret)
text(prune.ret, pretty = 0)

prune.pred = predict(prune.ret, test.dur)

prune.mse = mean((prune.pred - test.dur$duration)^2)
prune.mse

## [1] 5817.635

Bagging

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.0.5

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

set.seed(42)

bag.ret = randomForest(duration ~ ., data = train.dur, mtry = 12, ntree = 5000, importance = TRUE)

bag.pred = predict(bag.ret, newdata = test.dur)
bag.mse = mean((bag.pred - test.dur$duration)^2)
bag.mse

## [1] 1103.648

importance(bag.ret)

##               %IncMSE IncNodePurity
## profit      35.481899    286213.183
## acq_exp      3.889448     11022.473
## ret_exp     96.273333   5942721.770
## acq_exp_sq   2.518997     11204.971
## ret_exp_sq  95.590983   5886583.279
## freq       106.215934    157630.581
## freq_sq    108.633000    157577.560
## crossbuy     6.364134     14100.902
## sow          2.969734     22442.890
## industry    12.421181      5846.837
## revenue     -4.796642     21712.905
## employees    5.110906     25523.980

varImpPlot(bag.ret)

Boosting

library(gbm)

## Warning: package 'gbm' was built under R version 4.0.5

## Loaded gbm 2.1.8

set.seed(42)

boost.ret = gbm(duration ~., data = train.dur, distribution = "gaussian", n.trees = 5000, shrinkage = .01, verbose = F)

boost.pred = predict(boost.ret, newdata = test.dur)

## Using 5000 trees...

boost.mse = mean((boost.pred - test.dur$duration)^2)
boost.mse

## [1] 624.2513

summary(boost.ret)

##                   var    rel.inf
## ret_exp       ret_exp 86.2180692
## profit         profit  8.3452925
## freq             freq  3.4942448
## employees   employees  0.5348091
## acq_exp       acq_exp  0.3923935
## revenue       revenue  0.3722611
## sow               sow  0.3194178
## industry     industry  0.1681210
## crossbuy     crossbuy  0.1553911
## acq_exp_sq acq_exp_sq  0.0000000
## ret_exp_sq ret_exp_sq  0.0000000
## freq_sq       freq_sq  0.0000000

Random Forest with Default

set.seed(42)

rf.ret = randomForest(duration ~., data = train.dur, importance = TRUE)

rf.pred = predict(rf.ret, newdata = test.dur)
rf.mse = mean((rf.pred - test.dur$duration)^2)
rf.mse

## [1] 1114.455

importance(rf.ret)

##               %IncMSE IncNodePurity
## profit     15.9465834    2372825.02
## acq_exp     2.9564141     139619.14
## ret_exp    27.8300568    4648114.59
## acq_exp_sq  2.7901489     138458.79
## ret_exp_sq 26.1578371    4379885.16
## freq       16.9549218     173264.10
## freq_sq    18.3525275     180103.48
## crossbuy    1.3137901      60174.71
## sow        -1.1106748      69863.61
## industry    0.1399243      17126.99
## revenue     1.0837839     127661.05
## employees   0.3413243     125514.27

varImpPlot(rf.ret)

Predicted vs Actual

plot(y = test.dur$duration, x = c(1:68), xlab = "Predicted", ylab = "Actual")
points(rf.pred, col = "red")

Tuning Hyperparameters

set.seed(42)

mtry.value = seq(2,12,1)
ntree.values = seq(3e3,10e3,1e3)

# create a data frame with all combinations
hyper_grid = expand.grid(mtry = mtry.value, ntree = ntree.values)

# create empty vector to store 00B error values
oob_err = c()

# write a  for loop over the tows of hyper_grid to traiun thgew grid of models
for (i in 1:nrow(hyper_grid)) {
  model = randomForest(duration ~., data = train.dur, importance = T, mtry = hyper_grid$mtry[i], ntree = hyper_grid$ntree[i])
  oob_err[i] = model$mse[length(model$mse)]
}

opt_i = which.min(oob_err)
hyper_grid[opt_i,]

##   mtry ntree
## 5    6  3000

which.min(oob_err)

## [1] 5

rf.ret$mse[length(rf.ret$mse)]

## [1] 1504.455

oob_err

##  [1] 2440.944 1679.957 1449.954 1376.562 1337.684 1353.420 1372.981 1391.576
##  [9] 1396.448 1416.638 1432.442 2505.179 1661.299 1445.048 1361.320 1360.374
## [17] 1371.831 1377.425 1383.343 1395.031 1406.813 1422.945 2445.977 1670.957
## [25] 1454.155 1370.990 1351.241 1359.659 1376.264 1384.049 1386.539 1401.286
## [33] 1419.747 2477.064 1677.889 1440.427 1358.358 1362.599 1366.592 1362.924
## [41] 1389.376 1392.028 1406.480 1425.267 2481.388 1662.341 1439.103 1375.266
## [49] 1356.510 1352.257 1370.049 1390.298 1391.718 1408.607 1421.691 2483.051
## [57] 1679.543 1454.202 1381.416 1351.353 1356.221 1368.804 1381.206 1397.682
## [65] 1406.357 1422.621 2465.051 1657.144 1431.988 1372.106 1354.225 1363.341
## [73] 1373.824 1388.480 1399.498 1407.759 1427.377 2443.405 1671.853 1440.828
## [81] 1367.964 1353.174 1363.885 1366.092 1380.814 1395.972 1405.757 1416.772

Using the Best Hyperparameters

set.seed(42)

hype.rf.ret = randomForest(duration ~ ., data = train.dur, mtry = 6, ntree = 9000, importance = T)

hype.rf.pred = predict(hype.rf.ret, newdata = test.dur)
hype.rf.mse = mean((hype.rf.pred - test.dur$duration)^2)
hype.rf.mse

## [1] 1052.546

importance(hype.rf.ret)

##               %IncMSE IncNodePurity
## profit      53.934825   1605306.453
## acq_exp      8.141277     52909.817
## ret_exp    122.294222   5191409.722
## acq_exp_sq   9.946018     53345.804
## ret_exp_sq 121.662335   5139220.527
## freq       113.150314    157722.789
## freq_sq    113.347037    160052.848
## crossbuy     5.097345     26339.124
## sow          2.647907     37686.712
## industry     8.285336      8956.825
## revenue     -1.420130     60268.219
## employees    6.962723     55040.843

varImpPlot(hype.rf.ret)

Predicted vs Actual

plot(y = test.dur$duration, x = c(1:68))
points(hype.rf.pred, col = "red")

Random Forest for Classification

set.seed(42)

rf.class.ret = randomForest(acquisition ~., data = train.acq, importance = TRUE)

rf.class.pred = predict(rf.class.ret, newdata = test.acq)
table(test.acq$acquisition, rf.class.pred)

##    rf.class.pred
##      0  1
##   0 24 12
##   1 12 52

rf.class.err = 1 - (27+51)/100
rf.class.err

## [1] 0.22

importance(rf.class.ret)

##                    0         1 MeanDecreaseAccuracy MeanDecreaseGini
## acq_exp     8.229173 17.063262             20.18638        31.508390
## acq_exp_sq  8.340276 17.442358             21.23271        31.283270
## industry   15.055798 14.096685             19.40672         9.902336
## revenue    11.264978  9.942957             14.90323        34.391032
## employees  41.635058 33.931811             50.88006        65.439793

varImpPlot(rf.class.ret)

Tuning Hyperparameters

set.seed(42)

mtry.value = seq(2,5,1)
ntree.values = seq(3e3,10e3,1e3)

hyper_grid = expand.grid(mtry = mtry.value, ntree = ntree.values)

oob_err = c()

for (i in 1:nrow(hyper_grid)) {
  model = randomForest(acquisition ~., data = train.acq, importance = T, mtry = hyper_grid$mtry[i], ntree = hyper_grid$ntree[i])
  oob_err[i] = model$err.rate[length(model$err.rate)]
}

opt_i = which.min(oob_err)
hyper_grid[opt_i,]

##    mtry ntree
## 30    3 10000

Using the Best Hyperparameters

set.seed(42)

hype.rf.class.ret = randomForest(acquisition ~ ., data = train.acq, mtry = 2, ntree = 3000, importance = T)

hype.rf.class.pred = predict(hype.rf.class.ret, newdata = test.acq)
table(test.acq$acquisition, hype.rf.class.pred)

##    hype.rf.class.pred
##      0  1
##   0 23 13
##   1 12 52

hype.rf.class.err = 1 - (26+51)/100
hype.rf.class.err

## [1] 0.23

importance(hype.rf.class.ret)

##                    0        1 MeanDecreaseAccuracy MeanDecreaseGini
## acq_exp     18.38338 41.67203             48.42113        31.846718
## acq_exp_sq  17.12800 42.17626             49.93758        31.376392
## industry    35.99992 35.52591             47.93580         9.792362
## revenue     26.54830 21.88429             33.12885        33.944486
## employees  101.57816 78.67935            117.87492        65.213049

varImpPlot(hype.rf.class.ret)

Logistic Model

class.log = glm(acquisition ~ ., data = train.acq, family = binomial)
summary(class.log)

## 
## Call:
## glm(formula = acquisition ~ ., family = binomial, data = train.acq)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4741  -0.3804   0.2051   0.5090   2.5403  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.545e+01  1.797e+00  -8.596  < 2e-16 ***
## acq_exp      3.326e-02  5.175e-03   6.427 1.30e-10 ***
## acq_exp_sq  -3.329e-05  5.168e-06  -6.443 1.17e-10 ***
## industry1    2.101e+00  3.498e-01   6.008 1.88e-09 ***
## revenue      7.563e-02  1.685e-02   4.489 7.16e-06 ***
## employees    7.834e-03  9.526e-04   8.224  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 498.43  on 399  degrees of freedom
## Residual deviance: 272.06  on 394  degrees of freedom
## AIC: 284.06
## 
## Number of Fisher Scoring iterations: 6

class.log.PredProb = predict.glm(class.log, newdata = test.acq, type = "response")
class.log.PredSur = ifelse(class.log.PredProb >= .5,1,0)
table(test.acq$acquisition, class.log.PredSur)

##    class.log.PredSur
##      0  1
##   0 25 11
##   1 14 50

class.log.error = 1 - (28+50)/100
class.log.error

## [1] 0.22

Checking Cook’s Distance and Multicolinearity

library(car)

## Loading required package: carData

cook.log = cooks.distance(class.log)
plot(cook.log, col = "red", pch = 20, cex = 1)
abline(h = 4/100, lty = 2, col = "steelblue")

large.cook = which(cooks.distance(class.log)>(4/100))

train.acq.cook = train.acq[-large.cook,]

vif(class.log)

##    acq_exp acq_exp_sq   industry    revenue  employees 
##  31.864631  32.170157   1.254930   1.074226   1.243289

Refitting Model

log.data = train.acq.cook[,c(1:2,4:6)]

class.log2 = glm(acquisition ~ ., data = log.data, family = binomial)
summary(class.log2)

## 
## Call:
## glm(formula = acquisition ~ ., family = binomial, data = log.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3702  -0.5317   0.2700   0.6082   2.1541  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.9681158  1.0290352  -7.743 9.69e-15 ***
## acq_exp      0.0001726  0.0008618   0.200    0.841    
## industry1    1.5498798  0.3006423   5.155 2.53e-07 ***
## revenue      0.0742721  0.0156395   4.749 2.04e-06 ***
## employees    0.0079550  0.0009115   8.727  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 494.61  on 396  degrees of freedom
## Residual deviance: 312.28  on 392  degrees of freedom
## AIC: 322.28
## 
## Number of Fisher Scoring iterations: 5

Refitting Model

class.log3 = glm(acquisition ~ industry + revenue + employees, data = log.data, family = binomial)
summary(class.log3)

## 
## Call:
## glm(formula = acquisition ~ industry + revenue + employees, family = binomial, 
##     data = log.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3906  -0.5338   0.2712   0.6127   2.1597  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.8917387  0.9539643  -8.273  < 2e-16 ***
## industry1    1.5563312  0.2991210   5.203 1.96e-07 ***
## revenue      0.0745287  0.0155960   4.779 1.76e-06 ***
## employees    0.0079479  0.0009107   8.727  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 494.61  on 396  degrees of freedom
## Residual deviance: 312.32  on 393  degrees of freedom
## AIC: 320.32
## 
## Number of Fisher Scoring iterations: 5

class.log.PredProb = predict.glm(class.log3, newdata = test.acq, type = "response")
class.log.PredSur = ifelse(class.log.PredProb >= .5,1,0)
table(test.acq$acquisition, class.log.PredSur)

##    class.log.PredSur
##      0  1
##   0 25 11
##   1 13 51

class.log.error2 = 1 - (25+51)/100
class.log.error2

## [1] 0.24

Finding the Ideal Cutoff Point

library(ROCR)

## Warning: package 'ROCR' was built under R version 4.0.3

pred = prediction(predict(class.log3, newdata = log.data, type = "response"), log.data$acquisition)

auc = round(as.numeric(performance(pred, measure = "auc")@y.values),3)

false.rates = performance(pred, "fpr", "fnr")
accuracy = performance(pred, "acc", "err")

perf = performance(pred, "tpr", "fpr")
plot(perf, colorize = T, main = "ROC curve")
text(.5,.5, paste("AUC", auc))

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

Using the Optimal Treshold

class.log.PredProb = predict.glm(class.log3, newdata = test.acq, type = "response")
class.log.PredSur = ifelse(class.log.PredProb >= .6998,1,0)
table(test.acq$acquisition, class.log.PredSur)

##    class.log.PredSur
##      0  1
##   0 33  3
##   1 19 45

class.log.error3 = 1 - (30+46)/100
class.log.error3

## [1] 0.24

Odds Ratio

OR= exp(class.log3$coefficients)
round(OR, 3)

## (Intercept)   industry1     revenue   employees 
##       0.000       4.741       1.077       1.008

Checking Basic Assumptions

par(mfrow=c(2,2))
plot(class.log3, which=c(1:4))

Checking Linearity of Data

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following object is masked from 'package:randomForest':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

probabilities <- predict(class.log3, type = "response")
predicted.classes <- ifelse(probabilities > 0.68044, 1, 0)

# Select only numeric predictors
mydata = dplyr::select_if(log.data[,c(3:5)], is.numeric) 
predictors <- colnames(mydata)
# Bind the logit and tidying the data for plot
mydata = mutate(mydata, logit = log(probabilities/(1-probabilities))) 
mydata = gather(mydata, key = "predictors", value = "predictor.value", -logit)

#making the plot
ggplot(mydata, aes(logit, predictor.value))+
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") + 
  theme_bw() + 
  facet_wrap(~predictors, scales = "free_y")

## `geom_smooth()` using formula 'y ~ x'

Customer Acquisition & Retention Case Study

Antonio Ramirez

4/22/2021

Loading, Cleaning and Splitting Data

Making the Tree

Cross-Validation and Pruning

Bagging

Boosting

Random Forest with Default

Predicted vs Actual

Tuning Hyperparameters

Using the Best Hyperparameters

Predicted vs Actual

Random Forest for Classification

Tuning Hyperparameters

Using the Best Hyperparameters

Logistic Model

Checking Cook’s Distance and Multicolinearity

Refitting Model

Refitting Model

Finding the Ideal Cutoff Point

Using the Optimal Treshold

Odds Ratio

Checking Basic Assumptions

Checking Linearity of Data