Access acquisitionRetention data, cleanup and split

library(SMCRM)
data("acquisitionRetention")
ret = acquisitionRetention[,c(2:15)]
ret = na.omit(ret)
ret$acquisition = as.factor(ret$acquisition)
ret$industry = as.factor(ret$industry)
str(ret)
## 'data.frame':    500 obs. of  14 variables:
##  $ acquisition: Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 1 1 ...
##  $ duration   : num  1635 1039 1288 0 1631 ...
##  $ profit     : num  6134 3524 4081 -638 5446 ...
##  $ acq_exp    : num  694 460 249 638 589 ...
##  $ ret_exp    : num  972 450 805 0 920 ...
##  $ acq_exp_sq : num  480998 211628 62016 407644 346897 ...
##  $ ret_exp_sq : num  943929 202077 648089 0 846106 ...
##  $ freq       : num  6 11 21 0 2 7 15 13 0 0 ...
##  $ freq_sq    : num  36 121 441 0 4 49 225 169 0 0 ...
##  $ crossbuy   : num  5 6 6 0 9 4 5 5 0 0 ...
##  $ sow        : num  95 22 90 0 80 48 51 23 0 0 ...
##  $ industry   : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 1 2 1 2 ...
##  $ revenue    : num  47.2 45.1 29.1 40.6 48.7 ...
##  $ employees  : num  898 686 1423 181 631 ...
ret.dur = ret[ret$duration != 0,c(2:14)]
ret.acq = ret[,c(1,4,6,12:14)]

train.dur.samp = sample(nrow(ret.dur), .8*(nrow(ret.dur)))
train.dur = ret.dur[train.dur.samp,]
test.dur = ret.dur[-train.dur.samp,]

train.acq.samp = sample(nrow(ret.acq), .8*(nrow(ret.acq)))
train.acq = ret.acq[train.acq.samp,]
test.acq = ret.acq[-train.acq.samp,]

Decision Tree

library(tree)
set.seed(42)

tree.ret = tree(duration ~., data = train.dur)

plot(tree.ret)
text(tree.ret,pretty=0)

tree.pred = predict(tree.ret, test.dur)

tree.mse = mean((tree.pred - test.dur$duration)^2)
tree.mse
## [1] 3754.787

Cross validation and data pruning

set.seed(42)

cv.ret = cv.tree(tree.ret)
plot(cv.ret$size[c(1:7)], cv.ret$dev[c(1:7)], type = "b")

cv.ret$dev
## [1]  1333105  1700741  1681019  2075847  3299651  5262614 12830740
prune.ret = prune.tree(tree.ret, best = 6)
plot(prune.ret)
text(prune.ret, pretty = 0)

prune.pred = predict(prune.ret, test.dur)

prune.mse = mean((prune.pred - test.dur$duration)^2)
prune.mse
## [1] 4383.626

Bagging

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
set.seed(42)

bag.ret = randomForest(duration ~ ., data = train.dur, mtry = 12, ntree = 5000, importance = TRUE)

bag.pred = predict(bag.ret, newdata = test.dur)
bag.mse = mean((bag.pred - test.dur$duration)^2)
bag.mse
## [1] 702.3303
importance(bag.ret)
##               %IncMSE IncNodePurity
## profit      31.500249    278265.008
## acq_exp      4.641504     16989.319
## ret_exp     94.256012   5963809.346
## acq_exp_sq   5.385089     16647.213
## ret_exp_sq  94.881197   5961043.738
## freq       106.373864    168657.091
## freq_sq    108.394395    167675.052
## crossbuy     1.609548     14115.846
## sow          2.625875     26887.186
## industry    15.181202      9621.868
## revenue     -9.517640     27534.350
## employees    3.369105     34777.078
varImpPlot(bag.ret)

Boosting

library(gbm)
## Loaded gbm 2.1.8
set.seed(42)

boost.ret = gbm(duration ~., data = train.dur, distribution = "gaussian", n.trees = 5000, shrinkage = .01, verbose = F)

boost.pred = predict(boost.ret, newdata = test.dur)
## Using 5000 trees...
boost.mse = mean((boost.pred - test.dur$duration)^2)
boost.mse
## [1] 871.9039
summary(boost.ret)

##                   var    rel.inf
## ret_exp       ret_exp 86.3401189
## profit         profit  8.1026812
## freq             freq  3.8329582
## employees   employees  0.5134737
## revenue       revenue  0.3333241
## acq_exp       acq_exp  0.3310670
## sow               sow  0.3014918
## industry     industry  0.1407230
## crossbuy     crossbuy  0.1041621
## acq_exp_sq acq_exp_sq  0.0000000
## ret_exp_sq ret_exp_sq  0.0000000
## freq_sq       freq_sq  0.0000000

Random Forest Default

set.seed(42)

rf.ret = randomForest(duration ~., data = train.dur, importance = TRUE)

rf.pred = predict(rf.ret, newdata = test.dur)
rf.mse = mean((rf.pred - test.dur$duration)^2)
rf.mse
## [1] 862.1942
importance(rf.ret)
##              %IncMSE IncNodePurity
## profit     16.386152    2416024.63
## acq_exp     5.221105     132161.24
## ret_exp    26.230854    4362545.28
## acq_exp_sq  5.530906     135101.07
## ret_exp_sq 28.504272    4743173.26
## freq       16.998284     199361.12
## freq_sq    17.493410     184323.47
## crossbuy    2.624369      52687.68
## sow         1.448555      97948.64
## industry    1.239394      15447.49
## revenue     1.384401     115147.43
## employees   2.741892     142703.00
varImpPlot(rf.ret)

Predicted and Actual

plot(y = test.dur$duration, x = c(1:68), xlab = "Predicted", ylab = "Actual")
points(rf.pred, col = "blue")

Hyperparameters Tuning

set.seed(42)

mtry.value = seq(2,12,1)
ntree.values = seq(3e3,10e3,1e3)

# create a data frame with all combinations
hyper_grid = expand.grid(mtry = mtry.value, ntree = ntree.values)

# create empty vector to store 00B error values
oob_err = c()

# write a  for loop over the tows of hyper_grid to traiun thgew grid of models
for (i in 1:nrow(hyper_grid)) {
  model = randomForest(duration ~., data = train.dur, importance = T, mtry = hyper_grid$mtry[i], ntree = hyper_grid$ntree[i])
  oob_err[i] = model$mse[length(model$mse)]
}

opt_i = which.min(oob_err)
hyper_grid[opt_i,]
##    mtry ntree
## 26    5  5000
which.min(oob_err)
## [1] 26
rf.ret$mse[length(rf.ret$mse)]
## [1] 1520.612
oob_err
##  [1] 2485.677 1715.504 1526.015 1461.765 1467.993 1519.595 1542.214 1577.500
##  [9] 1579.147 1611.733 1622.010 2422.281 1670.717 1519.661 1464.843 1479.989
## [17] 1517.039 1537.621 1576.896 1571.150 1600.795 1622.799 2458.436 1689.432
## [25] 1498.109 1458.938 1479.696 1511.394 1534.262 1568.098 1588.315 1609.375
## [33] 1635.325 2427.629 1707.550 1496.731 1475.207 1471.640 1517.657 1541.333
## [41] 1559.871 1589.980 1605.197 1618.321 2502.558 1696.165 1511.563 1461.452
## [49] 1494.961 1502.952 1534.366 1566.257 1584.290 1599.283 1626.851 2467.471
## [57] 1707.204 1516.271 1466.615 1484.000 1505.864 1543.147 1566.748 1585.253
## [65] 1601.753 1617.048 2474.625 1695.499 1499.349 1468.541 1475.916 1505.897
## [73] 1544.727 1569.609 1598.773 1603.766 1616.838 2440.367 1695.764 1494.981
## [81] 1476.864 1476.816 1519.636 1541.672 1567.086 1581.791 1599.879 1627.422

Selecting best hyperparameters

set.seed(42)

hype.rf.ret = randomForest(duration ~ ., data = train.dur, mtry = 5, ntree = 9000, importance = T)

hype.rf.pred = predict(hype.rf.ret, newdata = test.dur)
hype.rf.mse = mean((hype.rf.pred - test.dur$duration)^2)
hype.rf.mse
## [1] 809.4861
importance(hype.rf.ret)
##               %IncMSE IncNodePurity
## profit      57.073714    1907598.34
## acq_exp     13.379535      82173.63
## ret_exp    120.738322    5085157.53
## acq_exp_sq  15.584653      80644.08
## ret_exp_sq 117.623536    4866164.03
## freq        95.810592     180056.86
## freq_sq     92.775624     182458.78
## crossbuy     4.917525      34345.33
## sow          1.472791      62291.94
## industry    11.814035      12446.84
## revenue     -2.904411      77274.92
## employees    7.971473      92228.11
varImpPlot(hype.rf.ret)

Predicted and Actual

plot(y = test.dur$duration, x = c(1:68))
points(hype.rf.pred, col = "blue")

Random Forest Classification

set.seed(42)

rf.class.ret = randomForest(acquisition ~., data = train.acq, importance = TRUE)

rf.class.pred = predict(rf.class.ret, newdata = test.acq)
table(test.acq$acquisition, rf.class.pred)
##    rf.class.pred
##      0  1
##   0 12 17
##   1  7 64
rf.class.err = 1 - (27+51)/100
rf.class.err
## [1] 0.22
importance(rf.class.ret)
##                    0         1 MeanDecreaseAccuracy MeanDecreaseGini
## acq_exp     9.726736 17.440556             22.38722         32.29424
## acq_exp_sq  8.602296 16.288328             20.61477         32.81734
## industry   24.244376 10.856302             24.34224         11.25669
## revenue    10.399403  7.082151             12.23199         33.73914
## employees  42.676453 34.337802             49.22097         67.11708
varImpPlot(rf.class.ret)

Hyperparameters Tuning

set.seed(42)

mtry.value = seq(2,5,1)
ntree.values = seq(3e3,10e3,1e3)

hyper_grid = expand.grid(mtry = mtry.value, ntree = ntree.values)

oob_err = c()

for (i in 1:nrow(hyper_grid)) {
  model = randomForest(acquisition ~., data = train.acq, importance = T, mtry = hyper_grid$mtry[i], ntree = hyper_grid$ntree[i])
  oob_err[i] = model$err.rate[length(model$err.rate)]
}

opt_i = which.min(oob_err)
hyper_grid[opt_i,]
##   mtry ntree
## 4    5  3000

Selecting the best hyperparameters

set.seed(42)

hype.rf.class.ret = randomForest(acquisition ~ ., data = train.acq, mtry = 2, ntree = 3000, importance = T)

hype.rf.class.pred = predict(hype.rf.class.ret, newdata = test.acq)
table(test.acq$acquisition, hype.rf.class.pred)
##    hype.rf.class.pred
##      0  1
##   0 12 17
##   1  8 63
hype.rf.class.err = 1 - (30+56)/100
hype.rf.class.err
## [1] 0.14
importance(hype.rf.class.ret)
##                    0        1 MeanDecreaseAccuracy MeanDecreaseGini
## acq_exp     24.94702 40.39002             52.76963         32.85903
## acq_exp_sq  24.11866 40.96689             52.45389         33.09297
## industry    56.17918 26.52844             55.37370         11.35234
## revenue     25.87034 17.87255             29.67675         32.94006
## employees  110.13824 86.27468            127.93717         66.85890
varImpPlot(hype.rf.class.ret)

Logistic Regression

class.log = glm(acquisition ~ ., data = train.acq, family = binomial)
summary(class.log)
## 
## Call:
## glm(formula = acquisition ~ ., family = binomial, data = train.acq)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2853  -0.4747   0.1912   0.5299   2.3726  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.413e+01  1.720e+00  -8.216  < 2e-16 ***
## acq_exp      3.033e-02  5.211e-03   5.819 5.91e-09 ***
## acq_exp_sq  -3.070e-05  5.118e-06  -5.999 1.99e-09 ***
## industry1    2.195e+00  3.526e-01   6.226 4.79e-10 ***
## revenue      6.720e-02  1.622e-02   4.144 3.41e-05 ***
## employees    7.645e-03  9.221e-04   8.291  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 508.75  on 399  degrees of freedom
## Residual deviance: 285.16  on 394  degrees of freedom
## AIC: 297.16
## 
## Number of Fisher Scoring iterations: 6
class.log.PredProb = predict.glm(class.log, newdata = test.acq, type = "response")
class.log.PredSur = ifelse(class.log.PredProb >= .5,1,0)
table(test.acq$acquisition, class.log.PredSur)
##    class.log.PredSur
##      0  1
##   0 13 16
##   1  4 67
class.log.error = 1 - (27+55)/100
class.log.error
## [1] 0.18

Diagnostic plot cooks distance

library(car)
## Loading required package: carData
cook.log = cooks.distance(class.log)
plot(cook.log, col = "red", pch = 20, cex = 1)
abline(h = 4/100, lty = 2, col = "blue")

large.cook = which(cooks.distance(class.log)>(4/100))

train.acq.cook = train.acq[-large.cook,]

vif(class.log)
##    acq_exp acq_exp_sq   industry    revenue  employees 
##  35.068611  35.951278   1.287413   1.041055   1.219445

Refit model

log.data = train.acq.cook[,c(1:2,4:6)]

class.log2 = glm(acquisition ~ ., data = log.data, family = binomial)
summary(class.log2)
## 
## Call:
## glm(formula = acquisition ~ ., family = binomial, data = log.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5402  -0.6281   0.2545   0.6651   2.2311  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.9140921  0.9484407  -7.290 3.10e-13 ***
## acq_exp     -0.0004338  0.0007984  -0.543    0.587    
## industry1    1.5982617  0.2949718   5.418 6.02e-08 ***
## revenue      0.0636390  0.0152426   4.175 2.98e-05 ***
## employees    0.0074663  0.0008641   8.640  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 506.31  on 396  degrees of freedom
## Residual deviance: 331.64  on 392  degrees of freedom
## AIC: 341.64
## 
## Number of Fisher Scoring iterations: 5

Refit model with select predictors

class.log3 = glm(acquisition ~ industry + revenue + employees, data = log.data, family = binomial)
summary(class.log3)
## 
## Call:
## glm(formula = acquisition ~ industry + revenue + employees, family = binomial, 
##     data = log.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4824  -0.6375   0.2510   0.6790   2.2527  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.0971905  0.8892065  -7.981 1.45e-15 ***
## industry1    1.5800737  0.2921598   5.408 6.36e-08 ***
## revenue      0.0629833  0.0151537   4.156 3.23e-05 ***
## employees    0.0074656  0.0008628   8.652  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 506.31  on 396  degrees of freedom
## Residual deviance: 331.93  on 393  degrees of freedom
## AIC: 339.93
## 
## Number of Fisher Scoring iterations: 5
class.log.PredProb = predict.glm(class.log3, newdata = test.acq, type = "response")
class.log.PredSur = ifelse(class.log.PredProb >= .5,1,0)
table(test.acq$acquisition, class.log.PredSur)
##    class.log.PredSur
##      0  1
##   0 15 14
##   1  7 64
class.log.error2 = 1 - (18+58)/100
class.log.error2
## [1] 0.24

Find ideal cutoff

library(ROCR)
pred = prediction(predict(class.log3, newdata = log.data, type = "response"), log.data$acquisition)

auc = round(as.numeric(performance(pred, measure = "auc")@y.values),3)

false.rates = performance(pred, "fpr", "fnr")
accuracy = performance(pred, "acc", "err")

perf = performance(pred, "tpr", "fpr")
plot(perf, colorize = T, main = "ROC curve")
text(.5,.5, paste("AUC", auc))

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='blue', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

Optimal threshold

class.log.PredProb = predict.glm(class.log3, newdata = test.acq, type = "response")
class.log.PredSur = ifelse(class.log.PredProb >= .6998,1,0)
table(test.acq$acquisition, class.log.PredSur)
##    class.log.PredSur
##      0  1
##   0 21  8
##   1 10 61
class.log.error3 = 1 - (31+46)/100
class.log.error3
## [1] 0.23

Odds Ratio

OR= exp(class.log3$coefficients)
round(OR, 3)
## (Intercept)   industry1     revenue   employees 
##       0.001       4.855       1.065       1.007

Assumption validation

par(mfrow=c(2,2))
plot(class.log3, which=c(1:4))

check for linearity assumption

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
probabilities <- predict(class.log3, type = "response")
predicted.classes <- ifelse(probabilities > 0.68044, 1, 0)

# Select only numeric predictors
mydata = dplyr::select_if(log.data[,c(3:5)], is.numeric) 
predictors <- colnames(mydata)
# Bind the logit and tidying the data for plot
mydata = mutate(mydata, logit = log(probabilities/(1-probabilities))) 
mydata = gather(mydata, key = "predictors", value = "predictor.value", -logit)

#making the plot
ggplot(mydata, aes(logit, predictor.value))+
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") + 
  theme_bw() + 
  facet_wrap(~predictors, scales = "free_y")
## `geom_smooth()` using formula 'y ~ x'