Access acquisitionRetention data, cleanup and split
library(SMCRM)
data("acquisitionRetention")
ret = acquisitionRetention[,c(2:15)]
ret = na.omit(ret)
ret$acquisition = as.factor(ret$acquisition)
ret$industry = as.factor(ret$industry)
str(ret)
## 'data.frame': 500 obs. of 14 variables:
## $ acquisition: Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 1 1 ...
## $ duration : num 1635 1039 1288 0 1631 ...
## $ profit : num 6134 3524 4081 -638 5446 ...
## $ acq_exp : num 694 460 249 638 589 ...
## $ ret_exp : num 972 450 805 0 920 ...
## $ acq_exp_sq : num 480998 211628 62016 407644 346897 ...
## $ ret_exp_sq : num 943929 202077 648089 0 846106 ...
## $ freq : num 6 11 21 0 2 7 15 13 0 0 ...
## $ freq_sq : num 36 121 441 0 4 49 225 169 0 0 ...
## $ crossbuy : num 5 6 6 0 9 4 5 5 0 0 ...
## $ sow : num 95 22 90 0 80 48 51 23 0 0 ...
## $ industry : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 1 2 1 2 ...
## $ revenue : num 47.2 45.1 29.1 40.6 48.7 ...
## $ employees : num 898 686 1423 181 631 ...
ret.dur = ret[ret$duration != 0,c(2:14)]
ret.acq = ret[,c(1,4,6,12:14)]
train.dur.samp = sample(nrow(ret.dur), .8*(nrow(ret.dur)))
train.dur = ret.dur[train.dur.samp,]
test.dur = ret.dur[-train.dur.samp,]
train.acq.samp = sample(nrow(ret.acq), .8*(nrow(ret.acq)))
train.acq = ret.acq[train.acq.samp,]
test.acq = ret.acq[-train.acq.samp,]
Decision Tree
library(tree)
set.seed(42)
tree.ret = tree(duration ~., data = train.dur)
plot(tree.ret)
text(tree.ret,pretty=0)

tree.pred = predict(tree.ret, test.dur)
tree.mse = mean((tree.pred - test.dur$duration)^2)
tree.mse
## [1] 3754.787
Cross validation and data pruning
set.seed(42)
cv.ret = cv.tree(tree.ret)
plot(cv.ret$size[c(1:7)], cv.ret$dev[c(1:7)], type = "b")

cv.ret$dev
## [1] 1333105 1700741 1681019 2075847 3299651 5262614 12830740
prune.ret = prune.tree(tree.ret, best = 6)
plot(prune.ret)
text(prune.ret, pretty = 0)

prune.pred = predict(prune.ret, test.dur)
prune.mse = mean((prune.pred - test.dur$duration)^2)
prune.mse
## [1] 4383.626
Bagging
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
set.seed(42)
bag.ret = randomForest(duration ~ ., data = train.dur, mtry = 12, ntree = 5000, importance = TRUE)
bag.pred = predict(bag.ret, newdata = test.dur)
bag.mse = mean((bag.pred - test.dur$duration)^2)
bag.mse
## [1] 702.3303
importance(bag.ret)
## %IncMSE IncNodePurity
## profit 31.500249 278265.008
## acq_exp 4.641504 16989.319
## ret_exp 94.256012 5963809.346
## acq_exp_sq 5.385089 16647.213
## ret_exp_sq 94.881197 5961043.738
## freq 106.373864 168657.091
## freq_sq 108.394395 167675.052
## crossbuy 1.609548 14115.846
## sow 2.625875 26887.186
## industry 15.181202 9621.868
## revenue -9.517640 27534.350
## employees 3.369105 34777.078
varImpPlot(bag.ret)

Boosting
library(gbm)
## Loaded gbm 2.1.8
set.seed(42)
boost.ret = gbm(duration ~., data = train.dur, distribution = "gaussian", n.trees = 5000, shrinkage = .01, verbose = F)
boost.pred = predict(boost.ret, newdata = test.dur)
## Using 5000 trees...
boost.mse = mean((boost.pred - test.dur$duration)^2)
boost.mse
## [1] 871.9039
summary(boost.ret)

## var rel.inf
## ret_exp ret_exp 86.3401189
## profit profit 8.1026812
## freq freq 3.8329582
## employees employees 0.5134737
## revenue revenue 0.3333241
## acq_exp acq_exp 0.3310670
## sow sow 0.3014918
## industry industry 0.1407230
## crossbuy crossbuy 0.1041621
## acq_exp_sq acq_exp_sq 0.0000000
## ret_exp_sq ret_exp_sq 0.0000000
## freq_sq freq_sq 0.0000000
Random Forest Default
set.seed(42)
rf.ret = randomForest(duration ~., data = train.dur, importance = TRUE)
rf.pred = predict(rf.ret, newdata = test.dur)
rf.mse = mean((rf.pred - test.dur$duration)^2)
rf.mse
## [1] 862.1942
importance(rf.ret)
## %IncMSE IncNodePurity
## profit 16.386152 2416024.63
## acq_exp 5.221105 132161.24
## ret_exp 26.230854 4362545.28
## acq_exp_sq 5.530906 135101.07
## ret_exp_sq 28.504272 4743173.26
## freq 16.998284 199361.12
## freq_sq 17.493410 184323.47
## crossbuy 2.624369 52687.68
## sow 1.448555 97948.64
## industry 1.239394 15447.49
## revenue 1.384401 115147.43
## employees 2.741892 142703.00
varImpPlot(rf.ret)

Predicted and Actual
plot(y = test.dur$duration, x = c(1:68), xlab = "Predicted", ylab = "Actual")
points(rf.pred, col = "blue")

Hyperparameters Tuning
set.seed(42)
mtry.value = seq(2,12,1)
ntree.values = seq(3e3,10e3,1e3)
# create a data frame with all combinations
hyper_grid = expand.grid(mtry = mtry.value, ntree = ntree.values)
# create empty vector to store 00B error values
oob_err = c()
# write a for loop over the tows of hyper_grid to traiun thgew grid of models
for (i in 1:nrow(hyper_grid)) {
model = randomForest(duration ~., data = train.dur, importance = T, mtry = hyper_grid$mtry[i], ntree = hyper_grid$ntree[i])
oob_err[i] = model$mse[length(model$mse)]
}
opt_i = which.min(oob_err)
hyper_grid[opt_i,]
## mtry ntree
## 26 5 5000
which.min(oob_err)
## [1] 26
rf.ret$mse[length(rf.ret$mse)]
## [1] 1520.612
oob_err
## [1] 2485.677 1715.504 1526.015 1461.765 1467.993 1519.595 1542.214 1577.500
## [9] 1579.147 1611.733 1622.010 2422.281 1670.717 1519.661 1464.843 1479.989
## [17] 1517.039 1537.621 1576.896 1571.150 1600.795 1622.799 2458.436 1689.432
## [25] 1498.109 1458.938 1479.696 1511.394 1534.262 1568.098 1588.315 1609.375
## [33] 1635.325 2427.629 1707.550 1496.731 1475.207 1471.640 1517.657 1541.333
## [41] 1559.871 1589.980 1605.197 1618.321 2502.558 1696.165 1511.563 1461.452
## [49] 1494.961 1502.952 1534.366 1566.257 1584.290 1599.283 1626.851 2467.471
## [57] 1707.204 1516.271 1466.615 1484.000 1505.864 1543.147 1566.748 1585.253
## [65] 1601.753 1617.048 2474.625 1695.499 1499.349 1468.541 1475.916 1505.897
## [73] 1544.727 1569.609 1598.773 1603.766 1616.838 2440.367 1695.764 1494.981
## [81] 1476.864 1476.816 1519.636 1541.672 1567.086 1581.791 1599.879 1627.422
Selecting best hyperparameters
set.seed(42)
hype.rf.ret = randomForest(duration ~ ., data = train.dur, mtry = 5, ntree = 9000, importance = T)
hype.rf.pred = predict(hype.rf.ret, newdata = test.dur)
hype.rf.mse = mean((hype.rf.pred - test.dur$duration)^2)
hype.rf.mse
## [1] 809.4861
importance(hype.rf.ret)
## %IncMSE IncNodePurity
## profit 57.073714 1907598.34
## acq_exp 13.379535 82173.63
## ret_exp 120.738322 5085157.53
## acq_exp_sq 15.584653 80644.08
## ret_exp_sq 117.623536 4866164.03
## freq 95.810592 180056.86
## freq_sq 92.775624 182458.78
## crossbuy 4.917525 34345.33
## sow 1.472791 62291.94
## industry 11.814035 12446.84
## revenue -2.904411 77274.92
## employees 7.971473 92228.11
varImpPlot(hype.rf.ret)

Predicted and Actual
plot(y = test.dur$duration, x = c(1:68))
points(hype.rf.pred, col = "blue")

Random Forest Classification
set.seed(42)
rf.class.ret = randomForest(acquisition ~., data = train.acq, importance = TRUE)
rf.class.pred = predict(rf.class.ret, newdata = test.acq)
table(test.acq$acquisition, rf.class.pred)
## rf.class.pred
## 0 1
## 0 12 17
## 1 7 64
rf.class.err = 1 - (27+51)/100
rf.class.err
## [1] 0.22
importance(rf.class.ret)
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## acq_exp 9.726736 17.440556 22.38722 32.29424
## acq_exp_sq 8.602296 16.288328 20.61477 32.81734
## industry 24.244376 10.856302 24.34224 11.25669
## revenue 10.399403 7.082151 12.23199 33.73914
## employees 42.676453 34.337802 49.22097 67.11708
varImpPlot(rf.class.ret)

Hyperparameters Tuning
set.seed(42)
mtry.value = seq(2,5,1)
ntree.values = seq(3e3,10e3,1e3)
hyper_grid = expand.grid(mtry = mtry.value, ntree = ntree.values)
oob_err = c()
for (i in 1:nrow(hyper_grid)) {
model = randomForest(acquisition ~., data = train.acq, importance = T, mtry = hyper_grid$mtry[i], ntree = hyper_grid$ntree[i])
oob_err[i] = model$err.rate[length(model$err.rate)]
}
opt_i = which.min(oob_err)
hyper_grid[opt_i,]
## mtry ntree
## 4 5 3000
Selecting the best hyperparameters
set.seed(42)
hype.rf.class.ret = randomForest(acquisition ~ ., data = train.acq, mtry = 2, ntree = 3000, importance = T)
hype.rf.class.pred = predict(hype.rf.class.ret, newdata = test.acq)
table(test.acq$acquisition, hype.rf.class.pred)
## hype.rf.class.pred
## 0 1
## 0 12 17
## 1 8 63
hype.rf.class.err = 1 - (30+56)/100
hype.rf.class.err
## [1] 0.14
importance(hype.rf.class.ret)
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## acq_exp 24.94702 40.39002 52.76963 32.85903
## acq_exp_sq 24.11866 40.96689 52.45389 33.09297
## industry 56.17918 26.52844 55.37370 11.35234
## revenue 25.87034 17.87255 29.67675 32.94006
## employees 110.13824 86.27468 127.93717 66.85890
varImpPlot(hype.rf.class.ret)

Logistic Regression
class.log = glm(acquisition ~ ., data = train.acq, family = binomial)
summary(class.log)
##
## Call:
## glm(formula = acquisition ~ ., family = binomial, data = train.acq)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2853 -0.4747 0.1912 0.5299 2.3726
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.413e+01 1.720e+00 -8.216 < 2e-16 ***
## acq_exp 3.033e-02 5.211e-03 5.819 5.91e-09 ***
## acq_exp_sq -3.070e-05 5.118e-06 -5.999 1.99e-09 ***
## industry1 2.195e+00 3.526e-01 6.226 4.79e-10 ***
## revenue 6.720e-02 1.622e-02 4.144 3.41e-05 ***
## employees 7.645e-03 9.221e-04 8.291 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 508.75 on 399 degrees of freedom
## Residual deviance: 285.16 on 394 degrees of freedom
## AIC: 297.16
##
## Number of Fisher Scoring iterations: 6
class.log.PredProb = predict.glm(class.log, newdata = test.acq, type = "response")
class.log.PredSur = ifelse(class.log.PredProb >= .5,1,0)
table(test.acq$acquisition, class.log.PredSur)
## class.log.PredSur
## 0 1
## 0 13 16
## 1 4 67
class.log.error = 1 - (27+55)/100
class.log.error
## [1] 0.18
Diagnostic plot cooks distance
library(car)
## Loading required package: carData
cook.log = cooks.distance(class.log)
plot(cook.log, col = "red", pch = 20, cex = 1)
abline(h = 4/100, lty = 2, col = "blue")

large.cook = which(cooks.distance(class.log)>(4/100))
train.acq.cook = train.acq[-large.cook,]
vif(class.log)
## acq_exp acq_exp_sq industry revenue employees
## 35.068611 35.951278 1.287413 1.041055 1.219445
Refit model
log.data = train.acq.cook[,c(1:2,4:6)]
class.log2 = glm(acquisition ~ ., data = log.data, family = binomial)
summary(class.log2)
##
## Call:
## glm(formula = acquisition ~ ., family = binomial, data = log.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5402 -0.6281 0.2545 0.6651 2.2311
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.9140921 0.9484407 -7.290 3.10e-13 ***
## acq_exp -0.0004338 0.0007984 -0.543 0.587
## industry1 1.5982617 0.2949718 5.418 6.02e-08 ***
## revenue 0.0636390 0.0152426 4.175 2.98e-05 ***
## employees 0.0074663 0.0008641 8.640 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 506.31 on 396 degrees of freedom
## Residual deviance: 331.64 on 392 degrees of freedom
## AIC: 341.64
##
## Number of Fisher Scoring iterations: 5
Refit model with select predictors
class.log3 = glm(acquisition ~ industry + revenue + employees, data = log.data, family = binomial)
summary(class.log3)
##
## Call:
## glm(formula = acquisition ~ industry + revenue + employees, family = binomial,
## data = log.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4824 -0.6375 0.2510 0.6790 2.2527
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.0971905 0.8892065 -7.981 1.45e-15 ***
## industry1 1.5800737 0.2921598 5.408 6.36e-08 ***
## revenue 0.0629833 0.0151537 4.156 3.23e-05 ***
## employees 0.0074656 0.0008628 8.652 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 506.31 on 396 degrees of freedom
## Residual deviance: 331.93 on 393 degrees of freedom
## AIC: 339.93
##
## Number of Fisher Scoring iterations: 5
class.log.PredProb = predict.glm(class.log3, newdata = test.acq, type = "response")
class.log.PredSur = ifelse(class.log.PredProb >= .5,1,0)
table(test.acq$acquisition, class.log.PredSur)
## class.log.PredSur
## 0 1
## 0 15 14
## 1 7 64
class.log.error2 = 1 - (18+58)/100
class.log.error2
## [1] 0.24
Find ideal cutoff
library(ROCR)
pred = prediction(predict(class.log3, newdata = log.data, type = "response"), log.data$acquisition)
auc = round(as.numeric(performance(pred, measure = "auc")@y.values),3)
false.rates = performance(pred, "fpr", "fnr")
accuracy = performance(pred, "acc", "err")
perf = performance(pred, "tpr", "fpr")
plot(perf, colorize = T, main = "ROC curve")
text(.5,.5, paste("AUC", auc))

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='blue', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

Optimal threshold
class.log.PredProb = predict.glm(class.log3, newdata = test.acq, type = "response")
class.log.PredSur = ifelse(class.log.PredProb >= .6998,1,0)
table(test.acq$acquisition, class.log.PredSur)
## class.log.PredSur
## 0 1
## 0 21 8
## 1 10 61
class.log.error3 = 1 - (31+46)/100
class.log.error3
## [1] 0.23
Odds Ratio
OR= exp(class.log3$coefficients)
round(OR, 3)
## (Intercept) industry1 revenue employees
## 0.001 4.855 1.065 1.007
Assumption validation
par(mfrow=c(2,2))
plot(class.log3, which=c(1:4))

check for linearity assumption
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
probabilities <- predict(class.log3, type = "response")
predicted.classes <- ifelse(probabilities > 0.68044, 1, 0)
# Select only numeric predictors
mydata = dplyr::select_if(log.data[,c(3:5)], is.numeric)
predictors <- colnames(mydata)
# Bind the logit and tidying the data for plot
mydata = mutate(mydata, logit = log(probabilities/(1-probabilities)))
mydata = gather(mydata, key = "predictors", value = "predictor.value", -logit)
#making the plot
ggplot(mydata, aes(logit, predictor.value))+
geom_point(size = 0.5, alpha = 0.5) +
geom_smooth(method = "loess") +
theme_bw() +
facet_wrap(~predictors, scales = "free_y")
## `geom_smooth()` using formula 'y ~ x'
