Loading, Cleaning and Splitting Data

library(SMCRM)
## Warning: package 'SMCRM' was built under R version 4.0.3
data("acquisitionRetention")
ret = acquisitionRetention[,c(2:15)]
ret = na.omit(ret)
ret$acquisition = as.factor(ret$acquisition)
ret$industry = as.factor(ret$industry)
str(ret)
## 'data.frame':    500 obs. of  14 variables:
##  $ acquisition: Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 1 1 ...
##  $ duration   : num  1635 1039 1288 0 1631 ...
##  $ profit     : num  6134 3524 4081 -638 5446 ...
##  $ acq_exp    : num  694 460 249 638 589 ...
##  $ ret_exp    : num  972 450 805 0 920 ...
##  $ acq_exp_sq : num  480998 211628 62016 407644 346897 ...
##  $ ret_exp_sq : num  943929 202077 648089 0 846106 ...
##  $ freq       : num  6 11 21 0 2 7 15 13 0 0 ...
##  $ freq_sq    : num  36 121 441 0 4 49 225 169 0 0 ...
##  $ crossbuy   : num  5 6 6 0 9 4 5 5 0 0 ...
##  $ sow        : num  95 22 90 0 80 48 51 23 0 0 ...
##  $ industry   : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 1 2 1 2 ...
##  $ revenue    : num  47.2 45.1 29.1 40.6 48.7 ...
##  $ employees  : num  898 686 1423 181 631 ...
ret.dur = ret[ret$duration != 0,c(2:14)]
ret.acq = ret[,c(1,4,6,12:14)]

train.dur.samp = sample(nrow(ret.dur), .8*(nrow(ret.dur)))
train.dur = ret.dur[train.dur.samp,]
test.dur = ret.dur[-train.dur.samp,]

train.acq.samp = sample(nrow(ret.acq), .8*(nrow(ret.acq)))
train.acq = ret.acq[train.acq.samp,]
test.acq = ret.acq[-train.acq.samp,]

Making the Tree

library(tree)
## Warning: package 'tree' was built under R version 4.0.4
set.seed(42)

tree.ret = tree(duration ~., data = train.dur)

plot(tree.ret)
text(tree.ret,pretty=0)

tree.pred = predict(tree.ret, test.dur)

tree.mse = mean((tree.pred - test.dur$duration)^2)
tree.mse
## [1] 3511.037

Cross-Validation and Pruning

set.seed(42)

cv.ret = cv.tree(tree.ret)
plot(cv.ret$size[c(1:7)], cv.ret$dev[c(1:7)], type = "b")

cv.ret$dev
## [1]  1279811  1471858  1471858  1661240  2224364  3717248  5025307 12662540
prune.ret = prune.tree(tree.ret, best = 6)
plot(prune.ret)
text(prune.ret, pretty = 0)

prune.pred = predict(prune.ret, test.dur)

prune.mse = mean((prune.pred - test.dur$duration)^2)
prune.mse
## [1] 5817.635

Bagging

library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
set.seed(42)

bag.ret = randomForest(duration ~ ., data = train.dur, mtry = 12, ntree = 5000, importance = TRUE)

bag.pred = predict(bag.ret, newdata = test.dur)
bag.mse = mean((bag.pred - test.dur$duration)^2)
bag.mse
## [1] 1103.648
importance(bag.ret)
##               %IncMSE IncNodePurity
## profit      35.481899    286213.183
## acq_exp      3.889448     11022.473
## ret_exp     96.273333   5942721.770
## acq_exp_sq   2.518997     11204.971
## ret_exp_sq  95.590983   5886583.279
## freq       106.215934    157630.581
## freq_sq    108.633000    157577.560
## crossbuy     6.364134     14100.902
## sow          2.969734     22442.890
## industry    12.421181      5846.837
## revenue     -4.796642     21712.905
## employees    5.110906     25523.980
varImpPlot(bag.ret)

Boosting

library(gbm)
## Warning: package 'gbm' was built under R version 4.0.5
## Loaded gbm 2.1.8
set.seed(42)

boost.ret = gbm(duration ~., data = train.dur, distribution = "gaussian", n.trees = 5000, shrinkage = .01, verbose = F)

boost.pred = predict(boost.ret, newdata = test.dur)
## Using 5000 trees...
boost.mse = mean((boost.pred - test.dur$duration)^2)
boost.mse
## [1] 624.2513
summary(boost.ret)

##                   var    rel.inf
## ret_exp       ret_exp 86.2180692
## profit         profit  8.3452925
## freq             freq  3.4942448
## employees   employees  0.5348091
## acq_exp       acq_exp  0.3923935
## revenue       revenue  0.3722611
## sow               sow  0.3194178
## industry     industry  0.1681210
## crossbuy     crossbuy  0.1553911
## acq_exp_sq acq_exp_sq  0.0000000
## ret_exp_sq ret_exp_sq  0.0000000
## freq_sq       freq_sq  0.0000000

Random Forest with Default

set.seed(42)

rf.ret = randomForest(duration ~., data = train.dur, importance = TRUE)

rf.pred = predict(rf.ret, newdata = test.dur)
rf.mse = mean((rf.pred - test.dur$duration)^2)
rf.mse
## [1] 1114.455
importance(rf.ret)
##               %IncMSE IncNodePurity
## profit     15.9465834    2372825.02
## acq_exp     2.9564141     139619.14
## ret_exp    27.8300568    4648114.59
## acq_exp_sq  2.7901489     138458.79
## ret_exp_sq 26.1578371    4379885.16
## freq       16.9549218     173264.10
## freq_sq    18.3525275     180103.48
## crossbuy    1.3137901      60174.71
## sow        -1.1106748      69863.61
## industry    0.1399243      17126.99
## revenue     1.0837839     127661.05
## employees   0.3413243     125514.27
varImpPlot(rf.ret)

Predicted vs Actual

plot(y = test.dur$duration, x = c(1:68), xlab = "Predicted", ylab = "Actual")
points(rf.pred, col = "red")

Tuning Hyperparameters

set.seed(42)

mtry.value = seq(2,12,1)
ntree.values = seq(3e3,10e3,1e3)

# create a data frame with all combinations
hyper_grid = expand.grid(mtry = mtry.value, ntree = ntree.values)

# create empty vector to store 00B error values
oob_err = c()

# write a  for loop over the tows of hyper_grid to traiun thgew grid of models
for (i in 1:nrow(hyper_grid)) {
  model = randomForest(duration ~., data = train.dur, importance = T, mtry = hyper_grid$mtry[i], ntree = hyper_grid$ntree[i])
  oob_err[i] = model$mse[length(model$mse)]
}

opt_i = which.min(oob_err)
hyper_grid[opt_i,]
##   mtry ntree
## 5    6  3000
which.min(oob_err)
## [1] 5
rf.ret$mse[length(rf.ret$mse)]
## [1] 1504.455
oob_err
##  [1] 2440.944 1679.957 1449.954 1376.562 1337.684 1353.420 1372.981 1391.576
##  [9] 1396.448 1416.638 1432.442 2505.179 1661.299 1445.048 1361.320 1360.374
## [17] 1371.831 1377.425 1383.343 1395.031 1406.813 1422.945 2445.977 1670.957
## [25] 1454.155 1370.990 1351.241 1359.659 1376.264 1384.049 1386.539 1401.286
## [33] 1419.747 2477.064 1677.889 1440.427 1358.358 1362.599 1366.592 1362.924
## [41] 1389.376 1392.028 1406.480 1425.267 2481.388 1662.341 1439.103 1375.266
## [49] 1356.510 1352.257 1370.049 1390.298 1391.718 1408.607 1421.691 2483.051
## [57] 1679.543 1454.202 1381.416 1351.353 1356.221 1368.804 1381.206 1397.682
## [65] 1406.357 1422.621 2465.051 1657.144 1431.988 1372.106 1354.225 1363.341
## [73] 1373.824 1388.480 1399.498 1407.759 1427.377 2443.405 1671.853 1440.828
## [81] 1367.964 1353.174 1363.885 1366.092 1380.814 1395.972 1405.757 1416.772

Using the Best Hyperparameters

set.seed(42)

hype.rf.ret = randomForest(duration ~ ., data = train.dur, mtry = 6, ntree = 9000, importance = T)

hype.rf.pred = predict(hype.rf.ret, newdata = test.dur)
hype.rf.mse = mean((hype.rf.pred - test.dur$duration)^2)
hype.rf.mse
## [1] 1052.546
importance(hype.rf.ret)
##               %IncMSE IncNodePurity
## profit      53.934825   1605306.453
## acq_exp      8.141277     52909.817
## ret_exp    122.294222   5191409.722
## acq_exp_sq   9.946018     53345.804
## ret_exp_sq 121.662335   5139220.527
## freq       113.150314    157722.789
## freq_sq    113.347037    160052.848
## crossbuy     5.097345     26339.124
## sow          2.647907     37686.712
## industry     8.285336      8956.825
## revenue     -1.420130     60268.219
## employees    6.962723     55040.843
varImpPlot(hype.rf.ret)

Predicted vs Actual

plot(y = test.dur$duration, x = c(1:68))
points(hype.rf.pred, col = "red")

Random Forest for Classification

set.seed(42)

rf.class.ret = randomForest(acquisition ~., data = train.acq, importance = TRUE)

rf.class.pred = predict(rf.class.ret, newdata = test.acq)
table(test.acq$acquisition, rf.class.pred)
##    rf.class.pred
##      0  1
##   0 24 12
##   1 12 52
rf.class.err = 1 - (27+51)/100
rf.class.err
## [1] 0.22
importance(rf.class.ret)
##                    0         1 MeanDecreaseAccuracy MeanDecreaseGini
## acq_exp     8.229173 17.063262             20.18638        31.508390
## acq_exp_sq  8.340276 17.442358             21.23271        31.283270
## industry   15.055798 14.096685             19.40672         9.902336
## revenue    11.264978  9.942957             14.90323        34.391032
## employees  41.635058 33.931811             50.88006        65.439793
varImpPlot(rf.class.ret)

Tuning Hyperparameters

set.seed(42)

mtry.value = seq(2,5,1)
ntree.values = seq(3e3,10e3,1e3)

hyper_grid = expand.grid(mtry = mtry.value, ntree = ntree.values)

oob_err = c()

for (i in 1:nrow(hyper_grid)) {
  model = randomForest(acquisition ~., data = train.acq, importance = T, mtry = hyper_grid$mtry[i], ntree = hyper_grid$ntree[i])
  oob_err[i] = model$err.rate[length(model$err.rate)]
}

opt_i = which.min(oob_err)
hyper_grid[opt_i,]
##    mtry ntree
## 30    3 10000

Using the Best Hyperparameters

set.seed(42)

hype.rf.class.ret = randomForest(acquisition ~ ., data = train.acq, mtry = 2, ntree = 3000, importance = T)

hype.rf.class.pred = predict(hype.rf.class.ret, newdata = test.acq)
table(test.acq$acquisition, hype.rf.class.pred)
##    hype.rf.class.pred
##      0  1
##   0 23 13
##   1 12 52
hype.rf.class.err = 1 - (26+51)/100
hype.rf.class.err
## [1] 0.23
importance(hype.rf.class.ret)
##                    0        1 MeanDecreaseAccuracy MeanDecreaseGini
## acq_exp     18.38338 41.67203             48.42113        31.846718
## acq_exp_sq  17.12800 42.17626             49.93758        31.376392
## industry    35.99992 35.52591             47.93580         9.792362
## revenue     26.54830 21.88429             33.12885        33.944486
## employees  101.57816 78.67935            117.87492        65.213049
varImpPlot(hype.rf.class.ret)

Logistic Model

class.log = glm(acquisition ~ ., data = train.acq, family = binomial)
summary(class.log)
## 
## Call:
## glm(formula = acquisition ~ ., family = binomial, data = train.acq)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4741  -0.3804   0.2051   0.5090   2.5403  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.545e+01  1.797e+00  -8.596  < 2e-16 ***
## acq_exp      3.326e-02  5.175e-03   6.427 1.30e-10 ***
## acq_exp_sq  -3.329e-05  5.168e-06  -6.443 1.17e-10 ***
## industry1    2.101e+00  3.498e-01   6.008 1.88e-09 ***
## revenue      7.563e-02  1.685e-02   4.489 7.16e-06 ***
## employees    7.834e-03  9.526e-04   8.224  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 498.43  on 399  degrees of freedom
## Residual deviance: 272.06  on 394  degrees of freedom
## AIC: 284.06
## 
## Number of Fisher Scoring iterations: 6
class.log.PredProb = predict.glm(class.log, newdata = test.acq, type = "response")
class.log.PredSur = ifelse(class.log.PredProb >= .5,1,0)
table(test.acq$acquisition, class.log.PredSur)
##    class.log.PredSur
##      0  1
##   0 25 11
##   1 14 50
class.log.error = 1 - (28+50)/100
class.log.error
## [1] 0.22

Checking Cook’s Distance and Multicolinearity

library(car)
## Loading required package: carData
cook.log = cooks.distance(class.log)
plot(cook.log, col = "red", pch = 20, cex = 1)
abline(h = 4/100, lty = 2, col = "steelblue")

large.cook = which(cooks.distance(class.log)>(4/100))

train.acq.cook = train.acq[-large.cook,]

vif(class.log)
##    acq_exp acq_exp_sq   industry    revenue  employees 
##  31.864631  32.170157   1.254930   1.074226   1.243289

Refitting Model

log.data = train.acq.cook[,c(1:2,4:6)]

class.log2 = glm(acquisition ~ ., data = log.data, family = binomial)
summary(class.log2)
## 
## Call:
## glm(formula = acquisition ~ ., family = binomial, data = log.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3702  -0.5317   0.2700   0.6082   2.1541  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.9681158  1.0290352  -7.743 9.69e-15 ***
## acq_exp      0.0001726  0.0008618   0.200    0.841    
## industry1    1.5498798  0.3006423   5.155 2.53e-07 ***
## revenue      0.0742721  0.0156395   4.749 2.04e-06 ***
## employees    0.0079550  0.0009115   8.727  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 494.61  on 396  degrees of freedom
## Residual deviance: 312.28  on 392  degrees of freedom
## AIC: 322.28
## 
## Number of Fisher Scoring iterations: 5

Refitting Model

class.log3 = glm(acquisition ~ industry + revenue + employees, data = log.data, family = binomial)
summary(class.log3)
## 
## Call:
## glm(formula = acquisition ~ industry + revenue + employees, family = binomial, 
##     data = log.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3906  -0.5338   0.2712   0.6127   2.1597  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.8917387  0.9539643  -8.273  < 2e-16 ***
## industry1    1.5563312  0.2991210   5.203 1.96e-07 ***
## revenue      0.0745287  0.0155960   4.779 1.76e-06 ***
## employees    0.0079479  0.0009107   8.727  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 494.61  on 396  degrees of freedom
## Residual deviance: 312.32  on 393  degrees of freedom
## AIC: 320.32
## 
## Number of Fisher Scoring iterations: 5
class.log.PredProb = predict.glm(class.log3, newdata = test.acq, type = "response")
class.log.PredSur = ifelse(class.log.PredProb >= .5,1,0)
table(test.acq$acquisition, class.log.PredSur)
##    class.log.PredSur
##      0  1
##   0 25 11
##   1 13 51
class.log.error2 = 1 - (25+51)/100
class.log.error2
## [1] 0.24

Finding the Ideal Cutoff Point

library(ROCR)
## Warning: package 'ROCR' was built under R version 4.0.3
pred = prediction(predict(class.log3, newdata = log.data, type = "response"), log.data$acquisition)

auc = round(as.numeric(performance(pred, measure = "auc")@y.values),3)

false.rates = performance(pred, "fpr", "fnr")
accuracy = performance(pred, "acc", "err")

perf = performance(pred, "tpr", "fpr")
plot(perf, colorize = T, main = "ROC curve")
text(.5,.5, paste("AUC", auc))

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

Using the Optimal Treshold

class.log.PredProb = predict.glm(class.log3, newdata = test.acq, type = "response")
class.log.PredSur = ifelse(class.log.PredProb >= .6998,1,0)
table(test.acq$acquisition, class.log.PredSur)
##    class.log.PredSur
##      0  1
##   0 33  3
##   1 19 45
class.log.error3 = 1 - (30+46)/100
class.log.error3
## [1] 0.24

Odds Ratio

OR= exp(class.log3$coefficients)
round(OR, 3)
## (Intercept)   industry1     revenue   employees 
##       0.000       4.741       1.077       1.008

Checking Basic Assumptions

par(mfrow=c(2,2))
plot(class.log3, which=c(1:4))

Checking Linearity of Data

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
probabilities <- predict(class.log3, type = "response")
predicted.classes <- ifelse(probabilities > 0.68044, 1, 0)

# Select only numeric predictors
mydata = dplyr::select_if(log.data[,c(3:5)], is.numeric) 
predictors <- colnames(mydata)
# Bind the logit and tidying the data for plot
mydata = mutate(mydata, logit = log(probabilities/(1-probabilities))) 
mydata = gather(mydata, key = "predictors", value = "predictor.value", -logit)

#making the plot
ggplot(mydata, aes(logit, predictor.value))+
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") + 
  theme_bw() + 
  facet_wrap(~predictors, scales = "free_y")
## `geom_smooth()` using formula 'y ~ x'