Loading, Cleaning and Splitting Data
library(SMCRM)
## Warning: package 'SMCRM' was built under R version 4.0.3
data("acquisitionRetention")
ret = acquisitionRetention[,c(2:15)]
ret = na.omit(ret)
ret$acquisition = as.factor(ret$acquisition)
ret$industry = as.factor(ret$industry)
str(ret)
## 'data.frame': 500 obs. of 14 variables:
## $ acquisition: Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 1 1 ...
## $ duration : num 1635 1039 1288 0 1631 ...
## $ profit : num 6134 3524 4081 -638 5446 ...
## $ acq_exp : num 694 460 249 638 589 ...
## $ ret_exp : num 972 450 805 0 920 ...
## $ acq_exp_sq : num 480998 211628 62016 407644 346897 ...
## $ ret_exp_sq : num 943929 202077 648089 0 846106 ...
## $ freq : num 6 11 21 0 2 7 15 13 0 0 ...
## $ freq_sq : num 36 121 441 0 4 49 225 169 0 0 ...
## $ crossbuy : num 5 6 6 0 9 4 5 5 0 0 ...
## $ sow : num 95 22 90 0 80 48 51 23 0 0 ...
## $ industry : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 1 2 1 2 ...
## $ revenue : num 47.2 45.1 29.1 40.6 48.7 ...
## $ employees : num 898 686 1423 181 631 ...
ret.dur = ret[ret$duration != 0,c(2:14)]
ret.acq = ret[,c(1,4,6,12:14)]
train.dur.samp = sample(nrow(ret.dur), .8*(nrow(ret.dur)))
train.dur = ret.dur[train.dur.samp,]
test.dur = ret.dur[-train.dur.samp,]
train.acq.samp = sample(nrow(ret.acq), .8*(nrow(ret.acq)))
train.acq = ret.acq[train.acq.samp,]
test.acq = ret.acq[-train.acq.samp,]
Random Forest with Default
set.seed(42)
rf.ret = randomForest(duration ~., data = train.dur, importance = TRUE)
rf.pred = predict(rf.ret, newdata = test.dur)
rf.mse = mean((rf.pred - test.dur$duration)^2)
rf.mse
## [1] 1114.455
importance(rf.ret)
## %IncMSE IncNodePurity
## profit 15.9465834 2372825.02
## acq_exp 2.9564141 139619.14
## ret_exp 27.8300568 4648114.59
## acq_exp_sq 2.7901489 138458.79
## ret_exp_sq 26.1578371 4379885.16
## freq 16.9549218 173264.10
## freq_sq 18.3525275 180103.48
## crossbuy 1.3137901 60174.71
## sow -1.1106748 69863.61
## industry 0.1399243 17126.99
## revenue 1.0837839 127661.05
## employees 0.3413243 125514.27
varImpPlot(rf.ret)

Predicted vs Actual
plot(y = test.dur$duration, x = c(1:68), xlab = "Predicted", ylab = "Actual")
points(rf.pred, col = "red")

Tuning Hyperparameters
set.seed(42)
mtry.value = seq(2,12,1)
ntree.values = seq(3e3,10e3,1e3)
# create a data frame with all combinations
hyper_grid = expand.grid(mtry = mtry.value, ntree = ntree.values)
# create empty vector to store 00B error values
oob_err = c()
# write a for loop over the tows of hyper_grid to traiun thgew grid of models
for (i in 1:nrow(hyper_grid)) {
model = randomForest(duration ~., data = train.dur, importance = T, mtry = hyper_grid$mtry[i], ntree = hyper_grid$ntree[i])
oob_err[i] = model$mse[length(model$mse)]
}
opt_i = which.min(oob_err)
hyper_grid[opt_i,]
## mtry ntree
## 5 6 3000
which.min(oob_err)
## [1] 5
rf.ret$mse[length(rf.ret$mse)]
## [1] 1504.455
oob_err
## [1] 2440.944 1679.957 1449.954 1376.562 1337.684 1353.420 1372.981 1391.576
## [9] 1396.448 1416.638 1432.442 2505.179 1661.299 1445.048 1361.320 1360.374
## [17] 1371.831 1377.425 1383.343 1395.031 1406.813 1422.945 2445.977 1670.957
## [25] 1454.155 1370.990 1351.241 1359.659 1376.264 1384.049 1386.539 1401.286
## [33] 1419.747 2477.064 1677.889 1440.427 1358.358 1362.599 1366.592 1362.924
## [41] 1389.376 1392.028 1406.480 1425.267 2481.388 1662.341 1439.103 1375.266
## [49] 1356.510 1352.257 1370.049 1390.298 1391.718 1408.607 1421.691 2483.051
## [57] 1679.543 1454.202 1381.416 1351.353 1356.221 1368.804 1381.206 1397.682
## [65] 1406.357 1422.621 2465.051 1657.144 1431.988 1372.106 1354.225 1363.341
## [73] 1373.824 1388.480 1399.498 1407.759 1427.377 2443.405 1671.853 1440.828
## [81] 1367.964 1353.174 1363.885 1366.092 1380.814 1395.972 1405.757 1416.772
Using the Best Hyperparameters
set.seed(42)
hype.rf.ret = randomForest(duration ~ ., data = train.dur, mtry = 6, ntree = 9000, importance = T)
hype.rf.pred = predict(hype.rf.ret, newdata = test.dur)
hype.rf.mse = mean((hype.rf.pred - test.dur$duration)^2)
hype.rf.mse
## [1] 1052.546
importance(hype.rf.ret)
## %IncMSE IncNodePurity
## profit 53.934825 1605306.453
## acq_exp 8.141277 52909.817
## ret_exp 122.294222 5191409.722
## acq_exp_sq 9.946018 53345.804
## ret_exp_sq 121.662335 5139220.527
## freq 113.150314 157722.789
## freq_sq 113.347037 160052.848
## crossbuy 5.097345 26339.124
## sow 2.647907 37686.712
## industry 8.285336 8956.825
## revenue -1.420130 60268.219
## employees 6.962723 55040.843
varImpPlot(hype.rf.ret)

Predicted vs Actual
plot(y = test.dur$duration, x = c(1:68))
points(hype.rf.pred, col = "red")

Random Forest for Classification
set.seed(42)
rf.class.ret = randomForest(acquisition ~., data = train.acq, importance = TRUE)
rf.class.pred = predict(rf.class.ret, newdata = test.acq)
table(test.acq$acquisition, rf.class.pred)
## rf.class.pred
## 0 1
## 0 24 12
## 1 12 52
rf.class.err = 1 - (27+51)/100
rf.class.err
## [1] 0.22
importance(rf.class.ret)
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## acq_exp 8.229173 17.063262 20.18638 31.508390
## acq_exp_sq 8.340276 17.442358 21.23271 31.283270
## industry 15.055798 14.096685 19.40672 9.902336
## revenue 11.264978 9.942957 14.90323 34.391032
## employees 41.635058 33.931811 50.88006 65.439793
varImpPlot(rf.class.ret)

Tuning Hyperparameters
set.seed(42)
mtry.value = seq(2,5,1)
ntree.values = seq(3e3,10e3,1e3)
hyper_grid = expand.grid(mtry = mtry.value, ntree = ntree.values)
oob_err = c()
for (i in 1:nrow(hyper_grid)) {
model = randomForest(acquisition ~., data = train.acq, importance = T, mtry = hyper_grid$mtry[i], ntree = hyper_grid$ntree[i])
oob_err[i] = model$err.rate[length(model$err.rate)]
}
opt_i = which.min(oob_err)
hyper_grid[opt_i,]
## mtry ntree
## 30 3 10000
Using the Best Hyperparameters
set.seed(42)
hype.rf.class.ret = randomForest(acquisition ~ ., data = train.acq, mtry = 2, ntree = 3000, importance = T)
hype.rf.class.pred = predict(hype.rf.class.ret, newdata = test.acq)
table(test.acq$acquisition, hype.rf.class.pred)
## hype.rf.class.pred
## 0 1
## 0 23 13
## 1 12 52
hype.rf.class.err = 1 - (26+51)/100
hype.rf.class.err
## [1] 0.23
importance(hype.rf.class.ret)
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## acq_exp 18.38338 41.67203 48.42113 31.846718
## acq_exp_sq 17.12800 42.17626 49.93758 31.376392
## industry 35.99992 35.52591 47.93580 9.792362
## revenue 26.54830 21.88429 33.12885 33.944486
## employees 101.57816 78.67935 117.87492 65.213049
varImpPlot(hype.rf.class.ret)

Logistic Model
class.log = glm(acquisition ~ ., data = train.acq, family = binomial)
summary(class.log)
##
## Call:
## glm(formula = acquisition ~ ., family = binomial, data = train.acq)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4741 -0.3804 0.2051 0.5090 2.5403
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.545e+01 1.797e+00 -8.596 < 2e-16 ***
## acq_exp 3.326e-02 5.175e-03 6.427 1.30e-10 ***
## acq_exp_sq -3.329e-05 5.168e-06 -6.443 1.17e-10 ***
## industry1 2.101e+00 3.498e-01 6.008 1.88e-09 ***
## revenue 7.563e-02 1.685e-02 4.489 7.16e-06 ***
## employees 7.834e-03 9.526e-04 8.224 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 498.43 on 399 degrees of freedom
## Residual deviance: 272.06 on 394 degrees of freedom
## AIC: 284.06
##
## Number of Fisher Scoring iterations: 6
class.log.PredProb = predict.glm(class.log, newdata = test.acq, type = "response")
class.log.PredSur = ifelse(class.log.PredProb >= .5,1,0)
table(test.acq$acquisition, class.log.PredSur)
## class.log.PredSur
## 0 1
## 0 25 11
## 1 14 50
class.log.error = 1 - (28+50)/100
class.log.error
## [1] 0.22
Checking Cook’s Distance and Multicolinearity
library(car)
## Loading required package: carData
cook.log = cooks.distance(class.log)
plot(cook.log, col = "red", pch = 20, cex = 1)
abline(h = 4/100, lty = 2, col = "steelblue")

large.cook = which(cooks.distance(class.log)>(4/100))
train.acq.cook = train.acq[-large.cook,]
vif(class.log)
## acq_exp acq_exp_sq industry revenue employees
## 31.864631 32.170157 1.254930 1.074226 1.243289
Refitting Model
log.data = train.acq.cook[,c(1:2,4:6)]
class.log2 = glm(acquisition ~ ., data = log.data, family = binomial)
summary(class.log2)
##
## Call:
## glm(formula = acquisition ~ ., family = binomial, data = log.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3702 -0.5317 0.2700 0.6082 2.1541
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.9681158 1.0290352 -7.743 9.69e-15 ***
## acq_exp 0.0001726 0.0008618 0.200 0.841
## industry1 1.5498798 0.3006423 5.155 2.53e-07 ***
## revenue 0.0742721 0.0156395 4.749 2.04e-06 ***
## employees 0.0079550 0.0009115 8.727 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 494.61 on 396 degrees of freedom
## Residual deviance: 312.28 on 392 degrees of freedom
## AIC: 322.28
##
## Number of Fisher Scoring iterations: 5
Refitting Model
class.log3 = glm(acquisition ~ industry + revenue + employees, data = log.data, family = binomial)
summary(class.log3)
##
## Call:
## glm(formula = acquisition ~ industry + revenue + employees, family = binomial,
## data = log.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3906 -0.5338 0.2712 0.6127 2.1597
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.8917387 0.9539643 -8.273 < 2e-16 ***
## industry1 1.5563312 0.2991210 5.203 1.96e-07 ***
## revenue 0.0745287 0.0155960 4.779 1.76e-06 ***
## employees 0.0079479 0.0009107 8.727 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 494.61 on 396 degrees of freedom
## Residual deviance: 312.32 on 393 degrees of freedom
## AIC: 320.32
##
## Number of Fisher Scoring iterations: 5
class.log.PredProb = predict.glm(class.log3, newdata = test.acq, type = "response")
class.log.PredSur = ifelse(class.log.PredProb >= .5,1,0)
table(test.acq$acquisition, class.log.PredSur)
## class.log.PredSur
## 0 1
## 0 25 11
## 1 13 51
class.log.error2 = 1 - (25+51)/100
class.log.error2
## [1] 0.24
Finding the Ideal Cutoff Point
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.0.3
pred = prediction(predict(class.log3, newdata = log.data, type = "response"), log.data$acquisition)
auc = round(as.numeric(performance(pred, measure = "auc")@y.values),3)
false.rates = performance(pred, "fpr", "fnr")
accuracy = performance(pred, "acc", "err")
perf = performance(pred, "tpr", "fpr")
plot(perf, colorize = T, main = "ROC curve")
text(.5,.5, paste("AUC", auc))

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

Using the Optimal Treshold
class.log.PredProb = predict.glm(class.log3, newdata = test.acq, type = "response")
class.log.PredSur = ifelse(class.log.PredProb >= .6998,1,0)
table(test.acq$acquisition, class.log.PredSur)
## class.log.PredSur
## 0 1
## 0 33 3
## 1 19 45
class.log.error3 = 1 - (30+46)/100
class.log.error3
## [1] 0.24
Odds Ratio
OR= exp(class.log3$coefficients)
round(OR, 3)
## (Intercept) industry1 revenue employees
## 0.000 4.741 1.077 1.008
Checking Basic Assumptions
par(mfrow=c(2,2))
plot(class.log3, which=c(1:4))

Checking Linearity of Data
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
probabilities <- predict(class.log3, type = "response")
predicted.classes <- ifelse(probabilities > 0.68044, 1, 0)
# Select only numeric predictors
mydata = dplyr::select_if(log.data[,c(3:5)], is.numeric)
predictors <- colnames(mydata)
# Bind the logit and tidying the data for plot
mydata = mutate(mydata, logit = log(probabilities/(1-probabilities)))
mydata = gather(mydata, key = "predictors", value = "predictor.value", -logit)
#making the plot
ggplot(mydata, aes(logit, predictor.value))+
geom_point(size = 0.5, alpha = 0.5) +
geom_smooth(method = "loess") +
theme_bw() +
facet_wrap(~predictors, scales = "free_y")
## `geom_smooth()` using formula 'y ~ x'
