library(caret)
library(tree)
library(SMCRM)
library(PerformanceAnalytics)
library(xgboost)
library(randomForestSRC)
library(rpart)
library(car)
library(dplyr)
# theme for nice plotting
theme_nice <- theme_classic()+
theme(
axis.line.y.left = element_line(colour = "black"),
axis.line.y.right = element_line(colour = "black"),
axis.line.x.bottom = element_line(colour = "black"),
axis.line.x.top = element_line(colour = "black"),
axis.text.y = element_text(colour = "black", size = 12),
axis.text.x = element_text(color = "black", size = 12),
axis.ticks = element_line(color = "black")) +
theme(
axis.ticks.length = unit(-0.25, "cm"),
axis.text.x = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")),
axis.text.y = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")))
# Load the data
data(acquisitionRetention)
#Loading dataset
data1 = data.frame(acquisitionRetention)
# Get the summary
summary(data1)
## customer acquisition duration profit
## Min. : 1.0 Min. :0.000 Min. : 0.0 Min. :-1027.0
## 1st Qu.:125.8 1st Qu.:0.000 1st Qu.: 0.0 1st Qu.: -316.3
## Median :250.5 Median :1.000 Median : 957.5 Median : 3369.9
## Mean :250.5 Mean :0.676 Mean : 742.5 Mean : 2403.8
## 3rd Qu.:375.2 3rd Qu.:1.000 3rd Qu.:1146.2 3rd Qu.: 3931.6
## Max. :500.0 Max. :1.000 Max. :1673.0 Max. : 6134.3
## acq_exp ret_exp acq_exp_sq ret_exp_sq
## Min. : 1.21 Min. : 0.0 Min. : 1.5 Min. : 0
## 1st Qu.: 384.14 1st Qu.: 0.0 1st Qu.: 147562.0 1st Qu.: 0
## Median : 491.66 Median : 398.1 Median : 241729.7 Median : 158480
## Mean : 493.35 Mean : 336.3 Mean : 271211.1 Mean : 184000
## 3rd Qu.: 600.21 3rd Qu.: 514.3 3rd Qu.: 360246.0 3rd Qu.: 264466
## Max. :1027.04 Max. :1095.0 Max. :1054811.2 Max. :1198937
## freq freq_sq crossbuy sow
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 6.00 Median : 36.00 Median : 5.000 Median : 44.00
## Mean : 6.22 Mean : 69.25 Mean : 4.052 Mean : 38.88
## 3rd Qu.:11.00 3rd Qu.:121.00 3rd Qu.: 7.000 3rd Qu.: 66.00
## Max. :21.00 Max. :441.00 Max. :11.000 Max. :116.00
## industry revenue employees
## Min. :0.000 Min. :14.49 Min. : 18.0
## 1st Qu.:0.000 1st Qu.:33.53 1st Qu.: 503.0
## Median :1.000 Median :41.43 Median : 657.5
## Mean :0.522 Mean :40.54 Mean : 671.5
## 3rd Qu.:1.000 3rd Qu.:47.52 3rd Qu.: 826.0
## Max. :1.000 Max. :65.10 Max. :1461.0
# Create a copy
data2 <- data1
str(data1)
## 'data.frame': 500 obs. of 15 variables:
## $ customer : num 1 2 3 4 5 6 7 8 9 10 ...
## $ acquisition: num 1 1 1 0 1 1 1 1 0 0 ...
## $ duration : num 1635 1039 1288 0 1631 ...
## $ profit : num 6134 3524 4081 -638 5446 ...
## $ acq_exp : num 694 460 249 638 589 ...
## $ ret_exp : num 972 450 805 0 920 ...
## $ acq_exp_sq : num 480998 211628 62016 407644 346897 ...
## $ ret_exp_sq : num 943929 202077 648089 0 846106 ...
## $ freq : num 6 11 21 0 2 7 15 13 0 0 ...
## $ freq_sq : num 36 121 441 0 4 49 225 169 0 0 ...
## $ crossbuy : num 5 6 6 0 9 4 5 5 0 0 ...
## $ sow : num 95 22 90 0 80 48 51 23 0 0 ...
## $ industry : num 1 0 0 0 0 1 0 1 0 1 ...
## $ revenue : num 47.2 45.1 29.1 40.6 48.7 ...
## $ employees : num 898 686 1423 181 631 ...
# Run Correlation chart including the yvar
chart.Correlation(data1, histogram=TRUE, pch=19)
Most of the variables are heavily correlated with response variable.
duration, profit, ret_exp, ret_exp_sq, freq, freq_sq, crossbuy and sow are heavily correlated.
Lets run some box plots to see if this makes sense.
# Create box plots to understand variable correlation with the response variable Acquistion
par(mfrow = c(2, 3))
boxplot(duration~ acquisition, data = data1, xlab = "acquisition", ylab = "Duration")
boxplot(ret_exp~ acquisition, data = data1, xlab = "acquisition", ylab = "ret_exp")
boxplot(freq~ acquisition, data = data1, xlab = "acquisition", ylab = "freq")
boxplot(crossbuy~ acquisition, data = data1, xlab = "acquisition", ylab = "crossbuy")
boxplot(sow~ acquisition, data = data1, xlab = "acquisition", ylab = "sow")
boxplot(profit~ acquisition, data = data1, xlab = "acquisition", ylab = "profit")
It is clear that whenever the fields Duration, ret_exp, freq, crossbuy, sow are “0”, acquisition is always biased as 0.
Similarly, whenever profit is negative, acquisition is always 0 indicating that marketing costs are included for acqusition.
We will exclude these variables in model.
we will also exclude the “customer” variable in the model as it is just a number and doesnt have any influence on the model.
# Convert the acquisition as a factor
data1$acquisition<- as.factor(data1$acquisition)
# See if any missing values and remove them
data1 <- na.omit(data1)
# Run a random forest model to predict acquisition using only the kept fields based on the above correlation analysis
# set the seed
set.seed(123)
# Set a tune grid to run at different mtry's
tunegrid <- expand.grid(.mtry=c(1:15))
# Set ntree to 1000 and importance to True to see the variable importance
# Run a randomforest with only the fields industry, revenue, employees and acq_exp
rf.forest1 <- rfsrc(acquisition~acq_exp+industry+revenue+employees,
data = data1,
tuneGrid=tunegrid,
importance = TRUE,
ntree = 1000)
rf.forest1
## Sample size: 500
## Frequency of class labels: 162, 338
## Number of trees: 1000
## Forest terminal node size: 1
## Average no. of terminal nodes: 76.809
## No. of variables tried at each split: 2
## Total no. of variables: 4
## Resampling used to grow trees: swor
## Resample size used to grow trees: 316
## Analysis: RF-C
## Family: class
## Splitting rule: gini *random*
## Number of random split points: 10
## Normalized brier score: 57.26
## AUC: 85.97
## Error rate: 0.21, 0.41, 0.12
##
## Confusion matrix:
##
## predicted
## observed 0 1 class.error
## 0 96 66 0.4074
## 1 40 298 0.1183
##
## Overall error rate: 21.2%
This gave an accuracy rate of 79.8%.
# create a dataframe from the prediction values
predictedAcqusition <- rf.forest1$class.oob
data2<-cbind(data1,predictedAcqusition)
# Keep only the predicted acquisition customers only in a new dataframe
data3 <- data2 %>% filter(predictedAcqusition == 1)
summary(data3)
## customer acquisition duration profit acq_exp
## Min. : 1.0 0: 66 Min. : 0.0 Min. :-1027 Min. : 43.15
## 1st Qu.:130.8 1:298 1st Qu.: 813.5 1st Qu.: 3028 1st Qu.: 407.69
## Median :258.0 Median :1020.5 Median : 3555 Median : 496.07
## Mean :256.7 Mean : 894.2 Mean : 3028 Mean : 492.25
## 3rd Qu.:386.2 3rd Qu.:1175.2 3rd Qu.: 4049 3rd Qu.: 579.63
## Max. :500.0 Max. :1673.0 Max. : 6134 Max. :1027.04
## ret_exp acq_exp_sq ret_exp_sq freq
## Min. : 0.0 Min. : 1862 Min. : 0 Min. : 0.000
## 1st Qu.: 322.1 1st Qu.: 166208 1st Qu.: 103779 1st Qu.: 3.000
## Median : 439.1 Median : 246087 Median : 192835 Median : 8.000
## Mean : 404.9 Mean : 262301 Mean : 219752 Mean : 7.527
## 3rd Qu.: 538.4 3rd Qu.: 335971 3rd Qu.: 289859 3rd Qu.:11.000
## Max. :1082.4 Max. :1054811 Max. :1171525 Max. :21.000
## freq_sq crossbuy sow industry
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. :0.0000
## 1st Qu.: 9.00 1st Qu.: 3.00 1st Qu.: 29.75 1st Qu.:0.0000
## Median : 64.00 Median : 5.00 Median : 52.00 Median :1.0000
## Mean : 84.27 Mean : 4.92 Mean : 46.78 Mean :0.5907
## 3rd Qu.:121.00 3rd Qu.: 7.00 3rd Qu.: 68.00 3rd Qu.:1.0000
## Max. :441.00 Max. :11.00 Max. :116.00 Max. :1.0000
## revenue employees predictedAcqusition
## Min. :15.86 Min. : 56.0 0: 0
## 1st Qu.:35.98 1st Qu.: 606.8 1:364
## Median :43.06 Median : 744.5
## Mean :42.21 Mean : 753.7
## 3rd Qu.:48.16 3rd Qu.: 895.2
## Max. :65.10 Max. :1461.0
# Run Correlation chart including all variables
chart.Correlation(data3[,c(3,5:6,9,11:15)], histogram=TRUE, pch=19)
# Lets run a GLM model to see the variable collinearity with all variables
set.seed(123)
glm.fit1 <- glm(duration~profit+acq_exp+freq+ret_exp+crossbuy+sow+industry+revenue+employees, data=data3)
preds.glm1 = predict(glm.fit1, newdata = data3)
glm1.mse = mean((preds.glm1 - data3$duration)^2)
glm1.mse
## [1] 2647.914
car::vif(glm.fit1)
## profit acq_exp freq ret_exp crossbuy sow industry revenue
## 18.315825 1.131901 1.737211 10.056359 2.808369 2.559769 1.079648 1.091787
## employees
## 1.265496
Here profit and ret_exp are highly collinear as indicated in the correlation chart. Lets remove ret_exp as we are dealing with acquisition and see how the model collinerity is.
# Lets run a GLM model to see the variable collinearity with all variables
set.seed(123)
glm.fit2 <- glm(duration~profit+acq_exp+freq+crossbuy+sow+industry+revenue+employees, data=data3)
preds.glm2 = predict(glm.fit2, newdata = data3)
glm2.mse = mean((preds.glm2 - data3$duration)^2)
glm2.mse
## [1] 6219.244
car::vif(glm.fit2)
## profit acq_exp freq crossbuy sow industry revenue employees
## 4.000792 1.039238 1.736314 2.617264 2.331227 1.058091 1.087949 1.245028
Even though the MSE is high, there is no collinearity involved.
# Lets create a random forest model to predict duration only for the predicted acquisition customers
# Run a random forest REGRESSION model to predict duration with all fields except the transformed squared variables and ret_exp
# set the seed
set.seed(123)
# Set a tune grid to run at different mtry's
tunegrid <- expand.grid(.mtry=c(1:15))
# Set ntree to 1000 and importance to True to see the variable importance
# Run a randomforest with all the fields
rf.forest2 <- rfsrc(duration~profit+acq_exp+freq+crossbuy+sow+industry+revenue+employees,
data = data3,
tuneGrid=tunegrid,
importance = TRUE,
ntree = 1000)
rf.forest2
## Sample size: 364
## Number of trees: 1000
## Forest terminal node size: 5
## Average no. of terminal nodes: 39.23
## No. of variables tried at each split: 3
## Total no. of variables: 8
## Resampling used to grow trees: swor
## Resample size used to grow trees: 230
## Analysis: RF-R
## Family: regr
## Splitting rule: mse *random*
## Number of random split points: 10
## % variance explained: 97.1
## Error rate: 6232.35
# Get the predicted duration and write into a dataframe
predictedDuration <- predict(rf.forest2,data3)$predicted
data4 <- cbind(data3,predictedDuration)
# Calculate the actual duration and the predicted duration (how long) for the customers predicted to be acquired
TotalActualDuration <- sum(data4$duration)
TotalPredictedDuration <- sum(data4$predictedDuration)
cat("TotalActualDuration = ", TotalActualDuration)
## TotalActualDuration = 325490
cat("\nTotalPredictedDuration = ", TotalPredictedDuration)
##
## TotalPredictedDuration = 325819.2
rf.forest2$importance
## profit acq_exp freq crossbuy sow industry
## 62157.56443 3547.47906 37110.98554 56139.20775 22182.75501 95.40617
## revenue employees
## 24.98874 127.09285
# Creating a plot of variable importance
rf.forest2$importance # values vary a lot
## profit acq_exp freq crossbuy sow industry
## 62157.56443 3547.47906 37110.98554 56139.20775 22182.75501 95.40617
## revenue employees
## 24.98874 127.09285
# Use this if the question is related to variable importance on a classification model
# importance = rf.forest1$importance[,c(1)] # values vary a lot
data.frame(importance = rf.forest2$importance) %>%
tibble::rownames_to_column(var = "variable") %>%
ggplot(aes(x = reorder(variable,importance), y = importance)) +
geom_bar(stat = "identity", fill = "orange", color = "black")+
coord_flip() +
labs(x = "Variables", y = "Variable importance")+
theme_nice # where's industry? difficult to see
rf.forest2$importance %>% log() # log transform
## profit acq_exp freq crossbuy sow industry revenue employees
## 11.037428 8.173993 10.521668 10.935590 10.007070 4.558143 3.218425 4.844918
data.frame(importance = rf.forest2$importance) %>% # add a large +ve constant
log() %>%
tibble::rownames_to_column(var = "variable") %>%
ggplot(aes(x = reorder(variable,importance), y = importance)) +
geom_bar(stat = "identity", fill = "orange", color = "black", width = 0.5)+
coord_flip() +
labs(x = "Variables", y = "Log-transformed variable importance") +
theme_nice
As all the values are less than 1 and positive, we dont need any transformation such as log or adding a large constant etc.
mindepth <- max.subtree(rf.forest2,
sub.order = TRUE)
# first order depths
print(round(mindepth$order, 3)[,1])
## profit acq_exp freq crossbuy sow industry revenue employees
## 1.635 3.297 2.279 2.274 3.097 7.579 4.109 3.778
# vizualise MD
data.frame(md = round(mindepth$order, 3)[,1]) %>%
tibble::rownames_to_column(var = "variable") %>%
ggplot(aes(x = reorder(variable,desc(md)), y = md)) +
geom_bar(stat = "identity", fill = "orange", color = "black", width = 0.2)+
coord_flip() +
labs(x = "Variables", y = "Minimal Depth")+
theme_nice
# interactions
mindepth$sub.order
## profit acq_exp freq crossbuy sow industry revenue
## profit 0.1399196 0.2264888 0.3205463 0.4089828 0.3510775 0.6802229 0.3576138
## acq_exp 0.2516335 0.2862922 0.4637345 0.5535774 0.4576455 0.7624997 0.4606001
## freq 0.3002827 0.3989477 0.2003254 0.5225195 0.4786879 0.7711915 0.4733314
## crossbuy 0.3387873 0.4275252 0.4883598 0.1999977 0.5227186 0.7921141 0.5137861
## sow 0.3905999 0.4891231 0.5645602 0.6259963 0.2717470 0.8379984 0.5698369
## industry 0.7872336 0.8231147 0.8627345 0.8951583 0.8713991 0.6518157 0.8634710
## revenue 0.4425062 0.5181028 0.6428629 0.6992784 0.6370630 0.8839927 0.3598838
## employees 0.3929604 0.4813238 0.6074370 0.6585310 0.5811056 0.8555074 0.5887954
## employees
## profit 0.3300100
## acq_exp 0.4277176
## freq 0.4782207
## crossbuy 0.5056605
## sow 0.5782995
## industry 0.8591563
## revenue 0.6262199
## employees 0.3291201
as.matrix(mindepth$sub.order) %>%
reshape2::melt() %>%
data.frame() %>%
ggplot(aes(x = Var1, y = Var2, fill = value)) +
scale_x_discrete(position = "top") +
geom_tile(color = "white") +
viridis::scale_fill_viridis("Relative min. depth") +
labs(x = "", y = "") +
theme_bw()
# cross-check with vimp
find.interaction(rf.forest2,
method = "vimp",
importance = "permute")
## Pairing profit with crossbuy
## Pairing profit with freq
## Pairing profit with sow
## Pairing profit with acq_exp
## Pairing profit with employees
## Pairing profit with industry
## Pairing profit with revenue
## Pairing crossbuy with freq
## Pairing crossbuy with sow
## Pairing crossbuy with acq_exp
## Pairing crossbuy with employees
## Pairing crossbuy with industry
## Pairing crossbuy with revenue
## Pairing freq with sow
## Pairing freq with acq_exp
## Pairing freq with employees
## Pairing freq with industry
## Pairing freq with revenue
## Pairing sow with acq_exp
## Pairing sow with employees
## Pairing sow with industry
## Pairing sow with revenue
## Pairing acq_exp with employees
## Pairing acq_exp with industry
## Pairing acq_exp with revenue
## Pairing employees with industry
## Pairing employees with revenue
## Pairing industry with revenue
##
## Method: vimp
## No. of variables: 8
## Variables sorted by VIMP?: TRUE
## No. of variables used for pairing: 8
## Total no. of paired interactions: 28
## Monte Carlo replications: 1
## Type of noising up used for VIMP: permute
##
## Var 1 Var 2 Paired Additive Difference
## profit:crossbuy 62815.0616 54534.4378 145923.3560 117349.4995 28573.8565
## profit:freq 62815.0616 37649.3683 117829.5065 100464.4299 17365.0765
## profit:sow 62815.0616 22348.0172 96452.8229 85163.0788 11289.7441
## profit:acq_exp 62815.0616 3499.5851 61310.1644 66314.6467 -5004.4823
## profit:employees 62815.0616 191.0569 62470.2628 63006.1186 -535.8558
## profit:industry 62815.0616 32.6256 62306.4309 62847.6872 -541.2564
## profit:revenue 62815.0616 -3.4741 62776.0117 62811.5875 -35.5758
## crossbuy:freq 56125.4744 36503.0245 111990.4631 92628.4988 19361.9643
## crossbuy:sow 56125.4744 22746.6868 89041.9794 78872.1612 10169.8182
## crossbuy:acq_exp 56125.4744 3508.1688 58641.0382 59633.6432 -992.6050
## crossbuy:employees 56125.4744 131.9008 55122.2762 56257.3752 -1135.0990
## crossbuy:industry 56125.4744 20.4461 56258.3166 56145.9205 112.3961
## crossbuy:revenue 56125.4744 9.3166 55820.9310 56134.7910 -313.8600
## freq:sow 37592.2795 21712.9363 63614.1120 59305.2158 4308.8962
## freq:acq_exp 37592.2795 3450.4637 40311.5799 41042.7431 -731.1632
## freq:employees 37592.2795 141.5016 36916.5016 37733.7811 -817.2795
## freq:industry 37592.2795 0.1015 36411.0921 37592.3810 -1181.2889
## freq:revenue 37592.2795 -55.3336 36559.9083 37536.9459 -977.0376
## sow:acq_exp 21492.6912 3480.3746 25194.6917 24973.0658 221.6259
## sow:employees 21492.6912 125.0988 22086.2213 21617.7900 468.4314
## sow:industry 21492.6912 37.0040 21159.8598 21529.6952 -369.8355
## sow:revenue 21492.6912 25.1279 21418.5762 21517.8191 -99.2429
## acq_exp:employees 3525.2137 92.2344 3544.1762 3617.4482 -73.2719
## acq_exp:industry 3525.2137 96.5666 3476.1859 3621.7803 -145.5944
## acq_exp:revenue 3525.2137 -29.7336 3505.6993 3495.4801 10.2192
## employees:industry -19.9197 56.0231 59.7672 36.1034 23.6638
## employees:revenue -19.9197 50.5517 2.2189 30.6320 -28.4131
## industry:revenue 51.4946 40.3536 111.8956 91.8481 20.0474
set.seed(123)
# Establish a list of possible values for hyper-parameters
mtry.values <- seq(4,6,1)
nodesize.values <- seq(4,8,2)
ntree.values <- seq(4e3,6e3,1e3)
# Create a data frame containing all combinations
hyper_grid <- expand.grid(mtry = mtry.values, nodesize = nodesize.values, ntree = ntree.values)
# Create an empty vector to store OOB error values
oob_err <- c()
# Write a loop over the rows of hyper_grid to train the grid of models
for (i in 1:nrow(hyper_grid)) {
# Train a Random Forest model
model <- rfsrc(duration ~ profit+acq_exp+freq+crossbuy+sow+industry+revenue+employees,
data = data3,
mtry = hyper_grid$mtry[i],
nodesize = hyper_grid$nodesize[i],
ntree = hyper_grid$ntree[i])
# Store OOB error for the model
oob_err[i] <- model$err.rate[length(model$err.rate)]
}
# Identify optimal set of hyperparmeters based on OOB error
opt_i <- which.min(oob_err)
print(hyper_grid[opt_i,])
## mtry nodesize ntree
## 12 6 4 5000
# Run a decision tree model
dt.model1 <- rpart(acquisition~acq_exp+industry+revenue+employees,
data = data1) # simple DT model
rattle::fancyRpartPlot(dt.model1, sub = "") # vizualize the DT
t_pred = predict(dt.model1,data1,type="class")
confMat <- table(data1$acquisition,t_pred)
confMat
## t_pred
## 0 1
## 0 112 50
## 1 27 311
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
## [1] 0.846
Decision Tree gave 80.6% accuracy.
# Run a logistic regression model
set.seed(123)
train.control <- trainControl(method = "cv")
glm.model1 <- train(acquisition~acq_exp_sq+industry+revenue+employees,
data=data1,method= 'glm', family= 'binomial',trControl = train.control)
summary(glm.model1)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2982 -0.6250 0.2985 0.6434 2.3767
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.718e+00 7.927e-01 -8.475 < 2e-16 ***
## acq_exp_sq -1.331e-06 6.858e-07 -1.942 0.0522 .
## industry 1.599e+00 2.605e-01 6.138 8.35e-10 ***
## revenue 7.215e-02 1.337e-02 5.397 6.76e-08 ***
## employees 6.782e-03 7.222e-04 9.391 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 629.85 on 499 degrees of freedom
## Residual deviance: 418.36 on 495 degrees of freedom
## AIC: 428.36
##
## Number of Fisher Scoring iterations: 5
pred.glm.model1<-predict(glm.model1,data1)
confusionMatrix(pred.glm.model1,data1$acquisition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 102 37
## 1 60 301
##
## Accuracy : 0.806
## 95% CI : (0.7686, 0.8398)
## No Information Rate : 0.676
## P-Value [Acc > NIR] : 5.918e-11
##
## Kappa : 0.5401
##
## Mcnemar's Test P-Value : 0.0255
##
## Sensitivity : 0.6296
## Specificity : 0.8905
## Pos Pred Value : 0.7338
## Neg Pred Value : 0.8338
## Prevalence : 0.3240
## Detection Rate : 0.2040
## Detection Prevalence : 0.2780
## Balanced Accuracy : 0.7601
##
## 'Positive' Class : 0
##
Logistic regression gave 80.6% accuracy.