library(SMCRM)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(rpart) #decision trees
library(randomForestSRC)
##
## randomForestSRC 3.3.3
##
## Type rfsrc.news() to see new features, changes, and bug fixes.
##
library(rattle)
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
## The following object is masked from 'package:randomForest':
##
## importance
library(corrplot)
## corrplot 0.95 loaded
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
#import data
data(acquisitionRetention)
str(acquisitionRetention)
## 'data.frame': 500 obs. of 15 variables:
## $ customer : num 1 2 3 4 5 6 7 8 9 10 ...
## $ acquisition: num 1 1 1 0 1 1 1 1 0 0 ...
## $ duration : num 1635 1039 1288 0 1631 ...
## $ profit : num 6134 3524 4081 -638 5446 ...
## $ acq_exp : num 694 460 249 638 589 ...
## $ ret_exp : num 972 450 805 0 920 ...
## $ acq_exp_sq : num 480998 211628 62016 407644 346897 ...
## $ ret_exp_sq : num 943929 202077 648089 0 846106 ...
## $ freq : num 6 11 21 0 2 7 15 13 0 0 ...
## $ freq_sq : num 36 121 441 0 4 49 225 169 0 0 ...
## $ crossbuy : num 5 6 6 0 9 4 5 5 0 0 ...
## $ sow : num 95 22 90 0 80 48 51 23 0 0 ...
## $ industry : num 1 0 0 0 0 1 0 1 0 1 ...
## $ revenue : num 47.2 45.1 29.1 40.6 48.7 ...
## $ employees : num 898 686 1423 181 631 ...
summary(acquisitionRetention)
## customer acquisition duration profit
## Min. : 1.0 Min. :0.000 Min. : 0.0 Min. :-1027.0
## 1st Qu.:125.8 1st Qu.:0.000 1st Qu.: 0.0 1st Qu.: -316.3
## Median :250.5 Median :1.000 Median : 957.5 Median : 3369.9
## Mean :250.5 Mean :0.676 Mean : 742.5 Mean : 2403.8
## 3rd Qu.:375.2 3rd Qu.:1.000 3rd Qu.:1146.2 3rd Qu.: 3931.6
## Max. :500.0 Max. :1.000 Max. :1673.0 Max. : 6134.3
## acq_exp ret_exp acq_exp_sq ret_exp_sq
## Min. : 1.21 Min. : 0.0 Min. : 1.5 Min. : 0
## 1st Qu.: 384.14 1st Qu.: 0.0 1st Qu.: 147562.0 1st Qu.: 0
## Median : 491.66 Median : 398.1 Median : 241729.7 Median : 158480
## Mean : 493.35 Mean : 336.3 Mean : 271211.1 Mean : 184000
## 3rd Qu.: 600.21 3rd Qu.: 514.3 3rd Qu.: 360246.0 3rd Qu.: 264466
## Max. :1027.04 Max. :1095.0 Max. :1054811.2 Max. :1198937
## freq freq_sq crossbuy sow
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 6.00 Median : 36.00 Median : 5.000 Median : 44.00
## Mean : 6.22 Mean : 69.25 Mean : 4.052 Mean : 38.88
## 3rd Qu.:11.00 3rd Qu.:121.00 3rd Qu.: 7.000 3rd Qu.: 66.00
## Max. :21.00 Max. :441.00 Max. :11.000 Max. :116.00
## industry revenue employees
## Min. :0.000 Min. :14.49 Min. : 18.0
## 1st Qu.:0.000 1st Qu.:33.53 1st Qu.: 503.0
## Median :1.000 Median :41.43 Median : 657.5
## Mean :0.522 Mean :40.54 Mean : 671.5
## 3rd Qu.:1.000 3rd Qu.:47.52 3rd Qu.: 826.0
## Max. :1.000 Max. :65.10 Max. :1461.0
customercustomer number (from 1 to 500)
acquisition1 if the prospect was acquired, 0 otherwise
durationnumber of days the customer was a customer of the firm, 0 if acquisition == 0
profitcustomer lifetime value (CLV) of a given customer, -(Acq_Exp) if the customer is not acquired
acq_exptotal dollars spent on trying to acquire this prospect
ret_exptotal dollars spent on trying to retain this customer
acq_exp_sqsquare of the total dollars spent on trying to acquire this prospect
ret_exp_sqsquare of the total dollars spent on trying to retain this customer
freqnumber of purchases the customer made during that customer’s lifetime with the firm, 0 if acquisition == 0
freq_sqsquare of the number of purchases the customer made during that customer’s lifetime with the firm
crossbuynumber of product categories the customer purchased from during that customer’s lifetime with the firm, 0 if acquisition = 0
sowShare-of-Wallet; percentage of purchases the customer makes from the given firm given the total amount of purchases across all firms in that category
industry1 if the customer is in the B2B industry, 0 otherwise
revenueannual sales revenue of the prospect’s firm (in millions of dollar)
employeesnumber of employees in the prospect’s firm
#convert categorical variables
acquisitionRetention$acquisition <- as.factor(acquisitionRetention$acquisition)
acquisitionRetention$industry <- as.factor(acquisitionRetention$industry)
#check missing values
sum(is.na(acquisitionRetention))
## [1] 0
No missing values in our dataset
pairs(acquisitionRetention[, sapply(acquisitionRetention, is.numeric)])
Plot to see distribution of Yes’ vs No’s
ggplot(acquisitionRetention, aes(x = as.factor(acquisition))) +
geom_bar(fill = "skyblue") +
labs(title = "Distribution of Yes vs No", x = "Acquired (0 = No, 1 = Yes)", y = "Count") +
theme_minimal()
Scatter plots
ggplot(acquisitionRetention, aes(x = industry, fill = acquisition)) +
geom_bar()
Boxplots
ggplot(acquisitionRetention, aes(x = acquisition, y = acq_exp)) +
geom_boxplot() +
theme_minimal()
ggplot(acquisitionRetention, aes(x = acquisition, y = acq_exp_sq)) +
geom_boxplot() +
theme_minimal()
ggplot(acquisitionRetention, aes(x = acquisition, y = revenue)) +
geom_boxplot() +
theme_minimal()
ggplot(acquisitionRetention, aes(x = acquisition, y = employees)) +
geom_boxplot() +
theme_minimal()
These are the only 4 variables that don’t perfectly separate the data
for acquisition.
All other variables end up being 0 if
acquisition = 0.
cor_matrix <- cor(acquisitionRetention[, sapply(acquisitionRetention, is.numeric)])
corrplot(cor_matrix, method = 'circle', type = 'lower', order = 'hclust', tl.col = 'black',
tl.srt = 45)
dt_model <- rpart(acquisition ~ acq_exp + acq_exp_sq + revenue + employees + industry,
data = acquisitionRetention)
rattle::fancyRpartPlot(dt_model, sub = " ", cex = 0.7)
dt_model$variable.importance
## employees acq_exp acq_exp_sq revenue industry
## 61.35674 19.15629 19.15629 17.89659 10.57767
Based on our Decision Tree, we can see that the most important
feature when including all 5 of our variables is
employees.
2nd is acq_exp, then is revenue and lastly is
industry.
Our model did not use acq_exp_sq, likely because it
doesn’t add any explanation of variance and doesn’t improve the model’s
ability to split the data.
Based on this information, we will do random forests using
employees, acq_exp, and
revenue.
set.seed(42)
train_index <- createDataPartition(acquisitionRetention$acquisition, p = 0.7, list = FALSE)
train_data <- acquisitionRetention[train_index, ]
test_data <- acquisitionRetention[-train_index, ]
set.seed(42)
rf_model <- rfsrc(acquisition ~ employees + revenue + acq_exp + industry,
data = train_data, importance = TRUE, ntree = 500, mtry = 1, positive = '1')
rf_model
## Sample size: 351
## Frequency of class labels: 114, 237
## Number of trees: 500
## Forest terminal node size: 1
## Average no. of terminal nodes: 34.6
## No. of variables tried at each split: 1
## Total no. of variables: 4
## Resampling used to grow trees: swor
## Resample size used to grow trees: 222
## Analysis: RF-C
## Family: class
## Splitting rule: gini *random*
## Number of random split points: 10
## Imbalanced ratio: 2.0789
## (OOB) Brier score: 0.14491668
## (OOB) Normalized Brier score: 0.57966673
## (OOB) AUC: 0.86505293
## (OOB) Log-loss: 0.44366657
## (OOB) PR-AUC: 0.7529923
## (OOB) G-mean: 0.67192023
## (OOB) Requested performance error: 0.22792023, 0.5, 0.09704641
##
## Confusion matrix:
##
## predicted
## observed 0 1 class.error
## 0 57 57 0.500
## 1 23 214 0.097
##
## (OOB) Misclassification rate: 0.2279202
Our model performance seems decent with a Brier score of 0.16 and a misclassification rate of 0.222.
Our model also has a higher class error for the
Not Acquired class (0.4386) than the Acquired
class (0.118)
plot(rf_model, main = 'Variable Importance for Random Forest')
##
## all 0 1
## employees 0.2967 0.4697 0.9566
## acq_exp 0.1966 -0.6142 1.0788
## revenue 0.0902 -0.0753 0.3957
## industry 0.0350 0.1507 0.0670
Based on our plots, we can see that:
After the first 50 trees, the error rate kind of goes up and down very slightly. The error rate of ‘0’ varies a lot more than the error rate of ‘1’.
It also seems like after the first 100 trees, the behavior follows the same up and down fluctuation around the same values.
We also confirm the variable importance of our random forest, and the
importance of acq_exp being less than 0 when
acquisition is 0, while the importance of the same variable
is the highest of the variables when acquisition is 1.
This means that for the overall model, this variable is important, when predicting 0 it isn’t, and when predicting 1 it is.
plot.variable(rf_model, partial = TRUE, plots.per.page = 1)
employees being higher, indicates that the likelihood of
NOT Acquiring is lower, or higher probability of Acquiring.
Industry being 0 has a higher likely of NOT Acquiring,
while Industry being 1 has a lower probability of NOT
Acquiring.
revenue follows the same slope that
employees
acq_exp is interesting because our plot shows that as
acq_exp goes up, we have a higher chance of acquiring, and after about
500, our chance of acquiring becomes much lower again.
find.interaction(rf_model, method = 'vimp', importance = 'permute')
## Pairing employees with acq_exp
## Pairing employees with revenue
## Pairing employees with industry
## Pairing acq_exp with revenue
## Pairing acq_exp with industry
## Pairing revenue with industry
##
## Method: vimp
## No. of variables: 4
## Variables sorted by VIMP?: TRUE
## No. of variables used for pairing: 4
## Total no. of paired interactions: 6
## Monte Carlo replications: 1
## Type of noising up used for VIMP: permute
##
## Var 1 Var 2 Paired Additive Difference
## employees:acq_exp 0.0818 0.0303 0.1067 0.1121 -0.0054
## employees:revenue 0.0818 0.0108 0.0939 0.0926 0.0013
## employees:industry 0.0818 0.0079 0.0912 0.0897 0.0015
## acq_exp:revenue 0.0284 0.0158 0.0451 0.0442 0.0009
## acq_exp:industry 0.0284 0.0065 0.0368 0.0348 0.0020
## revenue:industry 0.0106 0.0089 0.0221 0.0195 0.0025
Variables don’t show a large difference between ‘Paired’ and ‘Additive’, therefore these interactions don’t indicate an association worth pursuing.
rf_model_preds <- predict(rf_model, newdata = test_data)
confusionMatrix(rf_model_preds$class, as.factor(test_data$acquisition), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 31 9
## 1 17 92
##
## Accuracy : 0.8255
## 95% CI : (0.7549, 0.8827)
## No Information Rate : 0.6779
## P-Value [Acc > NIR] : 3.729e-05
##
## Kappa : 0.5822
##
## Mcnemar's Test P-Value : 0.1698
##
## Sensitivity : 0.9109
## Specificity : 0.6458
## Pos Pred Value : 0.8440
## Neg Pred Value : 0.7750
## Prevalence : 0.6779
## Detection Rate : 0.6174
## Detection Prevalence : 0.7315
## Balanced Accuracy : 0.7784
##
## 'Positive' Class : 1
##
set.seed(42)
mtry.values <- seq(1, 3, 1)
nodesize.values <- seq(2, 8, 1)
ntree.values <- seq(100, 500, 100)
hyper_grid <- expand.grid(mtry = mtry.values,
nodesize = nodesize.values,
ntree = ntree.values)
results <- data.frame()
for(i in 1:nrow(hyper_grid)) {
params <- hyper_grid[i, ]
set.seed(42)
model <- rfsrc(acquisition ~ employees + revenue + acq_exp + industry,
data = train_data,
ntree = params$ntree,
mtry = params$mtry,
nodesize = params$nodesize,
importance = TRUE,
positive = '1')
#store results
results <- rbind(results, data.frame(
mtry = params$mtry,
nodesize = params$nodesize,
ntree = params$ntree,
oob_err = model$err.rate[model$ntree]
))
}
best_params <- results[which.min(results$oob_err), ]
print(best_params)
## mtry nodesize ntree oob_err
## 56 2 6 300 0.1965812
set.seed(42)
rf_model_hyper <- rfsrc(acquisition ~ employees + revenue + acq_exp + industry,
data = train_data,
mtry = 2,
nodesize = 6,
ntree = 300,
importance = TRUE,
positive = '1')
rf_model_hyper
## Sample size: 351
## Frequency of class labels: 114, 237
## Number of trees: 300
## Forest terminal node size: 6
## Average no. of terminal nodes: 25.9633
## No. of variables tried at each split: 2
## Total no. of variables: 4
## Resampling used to grow trees: swor
## Resample size used to grow trees: 222
## Analysis: RF-C
## Family: class
## Splitting rule: gini *random*
## Number of random split points: 10
## Imbalanced ratio: 2.0789
## (OOB) Brier score: 0.13667768
## (OOB) Normalized Brier score: 0.54671071
## (OOB) AUC: 0.86930935
## (OOB) Log-loss: 0.41236117
## (OOB) PR-AUC: 0.75751153
## (OOB) G-mean: 0.75478095
## (OOB) Requested performance error: 0.1965812, 0.35087719, 0.12236287
##
## Confusion matrix:
##
## predicted
## observed 0 1 class.error
## 0 74 40 0.3509
## 1 29 208 0.1224
##
## (OOB) Misclassification rate: 0.1965812
rf_model_hyper_preds <- predict(rf_model_hyper, newdata = test_data)
confusionMatrix(rf_model_hyper_preds$class, as.factor(test_data$acquisition), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 35 15
## 1 13 86
##
## Accuracy : 0.8121
## 95% CI : (0.74, 0.8713)
## No Information Rate : 0.6779
## P-Value [Acc > NIR] : 0.0001794
##
## Kappa : 0.5744
##
## Mcnemar's Test P-Value : 0.8501067
##
## Sensitivity : 0.8515
## Specificity : 0.7292
## Pos Pred Value : 0.8687
## Neg Pred Value : 0.7000
## Prevalence : 0.6779
## Detection Rate : 0.5772
## Detection Prevalence : 0.6644
## Balanced Accuracy : 0.7903
##
## 'Positive' Class : 1
##
We know that acq_exp and acq_exp_sq are highly correlated so they will probably give us multicollinearity.
For Logistic Regression though, we will use industry to
see if it works better in this model because in random forest it had a
very low importance.
log_model <- glm(acquisition ~ employees + revenue + acq_exp + industry, data = train_data, family = 'binomial')
summary(log_model)
##
## Call:
## glm(formula = acquisition ~ employees + revenue + acq_exp + industry,
## family = "binomial", data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.7126171 1.0091211 -6.652 2.89e-11 ***
## employees 0.0067305 0.0008403 8.010 1.15e-15 ***
## revenue 0.0623512 0.0153739 4.056 5.00e-05 ***
## acq_exp 0.0003939 0.0008157 0.483 0.629
## industry1 1.2248144 0.2913040 4.205 2.62e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 442.56 on 350 degrees of freedom
## Residual deviance: 310.42 on 346 degrees of freedom
## AIC: 320.42
##
## Number of Fisher Scoring iterations: 5
All variables except acq_exp are statistically
significant
vif(log_model)
## employees revenue acq_exp industry
## 1.121634 1.062416 1.012116 1.063939
log_model_preds <- predict(log_model, newdata = test_data, type = 'response')
#convert probabilities to class labels
log_model_preds_class <- ifelse(log_model_preds > 0.5, 1, 0)
confusionMatrix(as.factor(log_model_preds_class), test_data$acquisition, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 35 13
## 1 13 88
##
## Accuracy : 0.8255
## 95% CI : (0.7549, 0.8827)
## No Information Rate : 0.6779
## P-Value [Acc > NIR] : 3.729e-05
##
## Kappa : 0.6005
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8713
## Specificity : 0.7292
## Pos Pred Value : 0.8713
## Neg Pred Value : 0.7292
## Prevalence : 0.6779
## Detection Rate : 0.5906
## Detection Prevalence : 0.6779
## Balanced Accuracy : 0.8002
##
## 'Positive' Class : 1
##
step_log_model <- step(log_model, direction = 'both', trace = FALSE)
summary(step_log_model)
##
## Call:
## glm(formula = acquisition ~ employees + revenue + industry, family = "binomial",
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.5168676 0.9189329 -7.092 1.32e-12 ***
## employees 0.0067063 0.0008381 8.002 1.23e-15 ***
## revenue 0.0624918 0.0153751 4.064 4.81e-05 ***
## industry1 1.2340041 0.2908785 4.242 2.21e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 442.56 on 350 degrees of freedom
## Residual deviance: 310.65 on 347 degrees of freedom
## AIC: 318.65
##
## Number of Fisher Scoring iterations: 5
Stepwise selection gets rid of acq_exp.
step_log_model_preds <- predict(step_log_model, newdata = test_data, type = 'response')
#convert probabilities to class labels
step_log_model_preds_class <- ifelse(step_log_model_preds > 0.5, 1, 0)
confusionMatrix(as.factor(step_log_model_preds_class), test_data$acquisition, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 35 11
## 1 13 90
##
## Accuracy : 0.8389
## 95% CI : (0.7699, 0.894)
## No Information Rate : 0.6779
## P-Value [Acc > NIR] : 6.493e-06
##
## Kappa : 0.6271
##
## Mcnemar's Test P-Value : 0.8383
##
## Sensitivity : 0.8911
## Specificity : 0.7292
## Pos Pred Value : 0.8738
## Neg Pred Value : 0.7609
## Prevalence : 0.6779
## Detection Rate : 0.6040
## Detection Prevalence : 0.6913
## Balanced Accuracy : 0.8101
##
## 'Positive' Class : 1
##
Stepwise model performs better. Removing acq_exp likely
was adding an unnecessary variable based on both p-value and AIC.
dt_model2 <- rpart(acquisition ~ employees + revenue + acq_exp,
data = train_data)
rattle::fancyRpartPlot(dt_model2, sub = " ", cex = 0.7)
dt_model2_preds <- predict(dt_model2, newdata = test_data, type = 'class')
#confusion matrix
confusionMatrix(dt_model2_preds, test_data$acquisition, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 28 16
## 1 20 85
##
## Accuracy : 0.7584
## 95% CI : (0.6815, 0.8247)
## No Information Rate : 0.6779
## P-Value [Acc > NIR] : 0.01998
##
## Kappa : 0.4344
##
## Mcnemar's Test P-Value : 0.61708
##
## Sensitivity : 0.8416
## Specificity : 0.5833
## Pos Pred Value : 0.8095
## Neg Pred Value : 0.6364
## Prevalence : 0.6779
## Detection Rate : 0.5705
## Detection Prevalence : 0.7047
## Balanced Accuracy : 0.7125
##
## 'Positive' Class : 1
##
Based on the performance of our Random Forest, Logistic Model, Logistic Model using Stepwise Selection, and Decision Trees model, our best performing model in terms of accuracy was the Stepwise Logistic Regression Model.
In terms of sensitivity, our best model was the same.
Creating predictions for entire dataframe
df_preds <- predict(step_log_model, newdata = acquisitionRetention, type = 'response')
df_preds_class <- ifelse(df_preds >= 0.5, 1, 0)
confusionMatrix(as.factor(df_preds_class), acquisitionRetention$acquisition, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 103 37
## 1 59 301
##
## Accuracy : 0.808
## 95% CI : (0.7707, 0.8416)
## No Information Rate : 0.676
## P-Value [Acc > NIR] : 2.928e-11
##
## Kappa : 0.5456
##
## Mcnemar's Test P-Value : 0.03209
##
## Sensitivity : 0.8905
## Specificity : 0.6358
## Pos Pred Value : 0.8361
## Neg Pred Value : 0.7357
## Prevalence : 0.6760
## Detection Rate : 0.6020
## Detection Prevalence : 0.7200
## Balanced Accuracy : 0.7632
##
## 'Positive' Class : 1
##
acquired_pred_df <- data.frame(acquisitionRetention, Predicted = df_preds_class) # use rf_model_preds$class to change it so it uses rf model
#filter out the observations we did not predict correctly
acquired_pred_df <- acquired_pred_df[acquired_pred_df$acquisition == 1 & acquired_pred_df$Predicted == 1, ]
summary(acquired_pred_df)
## customer acquisition duration profit acq_exp
## Min. : 1 0: 0 Min. : 654 Min. :1839 Min. :151.0
## 1st Qu.:128 1:301 1st Qu.: 950 1st Qu.:3349 1st Qu.:393.9
## Median :254 Median :1069 Median :3733 Median :494.2
## Mean :252 Mean :1089 Mean :3786 Mean :491.3
## 3rd Qu.:375 3rd Qu.:1232 3rd Qu.:4152 3rd Qu.:579.5
## Max. :500 Max. :1660 Max. :6134 Max. :864.1
## ret_exp acq_exp_sq ret_exp_sq freq
## Min. :229.9 Min. : 22813 Min. : 52840 Min. : 1.000
## 1st Qu.:395.1 1st Qu.:155126 1st Qu.:156144 1st Qu.: 6.000
## Median :464.4 Median :244194 Median :215649 Median : 9.000
## Mean :491.6 Mean :260566 Mean :263604 Mean : 9.229
## 3rd Qu.:562.4 3rd Qu.:335855 3rd Qu.:316328 3rd Qu.:12.000
## Max. :996.9 Max. :746669 Max. :993730 Max. :21.000
## freq_sq crossbuy sow industry revenue
## Min. : 1.0 Min. : 1.000 Min. : 1.00 0:111 Min. :15.86
## 1st Qu.: 36.0 1st Qu.: 5.000 1st Qu.: 42.00 1:190 1st Qu.:36.85
## Median : 81.0 Median : 6.000 Median : 58.00 Median :43.45
## Mean :103.7 Mean : 6.027 Mean : 57.23 Mean :42.82
## 3rd Qu.:144.0 3rd Qu.: 7.000 3rd Qu.: 71.00 3rd Qu.:48.72
## Max. :441.0 Max. :11.000 Max. :116.00 Max. :65.10
## employees Predicted
## Min. : 343.0 Min. :1
## 1st Qu.: 641.0 1st Qu.:1
## Median : 774.0 Median :1
## Mean : 789.7 Mean :1
## 3rd Qu.: 923.0 3rd Qu.:1
## Max. :1461.0 Max. :1
We remain with 301 observations, which is confirmed by our confusion matrix of the stepwise logistic model on the full dataset.
set.seed(42)
train_index_d <- createDataPartition(acquired_pred_df$acquisition, p = 0.7, list = FALSE)
## Warning in createDataPartition(acquired_pred_df$acquisition, p = 0.7, list =
## FALSE): Some classes have no records ( 0 ) and these will be ignored
train_data_d <- acquired_pred_df[train_index_d, ]
test_data_d <- acquired_pred_df[-train_index_d, ]
We are aware that there are multiple variables that should result in a high multicollinearity, based on our correlation plot.
profit has the highest correlation to duration,
therefore we won’t include this. This is also too good of an indicator
of duration.
Also, all of the _sq variables are directly related to their non-transformed variables. We must choose whether to use the transformation or the regular variable
lin_model_duration <- lm(duration ~ acq_exp + ret_exp +
freq + crossbuy + sow + industry + revenue +
employees,
data = train_data_d)
summary(lin_model_duration)
##
## Call:
## lm(formula = duration ~ acq_exp + ret_exp + freq + crossbuy +
## sow + industry + revenue + employees, data = train_data_d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -182.360 -23.123 9.504 30.946 84.563
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 532.38769 29.35126 18.138 < 2e-16 ***
## acq_exp 0.03417 0.02276 1.501 0.13483
## ret_exp 1.34949 0.02078 64.946 < 2e-16 ***
## freq -9.02119 0.74766 -12.066 < 2e-16 ***
## crossbuy 1.12766 1.61352 0.699 0.48543
## sow 0.43159 0.14869 2.903 0.00411 **
## industry1 -13.55921 6.58363 -2.060 0.04073 *
## revenue -0.59455 0.34627 -1.717 0.08751 .
## employees -0.04835 0.01632 -2.963 0.00341 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.41 on 202 degrees of freedom
## Multiple R-squared: 0.9559, Adjusted R-squared: 0.9541
## F-statistic: 546.7 on 8 and 202 DF, p-value: < 2.2e-16
vif(lin_model_duration)
## acq_exp ret_exp freq crossbuy sow industry revenue employees
## 1.047762 1.018493 1.031301 1.042627 1.032964 1.043999 1.081956 1.100230
predictions <- predict(lin_model_duration, newdata = test_data_d) # Get predictions
actuals2 <- test_data_d$duration # Actual values
rmse <- sqrt(mean((actuals2 - predictions)^2)) # Compute RMSE
print(rmse)
## [1] 37.89923
step_lin_model_duration <- step(lin_model_duration, direction = 'both', trace = FALSE)
summary(step_lin_model_duration)
##
## Call:
## lm(formula = duration ~ acq_exp + ret_exp + freq + sow + industry +
## revenue + employees, data = train_data_d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -181.834 -23.586 9.502 30.681 81.387
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 537.31772 28.45505 18.883 < 2e-16 ***
## acq_exp 0.03635 0.02252 1.614 0.10802
## ret_exp 1.34993 0.02074 65.080 < 2e-16 ***
## freq -9.01736 0.74670 -12.076 < 2e-16 ***
## sow 0.44030 0.14798 2.976 0.00328 **
## industry1 -13.16188 6.55076 -2.009 0.04584 *
## revenue -0.60559 0.34548 -1.753 0.08113 .
## employees -0.04811 0.01629 -2.953 0.00352 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.35 on 203 degrees of freedom
## Multiple R-squared: 0.9557, Adjusted R-squared: 0.9542
## F-statistic: 626.3 on 7 and 203 DF, p-value: < 2.2e-16
we check for multicollinearity just for practice, even though we have already removed the multicollinear variables.
vif(step_lin_model_duration)
## acq_exp ret_exp freq sow industry revenue employees
## 1.028118 1.017543 1.031246 1.025697 1.036214 1.079702 1.099733
step_predictions <- predict(step_lin_model_duration, newdata = test_data_d) # Get predictions
step_rmse <- sqrt(mean((actuals2 - step_predictions)^2)) # Compute RMSE
print(step_rmse)
## [1] 38.17574
Model performance decreases slightly when using stepwise linear regression.
dt_model_duration <- rpart(duration ~ acq_exp + acq_exp_sq + ret_exp + ret_exp_sq +
freq + freq_sq + crossbuy + sow + industry + revenue +
employees,
data = train_data_d)
rattle::fancyRpartPlot(dt_model_duration, sub = " ", cex = 0.7)
dt_model_duration$variable.importance
## ret_exp ret_exp_sq freq freq_sq acq_exp revenue acq_exp_sq
## 8699813.82 8699813.82 235852.30 235852.30 206892.67 183178.55 173660.62
## employees sow crossbuy
## 102586.22 56644.78 42962.32
Our model seems to only use ret_exp. It has a really
high variable importance.
dt_model_duration_preds <- predict(dt_model_duration, newdata = test_data_d)
sqrt(mean((actuals2 - dt_model_duration_preds)^2))
## [1] 63.50395
Model performs slightly worse than both linear and stepwise linear regression.
set.seed(42)
rf_model_duration <- rfsrc(duration ~ acq_exp + ret_exp +
freq + crossbuy + sow + industry + revenue +
employees,
data = train_data_d,
importance = TRUE,
ntree = 400,
mtry = 6,
nodesize = 2)
rf_model_duration
## Sample size: 211
## Number of trees: 400
## Forest terminal node size: 2
## Average no. of terminal nodes: 65.485
## No. of variables tried at each split: 6
## Total no. of variables: 8
## Resampling used to grow trees: swor
## Resample size used to grow trees: 133
## Analysis: RF-R
## Family: regr
## Splitting rule: mse *random*
## Number of random split points: 10
## (OOB) R squared: 0.96783633
## (OOB) Requested performance error: 1445.08144482
find.interaction(rf_model_duration, method = 'vimp', importance = 'permute')
## Pairing ret_exp with freq
## Pairing ret_exp with crossbuy
## Pairing ret_exp with acq_exp
## Pairing ret_exp with revenue
## Pairing ret_exp with sow
## Pairing ret_exp with employees
## Pairing ret_exp with industry
## Pairing freq with crossbuy
## Pairing freq with acq_exp
## Pairing freq with revenue
## Pairing freq with sow
## Pairing freq with employees
## Pairing freq with industry
## Pairing crossbuy with acq_exp
## Pairing crossbuy with revenue
## Pairing crossbuy with sow
## Pairing crossbuy with employees
## Pairing crossbuy with industry
## Pairing acq_exp with revenue
## Pairing acq_exp with sow
## Pairing acq_exp with employees
## Pairing acq_exp with industry
## Pairing revenue with sow
## Pairing revenue with employees
## Pairing revenue with industry
## Pairing sow with employees
## Pairing sow with industry
## Pairing employees with industry
##
## Method: vimp
## No. of variables: 8
## Variables sorted by VIMP?: TRUE
## No. of variables used for pairing: 8
## Total no. of paired interactions: 28
## Monte Carlo replications: 1
## Type of noising up used for VIMP: permute
##
## Var 1 Var 2 Paired Additive Difference
## ret_exp:freq 54201.0637 1821.2717 55510.4497 56022.3354 -511.8857
## ret_exp:crossbuy 54201.0637 56.3693 54091.1071 54257.4330 -166.3259
## ret_exp:acq_exp 54201.0637 -8.7183 54869.8112 54192.3454 677.4657
## ret_exp:revenue 54201.0637 -106.9028 54136.1795 54094.1609 42.0186
## ret_exp:sow 54201.0637 -27.0525 54100.7155 54174.0112 -73.2958
## ret_exp:employees 54201.0637 38.7216 54880.8601 54239.7853 641.0748
## ret_exp:industry 54201.0637 -4.8962 54369.5000 54196.1676 173.3324
## freq:crossbuy 1914.0334 61.2733 2046.2011 1975.3067 70.8945
## freq:acq_exp 1914.0334 -4.1243 1878.8619 1909.9091 -31.0472
## freq:revenue 1914.0334 -18.3348 1797.6993 1895.6986 -97.9993
## freq:sow 1914.0334 16.6514 1793.5813 1930.6848 -137.1035
## freq:employees 1914.0334 -22.7791 1919.2035 1891.2543 27.9492
## freq:industry 1914.0334 7.2778 1840.9511 1921.3111 -80.3600
## crossbuy:acq_exp 35.0966 -14.4201 73.8426 20.6765 53.1662
## crossbuy:revenue 35.0966 -69.2062 2.9400 -34.1097 37.0497
## crossbuy:sow 35.0966 -34.7794 24.7509 0.3172 24.4337
## crossbuy:employees 35.0966 11.0629 55.8097 46.1594 9.6503
## crossbuy:industry 35.0966 0.3104 40.4869 35.4069 5.0800
## acq_exp:revenue 8.3465 -60.5209 -31.0185 -52.1744 21.1559
## acq_exp:sow 8.3465 -13.3451 -31.2697 -4.9986 -26.2711
## acq_exp:employees 8.3465 3.8804 -2.9693 12.2268 -15.1962
## acq_exp:industry 8.3465 2.4421 5.2675 10.7886 -5.5211
## revenue:sow -76.9031 -10.6030 -124.4245 -87.5061 -36.9184
## revenue:employees -76.9031 22.4242 -84.3401 -54.4789 -29.8611
## revenue:industry -76.9031 10.5161 -107.2713 -66.3870 -40.8843
## sow:employees -2.3665 5.4675 41.2102 3.1010 38.1092
## sow:industry -2.3665 -0.9354 7.4227 -3.3018 10.7246
## employees:industry 35.9397 -11.5298 -5.8940 24.4098 -30.3038
ret_exp : acq_exp has an interaction that we could
explore
maybe also do ret_exp : employees and
ret_exp : freq.
plot(rf_model_duration, main = 'Variable Importance for Random Forest')
##
## Importance Relative Imp
## ret_exp 339911.9084 1.0000
## freq 10039.8790 0.0295
## crossbuy 1908.1819 0.0056
## acq_exp 1658.7736 0.0049
## revenue 968.8881 0.0029
## sow 669.8189 0.0020
## employees 461.9788 0.0014
## industry 10.6160 0.0000
Almost all variables have no importance compared to
ret_exp
min_depth <- max.subtree(rf_model_duration, max.order = 1)
print(min_depth$order)
## [,1]
## acq_exp 3.2475
## ret_exp 0.2975
## freq 1.8900
## crossbuy 4.1300
## sow 3.7025
## industry 6.3450
## revenue 3.6700
## employees 3.6425
ret_exp is still the most important variable.
Lower minimal depth, the more important because it’s going to split closest to the top of the tree.
plot.variable(rf_model_duration, partial = TRUE, plots.per.page = 1)
ret_exp shows a positive relationship with duration, up
until about 1000, then after shows diminishing returns.
freq shows a downward slope. Interpreted, this would
mean that a higher purchase frequency is associated with shorter
duration. Although it sounds counter intuitive, it may show that our
more loyal customers buy from us less, and we may have issues retaining
customers who buy very frequently?
crossbuy, sow, and acq_exp
show a very slight upward slope, and after than all variables seem to be
on a plateau. These probably won’t help much with predicting
duration.
rf_model_duration_preds <- predict(rf_model_duration, newdata = test_data_d)
sqrt(mean((actuals2 - rf_model_duration_preds$predicted)^2))
## [1] 31.06021
train_data_d$ret_exp_acq_exp <- train_data_d$ret_exp * train_data_d$acq_exp
test_data_d$ret_exp_acq_exp <- test_data_d$ret_exp * test_data_d$acq_exp
train_data_d$ret_exp_employees <- train_data_d$ret_exp * train_data_d$employees
test_data_d$ret_exp_employees <- test_data_d$ret_exp * test_data_d$employees
train_data_d$ret_exp_freq<- train_data_d$ret_exp * train_data_d$freq
test_data_d$ret_exp_freq <- test_data_d$ret_exp * test_data_d$freq
set.seed(42)
rf_model_duration <- rfsrc(duration ~ acq_exp + ret_exp + freq + crossbuy +
sow + industry + revenue + employees +
ret_exp_acq_exp + ret_exp_employees + ret_exp_freq, #interactions
data = train_data_d,
importance = TRUE,
ntree = 500,
mtry = 6,
nodesize = 2)
rf_model_duration
## Sample size: 211
## Number of trees: 500
## Forest terminal node size: 2
## Average no. of terminal nodes: 65.984
## No. of variables tried at each split: 6
## Total no. of variables: 11
## Resampling used to grow trees: swor
## Resample size used to grow trees: 133
## Analysis: RF-R
## Family: regr
## Splitting rule: mse *random*
## Number of random split points: 10
## (OOB) R squared: 0.95341795
## (OOB) Requested performance error: 2092.88466078
plot(rf_model_duration)
##
## Importance Relative Imp
## ret_exp 156825.9656 1.0000
## ret_exp_acq_exp 12574.6486 0.0802
## ret_exp_employees 9815.0340 0.0626
## freq 4510.4884 0.0288
## ret_exp_freq 1810.5480 0.0115
## employees 1240.9091 0.0079
## acq_exp 675.1601 0.0043
## revenue 416.7105 0.0027
## sow 317.3310 0.0020
## crossbuy 296.0826 0.0019
## industry 6.1499 0.0000
Based on variable importance, we will remove the 4 lowest.
mtry.values <- seq(2, 8, 2)
nodesize.values <- seq(2, 8, 2)
ntree.values <- seq(100, 500, 100)
hyper_grid <- expand.grid(mtry = mtry.values,
nodesize = nodesize.values,
ntree = ntree.values)
results <- data.frame()
for(i in 1:nrow(hyper_grid)) {
params <- hyper_grid[i, ]
set.seed(42)
model <- rfsrc(duration ~ ret_exp + freq + industry + employees +
ret_exp_acq_exp + ret_exp_employees + ret_exp_freq,
data = train_data_d,
ntree = params$ntree,
mtry = params$mtry,
nodesize = params$nodesize,
importance = TRUE)
#store results
results <- rbind(results, data.frame(
mtry = params$mtry,
nodesize = params$nodesize,
ntree = params$ntree,
oob_err = model$err.rate[model$ntree]
))
}
best_params <- results[which.min(results$oob_err), ]
print(best_params)
## mtry nodesize ntree oob_err
## 68 8 2 500 1488.735
set.seed(42)
rf_model_duration <- rfsrc(duration ~ ret_exp + freq + industry + employees +
ret_exp_acq_exp + ret_exp_employees + ret_exp_freq,
data = train_data_d,
ntree = 500,
mtry = 8,
nodesize = 2,
importance = TRUE)
rf_model_duration
## Sample size: 211
## Number of trees: 500
## Forest terminal node size: 2
## Average no. of terminal nodes: 66.002
## No. of variables tried at each split: 7
## Total no. of variables: 7
## Resampling used to grow trees: swor
## Resample size used to grow trees: 133
## Analysis: RF-R
## Family: regr
## Splitting rule: mse *random*
## Number of random split points: 10
## (OOB) R squared: 0.96686472
## (OOB) Requested performance error: 1488.73491544
plot.variable(rf_model_duration, partial = TRUE, plots.per.page = 1)
rf_model_duration_preds <- predict(rf_model_duration, newdata = test_data_d)
sqrt(mean((actuals2 - rf_model_duration_preds$predicted)^2))
## [1] 29.72627
Seems to be the best performing model based on RMSE.
Linear Regression:
37.89923
Stepwise Linear Regression:
38.17574
Decision Trees:
63.50395
Random Forest:
31.06021
Random Forest Hyper Tuned:
29.72627
The RMSE of 29.72627 means that, on average, the difference between
the predicted duration values and the actual
duration values is about 29.72 units.