Load Packages
library(SMCRM) # CRM data
library(dplyr) # data wrangling
library(tidyr) # data wrangling
library(ggplot2) # plotting
library(survival) # survival
library(rpart) # DT
library(randomForestSRC) # RF
library(rpart.plot)
library(pROC)
library(caret)
library(randomForest)
# theme for nice plotting
theme_nice <- theme_classic()+
theme(
axis.line.y.left = element_line(colour = "black"),
axis.line.y.right = element_line(colour = "black"),
axis.line.x.bottom = element_line(colour = "black"),
axis.line.x.top = element_line(colour = "black"),
axis.text.y = element_text(colour = "black", size = 12),
axis.text.x = element_text(color = "black", size = 12),
axis.ticks = element_line(color = "black")) +
theme(
axis.ticks.length = unit(-0.25, "cm"),
axis.text.x = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")),
axis.text.y = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")))
Dataset
data("acquisitionRetention")
df = acquisitionRetention
# Check for missing values
colSums(is.na(df))
## customer acquisition duration profit acq_exp ret_exp
## 0 0 0 0 0 0
## acq_exp_sq ret_exp_sq freq freq_sq crossbuy sow
## 0 0 0 0 0 0
## industry revenue employees
## 0 0 0
Data Exploration
# Look at structure of data
str(acquisitionRetention)
## 'data.frame': 500 obs. of 15 variables:
## $ customer : num 1 2 3 4 5 6 7 8 9 10 ...
## $ acquisition: num 1 1 1 0 1 1 1 1 0 0 ...
## $ duration : num 1635 1039 1288 0 1631 ...
## $ profit : num 6134 3524 4081 -638 5446 ...
## $ acq_exp : num 694 460 249 638 589 ...
## $ ret_exp : num 972 450 805 0 920 ...
## $ acq_exp_sq : num 480998 211628 62016 407644 346897 ...
## $ ret_exp_sq : num 943929 202077 648089 0 846106 ...
## $ freq : num 6 11 21 0 2 7 15 13 0 0 ...
## $ freq_sq : num 36 121 441 0 4 49 225 169 0 0 ...
## $ crossbuy : num 5 6 6 0 9 4 5 5 0 0 ...
## $ sow : num 95 22 90 0 80 48 51 23 0 0 ...
## $ industry : num 1 0 0 0 0 1 0 1 0 1 ...
## $ revenue : num 47.2 45.1 29.1 40.6 48.7 ...
## $ employees : num 898 686 1423 181 631 ...
# Summary Statistics
summary(df)
## customer acquisition duration profit
## Min. : 1.0 Min. :0.000 Min. : 0.0 Min. :-1027.0
## 1st Qu.:125.8 1st Qu.:0.000 1st Qu.: 0.0 1st Qu.: -316.3
## Median :250.5 Median :1.000 Median : 957.5 Median : 3369.9
## Mean :250.5 Mean :0.676 Mean : 742.5 Mean : 2403.8
## 3rd Qu.:375.2 3rd Qu.:1.000 3rd Qu.:1146.2 3rd Qu.: 3931.6
## Max. :500.0 Max. :1.000 Max. :1673.0 Max. : 6134.3
## acq_exp ret_exp acq_exp_sq ret_exp_sq
## Min. : 1.21 Min. : 0.0 Min. : 1.5 Min. : 0
## 1st Qu.: 384.14 1st Qu.: 0.0 1st Qu.: 147562.0 1st Qu.: 0
## Median : 491.66 Median : 398.1 Median : 241729.7 Median : 158480
## Mean : 493.35 Mean : 336.3 Mean : 271211.1 Mean : 184000
## 3rd Qu.: 600.21 3rd Qu.: 514.3 3rd Qu.: 360246.0 3rd Qu.: 264466
## Max. :1027.04 Max. :1095.0 Max. :1054811.2 Max. :1198937
## freq freq_sq crossbuy sow
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 6.00 Median : 36.00 Median : 5.000 Median : 44.00
## Mean : 6.22 Mean : 69.25 Mean : 4.052 Mean : 38.88
## 3rd Qu.:11.00 3rd Qu.:121.00 3rd Qu.: 7.000 3rd Qu.: 66.00
## Max. :21.00 Max. :441.00 Max. :11.000 Max. :116.00
## industry revenue employees
## Min. :0.000 Min. :14.49 Min. : 18.0
## 1st Qu.:0.000 1st Qu.:33.53 1st Qu.: 503.0
## Median :1.000 Median :41.43 Median : 657.5
## Mean :0.522 Mean :40.54 Mean : 671.5
## 3rd Qu.:1.000 3rd Qu.:47.52 3rd Qu.: 826.0
## Max. :1.000 Max. :65.10 Max. :1461.0
# Identify highly correlated variables with acquisition
#Correlation values
cor(df)
## customer acquisition duration profit acq_exp
## customer 1.000000000 0.054034172 0.03638573 0.04099600 -0.0344779052
## acquisition 0.054034172 1.000000000 0.94499140 0.96392988 -0.0017487494
## duration 0.036385733 0.944991403 1.00000000 0.98434462 0.0117152328
## profit 0.040996005 0.963929883 0.98434462 1.00000000 0.0395591912
## acq_exp -0.034477905 -0.001748749 0.01171523 0.03955919 1.0000000000
## ret_exp 0.022855507 0.874115734 0.97852966 0.95164652 0.0118498772
## acq_exp_sq -0.043878373 -0.077186844 -0.05987778 -0.03958462 0.9737781739
## ret_exp_sq -0.007095947 0.631302293 0.82648533 0.77709045 0.0267900795
## freq 0.037846540 0.778910016 0.70998289 0.74689819 0.0007026176
## freq_sq 0.030427659 0.567605728 0.49768232 0.53904545 -0.0050416071
## crossbuy 0.064402299 0.866154614 0.83264401 0.85553188 0.0258650343
## sow 0.008009862 0.847150795 0.80815353 0.83170352 0.0308839247
## industry 0.095383073 0.244373420 0.20789458 0.22744229 0.0145649028
## revenue 0.004210533 0.248837329 0.22598952 0.24148754 0.0643407122
## employees 0.018449766 0.477019473 0.43320548 0.47130639 -0.0419314213
## ret_exp acq_exp_sq ret_exp_sq freq freq_sq
## customer 0.02285551 -0.04387837 -0.007095947 0.0378465396 0.030427659
## acquisition 0.87411573 -0.07718684 0.631302293 0.7789100155 0.567605728
## duration 0.97852966 -0.05987778 0.826485332 0.7099828902 0.497682317
## profit 0.95164652 -0.03958462 0.777090452 0.7468981898 0.539045453
## acq_exp 0.01184988 0.97377817 0.026790080 0.0007026176 -0.005041607
## ret_exp 1.00000000 -0.05562666 0.920692412 0.6940495656 0.513877389
## acq_exp_sq -0.05562666 1.00000000 -0.023073486 -0.0604820431 -0.050203871
## ret_exp_sq 0.92069241 -0.02307349 1.000000000 0.5059909848 0.377358737
## freq 0.69404957 -0.06048204 0.505990985 1.0000000000 0.938395608
## freq_sq 0.51387739 -0.05020387 0.377358737 0.9383956081 1.000000000
## crossbuy 0.77688913 -0.04020979 0.578974344 0.6863182859 0.520562814
## sow 0.74092081 -0.02980165 0.532801273 0.6603665316 0.481760366
## industry 0.18104417 0.03250630 0.099428831 0.1604756340 0.102778207
## revenue 0.20188718 0.04431747 0.130012381 0.1545687918 0.095854901
## employees 0.40942256 -0.05879389 0.286288747 0.4292780382 0.355054413
## crossbuy sow industry revenue employees
## customer 0.06440230 0.008009862 0.095383073 0.004210533 0.018449766
## acquisition 0.86615461 0.847150795 0.244373420 0.248837329 0.477019473
## duration 0.83264401 0.808153531 0.207894584 0.225989515 0.433205481
## profit 0.85553188 0.831703516 0.227442291 0.241487536 0.471306392
## acq_exp 0.02586503 0.030883925 0.014564903 0.064340712 -0.041931421
## ret_exp 0.77688913 0.740920812 0.181044165 0.201887177 0.409422565
## acq_exp_sq -0.04020979 -0.029801655 0.032506302 0.044317467 -0.058793888
## ret_exp_sq 0.57897434 0.532801273 0.099428831 0.130012381 0.286288747
## freq 0.68631829 0.660366532 0.160475634 0.154568792 0.429278038
## freq_sq 0.52056281 0.481760366 0.102778207 0.095854901 0.355054413
## crossbuy 1.00000000 0.746630339 0.218109778 0.194752511 0.415964288
## sow 0.74663034 1.000000000 0.209726320 0.233400782 0.414154030
## industry 0.21810978 0.209726320 1.000000000 0.030086417 -0.002323206
## revenue 0.19475251 0.233400782 0.030086417 1.000000000 0.047489335
## employees 0.41596429 0.414154030 -0.002323206 0.047489335 1.000000000
# create box plots to show if we need to remove any variables
par(mfrow = c(2, 5))
boxplot(duration ~ acquisition, df, xlab = "acquisition", ylab = "duration", col = "blue")
boxplot(profit ~ acquisition, df, xlab = "acquisition", ylab = "profit", col = "red")
boxplot(ret_exp ~ acquisition, df, xlab = "acquisition", ylab = "ret_exp", col = "purple")
boxplot(acq_exp_sq ~ acquisition, df, xlab = "acquisition", ylab = "acq_exp_sq", col = "pink")
boxplot(ret_exp_sq ~ acquisition, df, xlab = "acquisition", ylab = "ret_exp_sq", col = "yellow")
boxplot(freq ~ acquisition, df, xlab = "acquisition", ylab = "freq", col = "lightblue")
boxplot(freq_sq ~ acquisition, df, xlab = "acquisition", ylab = "freq_sq", col = "green")
boxplot(crossbuy ~ acquisition, df, xlab = "acquisition", ylab = "crossbuy", col = "grey")
boxplot(sow ~ acquisition, df, xlab = "acquisition", ylab = "sow", col ="black")
Use acquisitionRetention data set to predict which customers will be
acquired
df = acquisitionRetention
df$industry = as.factor(df$industry) # Difference between Minh and Matthew
df$acquisition = factor(df$acquisition, levels = c(0,1), labels = c("No","Yes"))
df$acq_exp[is.na(df$acq_exp)] = median(df$acq_exp, na.rm = T)
# spliting training and testing data
set.seed(456)
train.ind = createDataPartition(df$acquisition, p = 0.7, list = FALSE)
trainData = df[train.ind,]
testData = df[-train.ind,]
# Create a random forest model
rf.model = randomForest(acquisition ~ acq_exp + industry + revenue + employees,
data = trainData,
ntree = 50,
mtry = 1,
importance = T)
# Predict on test set
test.predictions = predict(rf.model, testData)
test.probabilities = predict(rf.model, testData, type = "prob")
# Create a confusion matrix
confusion.matrix = confusionMatrix(test.predictions,testData$acquisition)
confusion.matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 26 14
## Yes 22 87
##
## Accuracy : 0.7584
## 95% CI : (0.6815, 0.8247)
## No Information Rate : 0.6779
## P-Value [Acc > NIR] : 0.01998
##
## Kappa : 0.4215
##
## Mcnemar's Test P-Value : 0.24335
##
## Sensitivity : 0.5417
## Specificity : 0.8614
## Pos Pred Value : 0.6500
## Neg Pred Value : 0.7982
## Prevalence : 0.3221
## Detection Rate : 0.1745
## Detection Prevalence : 0.2685
## Balanced Accuracy : 0.7015
##
## 'Positive' Class : No
##
Predict number of customers to be acquired
full.predictions = predict(rf.model,df)
full.probabilitie = predict(rf.model,df, type = "prob")
ac.results = data.frame(customer = df$customer,
predicted.acquisition = full.predictions,
prob_acquired = full.probabilitie[,"Yes"])
likely.acquired = ac.results[ac.results$predicted.acquisition == "Yes",]
cat("Number of customers predicted to be acquired", nrow(likely.acquired), "\n")
## Number of customers predicted to be acquired 368
print(head(likely.acquired))
## customer predicted.acquisition prob_acquired
## 1 1 Yes 0.94
## 2 2 Yes 1.00
## 3 3 Yes 0.82
## 5 5 Yes 0.98
## 6 6 Yes 0.90
## 7 7 Yes 1.00
rf_roc <- roc(testData$acquisition, test.probabilities[,"Yes"])
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
rf_auc <- auc(rf_roc)
print(rf_auc)
## Area under the curve: 0.8177
# 2. Decision Tree Model
dt.model <- rpart(acquisition ~ acq_exp + industry + revenue + employees,
data = trainData,
method = "class")
# Visualize the decision tree
rpart.plot(dt.model, main = "Decision Tree for Customer Acquisition")
# Predict using decision tree
dt_predictions <- predict(dt.model, testData, type = "class")
dt_probabilities <- predict(dt.model, testData, type = "prob")
# Evaluate decision tree performance
dt_confusion_matrix <- confusionMatrix(dt_predictions, testData$acquisition)
dt_roc <- roc(as.numeric(testData$acquisition), dt_probabilities[,2])
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
dt_auc <- auc(dt_roc)
dt_confusion_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 27 16
## Yes 21 85
##
## Accuracy : 0.7517
## 95% CI : (0.6743, 0.8187)
## No Information Rate : 0.6779
## P-Value [Acc > NIR] : 0.03074
##
## Kappa : 0.4154
##
## Mcnemar's Test P-Value : 0.51080
##
## Sensitivity : 0.5625
## Specificity : 0.8416
## Pos Pred Value : 0.6279
## Neg Pred Value : 0.8019
## Prevalence : 0.3221
## Detection Rate : 0.1812
## Detection Prevalence : 0.2886
## Balanced Accuracy : 0.7020
##
## 'Positive' Class : No
##
dt_roc
##
## Call:
## roc.default(response = as.numeric(testData$acquisition), predictor = dt_probabilities[, 2])
##
## Data: dt_probabilities[, 2] in 48 controls (as.numeric(testData$acquisition) 1) < 101 cases (as.numeric(testData$acquisition) 2).
## Area under the curve: 0.7328
# 3. Logistic Regression Model
logit.model <- glm(acquisition ~ acq_exp + industry + revenue + employees,
data = trainData,
family = "binomial")
# Predict using logistic regression
logit_probabilities <- predict(logit.model, testData, type = "response")
logit_predictions <- as.factor(ifelse(logit_probabilities > 0.5, "Yes", "No"))
# Evaluate logistic regression performance
logit_confusion_matrix <- confusionMatrix(logit_predictions, testData$acquisition)
logit_roc <- roc(as.numeric(testData$acquisition), logit_probabilities)
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
logit_auc <- auc(logit_roc)
logit_confusion_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 27 12
## Yes 21 89
##
## Accuracy : 0.7785
## 95% CI : (0.7033, 0.8424)
## No Information Rate : 0.6779
## P-Value [Acc > NIR] : 0.004489
##
## Kappa : 0.4666
##
## Mcnemar's Test P-Value : 0.163734
##
## Sensitivity : 0.5625
## Specificity : 0.8812
## Pos Pred Value : 0.6923
## Neg Pred Value : 0.8091
## Prevalence : 0.3221
## Detection Rate : 0.1812
## Detection Prevalence : 0.2617
## Balanced Accuracy : 0.7218
##
## 'Positive' Class : No
##
logit_roc
##
## Call:
## roc.default(response = as.numeric(testData$acquisition), predictor = logit_probabilities)
##
## Data: logit_probabilities in 48 controls (as.numeric(testData$acquisition) 1) < 101 cases (as.numeric(testData$acquisition) 2).
## Area under the curve: 0.8426
# Create a comparison table of performance metrics
model_comparison <- data.frame(
Model = c("Random Forest", "Decision Tree", "Logistic Regression"),
Accuracy = c(confusion.matrix$overall["Accuracy"],
dt_confusion_matrix$overall["Accuracy"],
logit_confusion_matrix$overall["Accuracy"]),
Sensitivity = c(confusion.matrix$byClass["Sensitivity"],
dt_confusion_matrix$byClass["Sensitivity"],
logit_confusion_matrix$byClass["Sensitivity"]),
Specificity = c(confusion.matrix$byClass["Specificity"],
dt_confusion_matrix$byClass["Specificity"],
logit_confusion_matrix$byClass["Specificity"]),
Precision = c(confusion.matrix$byClass["Pos Pred Value"],
dt_confusion_matrix$byClass["Pos Pred Value"],
logit_confusion_matrix$byClass["Pos Pred Value"]),
F1_Score = c(confusion.matrix$byClass["F1"],
dt_confusion_matrix$byClass["F1"],
logit_confusion_matrix$byClass["F1"]),
AUC = c(rf_auc, dt_auc, logit_auc)
)
# Display the comparison table
print(model_comparison)
## Model Accuracy Sensitivity Specificity Precision F1_Score
## 1 Random Forest 0.7583893 0.5416667 0.8613861 0.6500000 0.5909091
## 2 Decision Tree 0.7516779 0.5625000 0.8415842 0.6279070 0.5934066
## 3 Logistic Regression 0.7785235 0.5625000 0.8811881 0.6923077 0.6206897
## AUC
## 1 0.8176568
## 2 0.7327764
## 3 0.8426155
# Plot ROC curves for all models
plot(rf_roc, col = "blue", main = "ROC Curves Comparison")
plot(dt_roc, col = "red", add = TRUE)
plot(logit_roc, col = "green", add = TRUE)
legend("bottomright", legend = c(paste("Random Forest (AUC =", round(rf_auc, 3), ")"),
paste("Decision Tree (AUC =", round(dt_auc, 3), ")"),
paste("Logistic Regression (AUC =", round(logit_auc, 3), ")")),
col = c("blue", "red", "green"), lty = 1)
# Create a bar chart for accuracy comparison
library(ggplot2)
ggplot(model_comparison, aes(x = Model, y = Accuracy, fill = Model)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Model Accuracy Comparison",
x = "Model Type",
y = "Accuracy") +
theme(legend.position = "none") +
geom_text(aes(label = round(Accuracy, 3)), vjust = -0.5)