Load Packages

library(SMCRM) # CRM data
library(dplyr) # data wrangling
library(tidyr) # data wrangling
library(ggplot2) # plotting
library(survival) # survival
library(rpart) # DT
library(randomForestSRC) # RF
library(rpart.plot)
library(pROC)
library(caret)
library(randomForest)


# theme for nice plotting
theme_nice <- theme_classic()+
                theme(
                  axis.line.y.left = element_line(colour = "black"),
                  axis.line.y.right = element_line(colour = "black"),
                  axis.line.x.bottom = element_line(colour = "black"),
                  axis.line.x.top = element_line(colour = "black"),
                  axis.text.y = element_text(colour = "black", size = 12),
                  axis.text.x = element_text(color = "black", size = 12),
                  axis.ticks = element_line(color = "black")) +
                theme(
                  axis.ticks.length = unit(-0.25, "cm"), 
                  axis.text.x = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")), 
                  axis.text.y = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")))

Dataset

data("acquisitionRetention")
df = acquisitionRetention
# Check for missing values 
colSums(is.na(df))
##    customer acquisition    duration      profit     acq_exp     ret_exp 
##           0           0           0           0           0           0 
##  acq_exp_sq  ret_exp_sq        freq     freq_sq    crossbuy         sow 
##           0           0           0           0           0           0 
##    industry     revenue   employees 
##           0           0           0

Data Exploration

# Look at structure of data 
str(acquisitionRetention)
## 'data.frame':    500 obs. of  15 variables:
##  $ customer   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ acquisition: num  1 1 1 0 1 1 1 1 0 0 ...
##  $ duration   : num  1635 1039 1288 0 1631 ...
##  $ profit     : num  6134 3524 4081 -638 5446 ...
##  $ acq_exp    : num  694 460 249 638 589 ...
##  $ ret_exp    : num  972 450 805 0 920 ...
##  $ acq_exp_sq : num  480998 211628 62016 407644 346897 ...
##  $ ret_exp_sq : num  943929 202077 648089 0 846106 ...
##  $ freq       : num  6 11 21 0 2 7 15 13 0 0 ...
##  $ freq_sq    : num  36 121 441 0 4 49 225 169 0 0 ...
##  $ crossbuy   : num  5 6 6 0 9 4 5 5 0 0 ...
##  $ sow        : num  95 22 90 0 80 48 51 23 0 0 ...
##  $ industry   : num  1 0 0 0 0 1 0 1 0 1 ...
##  $ revenue    : num  47.2 45.1 29.1 40.6 48.7 ...
##  $ employees  : num  898 686 1423 181 631 ...
# Summary Statistics
summary(df)
##     customer      acquisition       duration          profit       
##  Min.   :  1.0   Min.   :0.000   Min.   :   0.0   Min.   :-1027.0  
##  1st Qu.:125.8   1st Qu.:0.000   1st Qu.:   0.0   1st Qu.: -316.3  
##  Median :250.5   Median :1.000   Median : 957.5   Median : 3369.9  
##  Mean   :250.5   Mean   :0.676   Mean   : 742.5   Mean   : 2403.8  
##  3rd Qu.:375.2   3rd Qu.:1.000   3rd Qu.:1146.2   3rd Qu.: 3931.6  
##  Max.   :500.0   Max.   :1.000   Max.   :1673.0   Max.   : 6134.3  
##     acq_exp           ret_exp         acq_exp_sq          ret_exp_sq     
##  Min.   :   1.21   Min.   :   0.0   Min.   :      1.5   Min.   :      0  
##  1st Qu.: 384.14   1st Qu.:   0.0   1st Qu.: 147562.0   1st Qu.:      0  
##  Median : 491.66   Median : 398.1   Median : 241729.7   Median : 158480  
##  Mean   : 493.35   Mean   : 336.3   Mean   : 271211.1   Mean   : 184000  
##  3rd Qu.: 600.21   3rd Qu.: 514.3   3rd Qu.: 360246.0   3rd Qu.: 264466  
##  Max.   :1027.04   Max.   :1095.0   Max.   :1054811.2   Max.   :1198937  
##       freq          freq_sq          crossbuy           sow        
##  Min.   : 0.00   Min.   :  0.00   Min.   : 0.000   Min.   :  0.00  
##  1st Qu.: 0.00   1st Qu.:  0.00   1st Qu.: 0.000   1st Qu.:  0.00  
##  Median : 6.00   Median : 36.00   Median : 5.000   Median : 44.00  
##  Mean   : 6.22   Mean   : 69.25   Mean   : 4.052   Mean   : 38.88  
##  3rd Qu.:11.00   3rd Qu.:121.00   3rd Qu.: 7.000   3rd Qu.: 66.00  
##  Max.   :21.00   Max.   :441.00   Max.   :11.000   Max.   :116.00  
##     industry        revenue        employees     
##  Min.   :0.000   Min.   :14.49   Min.   :  18.0  
##  1st Qu.:0.000   1st Qu.:33.53   1st Qu.: 503.0  
##  Median :1.000   Median :41.43   Median : 657.5  
##  Mean   :0.522   Mean   :40.54   Mean   : 671.5  
##  3rd Qu.:1.000   3rd Qu.:47.52   3rd Qu.: 826.0  
##  Max.   :1.000   Max.   :65.10   Max.   :1461.0
# Identify highly correlated variables with acquisition
#Correlation values
cor(df)
##                 customer  acquisition    duration      profit       acq_exp
## customer     1.000000000  0.054034172  0.03638573  0.04099600 -0.0344779052
## acquisition  0.054034172  1.000000000  0.94499140  0.96392988 -0.0017487494
## duration     0.036385733  0.944991403  1.00000000  0.98434462  0.0117152328
## profit       0.040996005  0.963929883  0.98434462  1.00000000  0.0395591912
## acq_exp     -0.034477905 -0.001748749  0.01171523  0.03955919  1.0000000000
## ret_exp      0.022855507  0.874115734  0.97852966  0.95164652  0.0118498772
## acq_exp_sq  -0.043878373 -0.077186844 -0.05987778 -0.03958462  0.9737781739
## ret_exp_sq  -0.007095947  0.631302293  0.82648533  0.77709045  0.0267900795
## freq         0.037846540  0.778910016  0.70998289  0.74689819  0.0007026176
## freq_sq      0.030427659  0.567605728  0.49768232  0.53904545 -0.0050416071
## crossbuy     0.064402299  0.866154614  0.83264401  0.85553188  0.0258650343
## sow          0.008009862  0.847150795  0.80815353  0.83170352  0.0308839247
## industry     0.095383073  0.244373420  0.20789458  0.22744229  0.0145649028
## revenue      0.004210533  0.248837329  0.22598952  0.24148754  0.0643407122
## employees    0.018449766  0.477019473  0.43320548  0.47130639 -0.0419314213
##                 ret_exp  acq_exp_sq   ret_exp_sq          freq      freq_sq
## customer     0.02285551 -0.04387837 -0.007095947  0.0378465396  0.030427659
## acquisition  0.87411573 -0.07718684  0.631302293  0.7789100155  0.567605728
## duration     0.97852966 -0.05987778  0.826485332  0.7099828902  0.497682317
## profit       0.95164652 -0.03958462  0.777090452  0.7468981898  0.539045453
## acq_exp      0.01184988  0.97377817  0.026790080  0.0007026176 -0.005041607
## ret_exp      1.00000000 -0.05562666  0.920692412  0.6940495656  0.513877389
## acq_exp_sq  -0.05562666  1.00000000 -0.023073486 -0.0604820431 -0.050203871
## ret_exp_sq   0.92069241 -0.02307349  1.000000000  0.5059909848  0.377358737
## freq         0.69404957 -0.06048204  0.505990985  1.0000000000  0.938395608
## freq_sq      0.51387739 -0.05020387  0.377358737  0.9383956081  1.000000000
## crossbuy     0.77688913 -0.04020979  0.578974344  0.6863182859  0.520562814
## sow          0.74092081 -0.02980165  0.532801273  0.6603665316  0.481760366
## industry     0.18104417  0.03250630  0.099428831  0.1604756340  0.102778207
## revenue      0.20188718  0.04431747  0.130012381  0.1545687918  0.095854901
## employees    0.40942256 -0.05879389  0.286288747  0.4292780382  0.355054413
##                crossbuy          sow     industry     revenue    employees
## customer     0.06440230  0.008009862  0.095383073 0.004210533  0.018449766
## acquisition  0.86615461  0.847150795  0.244373420 0.248837329  0.477019473
## duration     0.83264401  0.808153531  0.207894584 0.225989515  0.433205481
## profit       0.85553188  0.831703516  0.227442291 0.241487536  0.471306392
## acq_exp      0.02586503  0.030883925  0.014564903 0.064340712 -0.041931421
## ret_exp      0.77688913  0.740920812  0.181044165 0.201887177  0.409422565
## acq_exp_sq  -0.04020979 -0.029801655  0.032506302 0.044317467 -0.058793888
## ret_exp_sq   0.57897434  0.532801273  0.099428831 0.130012381  0.286288747
## freq         0.68631829  0.660366532  0.160475634 0.154568792  0.429278038
## freq_sq      0.52056281  0.481760366  0.102778207 0.095854901  0.355054413
## crossbuy     1.00000000  0.746630339  0.218109778 0.194752511  0.415964288
## sow          0.74663034  1.000000000  0.209726320 0.233400782  0.414154030
## industry     0.21810978  0.209726320  1.000000000 0.030086417 -0.002323206
## revenue      0.19475251  0.233400782  0.030086417 1.000000000  0.047489335
## employees    0.41596429  0.414154030 -0.002323206 0.047489335  1.000000000
# create box plots to show if we need to remove any variables
par(mfrow = c(2, 5))
boxplot(duration ~ acquisition, df, xlab = "acquisition", ylab = "duration", col = "blue")
boxplot(profit ~ acquisition, df, xlab = "acquisition", ylab = "profit", col = "red")
boxplot(ret_exp ~ acquisition, df, xlab = "acquisition", ylab = "ret_exp", col = "purple")
boxplot(acq_exp_sq ~ acquisition, df, xlab = "acquisition", ylab = "acq_exp_sq", col = "pink")
boxplot(ret_exp_sq ~ acquisition, df, xlab = "acquisition", ylab = "ret_exp_sq", col = "yellow")
boxplot(freq ~ acquisition, df, xlab = "acquisition", ylab = "freq", col = "lightblue")
boxplot(freq_sq ~ acquisition, df, xlab = "acquisition", ylab = "freq_sq", col = "green")
boxplot(crossbuy ~ acquisition, df, xlab = "acquisition", ylab = "crossbuy", col = "grey")
boxplot(sow ~ acquisition, df, xlab = "acquisition", ylab = "sow", col ="black")

Use acquisitionRetention data set to predict which customers will be acquired

df = acquisitionRetention
df$industry = as.factor(df$industry) # Difference between Minh and Matthew
df$acquisition = factor(df$acquisition, levels = c(0,1), labels = c("No","Yes"))
df$acq_exp[is.na(df$acq_exp)] = median(df$acq_exp, na.rm = T)
# spliting training and testing data
set.seed(456)
train.ind = createDataPartition(df$acquisition, p = 0.7, list = FALSE)
trainData = df[train.ind,]
testData = df[-train.ind,]
# Create a random forest model
rf.model = randomForest(acquisition ~ acq_exp + industry + revenue + employees,

data = trainData,
ntree = 50,
mtry = 1,
importance = T)

# Predict on test set
test.predictions = predict(rf.model, testData)
test.probabilities = predict(rf.model, testData, type = "prob")
# Create a confusion matrix
confusion.matrix = confusionMatrix(test.predictions,testData$acquisition)
confusion.matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  26  14
##        Yes 22  87
##                                           
##                Accuracy : 0.7584          
##                  95% CI : (0.6815, 0.8247)
##     No Information Rate : 0.6779          
##     P-Value [Acc > NIR] : 0.01998         
##                                           
##                   Kappa : 0.4215          
##                                           
##  Mcnemar's Test P-Value : 0.24335         
##                                           
##             Sensitivity : 0.5417          
##             Specificity : 0.8614          
##          Pos Pred Value : 0.6500          
##          Neg Pred Value : 0.7982          
##              Prevalence : 0.3221          
##          Detection Rate : 0.1745          
##    Detection Prevalence : 0.2685          
##       Balanced Accuracy : 0.7015          
##                                           
##        'Positive' Class : No              
## 

Predict number of customers to be acquired

full.predictions = predict(rf.model,df)
full.probabilitie = predict(rf.model,df, type = "prob")
ac.results = data.frame(customer = df$customer,
                        predicted.acquisition = full.predictions,
                        prob_acquired = full.probabilitie[,"Yes"])

likely.acquired = ac.results[ac.results$predicted.acquisition == "Yes",]
cat("Number of customers predicted to be acquired", nrow(likely.acquired), "\n")
## Number of customers predicted to be acquired 368
print(head(likely.acquired))
##   customer predicted.acquisition prob_acquired
## 1        1                   Yes          0.94
## 2        2                   Yes          1.00
## 3        3                   Yes          0.82
## 5        5                   Yes          0.98
## 6        6                   Yes          0.90
## 7        7                   Yes          1.00
rf_roc <- roc(testData$acquisition, test.probabilities[,"Yes"])
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
rf_auc <- auc(rf_roc)
print(rf_auc)
## Area under the curve: 0.8177
# 2. Decision Tree Model
dt.model <- rpart(acquisition ~ acq_exp + industry + revenue + employees,
                 data = trainData,
                 method = "class")
# Visualize the decision tree
rpart.plot(dt.model, main = "Decision Tree for Customer Acquisition")

# Predict using decision tree
dt_predictions <- predict(dt.model, testData, type = "class")
dt_probabilities <- predict(dt.model, testData, type = "prob")

# Evaluate decision tree performance
dt_confusion_matrix <- confusionMatrix(dt_predictions, testData$acquisition)
dt_roc <- roc(as.numeric(testData$acquisition), dt_probabilities[,2])
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
dt_auc <- auc(dt_roc)

dt_confusion_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  27  16
##        Yes 21  85
##                                           
##                Accuracy : 0.7517          
##                  95% CI : (0.6743, 0.8187)
##     No Information Rate : 0.6779          
##     P-Value [Acc > NIR] : 0.03074         
##                                           
##                   Kappa : 0.4154          
##                                           
##  Mcnemar's Test P-Value : 0.51080         
##                                           
##             Sensitivity : 0.5625          
##             Specificity : 0.8416          
##          Pos Pred Value : 0.6279          
##          Neg Pred Value : 0.8019          
##              Prevalence : 0.3221          
##          Detection Rate : 0.1812          
##    Detection Prevalence : 0.2886          
##       Balanced Accuracy : 0.7020          
##                                           
##        'Positive' Class : No              
## 
dt_roc
## 
## Call:
## roc.default(response = as.numeric(testData$acquisition), predictor = dt_probabilities[,     2])
## 
## Data: dt_probabilities[, 2] in 48 controls (as.numeric(testData$acquisition) 1) < 101 cases (as.numeric(testData$acquisition) 2).
## Area under the curve: 0.7328
# 3. Logistic Regression Model
logit.model <- glm(acquisition ~ acq_exp + industry + revenue + employees,
                  data = trainData,
                  family = "binomial")

# Predict using logistic regression
logit_probabilities <- predict(logit.model, testData, type = "response")
logit_predictions <- as.factor(ifelse(logit_probabilities > 0.5, "Yes", "No"))
# Evaluate logistic regression performance
logit_confusion_matrix <- confusionMatrix(logit_predictions, testData$acquisition)
logit_roc <- roc(as.numeric(testData$acquisition), logit_probabilities)
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
logit_auc <- auc(logit_roc)
logit_confusion_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  27  12
##        Yes 21  89
##                                           
##                Accuracy : 0.7785          
##                  95% CI : (0.7033, 0.8424)
##     No Information Rate : 0.6779          
##     P-Value [Acc > NIR] : 0.004489        
##                                           
##                   Kappa : 0.4666          
##                                           
##  Mcnemar's Test P-Value : 0.163734        
##                                           
##             Sensitivity : 0.5625          
##             Specificity : 0.8812          
##          Pos Pred Value : 0.6923          
##          Neg Pred Value : 0.8091          
##              Prevalence : 0.3221          
##          Detection Rate : 0.1812          
##    Detection Prevalence : 0.2617          
##       Balanced Accuracy : 0.7218          
##                                           
##        'Positive' Class : No              
## 
logit_roc
## 
## Call:
## roc.default(response = as.numeric(testData$acquisition), predictor = logit_probabilities)
## 
## Data: logit_probabilities in 48 controls (as.numeric(testData$acquisition) 1) < 101 cases (as.numeric(testData$acquisition) 2).
## Area under the curve: 0.8426
# Create a comparison table of performance metrics
model_comparison <- data.frame(
  Model = c("Random Forest", "Decision Tree", "Logistic Regression"),
  Accuracy = c(confusion.matrix$overall["Accuracy"],
               dt_confusion_matrix$overall["Accuracy"],
               logit_confusion_matrix$overall["Accuracy"]),
  Sensitivity = c(confusion.matrix$byClass["Sensitivity"],
                 dt_confusion_matrix$byClass["Sensitivity"],
                 logit_confusion_matrix$byClass["Sensitivity"]),
  Specificity = c(confusion.matrix$byClass["Specificity"],
                 dt_confusion_matrix$byClass["Specificity"],
                 logit_confusion_matrix$byClass["Specificity"]),
  Precision = c(confusion.matrix$byClass["Pos Pred Value"],
               dt_confusion_matrix$byClass["Pos Pred Value"],
               logit_confusion_matrix$byClass["Pos Pred Value"]),
  F1_Score = c(confusion.matrix$byClass["F1"],
              dt_confusion_matrix$byClass["F1"],
              logit_confusion_matrix$byClass["F1"]),
  AUC = c(rf_auc, dt_auc, logit_auc)
)

# Display the comparison table
print(model_comparison)
##                 Model  Accuracy Sensitivity Specificity Precision  F1_Score
## 1       Random Forest 0.7583893   0.5416667   0.8613861 0.6500000 0.5909091
## 2       Decision Tree 0.7516779   0.5625000   0.8415842 0.6279070 0.5934066
## 3 Logistic Regression 0.7785235   0.5625000   0.8811881 0.6923077 0.6206897
##         AUC
## 1 0.8176568
## 2 0.7327764
## 3 0.8426155
# Plot ROC curves for all models
plot(rf_roc, col = "blue", main = "ROC Curves Comparison")
plot(dt_roc, col = "red", add = TRUE)
plot(logit_roc, col = "green", add = TRUE)
legend("bottomright", legend = c(paste("Random Forest (AUC =", round(rf_auc, 3), ")"),
                                paste("Decision Tree (AUC =", round(dt_auc, 3), ")"),
                                paste("Logistic Regression (AUC =", round(logit_auc, 3), ")")),
       col = c("blue", "red", "green"), lty = 1)

# Create a bar chart for accuracy comparison
library(ggplot2)
ggplot(model_comparison, aes(x = Model, y = Accuracy, fill = Model)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Model Accuracy Comparison",
       x = "Model Type",
       y = "Accuracy") +
  theme(legend.position = "none") +
  geom_text(aes(label = round(Accuracy, 3)), vjust = -0.5)