Load acquisitionRetention data from the SMCRM package

#To access the data in R: load the SMCRM package and execute
data(acquisitionRetention)

#Assign the data to another data frame which we are going
#to manipulate further
df_acquisitionRetention <- acquisitionRetention 

Exploratory Data Analysis

#Structure of the data
str(df_acquisitionRetention)
## 'data.frame':    500 obs. of  15 variables:
##  $ customer   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ acquisition: num  1 1 1 0 1 1 1 1 0 0 ...
##  $ duration   : num  1635 1039 1288 0 1631 ...
##  $ profit     : num  6134 3524 4081 -638 5446 ...
##  $ acq_exp    : num  694 460 249 638 589 ...
##  $ ret_exp    : num  972 450 805 0 920 ...
##  $ acq_exp_sq : num  480998 211628 62016 407644 346897 ...
##  $ ret_exp_sq : num  943929 202077 648089 0 846106 ...
##  $ freq       : num  6 11 21 0 2 7 15 13 0 0 ...
##  $ freq_sq    : num  36 121 441 0 4 49 225 169 0 0 ...
##  $ crossbuy   : num  5 6 6 0 9 4 5 5 0 0 ...
##  $ sow        : num  95 22 90 0 80 48 51 23 0 0 ...
##  $ industry   : num  1 0 0 0 0 1 0 1 0 1 ...
##  $ revenue    : num  47.2 45.1 29.1 40.6 48.7 ...
##  $ employees  : num  898 686 1423 181 631 ...
#Summary of the data
summary(df_acquisitionRetention)
##     customer      acquisition       duration          profit       
##  Min.   :  1.0   Min.   :0.000   Min.   :   0.0   Min.   :-1027.0  
##  1st Qu.:125.8   1st Qu.:0.000   1st Qu.:   0.0   1st Qu.: -316.3  
##  Median :250.5   Median :1.000   Median : 957.5   Median : 3369.9  
##  Mean   :250.5   Mean   :0.676   Mean   : 742.5   Mean   : 2403.8  
##  3rd Qu.:375.2   3rd Qu.:1.000   3rd Qu.:1146.2   3rd Qu.: 3931.6  
##  Max.   :500.0   Max.   :1.000   Max.   :1673.0   Max.   : 6134.3  
##     acq_exp           ret_exp         acq_exp_sq          ret_exp_sq     
##  Min.   :   1.21   Min.   :   0.0   Min.   :      1.5   Min.   :      0  
##  1st Qu.: 384.14   1st Qu.:   0.0   1st Qu.: 147562.0   1st Qu.:      0  
##  Median : 491.66   Median : 398.1   Median : 241729.7   Median : 158480  
##  Mean   : 493.35   Mean   : 336.3   Mean   : 271211.1   Mean   : 184000  
##  3rd Qu.: 600.21   3rd Qu.: 514.3   3rd Qu.: 360246.0   3rd Qu.: 264466  
##  Max.   :1027.04   Max.   :1095.0   Max.   :1054811.2   Max.   :1198937  
##       freq          freq_sq          crossbuy           sow        
##  Min.   : 0.00   Min.   :  0.00   Min.   : 0.000   Min.   :  0.00  
##  1st Qu.: 0.00   1st Qu.:  0.00   1st Qu.: 0.000   1st Qu.:  0.00  
##  Median : 6.00   Median : 36.00   Median : 5.000   Median : 44.00  
##  Mean   : 6.22   Mean   : 69.25   Mean   : 4.052   Mean   : 38.88  
##  3rd Qu.:11.00   3rd Qu.:121.00   3rd Qu.: 7.000   3rd Qu.: 66.00  
##  Max.   :21.00   Max.   :441.00   Max.   :11.000   Max.   :116.00  
##     industry        revenue        employees     
##  Min.   :0.000   Min.   :14.49   Min.   :  18.0  
##  1st Qu.:0.000   1st Qu.:33.53   1st Qu.: 503.0  
##  Median :1.000   Median :41.43   Median : 657.5  
##  Mean   :0.522   Mean   :40.54   Mean   : 671.5  
##  3rd Qu.:1.000   3rd Qu.:47.52   3rd Qu.: 826.0  
##  Max.   :1.000   Max.   :65.10   Max.   :1461.0
#Correlation
chart.Correlation(df_acquisitionRetention, histogram = TRUE, pch=20)

Many variables are correlated with response variable and they are:
duration, profit, ret_exp, acq_exp_sq, ret_exp_sq, freq, freq_sq, crossbuy and sow.

Boxplot between the above correlated variables and the response variable acquisition.

par(mfrow=c(4,3))
boxplot(duration ~ acquisition, data=df_acquisitionRetention, ylab='Duration', xlab='acquisition')
boxplot(profit ~ acquisition, data=df_acquisitionRetention, ylab='profit', xlab='acquisition')
boxplot(ret_exp ~ acquisition, data=df_acquisitionRetention, ylab='ret_exp', xlab='acquisition')
boxplot(acq_exp_sq ~ acquisition, data=df_acquisitionRetention, ylab='acq_exp_sq', xlab='acquisition')
boxplot(ret_exp_sq ~ acquisition, data=df_acquisitionRetention, ylab='ret_exp_sq', xlab='acquisition')
boxplot(freq ~ acquisition, data=df_acquisitionRetention, ylab='freq', xlab='acquisition')
boxplot(freq_sq ~ acquisition, data=df_acquisitionRetention, ylab='freq_sq', xlab='acquisition')
boxplot(crossbuy ~ acquisition, data=df_acquisitionRetention, ylab='crossbuy', xlab='acquisition')
boxplot(sow ~ acquisition, data=df_acquisitionRetention, ylab='sow', xlab='acquisition')
boxplot(revenue ~ acquisition, data=df_acquisitionRetention, ylab='revenue', xlab='acquisition')
boxplot(employees ~ acquisition, data=df_acquisitionRetention, ylab='employees', xlab='acquisition')
boxplot(industry ~ acquisition, data=df_acquisitionRetention, ylab='industry', xlab='acquisition')

par(mfrow=c(1,1))

Most of the variables is either 0 or negative biased when acquistion is 0 except for act_exp_sq. Since, act_exp_sq is square of the variable act_exp, we are going to use act_exp variable in the model along with industry, revenue and employees.

#Omit any NA
df_acqReten_factored <- na.omit(df_acquisitionRetention)
#Factorize the `industry` and `acquisition` variable
df_acqReten_factored$acquisition <- as.factor(df_acqReten_factored$acquisition)
df_acqReten_factored$industry <- as.factor(df_acqReten_factored$industry)
str(df_acqReten_factored)
## 'data.frame':    500 obs. of  15 variables:
##  $ customer   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ acquisition: Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 1 1 ...
##  $ duration   : num  1635 1039 1288 0 1631 ...
##  $ profit     : num  6134 3524 4081 -638 5446 ...
##  $ acq_exp    : num  694 460 249 638 589 ...
##  $ ret_exp    : num  972 450 805 0 920 ...
##  $ acq_exp_sq : num  480998 211628 62016 407644 346897 ...
##  $ ret_exp_sq : num  943929 202077 648089 0 846106 ...
##  $ freq       : num  6 11 21 0 2 7 15 13 0 0 ...
##  $ freq_sq    : num  36 121 441 0 4 49 225 169 0 0 ...
##  $ crossbuy   : num  5 6 6 0 9 4 5 5 0 0 ...
##  $ sow        : num  95 22 90 0 80 48 51 23 0 0 ...
##  $ industry   : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 1 2 1 2 ...
##  $ revenue    : num  47.2 45.1 29.1 40.6 48.7 ...
##  $ employees  : num  898 686 1423 181 631 ...
#summary(df_acqReten_factored)

Split the data for training and testing with 70% and 30%.

set.seed(123)
train <- createDataPartition(y = df_acqReten_factored$acquisition, p=0.7, list=FALSE)
df_train_acq <- df_acqReten_factored[train, ]
df_test_acq <- df_acqReten_factored[-train, ]

Use acquisitionRetention data set to predict which customers will be acquired

set.seed(123)
#Define the CARET train control
train_control <- trainControl(method='repeatedcv', 
                                number=10, 
                                repeats=3)
#Tuned parameter
tunegrid <- expand.grid(.mtry=2,.ntree=1500, .nodesize=2)
#Applying the random forest model for the acquisition
rf_acquisition <- train(acquisition ~ acq_exp + industry + revenue + employees, 
                        data = df_train_acq, 
                        method = 'rf', 
                        trControl = train_control,
                        tunegrid = tunegrid,
                        importance=TRUE)

rf_acquisition_preds <- predict(rf_acquisition, df_test_acq)
confusionMatrix(rf_acquisition_preds, df_test_acq$acquisition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 32 13
##          1 16 88
##                                           
##                Accuracy : 0.8054          
##                  95% CI : (0.7326, 0.8656)
##     No Information Rate : 0.6779          
##     P-Value [Acc > NIR] : 0.0003693       
##                                           
##                   Kappa : 0.5469          
##                                           
##  Mcnemar's Test P-Value : 0.7103466       
##                                           
##             Sensitivity : 0.6667          
##             Specificity : 0.8713          
##          Pos Pred Value : 0.7111          
##          Neg Pred Value : 0.8462          
##              Prevalence : 0.3221          
##          Detection Rate : 0.2148          
##    Detection Prevalence : 0.3020          
##       Balanced Accuracy : 0.7690          
##                                           
##        'Positive' Class : 0               
## 

The accuracy is 80.54% with un-tuned Random Forest model for the acquisition.

importance <- varImp(rf_acquisition)$importance
df_importance <- data.frame(variables = row.names(importance),importance = importance)

df_importance %>%
  # tibble::rownames_to_column(var = "variable") %>% rename(.,'0'='importance')
  ggplot(aes(x = variables, y =importance.0 )) +
    geom_bar(stat = "identity", fill = "lightgreen", color = "black")+
    coord_flip() +
     labs(x = "Variables", y = "Variable importance")+
     theme_plot

How long (duration) based on a feature set using a random forest

#Predict the acquisition on the full dataset using the above random forest model
full_acq_preds <- predict(rf_acquisition, df_acqReten_factored)
#Bind the fully predicted acquisition to the original data set
df_acqReten_binded <- cbind(df_acquisitionRetention, full_acq_preds)
#Create a data frame with the subset from the above having both actual acquisition = 1
#and predicted acquisition = 1
df_acqisition_only <- subset(df_acqReten_binded, acquisition == '1' & full_acq_preds == '1')
#Summary of the above
summary(df_acqisition_only)
##     customer      acquisition    duration        profit        acq_exp     
##  Min.   :  1.0   Min.   :1    Min.   : 654   Min.   :1839   Min.   :151.0  
##  1st Qu.:135.2   1st Qu.:1    1st Qu.: 950   1st Qu.:3365   1st Qu.:406.4  
##  Median :262.5   Median :1    Median :1071   Median :3713   Median :493.4  
##  Mean   :258.4   Mean   :1    Mean   :1096   Mean   :3795   Mean   :492.0  
##  3rd Qu.:383.2   3rd Qu.:1    3rd Qu.:1235   3rd Qu.:4155   3rd Qu.:579.2  
##  Max.   :500.0   Max.   :1    Max.   :1673   Max.   :6134   Max.   :832.2  
##     ret_exp         acq_exp_sq       ret_exp_sq           freq       
##  Min.   : 229.9   Min.   : 22813   Min.   :  52840   Min.   : 1.000  
##  1st Qu.: 395.0   1st Qu.:165175   1st Qu.: 156047   1st Qu.: 6.000  
##  Median : 465.6   Median :243424   Median : 216769   Median : 9.000  
##  Mean   : 496.9   Mean   :259776   Mean   : 272077   Mean   : 9.204  
##  3rd Qu.: 564.7   3rd Qu.:335478   3rd Qu.: 318857   3rd Qu.:12.000  
##  Max.   :1095.0   Max.   :692557   Max.   :1198937   Max.   :21.000  
##     freq_sq         crossbuy           sow            industry     
##  Min.   :  1.0   Min.   : 1.000   Min.   :  1.00   Min.   :0.0000  
##  1st Qu.: 36.0   1st Qu.: 5.000   1st Qu.: 42.00   1st Qu.:0.0000  
##  Median : 81.0   Median : 6.000   Median : 58.00   Median :1.0000  
##  Mean   :102.8   Mean   : 6.028   Mean   : 57.14   Mean   :0.6142  
##  3rd Qu.:144.0   3rd Qu.: 7.000   3rd Qu.: 71.00   3rd Qu.:1.0000  
##  Max.   :441.0   Max.   :11.000   Max.   :116.00   Max.   :1.0000  
##     revenue        employees      full_acq_preds
##  Min.   :15.86   Min.   :  24.0   0:  0         
##  1st Qu.:35.74   1st Qu.: 617.8   1:324         
##  Median :42.93   Median : 760.5                 
##  Mean   :42.12   Mean   : 762.4                 
##  3rd Qu.:48.09   3rd Qu.: 904.8                 
##  Max.   :65.10   Max.   :1461.0
df_acq_duration <- df_acqisition_only[ ,c(3,4,5,6,9,11,12,13, 14,15)]
#Leaving all the squared variables along with acquisition variables (both actual and predicted ones)
chart.Correlation(df_acq_duration, histogram = TRUE, pch=20)

set.seed(123)
glm_duration_vif1 <- glm(duration ~ profit + acq_exp + ret_exp + freq + 
                          crossbuy + sow + industry + revenue + employees, 
                          data = df_acqisition_only)
vif(glm_duration_vif1)
##    profit   acq_exp   ret_exp      freq  crossbuy       sow  industry   revenue 
## 36.103411  7.718499 28.165213  1.198511  1.125215  1.172689  1.159900  1.125594 
## employees 
##  1.262747
#Removing `profit` and running the glm and vif
set.seed(123)
glm_duration_vif2 <- glm(duration ~ acq_exp + ret_exp + freq + 
                          crossbuy + sow + industry + revenue + employees, 
                          data = df_acqisition_only)
vif(glm_duration_vif2)
##   acq_exp   ret_exp      freq  crossbuy       sow  industry   revenue employees 
##  1.020574  1.019536  1.030004  1.022120  1.017467  1.030442  1.034593  1.044688
#Using rfsrc function for better tuning
set.seed(123)
tuner_param <- expand.grid(mtry = c(2:8))
rf_duration_tuned <- rfsrc(duration ~ acq_exp + ret_exp + freq + crossbuy + 
                            sow + industry + revenue + employees, 
                            data = df_acqisition_only, 
                            tuneGrid = tuner_param, 
                           importance = TRUE, 
                           ntree = 1000)
rf_duration_preds <- predict(rf_duration_tuned, df_acqisition_only)$predicted

df_duration_both_actual_pred <- cbind(df_acqisition_only, rf_duration_preds)
mean_actual_duration <- mean(df_duration_both_actual_pred$duration)
mean_predicted_duration <- mean(df_duration_both_actual_pred$rf_duration_preds)
sprintf("Actual Mean for the duration = %.2f", mean_actual_duration)
## [1] "Actual Mean for the duration = 1096.27"
sprintf("Predicted Mean for the duration = %.2f", mean_predicted_duration)
## [1] "Predicted Mean for the duration = 1098.67"

Compute variable importance to detect interactions and optimize hyperparameters for acquired customers.

rf_tuned_var_imp <- rf_duration_tuned$importance
rf_tuned_var_imp
##      acq_exp      ret_exp         freq     crossbuy          sow     industry 
##   -23.666911 52847.995429  1638.262079    70.551577   -75.784909   -20.777611 
##      revenue    employees 
##    -7.804186   -85.055949
#Code from the instructor demonstration
data.frame(importance = rf_duration_tuned$importance) %>%
  tibble::rownames_to_column(var = "variable") %>%
  ggplot(aes(x = reorder(variable,importance), y = importance)) +
    geom_bar(stat = "identity", fill = "lightgreen", color = "black")+
    coord_flip() +
     labs(x = "Variables", y = "Variable importance")+
     theme_plot

#Negative values for some of the variable importance. So add a constant value
#and use log on it to plot
rf_tuned_var_imp_log <- log(rf_tuned_var_imp + 150)
rf_tuned_var_imp_log
##   acq_exp   ret_exp      freq  crossbuy       sow  industry   revenue employees 
##  4.838922 10.878009  7.489000  5.396132  4.306968  4.861535  4.957205  4.173526
data.frame(importance = rf_tuned_var_imp_log) %>%
  tibble::rownames_to_column(var = "variable") %>%
  ggplot(aes(x = reorder(variable,importance), y = importance)) +
    geom_bar(stat = "identity", fill = "lightgreen", color = "black")+
    coord_flip() +
     labs(x = "Variables", y = "Variable importance")+
     theme_plot

#Using instructor code
mindepth_duration <- max.subtree(rf_duration_tuned, sub.order = TRUE)
print(round(mindepth_duration$order, 3)[,1])
##   acq_exp   ret_exp      freq  crossbuy       sow  industry   revenue employees 
##     3.091     1.044     1.826     2.917     3.222     7.321     3.055     2.730
#Using instructor code
data.frame(md = round(mindepth_duration$order, 3)[,1]) %>%
  tibble::rownames_to_column(var = "variable") %>%
  ggplot(aes(x = reorder(variable,desc(md)), y = md)) +
    geom_bar(stat = "identity", fill = "lightgreen", color = "black", width = 0.2)+
    coord_flip() +
     labs(x = "Variables", y = "Minimal Depth")+
     theme_plot

#Using instructor code
mindepth_duration$sub.order
##             acq_exp    ret_exp      freq  crossbuy       sow  industry
## acq_exp   0.2971449 0.38088681 0.5344467 0.6741266 0.6178362 0.9024478
## ret_exp   0.3281538 0.09658864 0.2443512 0.3866639 0.3396511 0.7810064
## freq      0.4223365 0.22577801 0.1750661 0.4802671 0.4089412 0.7986187
## crossbuy  0.5711820 0.37603499 0.5099518 0.2810813 0.5806476 0.8706712
## sow       0.6149306 0.39912216 0.5826539 0.6964783 0.3091217 0.9131550
## industry  0.9011068 0.82404741 0.8900916 0.9320901 0.8989268 0.6901707
## revenue   0.5918537 0.37207552 0.5416602 0.6689008 0.6042329 0.8772073
## employees 0.5472840 0.33537281 0.4976333 0.6444821 0.5713956 0.8775078
##             revenue employees
## acq_exp   0.5982140 0.5659886
## ret_exp   0.3366113 0.3066234
## freq      0.4138096 0.4004520
## crossbuy  0.5798340 0.5552591
## sow       0.6230647 0.6236059
## industry  0.8991668 0.8906798
## revenue   0.2915024 0.5695797
## employees 0.5602777 0.2602101
as.matrix(mindepth_duration$sub.order) %>%
  reshape2::melt() %>%
  data.frame() %>%
  ggplot(aes(x = Var1, y = Var2, fill = value)) +
    scale_x_discrete(position = "top") +
    geom_tile(color = "white") +
    viridis::scale_fill_viridis("Relative min. depth") +
    labs(x = "Model Variables", y = "Model Variables") +
    theme_bw()

The industry variable has the greatest relative minimum depth value and ret_exp variable has the least relative minimum depth across all variables.

find.interaction(rf_duration_tuned, method = "vimp", importance = "permute")
## Pairing ret_exp with freq 
## Pairing ret_exp with crossbuy 
## Pairing ret_exp with revenue 
## Pairing ret_exp with industry 
## Pairing ret_exp with acq_exp 
## Pairing ret_exp with sow 
## Pairing ret_exp with employees 
## Pairing freq with crossbuy 
## Pairing freq with revenue 
## Pairing freq with industry 
## Pairing freq with acq_exp 
## Pairing freq with sow 
## Pairing freq with employees 
## Pairing crossbuy with revenue 
## Pairing crossbuy with industry 
## Pairing crossbuy with acq_exp 
## Pairing crossbuy with sow 
## Pairing crossbuy with employees 
## Pairing revenue with industry 
## Pairing revenue with acq_exp 
## Pairing revenue with sow 
## Pairing revenue with employees 
## Pairing industry with acq_exp 
## Pairing industry with sow 
## Pairing industry with employees 
## Pairing acq_exp with sow 
## Pairing acq_exp with employees 
## Pairing sow with employees 
## 
##                               Method: vimp
##                     No. of variables: 8
##            Variables sorted by VIMP?: TRUE
##    No. of variables used for pairing: 8
##     Total no. of paired interactions: 28
##             Monte Carlo replications: 1
##     Type of noising up used for VIMP: permute
## 
##                         Var 1     Var 2     Paired   Additive Difference
## ret_exp:freq       54106.8341 1474.7166 54679.3332 55581.5507  -902.2175
## ret_exp:crossbuy   54106.8341   55.9429 53871.8206 54162.7770  -290.9564
## ret_exp:revenue    54106.8341  -52.6655 53610.0227 54054.1687  -444.1460
## ret_exp:industry   54106.8341  -22.1130 53633.2526 54084.7212  -451.4686
## ret_exp:acq_exp    54106.8341  -58.6368 53557.3883 54048.1973  -490.8091
## ret_exp:sow        54106.8341  -81.0409 53341.4290 54025.7932  -684.3643
## ret_exp:employees  54106.8341  -30.8446 53460.9463 54075.9895  -615.0433
## freq:crossbuy       1570.9523   55.9429  1668.0246  1626.8952    41.1294
## freq:revenue        1570.9523  -52.6655  1613.8531  1518.2868    95.5662
## freq:industry       1570.9523  -22.1130  1545.9013  1548.8393    -2.9380
## freq:acq_exp        1570.9523  -58.6368  1492.5064  1512.3155   -19.8091
## freq:sow            1570.9523  -81.0409  1511.7167  1489.9114    21.8053
## freq:employees      1570.9523  -30.8446  1623.7912  1540.1077    83.6835
## crossbuy:revenue     125.3203  -52.6655    49.3926    72.6549   -23.2622
## crossbuy:industry    125.3203  -22.1130    -3.1237   103.2074  -106.3310
## crossbuy:acq_exp     125.3203  -58.6368  -170.3608    66.6835  -237.0444
## crossbuy:sow         125.3203  -81.0409    52.3014    44.2794     8.0220
## crossbuy:employees   125.3203  -30.8446   -54.4062    94.4757  -148.8819
## revenue:industry     -35.7425  -22.1130   -28.6166   -57.8555    29.2389
## revenue:acq_exp      -35.7425  -58.6368  -119.2871   -94.3793   -24.9078
## revenue:sow          -35.7425  -81.0409   -34.7354  -116.7834    82.0480
## revenue:employees    -35.7425  -30.8446   -62.6454   -66.5871     3.9417
## industry:acq_exp      -0.9345  -58.6368  -152.7659   -59.5713   -93.1946
## industry:sow          -0.9345  -81.0409   -89.6305   -81.9754    -7.6551
## industry:employees    -0.9345  -30.8446   -84.2773   -31.7791   -52.4982
## acq_exp:sow          -90.4718  -81.0409  -118.7171  -171.5127    52.7956
## acq_exp:employees    -90.4718  -30.8446   -57.6534  -121.3164    63.6629
## sow:employees       -114.6518  -30.8446  -146.9512  -145.4964    -1.4548

Tuning the hyper parameters for the Random Forest model.

set.seed(123)

# Hyper-parameters
hp_mtry <- seq(2,8,1)
hp_nodesize <- seq(2,10,2)
hp_ntree <- seq(1000,5000,500)

#Data frame for the tuning hyper parameters
df_hps <- expand.grid(mtry = hp_mtry, nodesize = hp_nodesize, ntree = hp_ntree)

#Vector to hold the OOB error
vc_oob_error_values <- c()

#Loop over the data frame
for (i in 1:nrow(df_hps)) {

   #Random Forest
   rf_model <- rfsrc(duration ~ acq_exp +
                    ret_exp +
                    freq +
                    crossbuy +
                    sow +
                    industry +
                    revenue +
                    employees,
                    data = df_acqisition_only,
                    mtry = df_hps$mtry[i],
                    nodesize = df_hps$nodesize[i],
                    ntree = df_hps$ntree[i])  
  
                          
    #Persist the OOB error values                      
    vc_oob_error_values[i] <- rf_model$err.rate[length(rf_model$err.rate)]
}

#Optimal set of hyper parameters based on OOB error
min_optimal_error_index <- which.min(vc_oob_error_values)
print(df_hps[min_optimal_error_index,])
##     mtry nodesize ntree
## 145    6        2  3000
set.seed(123)

rf_tuned_hp_duration <- rfsrc(duration ~ acq_exp + 
                                          ret_exp + 
                                          freq + 
                                          crossbuy + 
                                          sow + 
                                          industry + 
                                          revenue + 
                                          employees+ret_exp * freq, 
                                data = df_duration_both_actual_pred, 
                                mtry = 6, 
                                nodesize = 2, 
                                ntree = 3000, 
                              importance = TRUE)
#Predict duration with Tuned RF model
rf_tuned_duration_preds <- predict(rf_tuned_hp_duration, df_duration_both_actual_pred)$predicted

#New Data frame with tuned duration predictions
df_duration_tuned_rf <- cbind(df_duration_both_actual_pred, rf_tuned_duration_preds)

#Comparing the statistical parameters and error terms between untuned and tuned 
#Random Forest Models

#Mean
mean_actual_duration <- mean(df_duration_tuned_rf$duration)
mean_predicted_duration <- mean(df_duration_tuned_rf$rf_duration_preds)
mean_tuned_predicted_duration <- mean(df_duration_tuned_rf$rf_tuned_duration_preds)

#MSE
mse_predicted_duration <- mean((df_duration_tuned_rf$rf_duration_preds - df_duration_tuned_rf$duration)^2)
mse_tuned_predicted_duration <- mean((df_duration_tuned_rf$rf_tuned_duration_preds - df_duration_tuned_rf$duration)^2)

column_names <- c("Model", "Mean", "MSE")

actual_duration_values <- c("No Model (actual duration)", 
                            mean_actual_duration, 
                            "NA")
predicted_duration_values <- c("Random Forest Model (before tuning)", 
                                mean_predicted_duration,  
                                mse_predicted_duration)
predicted_tuned_duration_values <- c("Random Forest Model (Tuned including Interaction)", 
                                    mean_tuned_predicted_duration,  
                                    mse_tuned_predicted_duration)
df_duration_comparison <- rbind("1" = actual_duration_values, 
                                "2" = predicted_duration_values, 
                                "3" = predicted_tuned_duration_values)
colnames(df_duration_comparison) <- column_names
df_duration_comparison
##   Model                                               Mean              
## 1 "No Model (actual duration)"                        "1096.26543209877"
## 2 "Random Forest Model (before tuning)"               "1098.67392137468"
## 3 "Random Forest Model (Tuned including Interaction)" "1098.03766358025"
##   MSE               
## 1 "NA"              
## 2 "1449.86376826512"
## 3 "183.726066160776"

The MSE value of the tuned Random Forest model is significantly lower than the un-tuned model.

Compare the accuracy of model with a decision trees and logistic regression model for acquiring customers

set.seed(123)

# Hyper-parameters
hp_mtry <- seq(2,8,1)
hp_nodesize <- seq(2,10,2)
hp_ntree <- seq(1000,5000,500)

#Data frame for the tuning hyper parameters
df_hps <- expand.grid(mtry = hp_mtry, nodesize = hp_nodesize, ntree = hp_ntree)

#Vector to hold the OOB error
vc_oob_error_values <- c()

#Loop over the data frame
for (i in 1:nrow(df_hps)) {

    #Random Forest
    rf_model <- rfsrc(acquisition ~ acq_exp +
                          industry +
                          revenue +
                          employees,
                          data = df_train_acq,
                          mtry = df_hps$mtry[i],
                          nodesize = df_hps$nodesize[i],
                          ntree = df_hps$ntree[i])  
                          
    #Persist the OOB error values                      
    vc_oob_error_values[i] <- rf_model$err.rate[length(rf_model$err.rate)]
}

#Optimal set of hyper parameters based on OOB error
min_optimal_error_index <- which.min(vc_oob_error_values)
print(df_hps[min_optimal_error_index,])
##    mtry nodesize ntree
## 36    2        2  1500
set.seed(123)

rf_acquisition_tuned <- rfsrc(acquisition ~ acq_exp + 
                                industry + 
                                revenue + 
                                employees, 
                              data = df_train_acq, 
                              mtry = 2, 
                              nodesize = 2, 
                              ntree = 1500, 
                              importance = TRUE)

#Predict acquisition with Tuned RF model
rf_tuned_acquisition_preds <- predict(rf_acquisition_tuned, df_test_acq)$class

#Confusion Matrix for the untuned RF model
confusion_untuned_acq <- table(rf_acquisition_preds, df_test_acq$acquisition)

#Confusion Matrix for the tuned RF model
confusion_tuned_acq <- table(rf_tuned_acquisition_preds, df_test_acq$acquisition)
set.seed(123)

#Decision Tree
dt_acquisition <- train(acquisition ~ acq_exp + 
                          industry + 
                          revenue + 
                          employees, 
                          data = df_train_acq, 
                          method='rpart', 
                          trControl = train_control)
dt_acquisition_preds <- predict(dt_acquisition, df_test_acq)
confusion_dt_acquisition <- table(dt_acquisition_preds, df_test_acq$acquisition)

rattle::fancyRpartPlot(dt_acquisition$finalModel, sub = "Acquistion Decision Tree")

set.seed(123)
#LOGIT model
glm_acquisition <- train(acquisition ~ acq_exp + 
                           industry + 
                           revenue + 
                           employees, 
                          data = df_train_acq, 
                          method='glm', 
                          family='binomial', 
                          trControl = train_control)
glm_acquisition_preds <- predict(glm_acquisition, df_test_acq)
confusion_glm_acquisition <- table(glm_acquisition_preds, df_test_acq$acquisition)

summary(glm_acquisition)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1761  -0.6426   0.3106   0.6776   2.3133  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.5983899  0.9738471  -6.776 1.24e-11 ***
## acq_exp     -0.0003749  0.0008213  -0.456    0.648    
## industry1    1.6808428  0.3070056   5.475 4.38e-08 ***
## revenue      0.0702781  0.0159174   4.415 1.01e-05 ***
## employees    0.0063407  0.0008219   7.715 1.21e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 442.56  on 350  degrees of freedom
## Residual deviance: 300.29  on 346  degrees of freedom
## AIC: 310.29
## 
## Number of Fisher Scoring iterations: 5
#Confustion matricies of all models
rf_acquisition_tuned_accuracy <- sum(diag(confusion_tuned_acq))/sum(confusion_tuned_acq)
rf_acquisition_accuracy <- sum(diag(confusion_untuned_acq))/sum(confusion_untuned_acq)
dt_acquisition_accuracy <- sum(diag(confusion_dt_acquisition))/sum(confusion_dt_acquisition)
glm_acquisition_accuracy <- sum(diag(confusion_glm_acquisition))/sum(confusion_glm_acquisition)

#Data frame holding accuracies from all models
acq_models <- c("Random Forest (Hyper Parameters)", 
                "Random Forest (Repeated CV with tuned parameter)", 
                "Decision Tree",
                "Logistic Regression")
acq_accuracy <- c(rf_acquisition_tuned_accuracy, 
                  rf_acquisition_accuracy, 
                  dt_acquisition_accuracy,
                  glm_acquisition_accuracy)

df_acq_model_accuracy <- cbind(Model = acq_models, 
                               Accuracy = acq_accuracy)
df_acq_model_accuracy
##      Model                                              Accuracy           
## [1,] "Random Forest (Hyper Parameters)"                 "0.791946308724832"
## [2,] "Random Forest (Repeated CV with tuned parameter)" "0.805369127516778"
## [3,] "Decision Tree"                                    "0.74496644295302" 
## [4,] "Logistic Regression"                              "0.798657718120805"

PDP Plots

plot.variable(rf_acquisition_tuned, partial=TRUE)

plot.variable(rf_duration_tuned, partial = TRUE)