Load acquisitionRetention data from SMCRM Package

library(SMCRM)
## Warning: package 'SMCRM' was built under R version 4.1.1
data("acquisitionRetention")

#Create dataframe with data
df.retention <- acquisitionRetention

Exploratory Data Analysis

#Structure of Data
str(df.retention)
## 'data.frame':    500 obs. of  15 variables:
##  $ customer   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ acquisition: num  1 1 1 0 1 1 1 1 0 0 ...
##  $ duration   : num  1635 1039 1288 0 1631 ...
##  $ profit     : num  6134 3524 4081 -638 5446 ...
##  $ acq_exp    : num  694 460 249 638 589 ...
##  $ ret_exp    : num  972 450 805 0 920 ...
##  $ acq_exp_sq : num  480998 211628 62016 407644 346897 ...
##  $ ret_exp_sq : num  943929 202077 648089 0 846106 ...
##  $ freq       : num  6 11 21 0 2 7 15 13 0 0 ...
##  $ freq_sq    : num  36 121 441 0 4 49 225 169 0 0 ...
##  $ crossbuy   : num  5 6 6 0 9 4 5 5 0 0 ...
##  $ sow        : num  95 22 90 0 80 48 51 23 0 0 ...
##  $ industry   : num  1 0 0 0 0 1 0 1 0 1 ...
##  $ revenue    : num  47.2 45.1 29.1 40.6 48.7 ...
##  $ employees  : num  898 686 1423 181 631 ...
#Summary of data
summary(df.retention)
##     customer      acquisition       duration          profit       
##  Min.   :  1.0   Min.   :0.000   Min.   :   0.0   Min.   :-1027.0  
##  1st Qu.:125.8   1st Qu.:0.000   1st Qu.:   0.0   1st Qu.: -316.3  
##  Median :250.5   Median :1.000   Median : 957.5   Median : 3369.9  
##  Mean   :250.5   Mean   :0.676   Mean   : 742.5   Mean   : 2403.8  
##  3rd Qu.:375.2   3rd Qu.:1.000   3rd Qu.:1146.2   3rd Qu.: 3931.6  
##  Max.   :500.0   Max.   :1.000   Max.   :1673.0   Max.   : 6134.3  
##     acq_exp           ret_exp         acq_exp_sq          ret_exp_sq     
##  Min.   :   1.21   Min.   :   0.0   Min.   :      1.5   Min.   :      0  
##  1st Qu.: 384.14   1st Qu.:   0.0   1st Qu.: 147562.0   1st Qu.:      0  
##  Median : 491.66   Median : 398.1   Median : 241729.7   Median : 158480  
##  Mean   : 493.35   Mean   : 336.3   Mean   : 271211.1   Mean   : 184000  
##  3rd Qu.: 600.21   3rd Qu.: 514.3   3rd Qu.: 360246.0   3rd Qu.: 264466  
##  Max.   :1027.04   Max.   :1095.0   Max.   :1054811.2   Max.   :1198937  
##       freq          freq_sq          crossbuy           sow        
##  Min.   : 0.00   Min.   :  0.00   Min.   : 0.000   Min.   :  0.00  
##  1st Qu.: 0.00   1st Qu.:  0.00   1st Qu.: 0.000   1st Qu.:  0.00  
##  Median : 6.00   Median : 36.00   Median : 5.000   Median : 44.00  
##  Mean   : 6.22   Mean   : 69.25   Mean   : 4.052   Mean   : 38.88  
##  3rd Qu.:11.00   3rd Qu.:121.00   3rd Qu.: 7.000   3rd Qu.: 66.00  
##  Max.   :21.00   Max.   :441.00   Max.   :11.000   Max.   :116.00  
##     industry        revenue        employees     
##  Min.   :0.000   Min.   :14.49   Min.   :  18.0  
##  1st Qu.:0.000   1st Qu.:33.53   1st Qu.: 503.0  
##  Median :1.000   Median :41.43   Median : 657.5  
##  Mean   :0.522   Mean   :40.54   Mean   : 671.5  
##  3rd Qu.:1.000   3rd Qu.:47.52   3rd Qu.: 826.0  
##  Max.   :1.000   Max.   :65.10   Max.   :1461.0
#Correlation 
#install.packages("PerformanceAnalytics")
library(PerformanceAnalytics)
## Warning: package 'PerformanceAnalytics' was built under R version 4.1.3
## Loading required package: xts
## Warning: package 'xts' was built under R version 4.1.1
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.1.1
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
chart.Correlation(df.retention, histogram = TRUE, pch = 20)

Correlated Variables: * duration * profit * ret_exp * acq_exp_sq * ret_exp_sq * freq * freq_sq * crossbuy * sow Below: Boxplot between correlated variables and response variable (acquisition)

par(mfrow=c(1,1))
boxplot(duration ~ acquisition, data=df.retention, ylab='Duration', xlab='acquisition')

boxplot(profit ~ acquisition, data=df.retention, ylab='profit', xlab='acquisition')

boxplot(ret_exp ~ acquisition, data=df.retention, ylab='ret_exp', xlab='acquisition')

boxplot(acq_exp_sq ~ acquisition, data=df.retention, ylab='acq_exp_sq', xlab='acquisition')

boxplot(ret_exp_sq ~ acquisition, data=df.retention, ylab='ret_exp_sq', xlab='acquisition')

boxplot(freq ~ acquisition, data=df.retention, ylab='freq', xlab='acquisition')

boxplot(freq_sq ~ acquisition, data=df.retention, ylab='freq_sq', xlab='acquisition')

boxplot(crossbuy ~ acquisition, data=df.retention, ylab='crossbuy', xlab='acquisition')

boxplot(sow ~ acquisition, data=df.retention, ylab='sow', xlab='acquisition')

Most variables are either 0 or negative biased when acquisition is 0. Except for act_exp_sq. Since this is the square of the variable act_exp, we need to use the act_exp variable in the model in addition to industry, revenue, and employees.

#omit N/As
df.retention.fac <- na.omit(df.retention)
#factorize the *industry* and *acquisition* variable
df.retention.fac$acquisition <- as.factor(df.retention.fac$acquisition)
df.retention.fac$industry <- as.factor(df.retention.fac$industry)
str(df.retention.fac)
## 'data.frame':    500 obs. of  15 variables:
##  $ customer   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ acquisition: Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 1 1 ...
##  $ duration   : num  1635 1039 1288 0 1631 ...
##  $ profit     : num  6134 3524 4081 -638 5446 ...
##  $ acq_exp    : num  694 460 249 638 589 ...
##  $ ret_exp    : num  972 450 805 0 920 ...
##  $ acq_exp_sq : num  480998 211628 62016 407644 346897 ...
##  $ ret_exp_sq : num  943929 202077 648089 0 846106 ...
##  $ freq       : num  6 11 21 0 2 7 15 13 0 0 ...
##  $ freq_sq    : num  36 121 441 0 4 49 225 169 0 0 ...
##  $ crossbuy   : num  5 6 6 0 9 4 5 5 0 0 ...
##  $ sow        : num  95 22 90 0 80 48 51 23 0 0 ...
##  $ industry   : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 1 2 1 2 ...
##  $ revenue    : num  47.2 45.1 29.1 40.6 48.7 ...
##  $ employees  : num  898 686 1423 181 631 ...
summary(df.retention.fac)
##     customer     acquisition    duration          profit       
##  Min.   :  1.0   0:162       Min.   :   0.0   Min.   :-1027.0  
##  1st Qu.:125.8   1:338       1st Qu.:   0.0   1st Qu.: -316.3  
##  Median :250.5               Median : 957.5   Median : 3369.9  
##  Mean   :250.5               Mean   : 742.5   Mean   : 2403.8  
##  3rd Qu.:375.2               3rd Qu.:1146.2   3rd Qu.: 3931.6  
##  Max.   :500.0               Max.   :1673.0   Max.   : 6134.3  
##     acq_exp           ret_exp         acq_exp_sq          ret_exp_sq     
##  Min.   :   1.21   Min.   :   0.0   Min.   :      1.5   Min.   :      0  
##  1st Qu.: 384.14   1st Qu.:   0.0   1st Qu.: 147562.0   1st Qu.:      0  
##  Median : 491.66   Median : 398.1   Median : 241729.7   Median : 158480  
##  Mean   : 493.35   Mean   : 336.3   Mean   : 271211.1   Mean   : 184000  
##  3rd Qu.: 600.21   3rd Qu.: 514.3   3rd Qu.: 360246.0   3rd Qu.: 264466  
##  Max.   :1027.04   Max.   :1095.0   Max.   :1054811.2   Max.   :1198937  
##       freq          freq_sq          crossbuy           sow         industry
##  Min.   : 0.00   Min.   :  0.00   Min.   : 0.000   Min.   :  0.00   0:239   
##  1st Qu.: 0.00   1st Qu.:  0.00   1st Qu.: 0.000   1st Qu.:  0.00   1:261   
##  Median : 6.00   Median : 36.00   Median : 5.000   Median : 44.00           
##  Mean   : 6.22   Mean   : 69.25   Mean   : 4.052   Mean   : 38.88           
##  3rd Qu.:11.00   3rd Qu.:121.00   3rd Qu.: 7.000   3rd Qu.: 66.00           
##  Max.   :21.00   Max.   :441.00   Max.   :11.000   Max.   :116.00           
##     revenue        employees     
##  Min.   :14.49   Min.   :  18.0  
##  1st Qu.:33.53   1st Qu.: 503.0  
##  Median :41.43   Median : 657.5  
##  Mean   :40.54   Mean   : 671.5  
##  3rd Qu.:47.52   3rd Qu.: 826.0  
##  Max.   :65.10   Max.   :1461.0