Load acquisitionRetention data from SMCRM Package
library(SMCRM)
## Warning: package 'SMCRM' was built under R version 4.1.1
data("acquisitionRetention")
#Create dataframe with data
df.retention <- acquisitionRetention
Exploratory Data Analysis
#Structure of Data
str(df.retention)
## 'data.frame': 500 obs. of 15 variables:
## $ customer : num 1 2 3 4 5 6 7 8 9 10 ...
## $ acquisition: num 1 1 1 0 1 1 1 1 0 0 ...
## $ duration : num 1635 1039 1288 0 1631 ...
## $ profit : num 6134 3524 4081 -638 5446 ...
## $ acq_exp : num 694 460 249 638 589 ...
## $ ret_exp : num 972 450 805 0 920 ...
## $ acq_exp_sq : num 480998 211628 62016 407644 346897 ...
## $ ret_exp_sq : num 943929 202077 648089 0 846106 ...
## $ freq : num 6 11 21 0 2 7 15 13 0 0 ...
## $ freq_sq : num 36 121 441 0 4 49 225 169 0 0 ...
## $ crossbuy : num 5 6 6 0 9 4 5 5 0 0 ...
## $ sow : num 95 22 90 0 80 48 51 23 0 0 ...
## $ industry : num 1 0 0 0 0 1 0 1 0 1 ...
## $ revenue : num 47.2 45.1 29.1 40.6 48.7 ...
## $ employees : num 898 686 1423 181 631 ...
#Summary of data
summary(df.retention)
## customer acquisition duration profit
## Min. : 1.0 Min. :0.000 Min. : 0.0 Min. :-1027.0
## 1st Qu.:125.8 1st Qu.:0.000 1st Qu.: 0.0 1st Qu.: -316.3
## Median :250.5 Median :1.000 Median : 957.5 Median : 3369.9
## Mean :250.5 Mean :0.676 Mean : 742.5 Mean : 2403.8
## 3rd Qu.:375.2 3rd Qu.:1.000 3rd Qu.:1146.2 3rd Qu.: 3931.6
## Max. :500.0 Max. :1.000 Max. :1673.0 Max. : 6134.3
## acq_exp ret_exp acq_exp_sq ret_exp_sq
## Min. : 1.21 Min. : 0.0 Min. : 1.5 Min. : 0
## 1st Qu.: 384.14 1st Qu.: 0.0 1st Qu.: 147562.0 1st Qu.: 0
## Median : 491.66 Median : 398.1 Median : 241729.7 Median : 158480
## Mean : 493.35 Mean : 336.3 Mean : 271211.1 Mean : 184000
## 3rd Qu.: 600.21 3rd Qu.: 514.3 3rd Qu.: 360246.0 3rd Qu.: 264466
## Max. :1027.04 Max. :1095.0 Max. :1054811.2 Max. :1198937
## freq freq_sq crossbuy sow
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 6.00 Median : 36.00 Median : 5.000 Median : 44.00
## Mean : 6.22 Mean : 69.25 Mean : 4.052 Mean : 38.88
## 3rd Qu.:11.00 3rd Qu.:121.00 3rd Qu.: 7.000 3rd Qu.: 66.00
## Max. :21.00 Max. :441.00 Max. :11.000 Max. :116.00
## industry revenue employees
## Min. :0.000 Min. :14.49 Min. : 18.0
## 1st Qu.:0.000 1st Qu.:33.53 1st Qu.: 503.0
## Median :1.000 Median :41.43 Median : 657.5
## Mean :0.522 Mean :40.54 Mean : 671.5
## 3rd Qu.:1.000 3rd Qu.:47.52 3rd Qu.: 826.0
## Max. :1.000 Max. :65.10 Max. :1461.0
#Correlation
#install.packages("PerformanceAnalytics")
library(PerformanceAnalytics)
## Warning: package 'PerformanceAnalytics' was built under R version 4.1.3
## Loading required package: xts
## Warning: package 'xts' was built under R version 4.1.1
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.1.1
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
chart.Correlation(df.retention, histogram = TRUE, pch = 20)
Correlated Variables: * duration * profit * ret_exp * acq_exp_sq * ret_exp_sq * freq * freq_sq * crossbuy * sow Below: Boxplot between correlated variables and response variable (acquisition)
par(mfrow=c(1,1))
boxplot(duration ~ acquisition, data=df.retention, ylab='Duration', xlab='acquisition')
boxplot(profit ~ acquisition, data=df.retention, ylab='profit', xlab='acquisition')
boxplot(ret_exp ~ acquisition, data=df.retention, ylab='ret_exp', xlab='acquisition')
boxplot(acq_exp_sq ~ acquisition, data=df.retention, ylab='acq_exp_sq', xlab='acquisition')
boxplot(ret_exp_sq ~ acquisition, data=df.retention, ylab='ret_exp_sq', xlab='acquisition')
boxplot(freq ~ acquisition, data=df.retention, ylab='freq', xlab='acquisition')
boxplot(freq_sq ~ acquisition, data=df.retention, ylab='freq_sq', xlab='acquisition')
boxplot(crossbuy ~ acquisition, data=df.retention, ylab='crossbuy', xlab='acquisition')
boxplot(sow ~ acquisition, data=df.retention, ylab='sow', xlab='acquisition')
Most variables are either 0 or negative biased when acquisition is 0. Except for act_exp_sq. Since this is the square of the variable act_exp, we need to use the act_exp variable in the model in addition to industry, revenue, and employees.
#omit N/As
df.retention.fac <- na.omit(df.retention)
#factorize the *industry* and *acquisition* variable
df.retention.fac$acquisition <- as.factor(df.retention.fac$acquisition)
df.retention.fac$industry <- as.factor(df.retention.fac$industry)
str(df.retention.fac)
## 'data.frame': 500 obs. of 15 variables:
## $ customer : num 1 2 3 4 5 6 7 8 9 10 ...
## $ acquisition: Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 1 1 ...
## $ duration : num 1635 1039 1288 0 1631 ...
## $ profit : num 6134 3524 4081 -638 5446 ...
## $ acq_exp : num 694 460 249 638 589 ...
## $ ret_exp : num 972 450 805 0 920 ...
## $ acq_exp_sq : num 480998 211628 62016 407644 346897 ...
## $ ret_exp_sq : num 943929 202077 648089 0 846106 ...
## $ freq : num 6 11 21 0 2 7 15 13 0 0 ...
## $ freq_sq : num 36 121 441 0 4 49 225 169 0 0 ...
## $ crossbuy : num 5 6 6 0 9 4 5 5 0 0 ...
## $ sow : num 95 22 90 0 80 48 51 23 0 0 ...
## $ industry : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 1 2 1 2 ...
## $ revenue : num 47.2 45.1 29.1 40.6 48.7 ...
## $ employees : num 898 686 1423 181 631 ...
summary(df.retention.fac)
## customer acquisition duration profit
## Min. : 1.0 0:162 Min. : 0.0 Min. :-1027.0
## 1st Qu.:125.8 1:338 1st Qu.: 0.0 1st Qu.: -316.3
## Median :250.5 Median : 957.5 Median : 3369.9
## Mean :250.5 Mean : 742.5 Mean : 2403.8
## 3rd Qu.:375.2 3rd Qu.:1146.2 3rd Qu.: 3931.6
## Max. :500.0 Max. :1673.0 Max. : 6134.3
## acq_exp ret_exp acq_exp_sq ret_exp_sq
## Min. : 1.21 Min. : 0.0 Min. : 1.5 Min. : 0
## 1st Qu.: 384.14 1st Qu.: 0.0 1st Qu.: 147562.0 1st Qu.: 0
## Median : 491.66 Median : 398.1 Median : 241729.7 Median : 158480
## Mean : 493.35 Mean : 336.3 Mean : 271211.1 Mean : 184000
## 3rd Qu.: 600.21 3rd Qu.: 514.3 3rd Qu.: 360246.0 3rd Qu.: 264466
## Max. :1027.04 Max. :1095.0 Max. :1054811.2 Max. :1198937
## freq freq_sq crossbuy sow industry
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.00 0:239
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00 1:261
## Median : 6.00 Median : 36.00 Median : 5.000 Median : 44.00
## Mean : 6.22 Mean : 69.25 Mean : 4.052 Mean : 38.88
## 3rd Qu.:11.00 3rd Qu.:121.00 3rd Qu.: 7.000 3rd Qu.: 66.00
## Max. :21.00 Max. :441.00 Max. :11.000 Max. :116.00
## revenue employees
## Min. :14.49 Min. : 18.0
## 1st Qu.:33.53 1st Qu.: 503.0
## Median :41.43 Median : 657.5
## Mean :40.54 Mean : 671.5
## 3rd Qu.:47.52 3rd Qu.: 826.0
## Max. :65.10 Max. :1461.0