data <- read.table("Customer_Transaction.txt",stringsAsFactors = FALSE)
names(data) <- c("Customer.ID","Transaction.Date","Volume","Value")
data$Month <- as.Date(paste(substr(data$Transaction.Date,1,4),substr(data$Transaction.Date,5,6),"01",sep = "-"))
data <- data %>%
  group_by(Customer.ID,Month) %>%
  summarise_at(c("Volume","Value"),funs(sum)) %>%
  ungroup()

Data-Mining Approach Explained

Data Mining approach is inductive by nature - it doesn’t focus on preassumed hypothesis but rather inducing conclusions from the observed data. The quality of the model fit on independent hold-out sample is the most important critieria for model evalution.

When we adopt data-mining approach to Customer Values, we use past value(X) to predict future value (Y). we set out time cutting point to divide data into base, proxy future period(Y in training set), and real future period(Y in testing set).

Feature Extraction

1). Determine Base Time and Proxy Time.

base.time <- c(as.Date("1997-07-01"),as.Date("1997-12-01"))
proxy.time <- c(as.Date("1998-01-01"),as.Date("1998-06-01"))
future.time <- c(as.Date("1998-07-01"),as.Date("1998-12-01"))
current.state <- as.Date("1997-12-01")

time.cut <- function(x) {
  if(x <= base.time[2] & x >= base.time[1]){
    return("base.time")
  } else if(x <= proxy.time[2] & x >= proxy.time[1]) {
    return("proxy.time")
  } else if(x <= future.time[2] & x >= future.time[1]) {
    return("future.time")
  } else {
    return("others")
  }
}

time.cut <- Vectorize(time.cut)

data$time.cut <- time.cut(data$Month)

data.model <- data[data$time.cut != "others",]

2). Feature Extraction: Recency and Frequency.

r.f <- data.model %>%
  filter(time.cut == "base.time") %>%
  group_by(Customer.ID) %>%
  summarise_at(c("Volume","Month"),funs(length,max)) %>%
  mutate(Month_max = round(as.integer(current.state - Month_max)/31,0)) %>%
  select(Customer.ID,Volume_length,Month_max)

names(r.f)[2:3] <- c("Frequency","Recency")

3). Feature Extraction: Monetary Value

data.model <- data.model %>%
  group_by(Customer.ID,time.cut) %>%
  summarise_at(c("Volume","Value"),funs(sum)) %>%
  ungroup()
volume <- data.model %>%
  select(-Value) %>%
  tidyr::spread(time.cut,Volume)
names(volume)[2:3] <- c("Volume.base","Volume.proxy")

value <- data.model %>%
  select(-Volume) %>%
  tidyr::spread(time.cut,Value)
names(value)[2:3] <- c("Value.base","Value.proxy")

all <- volume %>%
  full_join(value,by="Customer.ID")

#Checkpoint
length(all[is.na(all$Volume.base),]$Customer.ID) == length(all[is.na(all$Value.base),]$Customer.ID)
## [1] TRUE
length(all[is.na(all$Volume.proxy),]$Customer.ID) == length(all[is.na(all$Value.proxy),]$Customer.ID)
## [1] TRUE
all <- all %>%
  #Get rid of new acquisitions in proxy state
  filter(!is.na(Volume.base))

all[is.na(all)] <- 0

all <- all %>%
  full_join(r.f, by = "Customer.ID")

#Create Response Variables for Model I
all$Buy <- ifelse(all$Volume.proxy>0,1,0)

all$avg.spend <- all$Value.base/all$Frequency

Modeling

First, we fit a logistic regression on to model responses(To buy or not to buy)

logistic <- glm(Buy~Recency+Frequency,data = all,family = binomial(link = "logit"))

summary(logistic)
## 
## Call:
## glm(formula = Buy ~ Recency + Frequency, family = binomial(link = "logit"), 
##     data = all)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4997  -1.0551   0.5577   1.0576   1.3918  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.81218    0.07803 -10.409  < 2e-16 ***
## Recency     -0.06552    0.01669  -3.927 8.61e-05 ***
## Frequency    0.64859    0.03337  19.434  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8861.6  on 6420  degrees of freedom
## Residual deviance: 8154.8  on 6418  degrees of freedom
## AIC: 8160.8
## 
## Number of Fisher Scoring iterations: 4

The fit is poor(by comparing the difference between Null deviance and Residual deviance).

Then we use this model to predict response probability.

all$buy.proba <- predict(logistic,type = "response")

Next, we model multicative regressions on spend.

all$log.avg.spend <- log(all$avg.spend+1)
all$log.value.proxy <- log(all$Value.proxy+1)

multicative <- lm(log.value.proxy~log.avg.spend,data = all)
summary(multicative)
## 
## Call:
## lm(formula = log.value.proxy ~ log.avg.spend, data = all)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5197 -2.0546  0.5446  1.9106  4.4465 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.45086    0.13055  -3.454 0.000557 ***
## log.avg.spend  0.75084    0.03583  20.953  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.114 on 6419 degrees of freedom
## Multiple R-squared:  0.06402,    Adjusted R-squared:  0.06387 
## F-statistic:   439 on 1 and 6419 DF,  p-value: < 2.2e-16

Then we use this model to predict spend. remember to incorporate mean adjustment when using multicative models.

all$spend.est <- predict(multicative)
mse<-mean(multicative$residuals^2)
all$spend.est <- exp(all$spend.est+mse/2)
all$spend.est.final <- all$spend.est*all$buy.proba

To further validate the statistical fit of modeling, User can use hold-out sample(test data) or cross-validation(when data set is small).

Derive business insight - Gain Table

Gain Table is very useful in deriving business insight. From gain table, we group customers into deciles by their expected values generated, and we look at the cost and revenue respectively for deciding up to which customer decile we’d like to invest.

cut <- quantile(all$spend.est.final, prob = seq(0, 1, length = 11),type =5)
cut[1] <- cut[1]-0.1

all$decile <- cut(all$spend.est.final,cut,labels = 10:1)

gain.table <- all %>%
  group_by(decile) %>%
  summarise_at(c("Customer.ID","spend.est.final"),funs(length(unique(.)),sum)) %>%
  ungroup() %>%
  mutate(N = Customer.ID_length, Total = spend.est.final_sum,Mean =round(spend.est.final_sum/Customer.ID_length,2)) %>%
  select(decile,N,Total,Mean) %>%
  arrange(as.integer(as.character(decile))) %>%
  mutate(cumn = cumsum(N),cumsum = cumsum(Total),cummean = cumsum/cumn)

print(gain.table)
## # A tibble: 10 x 7
##    decile     N     Total   Mean  cumn   cumsum   cummean
##    <fctr> <int>     <dbl>  <dbl> <int>    <dbl>     <dbl>
##  1      1   642 103911.18 161.86   642 103911.2 161.85542
##  2      2   642  58287.96  90.79  1284 162199.1 126.32332
##  3      3   642  45370.73  70.67  1926 207569.9 107.77252
##  4      4   642  37460.59  58.35  2568 245030.5  95.41685
##  5      5   642  31550.85  49.14  3210 276581.3  86.16240
##  6      6   643  26354.93  40.99  3853 302936.2  78.62347
##  7      7   641  21477.95  33.51  4494 324414.2  72.18830
##  8      8   641  17483.93  27.28  5135 341898.1  66.58191
##  9      9   634  13005.14  20.51  5769 354903.3  61.51903
## 10     10   652  10467.51  16.05  6421 365370.8  56.90247

What Else Can Happen?

In Feature Extraction - In our model, only RFM(Recency, Frequency and Monetary Values) are used. You can be more creative given your business nature. For instance, will all marketing contact point(Email, Paid Search, Social etc) have the same impact on Customer Equity? More specifically, we can introduce amount of customer digital interactions with brand(Facebook likes, insta likes) into the model(has the data mapped/connected on user level.)

In Model fit - In this example, neither the Logistic model nor regression model gives good fit. What can we do to improve model fit? should we apply transformations? other modeling options(Forests..)