data <- read.table("Customer_Transaction.txt",stringsAsFactors = FALSE)
names(data) <- c("Customer.ID","Transaction.Date","Volume","Value")
data$Month <- as.Date(paste(substr(data$Transaction.Date,1,4),substr(data$Transaction.Date,5,6),"01",sep = "-"))
data <- data %>%
group_by(Customer.ID,Month) %>%
summarise_at(c("Volume","Value"),funs(sum)) %>%
ungroup()
Data Mining approach is inductive by nature - it doesn’t focus on preassumed hypothesis but rather inducing conclusions from the observed data. The quality of the model fit on independent hold-out sample is the most important critieria for model evalution.
When we adopt data-mining approach to Customer Values, we use past value(X) to predict future value (Y). we set out time cutting point to divide data into base, proxy future period(Y in training set), and real future period(Y in testing set).
1). Determine Base Time and Proxy Time.
base.time <- c(as.Date("1997-07-01"),as.Date("1997-12-01"))
proxy.time <- c(as.Date("1998-01-01"),as.Date("1998-06-01"))
future.time <- c(as.Date("1998-07-01"),as.Date("1998-12-01"))
current.state <- as.Date("1997-12-01")
time.cut <- function(x) {
if(x <= base.time[2] & x >= base.time[1]){
return("base.time")
} else if(x <= proxy.time[2] & x >= proxy.time[1]) {
return("proxy.time")
} else if(x <= future.time[2] & x >= future.time[1]) {
return("future.time")
} else {
return("others")
}
}
time.cut <- Vectorize(time.cut)
data$time.cut <- time.cut(data$Month)
data.model <- data[data$time.cut != "others",]
2). Feature Extraction: Recency and Frequency.
r.f <- data.model %>%
filter(time.cut == "base.time") %>%
group_by(Customer.ID) %>%
summarise_at(c("Volume","Month"),funs(length,max)) %>%
mutate(Month_max = round(as.integer(current.state - Month_max)/31,0)) %>%
select(Customer.ID,Volume_length,Month_max)
names(r.f)[2:3] <- c("Frequency","Recency")
3). Feature Extraction: Monetary Value
data.model <- data.model %>%
group_by(Customer.ID,time.cut) %>%
summarise_at(c("Volume","Value"),funs(sum)) %>%
ungroup()
volume <- data.model %>%
select(-Value) %>%
tidyr::spread(time.cut,Volume)
names(volume)[2:3] <- c("Volume.base","Volume.proxy")
value <- data.model %>%
select(-Volume) %>%
tidyr::spread(time.cut,Value)
names(value)[2:3] <- c("Value.base","Value.proxy")
all <- volume %>%
full_join(value,by="Customer.ID")
#Checkpoint
length(all[is.na(all$Volume.base),]$Customer.ID) == length(all[is.na(all$Value.base),]$Customer.ID)
## [1] TRUE
length(all[is.na(all$Volume.proxy),]$Customer.ID) == length(all[is.na(all$Value.proxy),]$Customer.ID)
## [1] TRUE
all <- all %>%
#Get rid of new acquisitions in proxy state
filter(!is.na(Volume.base))
all[is.na(all)] <- 0
all <- all %>%
full_join(r.f, by = "Customer.ID")
#Create Response Variables for Model I
all$Buy <- ifelse(all$Volume.proxy>0,1,0)
all$avg.spend <- all$Value.base/all$Frequency
First, we fit a logistic regression on to model responses(To buy or not to buy)
logistic <- glm(Buy~Recency+Frequency,data = all,family = binomial(link = "logit"))
summary(logistic)
##
## Call:
## glm(formula = Buy ~ Recency + Frequency, family = binomial(link = "logit"),
## data = all)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4997 -1.0551 0.5577 1.0576 1.3918
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.81218 0.07803 -10.409 < 2e-16 ***
## Recency -0.06552 0.01669 -3.927 8.61e-05 ***
## Frequency 0.64859 0.03337 19.434 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8861.6 on 6420 degrees of freedom
## Residual deviance: 8154.8 on 6418 degrees of freedom
## AIC: 8160.8
##
## Number of Fisher Scoring iterations: 4
The fit is poor(by comparing the difference between Null deviance and Residual deviance).
Then we use this model to predict response probability.
all$buy.proba <- predict(logistic,type = "response")
Next, we model multicative regressions on spend.
all$log.avg.spend <- log(all$avg.spend+1)
all$log.value.proxy <- log(all$Value.proxy+1)
multicative <- lm(log.value.proxy~log.avg.spend,data = all)
summary(multicative)
##
## Call:
## lm(formula = log.value.proxy ~ log.avg.spend, data = all)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5197 -2.0546 0.5446 1.9106 4.4465
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.45086 0.13055 -3.454 0.000557 ***
## log.avg.spend 0.75084 0.03583 20.953 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.114 on 6419 degrees of freedom
## Multiple R-squared: 0.06402, Adjusted R-squared: 0.06387
## F-statistic: 439 on 1 and 6419 DF, p-value: < 2.2e-16
Then we use this model to predict spend. remember to incorporate mean adjustment when using multicative models.
all$spend.est <- predict(multicative)
mse<-mean(multicative$residuals^2)
all$spend.est <- exp(all$spend.est+mse/2)
all$spend.est.final <- all$spend.est*all$buy.proba
To further validate the statistical fit of modeling, User can use hold-out sample(test data) or cross-validation(when data set is small).
Gain Table is very useful in deriving business insight. From gain table, we group customers into deciles by their expected values generated, and we look at the cost and revenue respectively for deciding up to which customer decile we’d like to invest.
cut <- quantile(all$spend.est.final, prob = seq(0, 1, length = 11),type =5)
cut[1] <- cut[1]-0.1
all$decile <- cut(all$spend.est.final,cut,labels = 10:1)
gain.table <- all %>%
group_by(decile) %>%
summarise_at(c("Customer.ID","spend.est.final"),funs(length(unique(.)),sum)) %>%
ungroup() %>%
mutate(N = Customer.ID_length, Total = spend.est.final_sum,Mean =round(spend.est.final_sum/Customer.ID_length,2)) %>%
select(decile,N,Total,Mean) %>%
arrange(as.integer(as.character(decile))) %>%
mutate(cumn = cumsum(N),cumsum = cumsum(Total),cummean = cumsum/cumn)
print(gain.table)
## # A tibble: 10 x 7
## decile N Total Mean cumn cumsum cummean
## <fctr> <int> <dbl> <dbl> <int> <dbl> <dbl>
## 1 1 642 103911.18 161.86 642 103911.2 161.85542
## 2 2 642 58287.96 90.79 1284 162199.1 126.32332
## 3 3 642 45370.73 70.67 1926 207569.9 107.77252
## 4 4 642 37460.59 58.35 2568 245030.5 95.41685
## 5 5 642 31550.85 49.14 3210 276581.3 86.16240
## 6 6 643 26354.93 40.99 3853 302936.2 78.62347
## 7 7 641 21477.95 33.51 4494 324414.2 72.18830
## 8 8 641 17483.93 27.28 5135 341898.1 66.58191
## 9 9 634 13005.14 20.51 5769 354903.3 61.51903
## 10 10 652 10467.51 16.05 6421 365370.8 56.90247
In Feature Extraction - In our model, only RFM(Recency, Frequency and Monetary Values) are used. You can be more creative given your business nature. For instance, will all marketing contact point(Email, Paid Search, Social etc) have the same impact on Customer Equity? More specifically, we can introduce amount of customer digital interactions with brand(Facebook likes, insta likes) into the model(has the data mapped/connected on user level.)
In Model fit - In this example, neither the Logistic model nor regression model gives good fit. What can we do to improve model fit? should we apply transformations? other modeling options(Forests..)