##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date() masks base::date()
## x dplyr::filter() masks stats::filter()
## x lubridate::intersect() masks base::intersect()
## x dplyr::lag() masks stats::lag()
## x lubridate::setdiff() masks base::setdiff()
## x lubridate::union() masks base::union()
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
#load data
data <- read_delim("brand_acq_tnsactions.csv.gz", delim = "\t")
## Rows: 35551878 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): custKey, SKU, productDesc, orderType
## dbl (3): dtKey, tnsCount, gross
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#add more specific dates
data <- data %>%
mutate(
date=ymd(dtKey),
year=year(date),
month=month(date),
week=week(date),
ymo=ymd(sprintf("%04d-%02d-01", year, month))
)
## Pseduo-churn modeling
# Create an indicator for when each customer entered dataset, and the order of the purchase
# adding the gross to the dataset
resamp <- data %>%
group_by(
custKey, week
) %>%
summarize(
gross = sum(gross),
num_sku = n()
) %>%
arrange(custKey, week)
## `summarise()` has grouped output by 'custKey'. You can override using the `.groups` argument.
# if you want to get rid of the refund variable
resamp %>%
mutate(
refund = as.integer(gross <= 0)
) %>%
filter(refund == 0)
## # A tibble: 1,812,028 × 5
## # Groups: custKey [139,298]
## custKey week gross num_sku refund
## <chr> <dbl> <dbl> <int> <int>
## 1 00001b95e102496f7f79232d8ae22ca3 3 98 18 0
## 2 0000c90cec4ed78061bdf2ed3f1850ba 40 104 20 0
## 3 0000ec4d855ae1c718ce1e645a2fc1b8 5 450. 26 0
## 4 0000ec4d855ae1c718ce1e645a2fc1b8 6 63 10 0
## 5 0000ec4d855ae1c718ce1e645a2fc1b8 7 274. 17 0
## 6 0000ec4d855ae1c718ce1e645a2fc1b8 8 119. 6 0
## 7 0000ec4d855ae1c718ce1e645a2fc1b8 9 51.6 1 0
## 8 0000ec4d855ae1c718ce1e645a2fc1b8 26 240. 32 0
## 9 0000ec4d855ae1c718ce1e645a2fc1b8 30 40.8 3 0
## 10 0000ec4d855ae1c718ce1e645a2fc1b8 31 31 1 0
## # … with 1,812,018 more rows
resamp <- resamp %>%
group_by(custKey) %>%
arrange(week) %>%
mutate(
purchase_seq = row_number(), #for each customer which number purchase this is frequency
customer_entry = min(week), #when did the customer enter the dataset
lifetime_seq = (week - customer_entry) + 1
)
today <- max(data$week)
lifetime <- resamp %>%
distinct(custKey, week, gross) %>%
group_by(custKey) %>%
arrange(week) %>%
mutate(
lweek=lag(week),
diff=week-lweek
) %>%
summarise(
# Summary lifetime info.
firstdate=min(week),
lastdate=max(week),
lifetime=as.integer(lastdate-firstdate)+1,
currentperiod=as.integer(today-lastdate),
npurchases=n(),
# Interpurchase info.
muperiod=mean(diff, na.rm=T),
muperiod=as.integer(ifelse(is.nan(muperiod), 0, muperiod)),
sdperiod=sd(diff, na.rm=T),
sdperiod=ifelse(is.na(sdperiod), 116, sdperiod),
# Monetary info.
mupurchase = mean(gross, na.rm=T),
histvalue = sum(gross, na.rm=T)
)
# Interpurchase heterogeneity
expperiod <- lifetime %>%
filter(npurchases>1) %>%
group_by(npurchases) %>%
summarize(
mu=mean(muperiod),
sd=sd(muperiod),
lwr=mu-1.96*sd,
lwr=ifelse(lwr<0, 0, lwr),
upr=mu+1.96*sd
) %>%
arrange(npurchases)
# Code the churns using actual data
lifetime <- lifetime %>%
left_join(expperiod) %>%
mutate(
churned = as.integer(currentperiod > (muperiod+1.96*sdperiod))
)
## Joining, by = "npurchases"
CrossTable(lifetime$churned)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 141154
##
##
## | 0 | 1 |
## |-----------|-----------|
## | 108346 | 32808 |
## | 0.768 | 0.232 |
## |-----------|-----------|
##
##
##
##
############################################################
# Examine some sample characteristics
############################################################
# Plot to see interpurchase heterogeneity
expperiod %>%
ggplot(aes(x=npurchases, y=mu/4, ymin=lwr/4, ymax=upr/4)) +
geom_errorbar(width=0) +
geom_line() +
scale_y_continuous(breaks=pretty_breaks()) +
labs(
x="Number of Months with Purchase",
y="Mean Interpurchase Period (Months)"
)

# See number of customers by interpurchase period
lifetime %>%
filter(npurchases>1) %>%
ggplot(aes(x=muperiod/4)) +
geom_histogram() +
scale_y_continuous(labels = comma, breaks=pretty_breaks(10)) +
labs(
x="Interpurchase Period (Months)",
y="Number of Customers"
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Lifetime distributions
lifetime %>%
filter(npurchases>1) %>%
ggplot(aes(x=lifetime/4)) +
geom_density(fill="black") +
scale_y_continuous(labels = percent) +
labs(
x="Observed Lifetime (Months)",
y="Percent of Customers"
)

############################################################
# Link the churns back to the week-level dataset
############################################################
resamp <- resamp %>%
left_join(
lifetime %>%
select(custKey, week=lastdate, churned)
) %>%
mutate(
churned = ifelse(is.na(churned), 0, churned)
)
## Joining, by = c("custKey", "week")
resamp
## # A tibble: 1,893,690 × 8
## # Groups: custKey [141,154]
## custKey week gross num_sku purchase_seq customer_entry lifetime_seq churned
## <chr> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl>
## 1 0001fc3… 1 289. 39 1 1 1 0
## 2 000250c… 1 34 2 1 1 1 0
## 3 0003538… 1 438. 16 1 1 1 0
## 4 000447d… 1 30 2 1 1 1 0
## 5 00060cf… 1 299. 16 1 1 1 0
## 6 00063a9… 1 14 1 1 1 1 0
## 7 0008cd6… 1 79.0 6 1 1 1 0
## 8 000b2a7… 1 0 1 1 1 1 0
## 9 000fc7e… 1 41.0 2 1 1 1 0
## 10 00109b2… 1 38 1 1 1 1 0
## # … with 1,893,680 more rows
############################################################
# Run churn prediction
############################################################
# We still need to make sure we have RFM feats
# we have F = purchase_seq
# we have M = gross
# Need R
resamp <- resamp %>%
group_by(custKey) %>%
arrange(week) %>%
mutate(
# recency
timelag = week - lag(week),
timelag = ifelse(is.na(timelag), 0, timelag)
)
# Set up our train test split
train.rat <- 0.70
N <- nrow(resamp)
train <- sample(1:N, size = ceiling(train.rat*N))
test <- (1:N)[-train]
df.train <- resamp[train,]
df.test <- resamp[test,]
# Our very basic model
glm.churn <- glm(churned ~ timelag + lifetime_seq + gross + week + num_sku,
data = df.train, family = binomial(link="logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm.churn)
##
## Call:
## glm(formula = churned ~ timelag + lifetime_seq + gross + week +
## num_sku, family = binomial(link = "logit"), data = df.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.3879 -0.2013 -0.1677 -0.1104 5.1366
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.087e+00 1.604e-02 -254.84 <2e-16 ***
## timelag 8.107e-02 1.129e-03 71.84 <2e-16 ***
## lifetime_seq -5.955e-02 5.922e-04 -100.55 <2e-16 ***
## gross 1.579e-03 4.616e-05 34.20 <2e-16 ***
## week 6.781e-02 6.385e-04 106.21 <2e-16 ***
## num_sku -8.991e-02 1.278e-03 -70.34 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 232104 on 1325582 degrees of freedom
## Residual deviance: 209778 on 1325577 degrees of freedom
## AIC: 209790
##
## Number of Fisher Scoring iterations: 8
# Make some churn predictions on our test set
df.test <- df.test %>%
ungroup() %>%
mutate(
yhat = predict(glm.churn, df.test, type = "response")
)
hist(df.test$yhat)

# Threshold
df.test <- df.test %>%
mutate(
churn_pred = as.integer(yhat > 0.05)
)
# Check our confusion matrix
with(df.test, CrossTable(churned, churn_pred))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 568107
##
##
## | churn_pred
## churned | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 539522 | 18779 | 558301 |
## | 6.142 | 160.339 | |
## | 0.966 | 0.034 | 0.983 |
## | 0.986 | 0.896 | |
## | 0.950 | 0.033 | |
## -------------|-----------|-----------|-----------|
## 1 | 7627 | 2179 | 9806 |
## | 349.672 | 9128.860 | |
## | 0.778 | 0.222 | 0.017 |
## | 0.014 | 0.104 | |
## | 0.013 | 0.004 | |
## -------------|-----------|-----------|-----------|
## Column Total | 547149 | 20958 | 568107 |
## | 0.963 | 0.037 | |
## -------------|-----------|-----------|-----------|
##
##
cm <- with(df.test, table(churned, churn_pred))
acc <- (cm[1,1] + cm[2,2]) / sum(cm)
pre <- (cm[2,2]) / (cm[2,1] + cm[2,2])
rec <- (cm[1,1]) / (cm[1,1] + cm[1,2])
f1 <- 2 * (pre * rec) / (pre + rec)
# How should we interpret these?
acc
## [1] 0.9535193
f1
## [1] 0.3613346
############################################################
# Estimate customer lifetime values
############################################################
# Make some churn predictions
resamp <- resamp %>%
ungroup() %>%
mutate(
yhat = predict(glm.churn, resamp, type = "response")
)
hist(resamp$yhat)

# Threshold
# Probability of alive = projection factor
resamp <- resamp %>%
mutate(
churn_pred = as.integer(yhat > 0.05),
alive_pr = 1 - yhat,
alive_pr_rescale = rescale(alive_pr, c(0, 1))
)
hist(resamp$alive_pr)

# We only need the final probs
final_probs <- resamp %>%
group_by(custKey) %>%
arrange(week) %>%
filter(row_number() == n())
hist(final_probs$alive_pr)

hist(final_probs$alive_pr_rescale)

final_probs
## # A tibble: 141,154 × 13
## # Groups: custKey [141,154]
## custKey week gross num_sku purchase_seq customer_entry lifetime_seq churned
## <chr> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl>
## 1 0074875… 1 44 1 1 1 1 0
## 2 01aab72… 1 0 1 1 1 1 0
## 3 01fe689… 1 35 1 1 1 1 0
## 4 026a7c6… 1 0 1 1 1 1 0
## 5 032151c… 1 35 1 1 1 1 0
## 6 03a5d53… 1 112 1 1 1 1 0
## 7 0497bec… 1 103 21 1 1 1 0
## 8 0583df6… 1 48 1 1 1 1 0
## 9 05968d4… 1 0 1 1 1 1 0
## 10 060b2c5… 1 97 3 1 1 1 0
## # … with 141,144 more rows, and 5 more variables: timelag <dbl>, yhat <dbl>,
## # churn_pred <int>, alive_pr <dbl>, alive_pr_rescale <dbl>
# We map and then aggregate
lifetime <- lifetime %>%
left_join(
final_probs %>%
select(-gross)
)
## Joining, by = c("custKey", "churned")
# Project over estimated remaining lifetime per customer
clv <- lifetime %>%
select(mupurchase, histvalue, lifetime, alive_pr_rescale, custKey) %>%
ungroup() %>%
mutate(
est_lifetime = ceiling(mean(lifetime, na.rm=T)),
remain_life = ifelse(
lifetime <= est_lifetime,
est_lifetime - lifetime,
lifetime + 12
)
)
# Now, do the geometric summations!
clv
## # A tibble: 141,154 × 7
## mupurchase histvalue lifetime alive_pr_rescale custKey est_lifetime
## <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 98 98 1 0.996 00001b95e102496f… 29
## 2 104 104 1 0.956 0000c90cec4ed780… 29
## 3 159. 1269. 27 0.972 0000ec4d855ae1c7… 29
## 4 552. 8278. 41 0.969 00010ba75e2c420b… 29
## 5 114 114 1 0.938 000184addd963b59… 29
## 6 236. 4491. 36 0.976 0001a87bbecc918b… 29
## 7 272. 2451. 9 0.993 0001fc38ea3e9e70… 29
## 8 210. 2735. 46 0.978 000250c7e75b0fd0… 29
## 9 165. 1652. 30 0.959 00025358c4c30bb4… 29
## 10 122. 365 8 0.709 00027d94689e5e5e… 29
## # … with 141,144 more rows, and 1 more variable: remain_life <dbl>
pr_discount <- 0.05
#this is not a vectorized formula
calc_future_clv <- function(money, remaining_life, pr_churn, pr_discount){
# No money accumulated yet...
base_clv <- 0
#for each future time period, calculate the marginal addition to CLV
for (t in 1: remaining_life){
discount_factor <- (( 1+ pr_discount)^t * (1 + pr_churn)^t)
period_value <- money / discount_factor
base_clv <- base_clv + period_value
}
base_clv
}
calc_future_clv(100,12,1-.888, pr_discount)
## [1] 503.7199
# iterate through the customer base
clv_estimates <- data.frame()
for(i in 1: nrow(clv)){
cust_i <- clv[i,]
m <- cust_i$mupurchase
rl <- cust_i$remain_life
prc <- 1 - cust_i$alive_pr_rescale
clv_hat_i <- calc_future_clv(m,rl,prc,pr_discount)
clv_estimates <- rbind(
clv_estimates,
data.frame(
i=i,
clv_hat = clv_hat_i
)
)
}
clv_estimates <- clv_estimates %>% as_tibble()
clv_estimates
## # A tibble: 141,154 × 2
## i clv_hat
## <int> <dbl>
## 1 1 1388.
## 2 2 995.
## 3 3 283.
## 4 4 6562.
## 5 5 942.
## 6 6 3033.
## 7 7 3186.
## 8 8 2819.
## 9 9 1725.
## 10 10 342.
## # … with 141,144 more rows
summary(clv_estimates)
## i clv_hat
## Min. : 1 Min. :-16560.3
## 1st Qu.: 35289 1st Qu.: 904.4
## Median : 70578 Median : 1814.9
## Mean : 70578 Mean : 2844.3
## 3rd Qu.:105866 3rd Qu.: 3539.1
## Max. :141154 Max. :153599.4
clv <- clv %>%
mutate(clv_hat = unlist(clv_estimates$clv_hat))
clv
## # A tibble: 141,154 × 8
## mupurchase histvalue lifetime alive_pr_rescale custKey est_lifetime
## <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 98 98 1 0.996 00001b95e102496f… 29
## 2 104 104 1 0.956 0000c90cec4ed780… 29
## 3 159. 1269. 27 0.972 0000ec4d855ae1c7… 29
## 4 552. 8278. 41 0.969 00010ba75e2c420b… 29
## 5 114 114 1 0.938 000184addd963b59… 29
## 6 236. 4491. 36 0.976 0001a87bbecc918b… 29
## 7 272. 2451. 9 0.993 0001fc38ea3e9e70… 29
## 8 210. 2735. 46 0.978 000250c7e75b0fd0… 29
## 9 165. 1652. 30 0.959 00025358c4c30bb4… 29
## 10 122. 365 8 0.709 00027d94689e5e5e… 29
## # … with 141,144 more rows, and 2 more variables: remain_life <dbl>,
## # clv_hat <dbl>
#Distribution of Customer Lifetime Values
clv %>%
ggplot(aes(x=clv_hat))+
geom_histogram(color='darkblue', fill='lightblue')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
