## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date()        masks base::date()
## x dplyr::filter()          masks stats::filter()
## x lubridate::intersect()   masks base::intersect()
## x dplyr::lag()             masks stats::lag()
## x lubridate::setdiff()     masks base::setdiff()
## x lubridate::union()       masks base::union()
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
#load data
data <- read_delim("brand_acq_tnsactions.csv.gz", delim = "\t")
## Rows: 35551878 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): custKey, SKU, productDesc, orderType
## dbl (3): dtKey, tnsCount, gross
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#add more specific dates
data <- data %>%
  mutate(
    date=ymd(dtKey),
    year=year(date),
    month=month(date),
    week=week(date),
    ymo=ymd(sprintf("%04d-%02d-01", year, month))
  )
## Pseduo-churn modeling
# Create an indicator for when each customer entered dataset, and the order of the purchase

# adding the gross to the dataset
resamp <- data %>%
  group_by(
    custKey, week
  ) %>%
  summarize(
    gross = sum(gross),
    num_sku = n()
  ) %>%
  arrange(custKey, week)
## `summarise()` has grouped output by 'custKey'. You can override using the `.groups` argument.
# if you want to get rid of the refund variable
resamp %>%
  mutate(
    refund = as.integer(gross <= 0)
  ) %>% 
  filter(refund == 0)
## # A tibble: 1,812,028 × 5
## # Groups:   custKey [139,298]
##    custKey                           week gross num_sku refund
##    <chr>                            <dbl> <dbl>   <int>  <int>
##  1 00001b95e102496f7f79232d8ae22ca3     3  98        18      0
##  2 0000c90cec4ed78061bdf2ed3f1850ba    40 104        20      0
##  3 0000ec4d855ae1c718ce1e645a2fc1b8     5 450.       26      0
##  4 0000ec4d855ae1c718ce1e645a2fc1b8     6  63        10      0
##  5 0000ec4d855ae1c718ce1e645a2fc1b8     7 274.       17      0
##  6 0000ec4d855ae1c718ce1e645a2fc1b8     8 119.        6      0
##  7 0000ec4d855ae1c718ce1e645a2fc1b8     9  51.6       1      0
##  8 0000ec4d855ae1c718ce1e645a2fc1b8    26 240.       32      0
##  9 0000ec4d855ae1c718ce1e645a2fc1b8    30  40.8       3      0
## 10 0000ec4d855ae1c718ce1e645a2fc1b8    31  31         1      0
## # … with 1,812,018 more rows
resamp <- resamp %>%
  group_by(custKey) %>%
  arrange(week) %>%
  mutate(
    purchase_seq = row_number(), #for each customer which number purchase this is frequency
    customer_entry = min(week),  #when did the customer enter the dataset
    lifetime_seq = (week - customer_entry) + 1
  )
today <- max(data$week)

lifetime <- resamp %>%
  distinct(custKey, week, gross) %>%
  group_by(custKey) %>%
  arrange(week) %>%
  mutate(
    lweek=lag(week),
    diff=week-lweek
  ) %>%
  summarise(
    # Summary lifetime info.
    firstdate=min(week),
    lastdate=max(week),
    lifetime=as.integer(lastdate-firstdate)+1,
    currentperiod=as.integer(today-lastdate),
    npurchases=n(),
    
    # Interpurchase info.
    muperiod=mean(diff, na.rm=T),
    muperiod=as.integer(ifelse(is.nan(muperiod), 0, muperiod)),
    sdperiod=sd(diff, na.rm=T),
    sdperiod=ifelse(is.na(sdperiod), 116, sdperiod),
    
    # Monetary info.
    mupurchase = mean(gross, na.rm=T),
    histvalue = sum(gross, na.rm=T)
  )


# Interpurchase heterogeneity
expperiod <- lifetime %>%
  filter(npurchases>1) %>%
  group_by(npurchases) %>%
  summarize(
    mu=mean(muperiod),
    sd=sd(muperiod),
    lwr=mu-1.96*sd,
    lwr=ifelse(lwr<0, 0, lwr),
    upr=mu+1.96*sd
  ) %>%
  arrange(npurchases)

# Code the churns using actual data
lifetime <- lifetime %>%
  left_join(expperiod) %>%
  mutate(
    churned = as.integer(currentperiod > (muperiod+1.96*sdperiod))
  )
## Joining, by = "npurchases"
CrossTable(lifetime$churned)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  141154 
## 
##  
##           |         0 |         1 | 
##           |-----------|-----------|
##           |    108346 |     32808 | 
##           |     0.768 |     0.232 | 
##           |-----------|-----------|
## 
## 
## 
## 
############################################################
# Examine some sample characteristics
############################################################

# Plot to see interpurchase heterogeneity
expperiod %>%
  ggplot(aes(x=npurchases, y=mu/4, ymin=lwr/4, ymax=upr/4)) +
  geom_errorbar(width=0) +
  geom_line() +
  scale_y_continuous(breaks=pretty_breaks()) +
  labs(
    x="Number of Months with Purchase",
    y="Mean Interpurchase Period (Months)"
  )

# See number of customers by interpurchase period
lifetime %>%
  filter(npurchases>1) %>%
  ggplot(aes(x=muperiod/4)) +
  geom_histogram() +
  scale_y_continuous(labels = comma, breaks=pretty_breaks(10)) +
  labs(
    x="Interpurchase Period (Months)",
    y="Number of Customers"
  )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Lifetime distributions
lifetime %>%
  filter(npurchases>1) %>%
  ggplot(aes(x=lifetime/4)) +
  geom_density(fill="black") +
  scale_y_continuous(labels = percent) +
  labs(
    x="Observed Lifetime (Months)",
    y="Percent of Customers"
  )

############################################################
# Link the churns back to the week-level dataset
############################################################

resamp <- resamp %>%
  left_join(
    lifetime %>%
      select(custKey, week=lastdate, churned)
  ) %>%
  mutate(
    churned = ifelse(is.na(churned), 0, churned)
  )
## Joining, by = c("custKey", "week")
resamp
## # A tibble: 1,893,690 × 8
## # Groups:   custKey [141,154]
##    custKey   week gross num_sku purchase_seq customer_entry lifetime_seq churned
##    <chr>    <dbl> <dbl>   <int>        <int>          <dbl>        <dbl>   <dbl>
##  1 0001fc3…     1 289.       39            1              1            1       0
##  2 000250c…     1  34         2            1              1            1       0
##  3 0003538…     1 438.       16            1              1            1       0
##  4 000447d…     1  30         2            1              1            1       0
##  5 00060cf…     1 299.       16            1              1            1       0
##  6 00063a9…     1  14         1            1              1            1       0
##  7 0008cd6…     1  79.0       6            1              1            1       0
##  8 000b2a7…     1   0         1            1              1            1       0
##  9 000fc7e…     1  41.0       2            1              1            1       0
## 10 00109b2…     1  38         1            1              1            1       0
## # … with 1,893,680 more rows
############################################################
# Run churn prediction
############################################################

# We still need to make sure we have RFM feats
# we have F = purchase_seq
# we have M = gross
# Need R
resamp <- resamp %>%
  group_by(custKey) %>%
  arrange(week) %>%
  mutate(
    
    # recency
    timelag = week - lag(week),
    timelag = ifelse(is.na(timelag), 0, timelag)
    
  )
# Set up our train test split
train.rat <- 0.70
N <- nrow(resamp)
train <- sample(1:N, size = ceiling(train.rat*N))
test <- (1:N)[-train]

df.train <- resamp[train,]
df.test <- resamp[test,]
# Our very basic model
glm.churn <- glm(churned ~ timelag + lifetime_seq + gross + week + num_sku, 
                 data = df.train, family = binomial(link="logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm.churn)
## 
## Call:
## glm(formula = churned ~ timelag + lifetime_seq + gross + week + 
##     num_sku, family = binomial(link = "logit"), data = df.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -6.3879  -0.2013  -0.1677  -0.1104   5.1366  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -4.087e+00  1.604e-02 -254.84   <2e-16 ***
## timelag       8.107e-02  1.129e-03   71.84   <2e-16 ***
## lifetime_seq -5.955e-02  5.922e-04 -100.55   <2e-16 ***
## gross         1.579e-03  4.616e-05   34.20   <2e-16 ***
## week          6.781e-02  6.385e-04  106.21   <2e-16 ***
## num_sku      -8.991e-02  1.278e-03  -70.34   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 232104  on 1325582  degrees of freedom
## Residual deviance: 209778  on 1325577  degrees of freedom
## AIC: 209790
## 
## Number of Fisher Scoring iterations: 8
# Make some churn predictions on our test set
df.test <- df.test %>%
  ungroup() %>%
  mutate(
    yhat = predict(glm.churn, df.test, type = "response")
  )
hist(df.test$yhat)

# Threshold
df.test <- df.test %>%
  mutate(
    churn_pred = as.integer(yhat > 0.05)
  )
# Check our confusion matrix
with(df.test, CrossTable(churned, churn_pred))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  568107 
## 
##  
##              | churn_pred 
##      churned |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |    539522 |     18779 |    558301 | 
##              |     6.142 |   160.339 |           | 
##              |     0.966 |     0.034 |     0.983 | 
##              |     0.986 |     0.896 |           | 
##              |     0.950 |     0.033 |           | 
## -------------|-----------|-----------|-----------|
##            1 |      7627 |      2179 |      9806 | 
##              |   349.672 |  9128.860 |           | 
##              |     0.778 |     0.222 |     0.017 | 
##              |     0.014 |     0.104 |           | 
##              |     0.013 |     0.004 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |    547149 |     20958 |    568107 | 
##              |     0.963 |     0.037 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
cm <- with(df.test, table(churned, churn_pred))
acc <- (cm[1,1] + cm[2,2]) / sum(cm)
pre <- (cm[2,2]) / (cm[2,1] + cm[2,2])
rec <- (cm[1,1]) / (cm[1,1] + cm[1,2])
f1 <- 2 * (pre * rec) / (pre + rec)

# How should we interpret these?
acc
## [1] 0.9535193
f1
## [1] 0.3613346
############################################################
# Estimate customer lifetime values
############################################################

# Make some churn predictions
resamp <- resamp %>%
  ungroup() %>%
  mutate(
    yhat = predict(glm.churn, resamp, type = "response")
  )
hist(resamp$yhat)

# Threshold
# Probability of alive = projection factor
resamp <- resamp %>%
  mutate(
    churn_pred = as.integer(yhat > 0.05),
    alive_pr = 1 - yhat,
    alive_pr_rescale = rescale(alive_pr, c(0, 1))
  )
hist(resamp$alive_pr)

# We only need the final probs
final_probs <- resamp %>%
  group_by(custKey) %>%
  arrange(week) %>%
  filter(row_number() == n())
hist(final_probs$alive_pr)

hist(final_probs$alive_pr_rescale)

final_probs
## # A tibble: 141,154 × 13
## # Groups:   custKey [141,154]
##    custKey   week gross num_sku purchase_seq customer_entry lifetime_seq churned
##    <chr>    <dbl> <dbl>   <int>        <int>          <dbl>        <dbl>   <dbl>
##  1 0074875…     1    44       1            1              1            1       0
##  2 01aab72…     1     0       1            1              1            1       0
##  3 01fe689…     1    35       1            1              1            1       0
##  4 026a7c6…     1     0       1            1              1            1       0
##  5 032151c…     1    35       1            1              1            1       0
##  6 03a5d53…     1   112       1            1              1            1       0
##  7 0497bec…     1   103      21            1              1            1       0
##  8 0583df6…     1    48       1            1              1            1       0
##  9 05968d4…     1     0       1            1              1            1       0
## 10 060b2c5…     1    97       3            1              1            1       0
## # … with 141,144 more rows, and 5 more variables: timelag <dbl>, yhat <dbl>,
## #   churn_pred <int>, alive_pr <dbl>, alive_pr_rescale <dbl>
# We map and then aggregate
lifetime <- lifetime %>%
  left_join(
    final_probs %>%
      select(-gross)
    )
## Joining, by = c("custKey", "churned")
# Project over estimated remaining lifetime per customer
clv <- lifetime %>%
  select(mupurchase, histvalue, lifetime, alive_pr_rescale, custKey) %>%
  ungroup() %>%
  mutate(
    est_lifetime = ceiling(mean(lifetime, na.rm=T)),
    remain_life = ifelse(
      lifetime <= est_lifetime, 
      est_lifetime - lifetime,
      lifetime + 12
    )
  )

# Now, do the geometric summations!
clv
## # A tibble: 141,154 × 7
##    mupurchase histvalue lifetime alive_pr_rescale custKey           est_lifetime
##         <dbl>     <dbl>    <dbl>            <dbl> <chr>                    <dbl>
##  1        98        98         1            0.996 00001b95e102496f…           29
##  2       104       104         1            0.956 0000c90cec4ed780…           29
##  3       159.     1269.       27            0.972 0000ec4d855ae1c7…           29
##  4       552.     8278.       41            0.969 00010ba75e2c420b…           29
##  5       114       114         1            0.938 000184addd963b59…           29
##  6       236.     4491.       36            0.976 0001a87bbecc918b…           29
##  7       272.     2451.        9            0.993 0001fc38ea3e9e70…           29
##  8       210.     2735.       46            0.978 000250c7e75b0fd0…           29
##  9       165.     1652.       30            0.959 00025358c4c30bb4…           29
## 10       122.      365         8            0.709 00027d94689e5e5e…           29
## # … with 141,144 more rows, and 1 more variable: remain_life <dbl>
pr_discount <- 0.05

#this is not a vectorized formula
calc_future_clv <- function(money, remaining_life, pr_churn, pr_discount){
  
  # No money accumulated yet...
  base_clv <- 0 
  
  #for each future time period, calculate the marginal addition to CLV
  for (t in 1: remaining_life){
    discount_factor <- (( 1+ pr_discount)^t * (1 + pr_churn)^t)
    
    period_value <- money / discount_factor
    
    base_clv <- base_clv + period_value
  }
  base_clv
}

calc_future_clv(100,12,1-.888, pr_discount)
## [1] 503.7199
# iterate through the customer base

clv_estimates <- data.frame()

for(i in 1: nrow(clv)){
  
  cust_i <- clv[i,]
  
  m <- cust_i$mupurchase
  rl <- cust_i$remain_life
  prc <- 1 - cust_i$alive_pr_rescale
  
  clv_hat_i <- calc_future_clv(m,rl,prc,pr_discount)
  
  clv_estimates <- rbind(
    clv_estimates, 
    data.frame(
      i=i, 
      clv_hat = clv_hat_i
    )
  )
}
clv_estimates <- clv_estimates %>% as_tibble()
clv_estimates
## # A tibble: 141,154 × 2
##        i clv_hat
##    <int>   <dbl>
##  1     1   1388.
##  2     2    995.
##  3     3    283.
##  4     4   6562.
##  5     5    942.
##  6     6   3033.
##  7     7   3186.
##  8     8   2819.
##  9     9   1725.
## 10    10    342.
## # … with 141,144 more rows
summary(clv_estimates)
##        i             clv_hat        
##  Min.   :     1   Min.   :-16560.3  
##  1st Qu.: 35289   1st Qu.:   904.4  
##  Median : 70578   Median :  1814.9  
##  Mean   : 70578   Mean   :  2844.3  
##  3rd Qu.:105866   3rd Qu.:  3539.1  
##  Max.   :141154   Max.   :153599.4
clv <- clv %>%
mutate(clv_hat = unlist(clv_estimates$clv_hat))
clv
## # A tibble: 141,154 × 8
##    mupurchase histvalue lifetime alive_pr_rescale custKey           est_lifetime
##         <dbl>     <dbl>    <dbl>            <dbl> <chr>                    <dbl>
##  1        98        98         1            0.996 00001b95e102496f…           29
##  2       104       104         1            0.956 0000c90cec4ed780…           29
##  3       159.     1269.       27            0.972 0000ec4d855ae1c7…           29
##  4       552.     8278.       41            0.969 00010ba75e2c420b…           29
##  5       114       114         1            0.938 000184addd963b59…           29
##  6       236.     4491.       36            0.976 0001a87bbecc918b…           29
##  7       272.     2451.        9            0.993 0001fc38ea3e9e70…           29
##  8       210.     2735.       46            0.978 000250c7e75b0fd0…           29
##  9       165.     1652.       30            0.959 00025358c4c30bb4…           29
## 10       122.      365         8            0.709 00027d94689e5e5e…           29
## # … with 141,144 more rows, and 2 more variables: remain_life <dbl>,
## #   clv_hat <dbl>
#Distribution of Customer Lifetime Values
clv %>%
  ggplot(aes(x=clv_hat))+
  geom_histogram(color='darkblue', fill='lightblue')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.