Disclaimer: The content of this RMarkdown note came from a Coursera course called Marketing Analytics.

Customer lifetime value models (CLV) compute the value of a new customer over a given period of time and compare that figure to the cost of acquiring such a new customer to prove or disprove it was a good investment. One of the best applications of customer lifetime value is to identify, which customers or which segments of customers are strategic for the future success of the firm. Suppose you have two segments, customers in segment number one are generating and average $100 of revenue per quarter. Customers in segment two are generating $150 on average.

Segment Customers in 2014 and 2015

The customer database is segmented into eight groups:

  1. inactive: customers who made the last purchase was more than 3 years ago (recency 37+ months)
  2. cold: customers who made the last purchase was between 2 and 3 years ago (recency 25-36 months)
  3. warm high value: customers who made the last purchase was between 1 and 2 years ago; made the first purchase more than one year ago; and spent more than $100 on average (recency 13-24 months, amount >= $100)
  4. warm low value: customers who made the last purchase was between 1 and 2 years ago; made the first purchase more than one year ago; and spent less than $100 on average (recency 13-24 months, amount < $100)
  5. new warm: customers who made the last purchase was between 1 and 2 years ago, and made the first purchase less than two years ago (recency 13-24 months, first_purchase <= 24 months)
  6. active high value: customers who made the last purchase was less than 2 years ago; made the first purchase more than one year ago; and spent more than $100 on average (recency < 24 months, amount >= $100)
  7. active low value: customers who made the last purchase was less than 2 years ago; made the first purchase more than one year ago; and spent less than $100 on average (recency < 24 months, amount < $100)
  8. new active: customers who made the last purchase was less than 2 years ago, and made the first purchase less than one year ago (recency < 24 months, first_purchase <= 12 months)

The data contains transaction records over a two-year period of 2014 and 2015. It includes three variables - 1) customer id; 2) purchase amount; and 3) date of purchase.

# Load text file into local variable called 'data'
data = read.delim(file = 'purchases.txt', header = FALSE, sep = '\t', dec = '.')

# Add headers and interpret the last column as a date, extract year of purchase
colnames(data) = c('customer_id', 'purchase_amount', 'date_of_purchase')
data$date_of_purchase = as.Date(data$date_of_purchase, "%Y-%m-%d")
data$year_of_purchase = as.numeric(format(data$date_of_purchase, "%Y"))
data$days_since       = as.numeric(difftime(time1 = "2016-01-01",
                                            time2 = data$date_of_purchase,
                                            units = "days"))

# Invoke library to compute key marketing indicators using SQL language
library(sqldf)

# Segment customers in 2015
customers_2015 = sqldf("SELECT customer_id,
                               MIN(days_since) AS 'recency',
                               MAX(days_since) AS 'first_purchase',
                               COUNT(*) AS 'frequency',
                               AVG(purchase_amount) AS 'amount'
                        FROM data GROUP BY 1")
customers_2015$segment = "NA"
customers_2015$segment[which(customers_2015$recency > 365*3)] = "inactive"
customers_2015$segment[which(customers_2015$recency <= 365*3 & customers_2015$recency > 365*2)] = "cold"
customers_2015$segment[which(customers_2015$recency <= 365*2 & customers_2015$recency > 365*1)] = "warm"
customers_2015$segment[which(customers_2015$recency <= 365)] = "active"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$first_purchase <= 365*2)] = "new warm"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$amount < 100)] = "warm low value"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$amount >= 100)] = "warm high value"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$first_purchase <= 365)] = "new active"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$amount < 100)] = "active low value"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$amount >= 100)] = "active high value"
customers_2015$segment = factor(x = customers_2015$segment, levels = c("inactive", "cold",
                                                                       "warm high value", "warm low value", "new warm",
                                                                       "active high value", "active low value", "new active"))

# Segment customers in 2014
customers_2014 = sqldf("SELECT customer_id,
                               MIN(days_since) - 365 AS 'recency',
                               MAX(days_since) - 365 AS 'first_purchase',
                               COUNT(*) AS 'frequency',
                               AVG(purchase_amount) AS 'amount'
                        FROM data
                        WHERE days_since > 365
                        GROUP BY 1")
customers_2014$segment = "NA"
customers_2014$segment[which(customers_2014$recency > 365*3)] = "inactive"
customers_2014$segment[which(customers_2014$recency <= 365*3 & customers_2014$recency > 365*2)] = "cold"
customers_2014$segment[which(customers_2014$recency <= 365*2 & customers_2014$recency > 365*1)] = "warm"
customers_2014$segment[which(customers_2014$recency <= 365)] = "active"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$first_purchase <= 365*2)] = "new warm"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$amount < 100)] = "warm low value"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$amount >= 100)] = "warm high value"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$first_purchase <= 365)] = "new active"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount < 100)] = "active low value"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount >= 100)] = "active high value"
customers_2014$segment = factor(x = customers_2014$segment, levels = c("inactive", "cold",
                                                                       "warm high value", "warm low value", "new warm",
                                                                       "active high value", "active low value", "new active"))

Compute Transition Matrix

A transition matrix is basically a matrix showing the likelihood of going from one segment to the next over one specific period of time.

# Compute transition matrix
new_data = merge(x = customers_2014, y = customers_2015, by = "customer_id", all.x = TRUE)
head(new_data)
##   customer_id recency.x first_purchase.x frequency.x amount.x
## 1          10 3464.2083         3464.208           1     30.0
## 2          80  302.2083         3386.208           6     70.0
## 3          90  393.2083         3418.208          10    115.8
## 4         120 1036.2083         1036.208           1     20.0
## 5         130 2605.2083         3345.208           2     50.0
## 6         160 2598.2083         3212.208           2     30.0
##          segment.x recency.y first_purchase.y frequency.y  amount.y
## 1         inactive 3829.2083         3829.208           1  30.00000
## 2 active low value  343.2083         3751.208           7  71.42857
## 3  warm high value  758.2083         3783.208          10 115.80000
## 4             cold 1401.2083         1401.208           1  20.00000
## 5         inactive 2970.2083         3710.208           2  50.00000
## 6         inactive 2963.2083         3577.208           2  30.00000
##          segment.y
## 1         inactive
## 2 active low value
## 3             cold
## 4         inactive
## 5         inactive
## 6         inactive
transition = table(new_data$segment.x, new_data$segment.y)
print(transition)
##                    
##                     inactive cold warm high value warm low value new warm
##   inactive              7227    0               0              0        0
##   cold                  1931    0               0              0        0
##   warm high value          0   75               0              0        0
##   warm low value           0  689               0              0        0
##   new warm                 0 1139               0              0        0
##   active high value        0    0             119              0        0
##   active low value         0    0               0            901        0
##   new active               0    0               0              0      938
##                    
##                     active high value active low value new active
##   inactive                         35              250          0
##   cold                             22              200          0
##   warm high value                  35                1          0
##   warm low value                    1              266          0
##   new warm                         15               96          0
##   active high value               354                2          0
##   active low value                 22             2088          0
##   new active                       89              410          0

# Divide each row by its sum
transition = transition / rowSums(transition)
print(transition)
##                    
##                        inactive        cold warm high value warm low value
##   inactive          0.962060703 0.000000000     0.000000000    0.000000000
##   cold              0.896888063 0.000000000     0.000000000    0.000000000
##   warm high value   0.000000000 0.675675676     0.000000000    0.000000000
##   warm low value    0.000000000 0.720711297     0.000000000    0.000000000
##   new warm          0.000000000 0.911200000     0.000000000    0.000000000
##   active high value 0.000000000 0.000000000     0.250526316    0.000000000
##   active low value  0.000000000 0.000000000     0.000000000    0.299236134
##   new active        0.000000000 0.000000000     0.000000000    0.000000000
##                    
##                        new warm active high value active low value
##   inactive          0.000000000       0.004659212      0.033280085
##   cold              0.000000000       0.010218300      0.092893637
##   warm high value   0.000000000       0.315315315      0.009009009
##   warm low value    0.000000000       0.001046025      0.278242678
##   new warm          0.000000000       0.012000000      0.076800000
##   active high value 0.000000000       0.745263158      0.004210526
##   active low value  0.000000000       0.007306543      0.693457323
##   new active        0.652748782       0.061934586      0.285316632
##                    
##                      new active
##   inactive          0.000000000
##   cold              0.000000000
##   warm high value   0.000000000
##   warm low value    0.000000000
##   new warm          0.000000000
##   active high value 0.000000000
##   active low value  0.000000000
##   new active        0.000000000

Use Transition Matrix to Make Predictions

# Initialize a matrix with the number of customers in each segment today and after 10 periods
segments = matrix(nrow = 8, ncol = 11)
segments[, 1] = table(customers_2015$segment)
colnames(segments) = 2015:2025
row.names(segments) = levels(customers_2015$segment)
print(segments)
##                   2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025
## inactive          9158   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## cold              1903   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## warm high value    119   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## warm low value     901   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## new warm           938   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## active high value  573   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## active low value  3313   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## new active        1512   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA

# Compute for each an every period
for (i in 2:11) {
   segments[, i] = segments[, i-1] %*% transition
}

# Plot inactive, active high value customers over time
barplot(segments[1, ])

barplot(segments[2, ])


# Display how segments will evolve over time
print(round(segments))
##                   2015  2016  2017  2018  2019  2020  2021  2022  2023
## inactive          9158 10517 11539 12636 12940 13186 13386 13542 13664
## cold              1903  1584  1711   874   821   782   740   709   684
## warm high value    119   144   165   160   156   152   149   146   143
## warm low value     901   991  1058   989   938   884   844   813   789
## new warm           938   987     0     0     0     0     0     0     0
## active high value  573   657   639   624   607   593   581   571   562
## active low value  3313  3537  3305  3134  2954  2820  2717  2637  2575
## new active        1512     0     0     0     0     0     0     0     0
##                    2024  2025
## inactive          13759 13834
## cold                665   650
## warm high value     141   139
## warm low value      771   756
## new warm              0     0
## active high value   554   547
## active low value   2527  2490
## new active            0     0

Predictions

Compute the (Discounted) CLV of a Database

Now that we know how to use the transition matrix to guess how many customers will be in each segment in a year, two years or even 20 years, we need to transform these figures into Dollars. In so doing, we will assume that the average spending of each segment does not change over time. If an average customer in a high value segment generates an average $100, we’ll simply assume that this figure will not change over the years. In reality it might go up or down, but without additional information our best guess is to assume that this figure will remain stable over time. We will discount future revenues. This is because all $100 are not created equally. For instance, one customer will spend $100 today while another will spend the same amount but only five years from now. The first customer is more valuable.

# Yearly revenue per segment
# This comes directly from module 2, lines 160-161
yearly_revenue = c(0, 0, 0, 0, 0, 323.57, 52.31, 79.17)

# Compute revenue per segment
revenue_per_segment = yearly_revenue * segments
print(revenue_per_segment)
##                       2015     2016     2017     2018     2019     2020
## inactive               0.0      0.0      0.0      0.0      0.0      0.0
## cold                   0.0      0.0      0.0      0.0      0.0      0.0
## warm high value        0.0      0.0      0.0      0.0      0.0      0.0
## warm low value         0.0      0.0      0.0      0.0      0.0      0.0
## new warm               0.0      0.0      0.0      0.0      0.0      0.0
## active high value 185405.6 212495.8 206634.8 202009.1 196555.6 191967.7
## active low value  173303.0 184999.5 172904.6 163925.2 154528.3 147531.6
## new active        119705.0      0.0      0.0      0.0      0.0      0.0
##                       2021     2022     2023     2024     2025
## inactive               0.0      0.0      0.0      0.0      0.0
## cold                   0.0      0.0      0.0      0.0      0.0
## warm high value        0.0      0.0      0.0      0.0      0.0
## warm low value         0.0      0.0      0.0      0.0      0.0
## new warm               0.0      0.0      0.0      0.0      0.0
## active high value 188022.8 184627.5 181717.4 179229.1 177106.1
## active low value  142127.7 137940.6 134702.6 132199.2 130264.3
## new active             0.0      0.0      0.0      0.0      0.0

# Compute yearly revenue
yearly_revenue = colSums(revenue_per_segment)
print(round(yearly_revenue))
##   2015   2016   2017   2018   2019   2020   2021   2022   2023   2024 
## 478414 397495 379539 365934 351084 339499 330150 322568 316420 311428 
##   2025 
## 307370
barplot(yearly_revenue)

Interpretation

# Compute cumulated revenue
cumulated_revenue = cumsum(yearly_revenue)
print(round(cumulated_revenue))
##    2015    2016    2017    2018    2019    2020    2021    2022    2023 
##  478414  875909 1255448 1621383 1972467 2311966 2642116 2964685 3281105 
##    2024    2025 
## 3592533 3899903
barplot(cumulated_revenue)


# Create a discount factor
discount_rate = 0.10
discount = 1 / ((1 + discount_rate) ^ ((1:11) - 1))
print(discount)
##  [1] 1.0000000 0.9090909 0.8264463 0.7513148 0.6830135 0.6209213 0.5644739
##  [8] 0.5131581 0.4665074 0.4240976 0.3855433

# Compute discounted yearly revenue
disc_yearly_revenue = yearly_revenue * discount
print(round(disc_yearly_revenue))
##   2015   2016   2017   2018   2019   2020   2021   2022   2023   2024 
## 478414 361359 313669 274932 239795 210802 186361 165528 147612 132076 
##   2025 
## 118505
barplot(disc_yearly_revenue)
lines(yearly_revenue)

Interpretation

# Compute discounted cumulated revenue
disc_cumulated_revenue = cumsum(disc_yearly_revenue)
print(round(disc_cumulated_revenue))
##    2015    2016    2017    2018    2019    2020    2021    2022    2023 
##  478414  839773 1153442 1428374 1668169 1878971 2065333 2230861 2378473 
##    2024    2025 
## 2510549 2629054
barplot(disc_cumulated_revenue)


# What is the database worth?
print(disc_cumulated_revenue[11] - yearly_revenue[1])
##    2025 
## 2150640

Interpretation