Disclaimer: The content of this RMarkdown note came from a Coursera course called Marketing Analytics.
Customer lifetime value models (CLV) compute the value of a new customer over a given period of time and compare that figure to the cost of acquiring such a new customer to prove or disprove it was a good investment. One of the best applications of customer lifetime value is to identify, which customers or which segments of customers are strategic for the future success of the firm. Suppose you have two segments, customers in segment number one are generating and average $100 of revenue per quarter. Customers in segment two are generating $150 on average.
The customer database is segmented into eight groups:
The data contains transaction records over a two-year period of 2014 and 2015. It includes three variables - 1) customer id; 2) purchase amount; and 3) date of purchase.
# Load text file into local variable called 'data'
data = read.delim(file = 'purchases.txt', header = FALSE, sep = '\t', dec = '.')
# Add headers and interpret the last column as a date, extract year of purchase
colnames(data) = c('customer_id', 'purchase_amount', 'date_of_purchase')
data$date_of_purchase = as.Date(data$date_of_purchase, "%Y-%m-%d")
data$year_of_purchase = as.numeric(format(data$date_of_purchase, "%Y"))
data$days_since = as.numeric(difftime(time1 = "2016-01-01",
time2 = data$date_of_purchase,
units = "days"))
# Invoke library to compute key marketing indicators using SQL language
library(sqldf)
# Segment customers in 2015
customers_2015 = sqldf("SELECT customer_id,
MIN(days_since) AS 'recency',
MAX(days_since) AS 'first_purchase',
COUNT(*) AS 'frequency',
AVG(purchase_amount) AS 'amount'
FROM data GROUP BY 1")
customers_2015$segment = "NA"
customers_2015$segment[which(customers_2015$recency > 365*3)] = "inactive"
customers_2015$segment[which(customers_2015$recency <= 365*3 & customers_2015$recency > 365*2)] = "cold"
customers_2015$segment[which(customers_2015$recency <= 365*2 & customers_2015$recency > 365*1)] = "warm"
customers_2015$segment[which(customers_2015$recency <= 365)] = "active"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$first_purchase <= 365*2)] = "new warm"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$amount < 100)] = "warm low value"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$amount >= 100)] = "warm high value"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$first_purchase <= 365)] = "new active"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$amount < 100)] = "active low value"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$amount >= 100)] = "active high value"
customers_2015$segment = factor(x = customers_2015$segment, levels = c("inactive", "cold",
"warm high value", "warm low value", "new warm",
"active high value", "active low value", "new active"))
# Segment customers in 2014
customers_2014 = sqldf("SELECT customer_id,
MIN(days_since) - 365 AS 'recency',
MAX(days_since) - 365 AS 'first_purchase',
COUNT(*) AS 'frequency',
AVG(purchase_amount) AS 'amount'
FROM data
WHERE days_since > 365
GROUP BY 1")
customers_2014$segment = "NA"
customers_2014$segment[which(customers_2014$recency > 365*3)] = "inactive"
customers_2014$segment[which(customers_2014$recency <= 365*3 & customers_2014$recency > 365*2)] = "cold"
customers_2014$segment[which(customers_2014$recency <= 365*2 & customers_2014$recency > 365*1)] = "warm"
customers_2014$segment[which(customers_2014$recency <= 365)] = "active"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$first_purchase <= 365*2)] = "new warm"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$amount < 100)] = "warm low value"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$amount >= 100)] = "warm high value"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$first_purchase <= 365)] = "new active"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount < 100)] = "active low value"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount >= 100)] = "active high value"
customers_2014$segment = factor(x = customers_2014$segment, levels = c("inactive", "cold",
"warm high value", "warm low value", "new warm",
"active high value", "active low value", "new active"))
A transition matrix is basically a matrix showing the likelihood of going from one segment to the next over one specific period of time.
# Compute transition matrix
new_data = merge(x = customers_2014, y = customers_2015, by = "customer_id", all.x = TRUE)
head(new_data)
## customer_id recency.x first_purchase.x frequency.x amount.x
## 1 10 3464.2083 3464.208 1 30.0
## 2 80 302.2083 3386.208 6 70.0
## 3 90 393.2083 3418.208 10 115.8
## 4 120 1036.2083 1036.208 1 20.0
## 5 130 2605.2083 3345.208 2 50.0
## 6 160 2598.2083 3212.208 2 30.0
## segment.x recency.y first_purchase.y frequency.y amount.y
## 1 inactive 3829.2083 3829.208 1 30.00000
## 2 active low value 343.2083 3751.208 7 71.42857
## 3 warm high value 758.2083 3783.208 10 115.80000
## 4 cold 1401.2083 1401.208 1 20.00000
## 5 inactive 2970.2083 3710.208 2 50.00000
## 6 inactive 2963.2083 3577.208 2 30.00000
## segment.y
## 1 inactive
## 2 active low value
## 3 cold
## 4 inactive
## 5 inactive
## 6 inactive
transition = table(new_data$segment.x, new_data$segment.y)
print(transition)
##
## inactive cold warm high value warm low value new warm
## inactive 7227 0 0 0 0
## cold 1931 0 0 0 0
## warm high value 0 75 0 0 0
## warm low value 0 689 0 0 0
## new warm 0 1139 0 0 0
## active high value 0 0 119 0 0
## active low value 0 0 0 901 0
## new active 0 0 0 0 938
##
## active high value active low value new active
## inactive 35 250 0
## cold 22 200 0
## warm high value 35 1 0
## warm low value 1 266 0
## new warm 15 96 0
## active high value 354 2 0
## active low value 22 2088 0
## new active 89 410 0
# Divide each row by its sum
transition = transition / rowSums(transition)
print(transition)
##
## inactive cold warm high value warm low value
## inactive 0.962060703 0.000000000 0.000000000 0.000000000
## cold 0.896888063 0.000000000 0.000000000 0.000000000
## warm high value 0.000000000 0.675675676 0.000000000 0.000000000
## warm low value 0.000000000 0.720711297 0.000000000 0.000000000
## new warm 0.000000000 0.911200000 0.000000000 0.000000000
## active high value 0.000000000 0.000000000 0.250526316 0.000000000
## active low value 0.000000000 0.000000000 0.000000000 0.299236134
## new active 0.000000000 0.000000000 0.000000000 0.000000000
##
## new warm active high value active low value
## inactive 0.000000000 0.004659212 0.033280085
## cold 0.000000000 0.010218300 0.092893637
## warm high value 0.000000000 0.315315315 0.009009009
## warm low value 0.000000000 0.001046025 0.278242678
## new warm 0.000000000 0.012000000 0.076800000
## active high value 0.000000000 0.745263158 0.004210526
## active low value 0.000000000 0.007306543 0.693457323
## new active 0.652748782 0.061934586 0.285316632
##
## new active
## inactive 0.000000000
## cold 0.000000000
## warm high value 0.000000000
## warm low value 0.000000000
## new warm 0.000000000
## active high value 0.000000000
## active low value 0.000000000
## new active 0.000000000
# Initialize a matrix with the number of customers in each segment today and after 10 periods
segments = matrix(nrow = 8, ncol = 11)
segments[, 1] = table(customers_2015$segment)
colnames(segments) = 2015:2025
row.names(segments) = levels(customers_2015$segment)
print(segments)
## 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025
## inactive 9158 NA NA NA NA NA NA NA NA NA NA
## cold 1903 NA NA NA NA NA NA NA NA NA NA
## warm high value 119 NA NA NA NA NA NA NA NA NA NA
## warm low value 901 NA NA NA NA NA NA NA NA NA NA
## new warm 938 NA NA NA NA NA NA NA NA NA NA
## active high value 573 NA NA NA NA NA NA NA NA NA NA
## active low value 3313 NA NA NA NA NA NA NA NA NA NA
## new active 1512 NA NA NA NA NA NA NA NA NA NA
# Compute for each an every period
for (i in 2:11) {
segments[, i] = segments[, i-1] %*% transition
}
# Plot inactive, active high value customers over time
barplot(segments[1, ])
barplot(segments[2, ])
# Display how segments will evolve over time
print(round(segments))
## 2015 2016 2017 2018 2019 2020 2021 2022 2023
## inactive 9158 10517 11539 12636 12940 13186 13386 13542 13664
## cold 1903 1584 1711 874 821 782 740 709 684
## warm high value 119 144 165 160 156 152 149 146 143
## warm low value 901 991 1058 989 938 884 844 813 789
## new warm 938 987 0 0 0 0 0 0 0
## active high value 573 657 639 624 607 593 581 571 562
## active low value 3313 3537 3305 3134 2954 2820 2717 2637 2575
## new active 1512 0 0 0 0 0 0 0 0
## 2024 2025
## inactive 13759 13834
## cold 665 650
## warm high value 141 139
## warm low value 771 756
## new warm 0 0
## active high value 554 547
## active low value 2527 2490
## new active 0 0
Predictions
Now that we know how to use the transition matrix to guess how many customers will be in each segment in a year, two years or even 20 years, we need to transform these figures into Dollars. In so doing, we will assume that the average spending of each segment does not change over time. If an average customer in a high value segment generates an average $100, we’ll simply assume that this figure will not change over the years. In reality it might go up or down, but without additional information our best guess is to assume that this figure will remain stable over time. We will discount future revenues. This is because all $100 are not created equally. For instance, one customer will spend $100 today while another will spend the same amount but only five years from now. The first customer is more valuable.
# Yearly revenue per segment
# This comes directly from module 2, lines 160-161
yearly_revenue = c(0, 0, 0, 0, 0, 323.57, 52.31, 79.17)
# Compute revenue per segment
revenue_per_segment = yearly_revenue * segments
print(revenue_per_segment)
## 2015 2016 2017 2018 2019 2020
## inactive 0.0 0.0 0.0 0.0 0.0 0.0
## cold 0.0 0.0 0.0 0.0 0.0 0.0
## warm high value 0.0 0.0 0.0 0.0 0.0 0.0
## warm low value 0.0 0.0 0.0 0.0 0.0 0.0
## new warm 0.0 0.0 0.0 0.0 0.0 0.0
## active high value 185405.6 212495.8 206634.8 202009.1 196555.6 191967.7
## active low value 173303.0 184999.5 172904.6 163925.2 154528.3 147531.6
## new active 119705.0 0.0 0.0 0.0 0.0 0.0
## 2021 2022 2023 2024 2025
## inactive 0.0 0.0 0.0 0.0 0.0
## cold 0.0 0.0 0.0 0.0 0.0
## warm high value 0.0 0.0 0.0 0.0 0.0
## warm low value 0.0 0.0 0.0 0.0 0.0
## new warm 0.0 0.0 0.0 0.0 0.0
## active high value 188022.8 184627.5 181717.4 179229.1 177106.1
## active low value 142127.7 137940.6 134702.6 132199.2 130264.3
## new active 0.0 0.0 0.0 0.0 0.0
# Compute yearly revenue
yearly_revenue = colSums(revenue_per_segment)
print(round(yearly_revenue))
## 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
## 478414 397495 379539 365934 351084 339499 330150 322568 316420 311428
## 2025
## 307370
barplot(yearly_revenue)
Interpretation
# Compute cumulated revenue
cumulated_revenue = cumsum(yearly_revenue)
print(round(cumulated_revenue))
## 2015 2016 2017 2018 2019 2020 2021 2022 2023
## 478414 875909 1255448 1621383 1972467 2311966 2642116 2964685 3281105
## 2024 2025
## 3592533 3899903
barplot(cumulated_revenue)
# Create a discount factor
discount_rate = 0.10
discount = 1 / ((1 + discount_rate) ^ ((1:11) - 1))
print(discount)
## [1] 1.0000000 0.9090909 0.8264463 0.7513148 0.6830135 0.6209213 0.5644739
## [8] 0.5131581 0.4665074 0.4240976 0.3855433
# Compute discounted yearly revenue
disc_yearly_revenue = yearly_revenue * discount
print(round(disc_yearly_revenue))
## 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
## 478414 361359 313669 274932 239795 210802 186361 165528 147612 132076
## 2025
## 118505
barplot(disc_yearly_revenue)
lines(yearly_revenue)
Interpretation
# Compute discounted cumulated revenue
disc_cumulated_revenue = cumsum(disc_yearly_revenue)
print(round(disc_cumulated_revenue))
## 2015 2016 2017 2018 2019 2020 2021 2022 2023
## 478414 839773 1153442 1428374 1668169 1878971 2065333 2230861 2378473
## 2024 2025
## 2510549 2629054
barplot(disc_cumulated_revenue)
# What is the database worth?
print(disc_cumulated_revenue[11] - yearly_revenue[1])
## 2025
## 2150640
Interpretation