# --- SEGMENT CUSTOMERS IN 2014 AND 2015 -------------------
# Load text file into local variable called 'data'
data = read.delim(file = "C:\\Users\\Srirama Bonam\\Documents\\Mktg Data Analytics.txt", header = FALSE, sep = '\t', dec = '.')
# Add headers and interpret the last column as a date, extract year of purchase
colnames(data) = c('customer_id', 'purchase_amount', 'date_of_purchase')
data$date_of_purchase = as.Date(data$date_of_purchase, "%Y-%m-%d")
data$year_of_purchase = as.numeric(format(data$date_of_purchase, "%Y"))
data$days_since = as.numeric(difftime(time1 = "2016-01-01",
time2 = data$date_of_purchase,
units = "days"))
# Invoke library to compute key marketing indicators using SQL language
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
# Segment customers in 2015
customers_2015 = sqldf("SELECT customer_id,
MIN(days_since) AS 'recency',
MAX(days_since) AS 'first_purchase',
COUNT(*) AS 'frequency',
AVG(purchase_amount) AS 'amount'
FROM data GROUP BY 1")
customers_2015$segment = "NA"
customers_2015$segment[which(customers_2015$recency > 365*3)] = "inactive"
customers_2015$segment[which(customers_2015$recency <= 365*3 & customers_2015$recency > 365*2)] = "cold"
customers_2015$segment[which(customers_2015$recency <= 365*2 & customers_2015$recency > 365*1)] = "warm"
customers_2015$segment[which(customers_2015$recency <= 365)] = "active"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$first_purchase <= 365*2)] = "new warm"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$amount < 100)] = "warm low value"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$amount >= 100)] = "warm high value"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$first_purchase <= 365)] = "new active"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$amount < 100)] = "active low value"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$amount >= 100)] = "active high value"
customers_2015$segment = factor(x = customers_2015$segment, levels = c("inactive", "cold",
"warm high value", "warm low value", "new warm",
"active high value", "active low value", "new active"))
# Segment customers in 2014
customers_2014 = sqldf("SELECT customer_id,
MIN(days_since) - 365 AS 'recency',
MAX(days_since) - 365 AS 'first_purchase',
COUNT(*) AS 'frequency',
AVG(purchase_amount) AS 'amount'
FROM data
WHERE days_since > 365
GROUP BY 1")
customers_2014$segment = "NA"
customers_2014$segment[which(customers_2014$recency > 365*3)] = "inactive"
customers_2014$segment[which(customers_2014$recency <= 365*3 & customers_2014$recency > 365*2)] = "cold"
customers_2014$segment[which(customers_2014$recency <= 365*2 & customers_2014$recency > 365*1)] = "warm"
customers_2014$segment[which(customers_2014$recency <= 365)] = "active"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$first_purchase <= 365*2)] = "new warm"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$amount < 100)] = "warm low value"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$amount >= 100)] = "warm high value"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$first_purchase <= 365)] = "new active"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount < 100)] = "active low value"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount >= 100)] = "active high value"
customers_2014$segment = factor(x = customers_2014$segment, levels = c("inactive", "cold",
"warm high value", "warm low value", "new warm",
"active high value", "active low value", "new active"))
# --- COMPUTE TRANSITION MATRIX ----------------------------
# Compute transition matrix
new_data = merge(x = customers_2014, y = customers_2015, by = "customer_id", all.x = TRUE)
head(new_data)
## customer_id recency.x first_purchase.x frequency.x amount.x
## 1 10 3463.7708 3463.771 1 30.0
## 2 80 301.7708 3385.771 6 70.0
## 3 90 392.7708 3417.771 10 115.8
## 4 120 1035.7708 1035.771 1 20.0
## 5 130 2604.7708 3344.771 2 50.0
## 6 160 2597.7708 3211.771 2 30.0
## segment.x recency.y first_purchase.y frequency.y amount.y
## 1 inactive 3828.7708 3828.771 1 30.00000
## 2 active low value 342.7708 3750.771 7 71.42857
## 3 warm high value 757.7708 3782.771 10 115.80000
## 4 cold 1400.7708 1400.771 1 20.00000
## 5 inactive 2969.7708 3709.771 2 50.00000
## 6 inactive 2962.7708 3576.771 2 30.00000
## segment.y
## 1 inactive
## 2 active low value
## 3 cold
## 4 inactive
## 5 inactive
## 6 inactive
transition = table(new_data$segment.x, new_data$segment.y)
print(transition)
##
## inactive cold warm high value warm low value new warm
## inactive 7227 0 0 0 0
## cold 1931 0 0 0 0
## warm high value 0 75 0 0 0
## warm low value 0 689 0 0 0
## new warm 0 1139 0 0 0
## active high value 0 0 119 0 0
## active low value 0 0 0 901 0
## new active 0 0 0 0 938
##
## active high value active low value new active
## inactive 35 250 0
## cold 22 200 0
## warm high value 35 1 0
## warm low value 1 266 0
## new warm 15 96 0
## active high value 354 2 0
## active low value 22 2088 0
## new active 89 410 0
# Divide each row by its sum
transition = transition / rowSums(transition)
print(transition)
##
## inactive cold warm high value warm low value
## inactive 0.962060703 0.000000000 0.000000000 0.000000000
## cold 0.896888063 0.000000000 0.000000000 0.000000000
## warm high value 0.000000000 0.675675676 0.000000000 0.000000000
## warm low value 0.000000000 0.720711297 0.000000000 0.000000000
## new warm 0.000000000 0.911200000 0.000000000 0.000000000
## active high value 0.000000000 0.000000000 0.250526316 0.000000000
## active low value 0.000000000 0.000000000 0.000000000 0.299236134
## new active 0.000000000 0.000000000 0.000000000 0.000000000
##
## new warm active high value active low value
## inactive 0.000000000 0.004659212 0.033280085
## cold 0.000000000 0.010218300 0.092893637
## warm high value 0.000000000 0.315315315 0.009009009
## warm low value 0.000000000 0.001046025 0.278242678
## new warm 0.000000000 0.012000000 0.076800000
## active high value 0.000000000 0.745263158 0.004210526
## active low value 0.000000000 0.007306543 0.693457323
## new active 0.652748782 0.061934586 0.285316632
##
## new active
## inactive 0.000000000
## cold 0.000000000
## warm high value 0.000000000
## warm low value 0.000000000
## new warm 0.000000000
## active high value 0.000000000
## active low value 0.000000000
## new active 0.000000000
# --- USE TRANSITION MATRIX TO MAKE PREDICTIONS ------------
# Initialize a matrix with the number of customers in each segment today and after 10 periods
segments = matrix(nrow = 8, ncol = 11)
segments[, 1] = table(customers_2015$segment)
colnames(segments) = 2015:2025
row.names(segments) = levels(customers_2015$segment)
print(segments)
## 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025
## inactive 9158 NA NA NA NA NA NA NA NA NA NA
## cold 1903 NA NA NA NA NA NA NA NA NA NA
## warm high value 119 NA NA NA NA NA NA NA NA NA NA
## warm low value 901 NA NA NA NA NA NA NA NA NA NA
## new warm 938 NA NA NA NA NA NA NA NA NA NA
## active high value 573 NA NA NA NA NA NA NA NA NA NA
## active low value 3313 NA NA NA NA NA NA NA NA NA NA
## new active 1512 NA NA NA NA NA NA NA NA NA NA
# Compute for each an every period
for (i in 2:11) {
segments[, i] = segments[, i-1] %*% transition
}
# Plot inactive, active high value customers over time
barplot(segments[1, ])

barplot(segments[2, ])

# Display how segments will evolve over time
print(round(segments))
## 2015 2016 2017 2018 2019 2020 2021 2022 2023
## inactive 9158 10517 11539 12636 12940 13186 13386 13542 13664
## cold 1903 1584 1711 874 821 782 740 709 684
## warm high value 119 144 165 160 156 152 149 146 143
## warm low value 901 991 1058 989 938 884 844 813 789
## new warm 938 987 0 0 0 0 0 0 0
## active high value 573 657 639 624 607 593 581 571 562
## active low value 3313 3537 3305 3134 2954 2820 2717 2637 2575
## new active 1512 0 0 0 0 0 0 0 0
## 2024 2025
## inactive 13759 13834
## cold 665 650
## warm high value 141 139
## warm low value 771 756
## new warm 0 0
## active high value 554 547
## active low value 2527 2490
## new active 0 0
# --- COMPUTE THE (DISCOUNTED) CLV OF A DATABASE -----------
# Yearly revenue per segment
# This comes directly from module 2, lines 160-161
yearly_revenue = c(0, 0, 0, 0, 0, 323.57, 52.31, 79.17)
# Compute revenue per segment
revenue_per_segment = yearly_revenue * segments
print(revenue_per_segment)
## 2015 2016 2017 2018 2019 2020
## inactive 0.0 0.0 0.0 0.0 0.0 0.0
## cold 0.0 0.0 0.0 0.0 0.0 0.0
## warm high value 0.0 0.0 0.0 0.0 0.0 0.0
## warm low value 0.0 0.0 0.0 0.0 0.0 0.0
## new warm 0.0 0.0 0.0 0.0 0.0 0.0
## active high value 185405.6 212495.8 206634.8 202009.1 196555.6 191967.7
## active low value 173303.0 184999.5 172904.6 163925.2 154528.3 147531.6
## new active 119705.0 0.0 0.0 0.0 0.0 0.0
## 2021 2022 2023 2024 2025
## inactive 0.0 0.0 0.0 0.0 0.0
## cold 0.0 0.0 0.0 0.0 0.0
## warm high value 0.0 0.0 0.0 0.0 0.0
## warm low value 0.0 0.0 0.0 0.0 0.0
## new warm 0.0 0.0 0.0 0.0 0.0
## active high value 188022.8 184627.5 181717.4 179229.1 177106.1
## active low value 142127.7 137940.6 134702.6 132199.2 130264.3
## new active 0.0 0.0 0.0 0.0 0.0
# Compute yearly revenue
yearly_revenue = colSums(revenue_per_segment)
print(round(yearly_revenue))
## 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
## 478414 397495 379539 365934 351084 339499 330150 322568 316420 311428
## 2025
## 307370
barplot(yearly_revenue)

# Compute cumulated revenue
cumulated_revenue = cumsum(yearly_revenue)
print(round(cumulated_revenue))
## 2015 2016 2017 2018 2019 2020 2021 2022 2023
## 478414 875909 1255448 1621383 1972467 2311966 2642116 2964685 3281105
## 2024 2025
## 3592533 3899903
barplot(cumulated_revenue)

# Create a discount factor
discount_rate = 0.10
discount = 1 / ((1 + discount_rate) ^ ((1:11) - 1))
print(discount)
## [1] 1.0000000 0.9090909 0.8264463 0.7513148 0.6830135 0.6209213 0.5644739
## [8] 0.5131581 0.4665074 0.4240976 0.3855433
# Compute discounted yearly revenue
disc_yearly_revenue = yearly_revenue * discount
print(round(disc_yearly_revenue))
## 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
## 478414 361359 313669 274932 239795 210802 186361 165528 147612 132076
## 2025
## 118505
barplot(disc_yearly_revenue)
lines(yearly_revenue)

# Compute discounted cumulated revenue
disc_cumulated_revenue = cumsum(disc_yearly_revenue)
print(round(disc_cumulated_revenue))
## 2015 2016 2017 2018 2019 2020 2021 2022 2023
## 478414 839773 1153442 1428374 1668169 1878971 2065333 2230861 2378473
## 2024 2025
## 2510549 2629054
barplot(disc_cumulated_revenue)

# What is the database worth?
print(disc_cumulated_revenue[11] - yearly_revenue[1])
## 2025
## 2150640