# Load text file into local variable called 'data'
data = read.delim(file = 'purchases.txt', header = FALSE, sep = '\t', dec = '.')
# Add headers and interpret the last column as a date, extract year of purchase
colnames(data) = c('customer_id', 'purchase_amount', 'date_of_purchase')
data$date_of_purchase = as.Date(data$date_of_purchase, "%Y-%m-%d")
data$days_since = as.numeric(difftime(time1 = "2016-01-01",
time2 = data$date_of_purchase,
units = "days"))
# Display the data after transformation
head(data)
## customer_id purchase_amount date_of_purchase days_since
## 1 760 25 2009-11-06 2247.3333
## 2 860 50 2012-09-28 1190.3333
## 3 1200 100 2005-10-25 3720.3333
## 4 1420 50 2009-07-09 2367.3333
## 5 1940 70 2013-01-25 1071.3333
## 6 1960 40 2013-10-29 794.3333
summary(data)
## customer_id purchase_amount date_of_purchase
## Min. : 10 Min. : 5.00 Min. :2005-01-02
## 1st Qu.: 57720 1st Qu.: 25.00 1st Qu.:2009-01-17
## Median :102440 Median : 30.00 Median :2011-11-23
## Mean :108935 Mean : 62.34 Mean :2011-07-14
## 3rd Qu.:160525 3rd Qu.: 60.00 3rd Qu.:2013-12-29
## Max. :264200 Max. :4500.00 Max. :2015-12-31
## days_since
## Min. : 1.333
## 1st Qu.: 733.333
## Median :1500.333
## Mean :1632.273
## 3rd Qu.:2540.333
## Max. :4016.333
# Compute key marketing indicators using SQL language
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
# Compute recency, frequency, and average purchase amount
customers = sqldf("SELECT customer_id,
MIN(days_since) AS 'recency',
COUNT(*) AS 'frequency',
AVG(purchase_amount) AS 'amount'
FROM data GROUP BY 1")
## Loading required package: tcltk
# Explore the data
head(customers)
## customer_id recency frequency amount
## 1 10 3829.3333 1 30.00000
## 2 80 343.3333 7 71.42857
## 3 90 758.3333 10 115.80000
## 4 120 1401.3333 1 20.00000
## 5 130 2970.3333 2 50.00000
## 6 160 2963.3333 2 30.00000
summary(customers)
## customer_id recency frequency amount
## Min. : 10 Min. : 1.333 Min. : 1.000 Min. : 5.00
## 1st Qu.: 81990 1st Qu.: 244.333 1st Qu.: 1.000 1st Qu.: 21.67
## Median :136430 Median :1070.333 Median : 2.000 Median : 30.00
## Mean :137574 Mean :1253.371 Mean : 2.782 Mean : 57.79
## 3rd Qu.:195100 3rd Qu.:2130.333 3rd Qu.: 3.000 3rd Qu.: 50.00
## Max. :264200 Max. :4014.333 Max. :45.000 Max. :4500.00
hist(customers$recency)
hist(customers$frequency)
hist(customers$amount)
hist(customers$amount, breaks = 100)
# Copy customer data into new data frame
new_data = customers
# Remove customer id as a variable, store it as row names
head(new_data)
## customer_id recency frequency amount
## 1 10 3829.3333 1 30.00000
## 2 80 343.3333 7 71.42857
## 3 90 758.3333 10 115.80000
## 4 120 1401.3333 1 20.00000
## 5 130 2970.3333 2 50.00000
## 6 160 2963.3333 2 30.00000
row.names(new_data) = new_data$customer_id
new_data$customer_id = NULL
head(new_data)
## recency frequency amount
## 10 3829.3333 1 30.00000
## 80 343.3333 7 71.42857
## 90 758.3333 10 115.80000
## 120 1401.3333 1 20.00000
## 130 2970.3333 2 50.00000
## 160 2963.3333 2 30.00000
# Take the log-transform of the amount, and plot
new_data$amount = log(new_data$amount)
hist(new_data$amount)
# Standardize variables
new_data = scale(new_data)
head(new_data)
## recency frequency amount
## 10 2.3819788 -0.6068923 -0.2357955
## 80 -0.8415073 1.4360863 0.8943622
## 90 -0.4577590 2.4575756 1.5238194
## 120 0.1368198 -0.6068923 -0.7640251
## 130 1.5876660 -0.2663959 0.4296952
## 160 1.5811931 -0.2663959 -0.2357955
# Compute distance metrics on standardized data
# This will likely generate an error on most machines
# d = dist(new_data)
# Take a 10% sample
sample = seq(1, 18417, by = 10)
head(sample)
## [1] 1 11 21 31 41 51
customers_sample = customers[sample, ]
new_data_sample = new_data[sample, ]
# Compute distance metrics on standardized data
d = dist(new_data_sample)
# Perform hierarchical clustering on distance metrics
c = hclust(d, method="ward.D2")
# Plot de dendogram
plot(c)
# Cut at 9 segments
members = cutree(c, k = 9)
# Show 30 first customers, frequency table
members[1:30]
## 10 260 510 850 1040 1430 1860 2160 2380 2700 3000 3140 3650 3920 4240
## 1 2 1 3 4 1 4 5 6 7 7 5 3 1 2
## 4470 4710 4910 5230 5520 5710 5920 6080 6240 6410 6600 6750 6940 7100 7330
## 3 6 7 4 5 1 5 8 5 6 1 3 3 7 2
table(members)
## members
## 1 2 3 4 5 6 7 8 9
## 309 212 130 306 49 83 236 317 200
# Show profile of each segment
aggdata = data.frame(aggregate(customers_sample[, 2:4], by = list(members), mean))
aggdata
## Group.1 recency frequency amount
## 1 1 2563.5696 1.284790 37.78474
## 2 2 2684.6258 1.311321 16.26336
## 3 3 193.9795 10.615385 42.01521
## 4 4 162.4379 2.398693 41.10641
## 5 5 2567.3741 1.857143 214.86197
## 6 6 448.9116 6.578313 261.21914
## 7 7 1144.0282 4.466102 29.85421
## 8 8 923.1630 1.280757 22.43357
## 9 9 799.6733 1.400000 87.42917
symbols(aggdata$frequency,aggdata$recency,circles=aggdata$amount, inches=.2, fg="white", bg="red",main= "Customer segmentation", xlab="Frequency", ylab= "Recency")
# Cut at 4 segments
members = cutree(c, k = 4)
# Show profile of each segment
aggdata2 = data.frame(aggregate(customers_sample[, 2:4], by = list(members), mean))
aggdata2
## Group.1 recency frequency amount
## 1 1 2612.8285 1.295585 29.02748
## 2 2 193.9795 10.615385 42.01521
## 3 3 712.8514 2.554133 31.12409
## 4 4 972.8785 2.762048 149.68451
symbols(aggdata2$frequency,aggdata2$recency,circles=aggdata2$amount, inches=.2, fg="white", bg="red",main= "Customer segmentation", xlab="Frequency", ylab= "Recency")
# Compute recency, frequency, and average purchase amount
customers_2015 = sqldf("SELECT customer_id,
MIN(days_since) AS 'recency',
MAX(days_since) AS 'first_purchase',
COUNT(*) AS 'frequency',
AVG(purchase_amount) AS 'amount'
FROM data GROUP BY 1")
head(customers_2015)
## customer_id recency first_purchase frequency amount
## 1 10 3829.3333 3829.333 1 30.00000
## 2 80 343.3333 3751.333 7 71.42857
## 3 90 758.3333 3783.333 10 115.80000
## 4 120 1401.3333 1401.333 1 20.00000
## 5 130 2970.3333 3710.333 2 50.00000
## 6 160 2963.3333 3577.333 2 30.00000
# Explore the data
summary(customers_2015)
## customer_id recency first_purchase frequency
## Min. : 10 Min. : 1.333 Min. : 1.333 Min. : 1.000
## 1st Qu.: 81990 1st Qu.: 244.333 1st Qu.: 988.333 1st Qu.: 1.000
## Median :136430 Median :1070.333 Median :2087.333 Median : 2.000
## Mean :137574 Mean :1253.371 Mean :1984.343 Mean : 2.782
## 3rd Qu.:195100 3rd Qu.:2130.333 3rd Qu.:2992.333 3rd Qu.: 3.000
## Max. :264200 Max. :4014.333 Max. :4016.333 Max. :45.000
## amount
## Min. : 5.00
## 1st Qu.: 21.67
## Median : 30.00
## Mean : 57.79
## 3rd Qu.: 50.00
## Max. :4500.00
hist(customers_2015$recency)
hist(customers_2015$frequency)
hist(customers_2015$amount)
hist(customers_2015$amount, breaks = 100)
# Simple 2-segment solution based on recency alone
customers_2015$segment = ifelse(test = customers_2015$recency > 365*3, yes = "inactive", no = "NA")
table(customers_2015$segment)
##
## inactive NA
## 9158 9259
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
## Group.1 recency first_purchase frequency amount
## 1 inactive 2178.4442 2546.502 1.814479 48.11277
## 2 NA 338.3893 1428.317 3.739713 67.36760
# A more complex 3-segment solution based on recency alone
customers_2015$segment = ifelse(test = customers_2015$recency > 365*3,
yes = "inactive",
no = ifelse(test = customers_2015$recency > 365*2,
yes = "cold",
no = "NA"))
table(customers_2015$segment)
##
## cold inactive NA
## 1903 9158 7356
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
## Group.1 recency first_purchase frequency amount
## 1 cold 858.1147 1432.451 2.303205 51.73989
## 2 inactive 2178.4442 2546.502 1.814479 48.11277
## 3 NA 203.9361 1427.248 4.111338 71.41050
# Simple 2-segment solution using the which statement
customers_2015$segment = "NA"
customers_2015$segment[which(customers_2015$recency > 365*3)] = "inactive"
table(customers_2015$segment)
##
## inactive NA
## 9158 9259
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
## Group.1 recency first_purchase frequency amount
## 1 inactive 2178.4442 2546.502 1.814479 48.11277
## 2 NA 338.3893 1428.317 3.739713 67.36760
# More complex 4-segment solution using which
customers_2015$segment = "NA"
customers_2015$segment[which(customers_2015$recency > 365*3)] = "inactive"
customers_2015$segment[which(customers_2015$recency <= 365*3 & customers_2015$recency > 365*2)] = "cold"
customers_2015$segment[which(customers_2015$recency <= 365*2 & customers_2015$recency > 365*1)] = "warm"
customers_2015$segment[which(customers_2015$recency <= 365)] = "active"
table(customers_2015$segment)
##
## active cold inactive warm
## 5398 1903 9158 1958
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
## Group.1 recency first_purchase frequency amount
## 1 active 100.0740 1466.177 4.560763 72.08094
## 2 cold 858.1147 1432.451 2.303205 51.73989
## 3 inactive 2178.4442 2546.502 1.814479 48.11277
## 4 warm 490.2731 1319.924 2.872319 69.56215
# Complete segment solution using which, and exploiting previous test as input
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$first_purchase <= 365*2)] = "new warm"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$amount < 100)] = "warm low value"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$amount >= 100)] = "warm high value"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$first_purchase <= 365)] = "new active"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$amount < 100)] = "active low value"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$amount >= 100)] = "active high value"
table(customers_2015$segment)
##
## active high value active low value cold inactive
## 573 3313 1903 9158
## new active new warm warm high value warm low value
## 1512 938 119 901
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
## Group.1 recency first_purchase frequency amount
## 1 active high value 89.15358 1986.24258 5.888307 240.04574
## 2 active low value 108.69434 2004.13533 5.935406 40.72452
## 3 cold 858.11473 1432.45052 2.303205 51.73989
## 4 inactive 2178.44417 2546.50171 1.814479 48.11277
## 5 new active 85.32407 90.34722 1.045635 77.13385
## 6 new warm 509.63824 516.95593 1.044776 66.59903
## 7 warm high value 455.45938 2015.68627 4.714286 327.40746
## 8 warm low value 474.71069 2063.97262 4.531632 38.59193
# Re-order factor in a way that makes sense
customers_2015$segment = factor(x = customers_2015$segment, levels = c("inactive", "cold", "warm high value", "warm low value", "new warm", "active high value", "active low value", "new active"))
table(customers_2015$segment)
##
## inactive cold warm high value warm low value
## 9158 1903 119 901
## new warm active high value active low value new active
## 938 573 3313 1512
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
## Group.1 recency first_purchase frequency amount
## 1 inactive 2178.44417 2546.50171 1.814479 48.11277
## 2 cold 858.11473 1432.45052 2.303205 51.73989
## 3 warm high value 455.45938 2015.68627 4.714286 327.40746
## 4 warm low value 474.71069 2063.97262 4.531632 38.59193
## 5 new warm 509.63824 516.95593 1.044776 66.59903
## 6 active high value 89.15358 1986.24258 5.888307 240.04574
## 7 active low value 108.69434 2004.13533 5.935406 40.72452
## 8 new active 85.32407 90.34722 1.045635 77.13385
# Compute recency, frequency, and average purchase amount
customers_2014 = sqldf("SELECT customer_id,
MIN(days_since) - 365 AS 'recency',
MAX(days_since) - 365 AS 'first_purchase',
COUNT(*) AS 'frequency',
AVG(purchase_amount) AS 'amount'
FROM data
WHERE days_since > 365
GROUP BY 1")
# Complete segment solution using which, and exploiting previous test as input
customers_2014$segment = "NA"
customers_2014$segment[which(customers_2014$recency > 365*3)] = "inactive"
customers_2014$segment[which(customers_2014$recency <= 365*3 & customers_2014$recency > 365*2)] = "cold"
customers_2014$segment[which(customers_2014$recency <= 365*2 & customers_2014$recency > 365*1)] = "warm"
customers_2014$segment[which(customers_2014$recency <= 365)] = "active"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$first_purchase <= 365*2)] = "new warm"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$amount < 100)] = "warm low value"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$amount >= 100)] = "warm high value"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$first_purchase <= 365)] = "new active"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount < 100)] = "active low value"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount >= 100)] = "active high value"
# Re-order factor in a way that makes sense
customers_2014$segment = factor(x = customers_2014$segment, levels = c("inactive", "cold", "warm high value", "warm low value", "new warm", "active high value", "active low value", "new active"))
# Show segmentation results
table(customers_2014$segment)
##
## inactive cold warm high value warm low value
## 7512 2153 111 956
## new warm active high value active low value new active
## 1250 475 3011 1437
pie(table(customers_2014$segment), col = rainbow(24))
aggregate(x = customers_2014[, 2:5], by = list(customers_2014$segment), mean)
## Group.1 recency first_purchase frequency amount
## 1 inactive 2058.77463 2353.3702 1.730964 48.11120
## 2 cold 866.95015 1565.7774 2.254064 51.11460
## 3 warm high value 461.53153 1879.1441 4.414414 187.84911
## 4 warm low value 470.99442 1945.7601 4.361925 37.38206
## 5 new warm 497.65093 505.2133 1.057600 51.36765
## 6 active high value 85.67228 1838.3839 5.696842 261.90216
## 7 active low value 98.42433 1796.6847 5.633677 40.45917
## 8 new active 132.42032 143.2046 1.070981 69.72516
# Compute how much revenue is generated by segments
# Notice that people with no revenue in 2015 do NOT appear
data$year_of_purchase = as.numeric(format(data$date_of_purchase, "%Y"))
revenue_2015 = sqldf("SELECT customer_id, SUM(purchase_amount) AS 'revenue_2015'
FROM data
WHERE year_of_purchase = 2015
GROUP BY 1")
revenue_2015$revenue_2015 = as.numeric(revenue_2015$revenue_2015)
summary(revenue_2015)
## customer_id revenue_2015
## Min. : 80 Min. : 5.00
## 1st Qu.:104105 1st Qu.: 30.00
## Median :185495 Median : 50.00
## Mean :167782 Mean : 88.62
## 3rd Qu.:246058 3rd Qu.: 85.00
## Max. :264200 Max. :4500.00
# Merge 2015 customers and 2015 revenue (correct)
actual = merge(customers_2015, revenue_2015, all.x = TRUE)
actual$revenue_2015[is.na(actual$revenue_2015)] = 0
# Show average revenue per customer and per segment
aggregate(x = actual$revenue_2015, by = list(customers_2015$segment), mean)
## Group.1 x
## 1 inactive 0.00000
## 2 cold 0.00000
## 3 warm high value 0.00000
## 4 warm low value 0.00000
## 5 new warm 0.00000
## 6 active high value 323.56894
## 7 active low value 52.30604
## 8 new active 79.16614
# Merge 2014 customers and 2015 revenue (correct) => Predictive analysis
forward = merge(customers_2014, revenue_2015, all.x = TRUE)
forward$revenue_2015[is.na(forward$revenue_2015)] = 0
# Show average revenue per customer and per segment
r = aggregate(x = forward$revenue_2015, by = list(customers_2014$segment), mean)
print(r)
## Group.1 x
## 1 inactive 2.949466
## 2 cold 6.108221
## 3 warm high value 114.459459
## 4 warm low value 13.494770
## 5 new warm 5.064000
## 6 active high value 254.077895
## 7 active low value 41.896556
## 8 new active 31.046625
# Re-order and display results
r = r[order(r$x, decreasing = TRUE), ]
print(r)
## Group.1 x
## 6 active high value 254.077895
## 3 warm high value 114.459459
## 7 active low value 41.896556
## 8 new active 31.046625
## 4 warm low value 13.494770
## 2 cold 6.108221
## 5 new warm 5.064000
## 1 inactive 2.949466
barplot(r$x, names.arg = r$Group.1)
Losing one active high value customer is equal to losing eight new active customers! => Retain high value customers
# Compute RFM variables as of a year ago
customers_2014 = sqldf("SELECT customer_id,
MIN(days_since) - 365 AS 'recency',
MAX(days_since) - 365 AS 'first_purchase',
COUNT(*) AS 'frequency',
AVG(purchase_amount) AS 'avg_amount',
MAX(purchase_amount) AS 'max_amount'
FROM data
WHERE days_since > 365
GROUP BY 1")
# Compute revenues generated by customers in 2015
revenue_2015 = sqldf("SELECT customer_id, SUM(purchase_amount) AS 'revenue_2015'
FROM data
WHERE year_of_purchase = 2015
GROUP BY 1")
# Merge 2015 customers and 2015 revenue
in_sample = merge(customers_2014, revenue_2015, all.x = TRUE)
in_sample$revenue_2015[is.na(in_sample$revenue_2015)] = 0
in_sample$active_2015 = as.numeric(in_sample$revenue_2015 > 0)
# Display calibration (in-sample) data
head(in_sample)
## customer_id recency first_purchase frequency avg_amount max_amount
## 1 10 3464.3333 3464.333 1 30.0 30
## 2 80 302.3333 3386.333 6 70.0 80
## 3 90 393.3333 3418.333 10 115.8 153
## 4 120 1036.3333 1036.333 1 20.0 20
## 5 130 2605.3333 3345.333 2 50.0 60
## 6 160 2598.3333 3212.333 2 30.0 30
## revenue_2015 active_2015
## 1 0 0
## 2 80 1
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
summary(in_sample)
## customer_id recency first_purchase frequency
## Min. : 10 Min. : 1.333 Min. : 1.333 Min. : 1.000
## 1st Qu.: 77710 1st Qu.: 258.333 1st Qu.: 796.333 1st Qu.: 1.000
## Median :127140 Median : 890.333 Median :1891.333 Median : 2.000
## Mean :127315 Mean :1122.920 Mean :1788.744 Mean : 2.665
## 3rd Qu.:181800 3rd Qu.:1868.333 3rd Qu.:2695.333 3rd Qu.: 3.000
## Max. :245840 Max. :3649.333 Max. :3651.333 Max. :40.000
## avg_amount max_amount revenue_2015 active_2015
## Min. : 5.00 Min. : 5.00 Min. : 0.00 Min. :0.0000
## 1st Qu.: 21.25 1st Qu.: 25.00 1st Qu.: 0.00 1st Qu.:0.0000
## Median : 30.00 Median : 30.00 Median : 0.00 Median :0.0000
## Mean : 55.53 Mean : 65.72 Mean : 21.22 Mean :0.2299
## 3rd Qu.: 50.00 3rd Qu.: 60.00 3rd Qu.: 0.00 3rd Qu.:0.0000
## Max. :4500.00 Max. :4500.00 Max. :4500.00 Max. :1.0000
# Calibrate probability model
library(nnet)
prob.model = multinom(formula = active_2015 ~ recency + first_purchase + frequency + avg_amount + max_amount, data = in_sample)
## # weights: 7 (6 variable)
## initial value 11717.653087
## iter 10 value 6233.650622
## final value 6184.465956
## converged
coef = summary(prob.model)$coefficients
std = summary(prob.model)$standard.errors
print(coef)
## (Intercept) recency first_purchase frequency avg_amount
## -0.5324757179 -0.0019651140 -0.0000116331 0.2195100049 0.0004159610
## max_amount
## -0.0001558093
print(std)
## (Intercept) recency first_purchase frequency avg_amount
## 4.411568e-02 6.000518e-05 3.925582e-05 1.479178e-02 3.634840e-04
## max_amount
## 2.713806e-04
print(coef / std)
## (Intercept) recency first_purchase frequency avg_amount
## -12.0699884 -32.7490735 -0.2963409 14.8399966 1.1443722
## max_amount
## -0.5741357
# For the monetary model, select only those who made a purchase
z = which(in_sample$active_2015 == 1)
head(in_sample[z, ])
## customer_id recency first_purchase frequency avg_amount max_amount
## 2 80 302.33333 3386.333 6 70.00000 80
## 18 480 16.33333 3313.333 11 62.27273 235
## 30 830 267.33333 3374.333 6 48.33333 60
## 31 850 62.33333 3051.333 8 28.12500 30
## 32 860 267.33333 3643.333 9 53.33333 60
## 39 1020 1462.33333 1827.333 3 73.33333 100
## revenue_2015 active_2015
## 2 80 1
## 18 45 1
## 30 50 1
## 31 60 1
## 32 60 1
## 39 120 1
summary(in_sample[z, ])
## customer_id recency first_purchase frequency
## Min. : 80 Min. : 1.333 Min. : 1.333 Min. : 1.000
## 1st Qu.: 78590 1st Qu.: 23.333 1st Qu.: 650.083 1st Qu.: 2.000
## Median :143550 Median : 97.333 Median :1604.333 Median : 4.000
## Mean :134907 Mean : 306.680 Mean :1636.497 Mean : 4.741
## 3rd Qu.:194362 3rd Qu.: 328.333 3rd Qu.:2666.333 3rd Qu.: 7.000
## Max. :236660 Max. :3544.333 Max. :3647.333 Max. :40.000
## avg_amount max_amount revenue_2015 active_2015
## Min. : 5.00 Min. : 5.00 Min. : 5.0 Min. :1
## 1st Qu.: 30.00 1st Qu.: 30.00 1st Qu.: 30.0 1st Qu.:1
## Median : 40.00 Median : 50.00 Median : 50.0 Median :1
## Mean : 67.78 Mean : 88.33 Mean : 92.3 Mean :1
## 3rd Qu.: 60.00 3rd Qu.: 80.00 3rd Qu.: 100.0 3rd Qu.:1
## Max. :4500.00 Max. :4500.00 Max. :4500.0 Max. :1
# Calibrate the monetary model (version 1)
amount.model = lm(formula = revenue_2015 ~ avg_amount + max_amount, data = in_sample[z, ])
summary(amount.model)
##
## Call:
## lm(formula = revenue_2015 ~ avg_amount + max_amount, data = in_sample[z,
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2138.5 -20.1 -16.6 0.1 3361.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.74709 2.38112 8.713 <2e-16 ***
## avg_amount 0.67486 0.03280 20.575 <2e-16 ***
## max_amount 0.29225 0.02363 12.367 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 136.6 on 3883 degrees of freedom
## Multiple R-squared: 0.6054, Adjusted R-squared: 0.6052
## F-statistic: 2979 on 2 and 3883 DF, p-value: < 2.2e-16
# Plot the results of the monetary model
plot(x = in_sample[z, ]$revenue_2015, y = amount.model$fitted.values)
# Re-calibrate the monetary model, using a log-transform (version 2)
amount.model = lm(formula = log(revenue_2015) ~ log(avg_amount) + log(max_amount), data = in_sample[z, ])
summary(amount.model)
##
## Call:
## lm(formula = log(revenue_2015) ~ log(avg_amount) + log(max_amount),
## data = in_sample[z, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7866 -0.1811 -0.0770 0.1852 3.5656
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.37000 0.04003 9.242 <2e-16 ***
## log(avg_amount) 0.54881 0.04167 13.171 <2e-16 ***
## log(max_amount) 0.38813 0.03796 10.224 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4781 on 3883 degrees of freedom
## Multiple R-squared: 0.6927, Adjusted R-squared: 0.6926
## F-statistic: 4377 on 2 and 3883 DF, p-value: < 2.2e-16
# Plot the results of this new monetary model
plot(x = log(in_sample[z, ]$revenue_2015), y = amount.model$fitted.values)
# Compute RFM variables as of today
library(sqldf)
customers_2015 = sqldf("SELECT customer_id,
MIN(days_since) AS 'recency',
MAX(days_since) AS 'first_purchase',
COUNT(*) AS 'frequency',
AVG(purchase_amount) AS 'avg_amount',
MAX(purchase_amount) AS 'max_amount'
FROM data GROUP BY 1")
# Predict the target variables based on today's data
customers_2015$prob_predicted = predict(object = prob.model, newdata = customers_2015, type = "probs")
customers_2015$revenue_predicted = exp(predict(object = amount.model, newdata = customers_2015))
customers_2015$score_predicted = customers_2015$prob_predicted * customers_2015$revenue_predicted
summary(customers_2015$prob_predicted)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000263 0.012630 0.106200 0.225000 0.397800 0.999900
summary(customers_2015$revenue_predicted)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.54 29.00 35.05 65.63 57.30 3833.00
summary(customers_2015$score_predicted)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0033 0.4557 4.5560 18.8300 17.9600 2854.0000
hist(customers_2015$score_predicted)
hist(log(customers_2015$score_predicted))
# How many customers have an expected revenue of more than $50
z = which(customers_2015$score_predicted > 50)
print(length(z))
## [1] 1323
# Segment customers in 2015
customers_2015 = sqldf("SELECT customer_id,
MIN(days_since) AS 'recency',
MAX(days_since) AS 'first_purchase',
COUNT(*) AS 'frequency',
AVG(purchase_amount) AS 'amount'
FROM data GROUP BY 1")
customers_2015$segment = "NA"
customers_2015$segment[which(customers_2015$recency > 365*3)] = "inactive"
customers_2015$segment[which(customers_2015$recency <= 365*3 & customers_2015$recency > 365*2)] = "cold"
customers_2015$segment[which(customers_2015$recency <= 365*2 & customers_2015$recency > 365*1)] = "warm"
customers_2015$segment[which(customers_2015$recency <= 365)] = "active"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$first_purchase <= 365*2)] = "new warm"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$amount < 100)] = "warm low value"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$amount >= 100)] = "warm high value"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$first_purchase <= 365)] = "new active"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$amount < 100)] = "active low value"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$amount >= 100)] = "active high value"
customers_2015$segment = factor(x = customers_2015$segment, levels = c("inactive", "cold", "warm high value", "warm low value", "new warm", "active high value", "active low value", "new active"))
# Segment customers in 2014
customers_2014 = sqldf("SELECT customer_id,
MIN(days_since) - 365 AS 'recency',
MAX(days_since) - 365 AS 'first_purchase',
COUNT(*) AS 'frequency',
AVG(purchase_amount) AS 'amount'
FROM data
WHERE days_since > 365
GROUP BY 1")
customers_2014$segment = "NA"
customers_2014$segment[which(customers_2014$recency > 365*3)] = "inactive"
customers_2014$segment[which(customers_2014$recency <= 365*3 & customers_2014$recency > 365*2)] = "cold"
customers_2014$segment[which(customers_2014$recency <= 365*2 & customers_2014$recency > 365*1)] = "warm"
customers_2014$segment[which(customers_2014$recency <= 365)] = "active"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$first_purchase <= 365*2)] = "new warm"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$amount < 100)] = "warm low value"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$amount >= 100)] = "warm high value"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$first_purchase <= 365)] = "new active"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount < 100)] = "active low value"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount >= 100)] = "active high value"
customers_2014$segment = factor(x = customers_2014$segment, levels = c("inactive", "cold", "warm high value", "warm low value", "new warm", "active high value", "active low value", "new active"))
# Compute transition matrix
new_data = merge(x = customers_2014, y = customers_2015, by = "customer_id", all.x = TRUE)
head(new_data)
## customer_id recency.x first_purchase.x frequency.x amount.x
## 1 10 3464.3333 3464.333 1 30.0
## 2 80 302.3333 3386.333 6 70.0
## 3 90 393.3333 3418.333 10 115.8
## 4 120 1036.3333 1036.333 1 20.0
## 5 130 2605.3333 3345.333 2 50.0
## 6 160 2598.3333 3212.333 2 30.0
## segment.x recency.y first_purchase.y frequency.y amount.y
## 1 inactive 3829.3333 3829.333 1 30.00000
## 2 active low value 343.3333 3751.333 7 71.42857
## 3 warm high value 758.3333 3783.333 10 115.80000
## 4 cold 1401.3333 1401.333 1 20.00000
## 5 inactive 2970.3333 3710.333 2 50.00000
## 6 inactive 2963.3333 3577.333 2 30.00000
## segment.y
## 1 inactive
## 2 active low value
## 3 cold
## 4 inactive
## 5 inactive
## 6 inactive
transition = table(new_data$segment.x, new_data$segment.y)
print(transition)
##
## inactive cold warm high value warm low value new warm
## inactive 7227 0 0 0 0
## cold 1931 0 0 0 0
## warm high value 0 75 0 0 0
## warm low value 0 689 0 0 0
## new warm 0 1139 0 0 0
## active high value 0 0 119 0 0
## active low value 0 0 0 901 0
## new active 0 0 0 0 938
##
## active high value active low value new active
## inactive 35 250 0
## cold 22 200 0
## warm high value 35 1 0
## warm low value 1 266 0
## new warm 15 96 0
## active high value 354 2 0
## active low value 22 2088 0
## new active 89 410 0
# Divide each row by its sum
transition = transition / rowSums(transition)
print(transition)
##
## inactive cold warm high value warm low value
## inactive 0.962060703 0.000000000 0.000000000 0.000000000
## cold 0.896888063 0.000000000 0.000000000 0.000000000
## warm high value 0.000000000 0.675675676 0.000000000 0.000000000
## warm low value 0.000000000 0.720711297 0.000000000 0.000000000
## new warm 0.000000000 0.911200000 0.000000000 0.000000000
## active high value 0.000000000 0.000000000 0.250526316 0.000000000
## active low value 0.000000000 0.000000000 0.000000000 0.299236134
## new active 0.000000000 0.000000000 0.000000000 0.000000000
##
## new warm active high value active low value
## inactive 0.000000000 0.004659212 0.033280085
## cold 0.000000000 0.010218300 0.092893637
## warm high value 0.000000000 0.315315315 0.009009009
## warm low value 0.000000000 0.001046025 0.278242678
## new warm 0.000000000 0.012000000 0.076800000
## active high value 0.000000000 0.745263158 0.004210526
## active low value 0.000000000 0.007306543 0.693457323
## new active 0.652748782 0.061934586 0.285316632
##
## new active
## inactive 0.000000000
## cold 0.000000000
## warm high value 0.000000000
## warm low value 0.000000000
## new warm 0.000000000
## active high value 0.000000000
## active low value 0.000000000
## new active 0.000000000
# Initialize a matrix with the number of customers in each segment today and after 10 periods
segments = matrix(nrow = 8, ncol = 11)
segments[, 1] = table(customers_2015$segment)
colnames(segments) = 2015:2025
row.names(segments) = levels(customers_2015$segment)
print(segments)
## 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025
## inactive 9158 NA NA NA NA NA NA NA NA NA NA
## cold 1903 NA NA NA NA NA NA NA NA NA NA
## warm high value 119 NA NA NA NA NA NA NA NA NA NA
## warm low value 901 NA NA NA NA NA NA NA NA NA NA
## new warm 938 NA NA NA NA NA NA NA NA NA NA
## active high value 573 NA NA NA NA NA NA NA NA NA NA
## active low value 3313 NA NA NA NA NA NA NA NA NA NA
## new active 1512 NA NA NA NA NA NA NA NA NA NA
# Compute for each an every period
for (i in 2:11) {
segments[, i] = segments[, i-1] %*% transition
}
segments
## 2015 2016 2017 2018 2019
## inactive 9158 10517.3299 11539.4037 12636.0028 12940.3737
## cold 1903 1584.4719 1710.7998 873.8794 820.9486
## warm high value 119 143.5516 164.5264 159.9884 156.4070
## warm low value 901 991.3693 1058.2780 989.0900 937.7240
## new warm 938 986.9562 0.0000 0.0000 0.0000
## active high value 573 656.7229 638.6093 624.3136 607.4594
## active low value 3313 3536.5982 3305.3828 3133.7259 2954.0873
## new active 1512 0.0000 0.0000 0.0000 0.0000
## 2020 2021 2022 2023 2024
## inactive 13185.7240 13386.3927 13542.1427 13663.9595 13759.3216
## cold 781.5087 739.9143 708.6683 684.3252 665.2866
## warm high value 152.1846 148.6323 145.5779 142.9492 140.6959
## warm low value 883.9697 843.9455 813.0326 789.0806 770.5582
## new warm 0.0000 0.0000 0.0000 0.0000 0.0000
## active high value 593.2803 581.0884 570.5954 561.6014 553.9112
## active low value 2820.3328 2717.0267 2636.9831 2575.0841 2527.2265
## new active 0.0000 0.0000 0.0000 0.0000 0.0000
## 2025
## inactive 13833.9902
## cold 650.4148
## warm high value 138.7693
## warm low value 756.2375
## new warm 0.0000
## active high value 547.3502
## active low value 2490.2379
## new active 0.0000
# Plot inactive, active high value customers over time
barplot(segments[1, ])
barplot(segments[2, ])
# Display how segments will evolve over time
print(round(segments))
## 2015 2016 2017 2018 2019 2020 2021 2022 2023
## inactive 9158 10517 11539 12636 12940 13186 13386 13542 13664
## cold 1903 1584 1711 874 821 782 740 709 684
## warm high value 119 144 165 160 156 152 149 146 143
## warm low value 901 991 1058 989 938 884 844 813 789
## new warm 938 987 0 0 0 0 0 0 0
## active high value 573 657 639 624 607 593 581 571 562
## active low value 3313 3537 3305 3134 2954 2820 2717 2637 2575
## new active 1512 0 0 0 0 0 0 0 0
## 2024 2025
## inactive 13759 13834
## cold 665 650
## warm high value 141 139
## warm low value 771 756
## new warm 0 0
## active high value 554 547
## active low value 2527 2490
## new active 0 0
# Yearly revenue per segment
# This comes directly from module 2, lines 160-161
yearly_revenue = c(0, 0, 0, 0, 0, 323.57, 52.31, 79.17)
# Compute revenue per segment
revenue_per_segment = yearly_revenue * segments
print(revenue_per_segment)
## 2015 2016 2017 2018 2019 2020
## inactive 0.0 0.0 0.0 0.0 0.0 0.0
## cold 0.0 0.0 0.0 0.0 0.0 0.0
## warm high value 0.0 0.0 0.0 0.0 0.0 0.0
## warm low value 0.0 0.0 0.0 0.0 0.0 0.0
## new warm 0.0 0.0 0.0 0.0 0.0 0.0
## active high value 185405.6 212495.8 206634.8 202009.1 196555.6 191967.7
## active low value 173303.0 184999.5 172904.6 163925.2 154528.3 147531.6
## new active 119705.0 0.0 0.0 0.0 0.0 0.0
## 2021 2022 2023 2024 2025
## inactive 0.0 0.0 0.0 0.0 0.0
## cold 0.0 0.0 0.0 0.0 0.0
## warm high value 0.0 0.0 0.0 0.0 0.0
## warm low value 0.0 0.0 0.0 0.0 0.0
## new warm 0.0 0.0 0.0 0.0 0.0
## active high value 188022.8 184627.5 181717.4 179229.1 177106.1
## active low value 142127.7 137940.6 134702.6 132199.2 130264.3
## new active 0.0 0.0 0.0 0.0 0.0
# Compute yearly revenue
yearly_revenue = colSums(revenue_per_segment)
print(round(yearly_revenue))
## 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
## 478414 397495 379539 365934 351084 339499 330150 322568 316420 311428
## 2025
## 307370
barplot(yearly_revenue)
# Compute cumulated revenue
cumulated_revenue = cumsum(yearly_revenue)
print(round(cumulated_revenue))
## 2015 2016 2017 2018 2019 2020 2021 2022 2023
## 478414 875909 1255448 1621383 1972467 2311966 2642116 2964685 3281105
## 2024 2025
## 3592533 3899903
barplot(cumulated_revenue)
# Create a discount factor
discount_rate = 0.10
discount = 1 / ((1 + discount_rate) ^ ((1:11) - 1))
print(discount)
## [1] 1.0000000 0.9090909 0.8264463 0.7513148 0.6830135 0.6209213 0.5644739
## [8] 0.5131581 0.4665074 0.4240976 0.3855433
# Compute discounted yearly revenue
disc_yearly_revenue = yearly_revenue * discount
print(round(disc_yearly_revenue))
## 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
## 478414 361359 313669 274932 239795 210802 186361 165528 147612 132076
## 2025
## 118505
barplot(disc_yearly_revenue)
lines(yearly_revenue)
# Compute discounted cumulated revenue
disc_cumulated_revenue = cumsum(disc_yearly_revenue)
print(round(disc_cumulated_revenue))
## 2015 2016 2017 2018 2019 2020 2021 2022 2023
## 478414 839773 1153442 1428374 1668169 1878971 2065333 2230861 2378473
## 2024 2025
## 2510549 2629054
barplot(disc_cumulated_revenue)
# What is the database worth?
print(disc_cumulated_revenue[11] - yearly_revenue[1])
## 2025
## 2150640