MODULE 1 - STATISTICAL SEGMENTATION

COMPUTING RECENCY, FREQUENCY, MONETARY VALUE

# Load text file into local variable called 'data'
data = read.delim(file = 'purchases.txt', header = FALSE, sep = '\t', dec = '.')

# Add headers and interpret the last column as a date, extract year of purchase
colnames(data) = c('customer_id', 'purchase_amount', 'date_of_purchase')
data$date_of_purchase = as.Date(data$date_of_purchase, "%Y-%m-%d")
data$days_since       = as.numeric(difftime(time1 = "2016-01-01",
                                            time2 = data$date_of_purchase,
                                            units = "days"))
# Display the data after transformation
head(data)
##   customer_id purchase_amount date_of_purchase days_since
## 1         760              25       2009-11-06  2247.3333
## 2         860              50       2012-09-28  1190.3333
## 3        1200             100       2005-10-25  3720.3333
## 4        1420              50       2009-07-09  2367.3333
## 5        1940              70       2013-01-25  1071.3333
## 6        1960              40       2013-10-29   794.3333
summary(data)
##   customer_id     purchase_amount   date_of_purchase    
##  Min.   :    10   Min.   :   5.00   Min.   :2005-01-02  
##  1st Qu.: 57720   1st Qu.:  25.00   1st Qu.:2009-01-17  
##  Median :102440   Median :  30.00   Median :2011-11-23  
##  Mean   :108935   Mean   :  62.34   Mean   :2011-07-14  
##  3rd Qu.:160525   3rd Qu.:  60.00   3rd Qu.:2013-12-29  
##  Max.   :264200   Max.   :4500.00   Max.   :2015-12-31  
##    days_since      
##  Min.   :   1.333  
##  1st Qu.: 733.333  
##  Median :1500.333  
##  Mean   :1632.273  
##  3rd Qu.:2540.333  
##  Max.   :4016.333
# Compute key marketing indicators using SQL language
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
# Compute recency, frequency, and average purchase amount
customers = sqldf("SELECT customer_id,
                          MIN(days_since) AS 'recency',
                          COUNT(*) AS 'frequency',
                          AVG(purchase_amount) AS 'amount'
                   FROM data GROUP BY 1")
## Loading required package: tcltk
# Explore the data
head(customers)
##   customer_id   recency frequency    amount
## 1          10 3829.3333         1  30.00000
## 2          80  343.3333         7  71.42857
## 3          90  758.3333        10 115.80000
## 4         120 1401.3333         1  20.00000
## 5         130 2970.3333         2  50.00000
## 6         160 2963.3333         2  30.00000
summary(customers)
##   customer_id        recency           frequency          amount       
##  Min.   :    10   Min.   :   1.333   Min.   : 1.000   Min.   :   5.00  
##  1st Qu.: 81990   1st Qu.: 244.333   1st Qu.: 1.000   1st Qu.:  21.67  
##  Median :136430   Median :1070.333   Median : 2.000   Median :  30.00  
##  Mean   :137574   Mean   :1253.371   Mean   : 2.782   Mean   :  57.79  
##  3rd Qu.:195100   3rd Qu.:2130.333   3rd Qu.: 3.000   3rd Qu.:  50.00  
##  Max.   :264200   Max.   :4014.333   Max.   :45.000   Max.   :4500.00
hist(customers$recency)

hist(customers$frequency)

hist(customers$amount)

hist(customers$amount, breaks = 100)

PREPARING AND TRANSFORMING DATA

# Copy customer data into new data frame
new_data = customers

# Remove customer id as a variable, store it as row names
head(new_data)
##   customer_id   recency frequency    amount
## 1          10 3829.3333         1  30.00000
## 2          80  343.3333         7  71.42857
## 3          90  758.3333        10 115.80000
## 4         120 1401.3333         1  20.00000
## 5         130 2970.3333         2  50.00000
## 6         160 2963.3333         2  30.00000
row.names(new_data) = new_data$customer_id
new_data$customer_id = NULL
head(new_data)
##       recency frequency    amount
## 10  3829.3333         1  30.00000
## 80   343.3333         7  71.42857
## 90   758.3333        10 115.80000
## 120 1401.3333         1  20.00000
## 130 2970.3333         2  50.00000
## 160 2963.3333         2  30.00000
# Take the log-transform of the amount, and plot
new_data$amount = log(new_data$amount)
hist(new_data$amount)

# Standardize variables
new_data = scale(new_data)
head(new_data)
##        recency  frequency     amount
## 10   2.3819788 -0.6068923 -0.2357955
## 80  -0.8415073  1.4360863  0.8943622
## 90  -0.4577590  2.4575756  1.5238194
## 120  0.1368198 -0.6068923 -0.7640251
## 130  1.5876660 -0.2663959  0.4296952
## 160  1.5811931 -0.2663959 -0.2357955

RUNNING A HIERARCHICAL SEGMENTATION

# Compute distance metrics on standardized data
# This will likely generate an error on most machines
# d = dist(new_data)

# Take a 10% sample
sample = seq(1, 18417, by = 10)
head(sample)
## [1]  1 11 21 31 41 51
customers_sample = customers[sample, ]
new_data_sample  = new_data[sample, ]

# Compute distance metrics on standardized data
d = dist(new_data_sample)

# Perform hierarchical clustering on distance metrics
c = hclust(d, method="ward.D2")

# Plot de dendogram
plot(c)

# Cut at 9 segments
members = cutree(c, k = 9)

# Show 30 first customers, frequency table
members[1:30]
##   10  260  510  850 1040 1430 1860 2160 2380 2700 3000 3140 3650 3920 4240 
##    1    2    1    3    4    1    4    5    6    7    7    5    3    1    2 
## 4470 4710 4910 5230 5520 5710 5920 6080 6240 6410 6600 6750 6940 7100 7330 
##    3    6    7    4    5    1    5    8    5    6    1    3    3    7    2
table(members)
## members
##   1   2   3   4   5   6   7   8   9 
## 309 212 130 306  49  83 236 317 200
# Show profile of each segment
aggdata = data.frame(aggregate(customers_sample[, 2:4], by = list(members), mean))
aggdata
##   Group.1   recency frequency    amount
## 1       1 2563.5696  1.284790  37.78474
## 2       2 2684.6258  1.311321  16.26336
## 3       3  193.9795 10.615385  42.01521
## 4       4  162.4379  2.398693  41.10641
## 5       5 2567.3741  1.857143 214.86197
## 6       6  448.9116  6.578313 261.21914
## 7       7 1144.0282  4.466102  29.85421
## 8       8  923.1630  1.280757  22.43357
## 9       9  799.6733  1.400000  87.42917
symbols(aggdata$frequency,aggdata$recency,circles=aggdata$amount, inches=.2, fg="white", bg="red",main= "Customer segmentation", xlab="Frequency", ylab= "Recency")

# Cut at 4 segments
members = cutree(c, k = 4)

# Show profile of each segment
aggdata2 = data.frame(aggregate(customers_sample[, 2:4], by = list(members), mean))
aggdata2
##   Group.1   recency frequency    amount
## 1       1 2612.8285  1.295585  29.02748
## 2       2  193.9795 10.615385  42.01521
## 3       3  712.8514  2.554133  31.12409
## 4       4  972.8785  2.762048 149.68451
symbols(aggdata2$frequency,aggdata2$recency,circles=aggdata2$amount, inches=.2, fg="white", bg="red",main= "Customer segmentation", xlab="Frequency", ylab= "Recency")

MODULE 2 - MANAGERIAL SEGMENTATION

COMPUTING RECENCY, FREQUENCY, MONETARY VALUE

# Compute recency, frequency, and average purchase amount
customers_2015 = sqldf("SELECT customer_id,
                               MIN(days_since) AS 'recency',
                               MAX(days_since) AS 'first_purchase',
                               COUNT(*) AS 'frequency',
                               AVG(purchase_amount) AS 'amount'
                        FROM data GROUP BY 1")
head(customers_2015)
##   customer_id   recency first_purchase frequency    amount
## 1          10 3829.3333       3829.333         1  30.00000
## 2          80  343.3333       3751.333         7  71.42857
## 3          90  758.3333       3783.333        10 115.80000
## 4         120 1401.3333       1401.333         1  20.00000
## 5         130 2970.3333       3710.333         2  50.00000
## 6         160 2963.3333       3577.333         2  30.00000
# Explore the data
summary(customers_2015)
##   customer_id        recency         first_purchase       frequency     
##  Min.   :    10   Min.   :   1.333   Min.   :   1.333   Min.   : 1.000  
##  1st Qu.: 81990   1st Qu.: 244.333   1st Qu.: 988.333   1st Qu.: 1.000  
##  Median :136430   Median :1070.333   Median :2087.333   Median : 2.000  
##  Mean   :137574   Mean   :1253.371   Mean   :1984.343   Mean   : 2.782  
##  3rd Qu.:195100   3rd Qu.:2130.333   3rd Qu.:2992.333   3rd Qu.: 3.000  
##  Max.   :264200   Max.   :4014.333   Max.   :4016.333   Max.   :45.000  
##      amount       
##  Min.   :   5.00  
##  1st Qu.:  21.67  
##  Median :  30.00  
##  Mean   :  57.79  
##  3rd Qu.:  50.00  
##  Max.   :4500.00
hist(customers_2015$recency)

hist(customers_2015$frequency)

hist(customers_2015$amount)

hist(customers_2015$amount, breaks = 100)

CODING A MANAGERIAL SEGMENTATION

# Simple 2-segment solution based on recency alone
customers_2015$segment = ifelse(test = customers_2015$recency > 365*3, yes = "inactive", no = "NA")
table(customers_2015$segment)
## 
## inactive       NA 
##     9158     9259
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
##    Group.1   recency first_purchase frequency   amount
## 1 inactive 2178.4442       2546.502  1.814479 48.11277
## 2       NA  338.3893       1428.317  3.739713 67.36760
# A more complex 3-segment solution based on recency alone
customers_2015$segment = ifelse(test = customers_2015$recency > 365*3,
                           yes = "inactive",
                           no = ifelse(test = customers_2015$recency > 365*2,
                                       yes = "cold",
                                       no = "NA"))
table(customers_2015$segment)
## 
##     cold inactive       NA 
##     1903     9158     7356
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
##    Group.1   recency first_purchase frequency   amount
## 1     cold  858.1147       1432.451  2.303205 51.73989
## 2 inactive 2178.4442       2546.502  1.814479 48.11277
## 3       NA  203.9361       1427.248  4.111338 71.41050
# Simple 2-segment solution using the which statement
customers_2015$segment = "NA"
customers_2015$segment[which(customers_2015$recency > 365*3)] = "inactive"
table(customers_2015$segment)
## 
## inactive       NA 
##     9158     9259
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
##    Group.1   recency first_purchase frequency   amount
## 1 inactive 2178.4442       2546.502  1.814479 48.11277
## 2       NA  338.3893       1428.317  3.739713 67.36760
# More complex 4-segment solution using which
customers_2015$segment = "NA"
customers_2015$segment[which(customers_2015$recency > 365*3)] = "inactive"
customers_2015$segment[which(customers_2015$recency <= 365*3 & customers_2015$recency > 365*2)] = "cold"
customers_2015$segment[which(customers_2015$recency <= 365*2 & customers_2015$recency > 365*1)] = "warm"
customers_2015$segment[which(customers_2015$recency <= 365)] = "active"
table(customers_2015$segment)
## 
##   active     cold inactive     warm 
##     5398     1903     9158     1958
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
##    Group.1   recency first_purchase frequency   amount
## 1   active  100.0740       1466.177  4.560763 72.08094
## 2     cold  858.1147       1432.451  2.303205 51.73989
## 3 inactive 2178.4442       2546.502  1.814479 48.11277
## 4     warm  490.2731       1319.924  2.872319 69.56215
# Complete segment solution using which, and exploiting previous test as input
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$first_purchase <= 365*2)] = "new warm"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$amount < 100)] = "warm low value"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$amount >= 100)] = "warm high value"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$first_purchase <= 365)] = "new active"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$amount < 100)] = "active low value"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$amount >= 100)] = "active high value"
table(customers_2015$segment)
## 
## active high value  active low value              cold          inactive 
##               573              3313              1903              9158 
##        new active          new warm   warm high value    warm low value 
##              1512               938               119               901
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
##             Group.1    recency first_purchase frequency    amount
## 1 active high value   89.15358     1986.24258  5.888307 240.04574
## 2  active low value  108.69434     2004.13533  5.935406  40.72452
## 3              cold  858.11473     1432.45052  2.303205  51.73989
## 4          inactive 2178.44417     2546.50171  1.814479  48.11277
## 5        new active   85.32407       90.34722  1.045635  77.13385
## 6          new warm  509.63824      516.95593  1.044776  66.59903
## 7   warm high value  455.45938     2015.68627  4.714286 327.40746
## 8    warm low value  474.71069     2063.97262  4.531632  38.59193
# Re-order factor in a way that makes sense
customers_2015$segment = factor(x = customers_2015$segment, levels = c("inactive", "cold", "warm high value", "warm low value", "new warm", "active high value", "active low value", "new active"))
table(customers_2015$segment)
## 
##          inactive              cold   warm high value    warm low value 
##              9158              1903               119               901 
##          new warm active high value  active low value        new active 
##               938               573              3313              1512
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
##             Group.1    recency first_purchase frequency    amount
## 1          inactive 2178.44417     2546.50171  1.814479  48.11277
## 2              cold  858.11473     1432.45052  2.303205  51.73989
## 3   warm high value  455.45938     2015.68627  4.714286 327.40746
## 4    warm low value  474.71069     2063.97262  4.531632  38.59193
## 5          new warm  509.63824      516.95593  1.044776  66.59903
## 6 active high value   89.15358     1986.24258  5.888307 240.04574
## 7  active low value  108.69434     2004.13533  5.935406  40.72452
## 8        new active   85.32407       90.34722  1.045635  77.13385

SEGMENTING A DATABASE RETROSPECTIVELY

# Compute recency, frequency, and average purchase amount
customers_2014 = sqldf("SELECT customer_id,
                               MIN(days_since) - 365 AS 'recency',
                               MAX(days_since) - 365 AS 'first_purchase',
                               COUNT(*) AS 'frequency',
                               AVG(purchase_amount) AS 'amount'
                        FROM data
                        WHERE days_since > 365
                        GROUP BY 1")
# Complete segment solution using which, and exploiting previous test as input
customers_2014$segment = "NA"
customers_2014$segment[which(customers_2014$recency > 365*3)] = "inactive"
customers_2014$segment[which(customers_2014$recency <= 365*3 & customers_2014$recency > 365*2)] = "cold"
customers_2014$segment[which(customers_2014$recency <= 365*2 & customers_2014$recency > 365*1)] = "warm"
customers_2014$segment[which(customers_2014$recency <= 365)] = "active"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$first_purchase <= 365*2)] = "new warm"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$amount < 100)] = "warm low value"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$amount >= 100)] = "warm high value"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$first_purchase <= 365)] = "new active"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount < 100)] = "active low value"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount >= 100)] = "active high value"
# Re-order factor in a way that makes sense
customers_2014$segment = factor(x = customers_2014$segment, levels = c("inactive", "cold", "warm high value", "warm low value", "new warm", "active high value", "active low value", "new active"))

# Show segmentation results
table(customers_2014$segment)
## 
##          inactive              cold   warm high value    warm low value 
##              7512              2153               111               956 
##          new warm active high value  active low value        new active 
##              1250               475              3011              1437
pie(table(customers_2014$segment), col = rainbow(24))

aggregate(x = customers_2014[, 2:5], by = list(customers_2014$segment), mean)
##             Group.1    recency first_purchase frequency    amount
## 1          inactive 2058.77463      2353.3702  1.730964  48.11120
## 2              cold  866.95015      1565.7774  2.254064  51.11460
## 3   warm high value  461.53153      1879.1441  4.414414 187.84911
## 4    warm low value  470.99442      1945.7601  4.361925  37.38206
## 5          new warm  497.65093       505.2133  1.057600  51.36765
## 6 active high value   85.67228      1838.3839  5.696842 261.90216
## 7  active low value   98.42433      1796.6847  5.633677  40.45917
## 8        new active  132.42032       143.2046  1.070981  69.72516

COMPUTING REVENUE GENERATION PER SEGMENT

# Compute how much revenue is generated by segments
# Notice that people with no revenue in 2015 do NOT appear
data$year_of_purchase = as.numeric(format(data$date_of_purchase, "%Y"))
revenue_2015 = sqldf("SELECT customer_id, SUM(purchase_amount) AS 'revenue_2015'
                      FROM data
                      WHERE year_of_purchase = 2015
                      GROUP BY 1")
revenue_2015$revenue_2015 = as.numeric(revenue_2015$revenue_2015)
summary(revenue_2015)
##   customer_id      revenue_2015    
##  Min.   :    80   Min.   :   5.00  
##  1st Qu.:104105   1st Qu.:  30.00  
##  Median :185495   Median :  50.00  
##  Mean   :167782   Mean   :  88.62  
##  3rd Qu.:246058   3rd Qu.:  85.00  
##  Max.   :264200   Max.   :4500.00
# Merge 2015 customers and 2015 revenue (correct)
actual = merge(customers_2015, revenue_2015, all.x = TRUE)
actual$revenue_2015[is.na(actual$revenue_2015)] = 0
# Show average revenue per customer and per segment
aggregate(x = actual$revenue_2015, by = list(customers_2015$segment), mean)
##             Group.1         x
## 1          inactive   0.00000
## 2              cold   0.00000
## 3   warm high value   0.00000
## 4    warm low value   0.00000
## 5          new warm   0.00000
## 6 active high value 323.56894
## 7  active low value  52.30604
## 8        new active  79.16614
# Merge 2014 customers and 2015 revenue (correct) => Predictive analysis
forward = merge(customers_2014, revenue_2015, all.x = TRUE)
forward$revenue_2015[is.na(forward$revenue_2015)] = 0

# Show average revenue per customer and per segment
r = aggregate(x = forward$revenue_2015, by = list(customers_2014$segment), mean)
print(r)
##             Group.1          x
## 1          inactive   2.949466
## 2              cold   6.108221
## 3   warm high value 114.459459
## 4    warm low value  13.494770
## 5          new warm   5.064000
## 6 active high value 254.077895
## 7  active low value  41.896556
## 8        new active  31.046625
# Re-order and display results
r = r[order(r$x, decreasing = TRUE), ]
print(r)
##             Group.1          x
## 6 active high value 254.077895
## 3   warm high value 114.459459
## 7  active low value  41.896556
## 8        new active  31.046625
## 4    warm low value  13.494770
## 2              cold   6.108221
## 5          new warm   5.064000
## 1          inactive   2.949466
barplot(r$x, names.arg = r$Group.1)

Losing one active high value customer is equal to losing eight new active customers! => Retain high value customers

MODULE 3 - SCORING

COMPUTING PREDICTORS AND TARGET VARIABLES

# Compute RFM variables as of a year ago
customers_2014 = sqldf("SELECT customer_id,
                               MIN(days_since) - 365 AS 'recency',
                               MAX(days_since) - 365 AS 'first_purchase',
                               COUNT(*) AS 'frequency',
                               AVG(purchase_amount) AS 'avg_amount',
                               MAX(purchase_amount) AS 'max_amount'
                        FROM data
                        WHERE days_since > 365
                        GROUP BY 1")

# Compute revenues generated by customers in 2015
revenue_2015 = sqldf("SELECT customer_id, SUM(purchase_amount) AS 'revenue_2015'
                      FROM data
                      WHERE year_of_purchase = 2015
                      GROUP BY 1")
# Merge 2015 customers and 2015 revenue
in_sample = merge(customers_2014, revenue_2015, all.x = TRUE)
in_sample$revenue_2015[is.na(in_sample$revenue_2015)] = 0
in_sample$active_2015 = as.numeric(in_sample$revenue_2015 > 0)

# Display calibration (in-sample) data
head(in_sample)
##   customer_id   recency first_purchase frequency avg_amount max_amount
## 1          10 3464.3333       3464.333         1       30.0         30
## 2          80  302.3333       3386.333         6       70.0         80
## 3          90  393.3333       3418.333        10      115.8        153
## 4         120 1036.3333       1036.333         1       20.0         20
## 5         130 2605.3333       3345.333         2       50.0         60
## 6         160 2598.3333       3212.333         2       30.0         30
##   revenue_2015 active_2015
## 1            0           0
## 2           80           1
## 3            0           0
## 4            0           0
## 5            0           0
## 6            0           0
summary(in_sample)
##   customer_id        recency         first_purchase       frequency     
##  Min.   :    10   Min.   :   1.333   Min.   :   1.333   Min.   : 1.000  
##  1st Qu.: 77710   1st Qu.: 258.333   1st Qu.: 796.333   1st Qu.: 1.000  
##  Median :127140   Median : 890.333   Median :1891.333   Median : 2.000  
##  Mean   :127315   Mean   :1122.920   Mean   :1788.744   Mean   : 2.665  
##  3rd Qu.:181800   3rd Qu.:1868.333   3rd Qu.:2695.333   3rd Qu.: 3.000  
##  Max.   :245840   Max.   :3649.333   Max.   :3651.333   Max.   :40.000  
##    avg_amount        max_amount       revenue_2015      active_2015    
##  Min.   :   5.00   Min.   :   5.00   Min.   :   0.00   Min.   :0.0000  
##  1st Qu.:  21.25   1st Qu.:  25.00   1st Qu.:   0.00   1st Qu.:0.0000  
##  Median :  30.00   Median :  30.00   Median :   0.00   Median :0.0000  
##  Mean   :  55.53   Mean   :  65.72   Mean   :  21.22   Mean   :0.2299  
##  3rd Qu.:  50.00   3rd Qu.:  60.00   3rd Qu.:   0.00   3rd Qu.:0.0000  
##  Max.   :4500.00   Max.   :4500.00   Max.   :4500.00   Max.   :1.0000

CALIBRATE THE MODELS

# Calibrate probability model
library(nnet)
prob.model = multinom(formula = active_2015 ~ recency + first_purchase + frequency + avg_amount + max_amount, data = in_sample)
## # weights:  7 (6 variable)
## initial  value 11717.653087 
## iter  10 value 6233.650622
## final  value 6184.465956 
## converged
coef = summary(prob.model)$coefficients
std  = summary(prob.model)$standard.errors
print(coef)
##    (Intercept)        recency first_purchase      frequency     avg_amount 
##  -0.5324757179  -0.0019651140  -0.0000116331   0.2195100049   0.0004159610 
##     max_amount 
##  -0.0001558093
print(std)
##    (Intercept)        recency first_purchase      frequency     avg_amount 
##   4.411568e-02   6.000518e-05   3.925582e-05   1.479178e-02   3.634840e-04 
##     max_amount 
##   2.713806e-04
print(coef / std)
##    (Intercept)        recency first_purchase      frequency     avg_amount 
##    -12.0699884    -32.7490735     -0.2963409     14.8399966      1.1443722 
##     max_amount 
##     -0.5741357
# For the monetary model, select only those who made a purchase
z = which(in_sample$active_2015 == 1)
head(in_sample[z, ])
##    customer_id    recency first_purchase frequency avg_amount max_amount
## 2           80  302.33333       3386.333         6   70.00000         80
## 18         480   16.33333       3313.333        11   62.27273        235
## 30         830  267.33333       3374.333         6   48.33333         60
## 31         850   62.33333       3051.333         8   28.12500         30
## 32         860  267.33333       3643.333         9   53.33333         60
## 39        1020 1462.33333       1827.333         3   73.33333        100
##    revenue_2015 active_2015
## 2            80           1
## 18           45           1
## 30           50           1
## 31           60           1
## 32           60           1
## 39          120           1
summary(in_sample[z, ])
##   customer_id        recency         first_purchase       frequency     
##  Min.   :    80   Min.   :   1.333   Min.   :   1.333   Min.   : 1.000  
##  1st Qu.: 78590   1st Qu.:  23.333   1st Qu.: 650.083   1st Qu.: 2.000  
##  Median :143550   Median :  97.333   Median :1604.333   Median : 4.000  
##  Mean   :134907   Mean   : 306.680   Mean   :1636.497   Mean   : 4.741  
##  3rd Qu.:194362   3rd Qu.: 328.333   3rd Qu.:2666.333   3rd Qu.: 7.000  
##  Max.   :236660   Max.   :3544.333   Max.   :3647.333   Max.   :40.000  
##    avg_amount        max_amount       revenue_2015     active_2015
##  Min.   :   5.00   Min.   :   5.00   Min.   :   5.0   Min.   :1   
##  1st Qu.:  30.00   1st Qu.:  30.00   1st Qu.:  30.0   1st Qu.:1   
##  Median :  40.00   Median :  50.00   Median :  50.0   Median :1   
##  Mean   :  67.78   Mean   :  88.33   Mean   :  92.3   Mean   :1   
##  3rd Qu.:  60.00   3rd Qu.:  80.00   3rd Qu.: 100.0   3rd Qu.:1   
##  Max.   :4500.00   Max.   :4500.00   Max.   :4500.0   Max.   :1
# Calibrate the monetary model (version 1)
amount.model = lm(formula = revenue_2015 ~ avg_amount + max_amount, data = in_sample[z, ])
summary(amount.model)
## 
## Call:
## lm(formula = revenue_2015 ~ avg_amount + max_amount, data = in_sample[z, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2138.5   -20.1   -16.6     0.1  3361.8 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 20.74709    2.38112   8.713   <2e-16 ***
## avg_amount   0.67486    0.03280  20.575   <2e-16 ***
## max_amount   0.29225    0.02363  12.367   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 136.6 on 3883 degrees of freedom
## Multiple R-squared:  0.6054, Adjusted R-squared:  0.6052 
## F-statistic:  2979 on 2 and 3883 DF,  p-value: < 2.2e-16
# Plot the results of the monetary model
plot(x = in_sample[z, ]$revenue_2015, y = amount.model$fitted.values)

# Re-calibrate the monetary model, using a log-transform (version 2)
amount.model = lm(formula = log(revenue_2015) ~ log(avg_amount) + log(max_amount), data = in_sample[z, ])
summary(amount.model)
## 
## Call:
## lm(formula = log(revenue_2015) ~ log(avg_amount) + log(max_amount), 
##     data = in_sample[z, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7866 -0.1811 -0.0770  0.1852  3.5656 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.37000    0.04003   9.242   <2e-16 ***
## log(avg_amount)  0.54881    0.04167  13.171   <2e-16 ***
## log(max_amount)  0.38813    0.03796  10.224   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4781 on 3883 degrees of freedom
## Multiple R-squared:  0.6927, Adjusted R-squared:  0.6926 
## F-statistic:  4377 on 2 and 3883 DF,  p-value: < 2.2e-16
# Plot the results of this new monetary model
plot(x = log(in_sample[z, ]$revenue_2015), y = amount.model$fitted.values)

APPLY THE MODELS TO TODAY’S DATA

# Compute RFM variables as of today
library(sqldf)
customers_2015 = sqldf("SELECT customer_id,
                               MIN(days_since) AS 'recency',
                               MAX(days_since) AS 'first_purchase',
                               COUNT(*) AS 'frequency',
                               AVG(purchase_amount) AS 'avg_amount',
                               MAX(purchase_amount) AS 'max_amount'
                        FROM data GROUP BY 1")
# Predict the target variables based on today's data
customers_2015$prob_predicted    = predict(object = prob.model, newdata = customers_2015, type = "probs")
customers_2015$revenue_predicted = exp(predict(object = amount.model, newdata = customers_2015))
customers_2015$score_predicted   = customers_2015$prob_predicted * customers_2015$revenue_predicted
summary(customers_2015$prob_predicted)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000263 0.012630 0.106200 0.225000 0.397800 0.999900
summary(customers_2015$revenue_predicted)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.54   29.00   35.05   65.63   57.30 3833.00
summary(customers_2015$score_predicted)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##    0.0033    0.4557    4.5560   18.8300   17.9600 2854.0000
hist(customers_2015$score_predicted)

hist(log(customers_2015$score_predicted))

# How many customers have an expected revenue of more than $50
z = which(customers_2015$score_predicted > 50)
print(length(z))
## [1] 1323

MODULE 4 - CUSTOMER LIFETIME VALUE

SEGMENT CUSTOMERS IN 2014 AND 2015

# Segment customers in 2015
customers_2015 = sqldf("SELECT customer_id,
                               MIN(days_since) AS 'recency',
                               MAX(days_since) AS 'first_purchase',
                               COUNT(*) AS 'frequency',
                               AVG(purchase_amount) AS 'amount'
                        FROM data GROUP BY 1")
customers_2015$segment = "NA"
customers_2015$segment[which(customers_2015$recency > 365*3)] = "inactive"
customers_2015$segment[which(customers_2015$recency <= 365*3 & customers_2015$recency > 365*2)] = "cold"
customers_2015$segment[which(customers_2015$recency <= 365*2 & customers_2015$recency > 365*1)] = "warm"
customers_2015$segment[which(customers_2015$recency <= 365)] = "active"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$first_purchase <= 365*2)] = "new warm"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$amount < 100)] = "warm low value"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$amount >= 100)] = "warm high value"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$first_purchase <= 365)] = "new active"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$amount < 100)] = "active low value"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$amount >= 100)] = "active high value"
customers_2015$segment = factor(x = customers_2015$segment, levels = c("inactive", "cold", "warm high value", "warm low value", "new warm", "active high value", "active low value", "new active"))

# Segment customers in 2014
customers_2014 = sqldf("SELECT customer_id,
                               MIN(days_since) - 365 AS 'recency',
                               MAX(days_since) - 365 AS 'first_purchase',
                               COUNT(*) AS 'frequency',
                               AVG(purchase_amount) AS 'amount'
                        FROM data
                        WHERE days_since > 365
                        GROUP BY 1")
customers_2014$segment = "NA"
customers_2014$segment[which(customers_2014$recency > 365*3)] = "inactive"
customers_2014$segment[which(customers_2014$recency <= 365*3 & customers_2014$recency > 365*2)] = "cold"
customers_2014$segment[which(customers_2014$recency <= 365*2 & customers_2014$recency > 365*1)] = "warm"
customers_2014$segment[which(customers_2014$recency <= 365)] = "active"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$first_purchase <= 365*2)] = "new warm"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$amount < 100)] = "warm low value"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$amount >= 100)] = "warm high value"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$first_purchase <= 365)] = "new active"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount < 100)] = "active low value"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount >= 100)] = "active high value"
customers_2014$segment = factor(x = customers_2014$segment, levels = c("inactive", "cold", "warm high value", "warm low value", "new warm", "active high value", "active low value", "new active"))

COMPUTE TRANSITION MATRIX

# Compute transition matrix
new_data = merge(x = customers_2014, y = customers_2015, by = "customer_id", all.x = TRUE)
head(new_data)
##   customer_id recency.x first_purchase.x frequency.x amount.x
## 1          10 3464.3333         3464.333           1     30.0
## 2          80  302.3333         3386.333           6     70.0
## 3          90  393.3333         3418.333          10    115.8
## 4         120 1036.3333         1036.333           1     20.0
## 5         130 2605.3333         3345.333           2     50.0
## 6         160 2598.3333         3212.333           2     30.0
##          segment.x recency.y first_purchase.y frequency.y  amount.y
## 1         inactive 3829.3333         3829.333           1  30.00000
## 2 active low value  343.3333         3751.333           7  71.42857
## 3  warm high value  758.3333         3783.333          10 115.80000
## 4             cold 1401.3333         1401.333           1  20.00000
## 5         inactive 2970.3333         3710.333           2  50.00000
## 6         inactive 2963.3333         3577.333           2  30.00000
##          segment.y
## 1         inactive
## 2 active low value
## 3             cold
## 4         inactive
## 5         inactive
## 6         inactive
transition = table(new_data$segment.x, new_data$segment.y)
print(transition)
##                    
##                     inactive cold warm high value warm low value new warm
##   inactive              7227    0               0              0        0
##   cold                  1931    0               0              0        0
##   warm high value          0   75               0              0        0
##   warm low value           0  689               0              0        0
##   new warm                 0 1139               0              0        0
##   active high value        0    0             119              0        0
##   active low value         0    0               0            901        0
##   new active               0    0               0              0      938
##                    
##                     active high value active low value new active
##   inactive                         35              250          0
##   cold                             22              200          0
##   warm high value                  35                1          0
##   warm low value                    1              266          0
##   new warm                         15               96          0
##   active high value               354                2          0
##   active low value                 22             2088          0
##   new active                       89              410          0
# Divide each row by its sum
transition = transition / rowSums(transition)
print(transition)
##                    
##                        inactive        cold warm high value warm low value
##   inactive          0.962060703 0.000000000     0.000000000    0.000000000
##   cold              0.896888063 0.000000000     0.000000000    0.000000000
##   warm high value   0.000000000 0.675675676     0.000000000    0.000000000
##   warm low value    0.000000000 0.720711297     0.000000000    0.000000000
##   new warm          0.000000000 0.911200000     0.000000000    0.000000000
##   active high value 0.000000000 0.000000000     0.250526316    0.000000000
##   active low value  0.000000000 0.000000000     0.000000000    0.299236134
##   new active        0.000000000 0.000000000     0.000000000    0.000000000
##                    
##                        new warm active high value active low value
##   inactive          0.000000000       0.004659212      0.033280085
##   cold              0.000000000       0.010218300      0.092893637
##   warm high value   0.000000000       0.315315315      0.009009009
##   warm low value    0.000000000       0.001046025      0.278242678
##   new warm          0.000000000       0.012000000      0.076800000
##   active high value 0.000000000       0.745263158      0.004210526
##   active low value  0.000000000       0.007306543      0.693457323
##   new active        0.652748782       0.061934586      0.285316632
##                    
##                      new active
##   inactive          0.000000000
##   cold              0.000000000
##   warm high value   0.000000000
##   warm low value    0.000000000
##   new warm          0.000000000
##   active high value 0.000000000
##   active low value  0.000000000
##   new active        0.000000000

USE TRANSITION MATRIX TO MAKE PREDICTIONS

# Initialize a matrix with the number of customers in each segment today and after 10 periods
segments = matrix(nrow = 8, ncol = 11)
segments[, 1] = table(customers_2015$segment)
colnames(segments) = 2015:2025
row.names(segments) = levels(customers_2015$segment)
print(segments)
##                   2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025
## inactive          9158   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## cold              1903   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## warm high value    119   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## warm low value     901   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## new warm           938   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## active high value  573   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## active low value  3313   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## new active        1512   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
# Compute for each an every period
for (i in 2:11) {
   segments[, i] = segments[, i-1] %*% transition
}
segments
##                   2015       2016       2017       2018       2019
## inactive          9158 10517.3299 11539.4037 12636.0028 12940.3737
## cold              1903  1584.4719  1710.7998   873.8794   820.9486
## warm high value    119   143.5516   164.5264   159.9884   156.4070
## warm low value     901   991.3693  1058.2780   989.0900   937.7240
## new warm           938   986.9562     0.0000     0.0000     0.0000
## active high value  573   656.7229   638.6093   624.3136   607.4594
## active low value  3313  3536.5982  3305.3828  3133.7259  2954.0873
## new active        1512     0.0000     0.0000     0.0000     0.0000
##                         2020       2021       2022       2023       2024
## inactive          13185.7240 13386.3927 13542.1427 13663.9595 13759.3216
## cold                781.5087   739.9143   708.6683   684.3252   665.2866
## warm high value     152.1846   148.6323   145.5779   142.9492   140.6959
## warm low value      883.9697   843.9455   813.0326   789.0806   770.5582
## new warm              0.0000     0.0000     0.0000     0.0000     0.0000
## active high value   593.2803   581.0884   570.5954   561.6014   553.9112
## active low value   2820.3328  2717.0267  2636.9831  2575.0841  2527.2265
## new active            0.0000     0.0000     0.0000     0.0000     0.0000
##                         2025
## inactive          13833.9902
## cold                650.4148
## warm high value     138.7693
## warm low value      756.2375
## new warm              0.0000
## active high value   547.3502
## active low value   2490.2379
## new active            0.0000
# Plot inactive, active high value customers over time
barplot(segments[1, ])

barplot(segments[2, ])

# Display how segments will evolve over time
print(round(segments))
##                   2015  2016  2017  2018  2019  2020  2021  2022  2023
## inactive          9158 10517 11539 12636 12940 13186 13386 13542 13664
## cold              1903  1584  1711   874   821   782   740   709   684
## warm high value    119   144   165   160   156   152   149   146   143
## warm low value     901   991  1058   989   938   884   844   813   789
## new warm           938   987     0     0     0     0     0     0     0
## active high value  573   657   639   624   607   593   581   571   562
## active low value  3313  3537  3305  3134  2954  2820  2717  2637  2575
## new active        1512     0     0     0     0     0     0     0     0
##                    2024  2025
## inactive          13759 13834
## cold                665   650
## warm high value     141   139
## warm low value      771   756
## new warm              0     0
## active high value   554   547
## active low value   2527  2490
## new active            0     0

COMPUTE THE (DISCOUNTED) CLV OF A DATABASE

# Yearly revenue per segment
# This comes directly from module 2, lines 160-161
yearly_revenue = c(0, 0, 0, 0, 0, 323.57, 52.31, 79.17)

# Compute revenue per segment
revenue_per_segment = yearly_revenue * segments
print(revenue_per_segment)
##                       2015     2016     2017     2018     2019     2020
## inactive               0.0      0.0      0.0      0.0      0.0      0.0
## cold                   0.0      0.0      0.0      0.0      0.0      0.0
## warm high value        0.0      0.0      0.0      0.0      0.0      0.0
## warm low value         0.0      0.0      0.0      0.0      0.0      0.0
## new warm               0.0      0.0      0.0      0.0      0.0      0.0
## active high value 185405.6 212495.8 206634.8 202009.1 196555.6 191967.7
## active low value  173303.0 184999.5 172904.6 163925.2 154528.3 147531.6
## new active        119705.0      0.0      0.0      0.0      0.0      0.0
##                       2021     2022     2023     2024     2025
## inactive               0.0      0.0      0.0      0.0      0.0
## cold                   0.0      0.0      0.0      0.0      0.0
## warm high value        0.0      0.0      0.0      0.0      0.0
## warm low value         0.0      0.0      0.0      0.0      0.0
## new warm               0.0      0.0      0.0      0.0      0.0
## active high value 188022.8 184627.5 181717.4 179229.1 177106.1
## active low value  142127.7 137940.6 134702.6 132199.2 130264.3
## new active             0.0      0.0      0.0      0.0      0.0
# Compute yearly revenue
yearly_revenue = colSums(revenue_per_segment)
print(round(yearly_revenue))
##   2015   2016   2017   2018   2019   2020   2021   2022   2023   2024 
## 478414 397495 379539 365934 351084 339499 330150 322568 316420 311428 
##   2025 
## 307370
barplot(yearly_revenue)

# Compute cumulated revenue
cumulated_revenue = cumsum(yearly_revenue)
print(round(cumulated_revenue))
##    2015    2016    2017    2018    2019    2020    2021    2022    2023 
##  478414  875909 1255448 1621383 1972467 2311966 2642116 2964685 3281105 
##    2024    2025 
## 3592533 3899903
barplot(cumulated_revenue)

# Create a discount factor
discount_rate = 0.10
discount = 1 / ((1 + discount_rate) ^ ((1:11) - 1))
print(discount)
##  [1] 1.0000000 0.9090909 0.8264463 0.7513148 0.6830135 0.6209213 0.5644739
##  [8] 0.5131581 0.4665074 0.4240976 0.3855433
# Compute discounted yearly revenue
disc_yearly_revenue = yearly_revenue * discount
print(round(disc_yearly_revenue))
##   2015   2016   2017   2018   2019   2020   2021   2022   2023   2024 
## 478414 361359 313669 274932 239795 210802 186361 165528 147612 132076 
##   2025 
## 118505
barplot(disc_yearly_revenue)
lines(yearly_revenue)

# Compute discounted cumulated revenue
disc_cumulated_revenue = cumsum(disc_yearly_revenue)
print(round(disc_cumulated_revenue))
##    2015    2016    2017    2018    2019    2020    2021    2022    2023 
##  478414  839773 1153442 1428374 1668169 1878971 2065333 2230861 2378473 
##    2024    2025 
## 2510549 2629054
barplot(disc_cumulated_revenue)

# What is the database worth?
print(disc_cumulated_revenue[11] - yearly_revenue[1])
##    2025 
## 2150640