First, load packages:

#library(knitr)
#library(psych)
#library(statar)
library(tidyverse)

About this file

This allows you to replicate the analysis in the class “Predicting Response with Recency, Frequency, and Monetary Value (RFM) Analysis” using the Bookbinders data. You do not have to go through this when you prepare the Bookbinders case; running and analyzing this is optional. However, this file will be important because it contains everything you will need to do for the “Tuango” assignment.

Data and Preliminaries

First, we read in the data and take a look at the variables in the dataset.

rm(list=ls())
load("Tuango_rfm.Rdata") 
str(tuango) # srt() shows the "structure" of the dataset
Classes 'tbl_df', 'tbl' and 'data.frame':   13939 obs. of  9 variables:
 $ userid   : int  63775658 64880613 65051746 66689882 68839217 70630920 73773882 74279814 74461297 74951919 ...
 $ recency  : int  309 297 295 277 259 243 216 213 212 208 ...
 $ frequency: int  7 8 1 1 1 1 1 3 1 1 ...
 $ monetary : num  39.8 39.8 72.9 40 21 ...
 $ buyer    : int  0 0 0 0 0 0 0 0 0 0 ...
 $ ordersize: num  0 0 0 0 0 0 0 0 0 0 ...
 $ platform : Factor w/ 2 levels "App","Browser": 1 2 2 2 1 1 2 2 1 1 ...
 $ category : int  3 3 3 3 3 3 3 3 3 3 ...
 $ mobile_os: Factor w/ 4 levels "android","ipad",..: 1 1 1 1 1 1 1 1 1 1 ...
 - attr(*, "datalabel")= chr ""
 - attr(*, "time.stamp")= chr "26 Sep 2014 08:38"
 - attr(*, "formats")= chr  "%12.0g" "%8.0g" "%8.0g" "%9.0g" ...
 - attr(*, "types")= int  253 252 251 254 254 254 251 254 253 251 ...
 - attr(*, "val.labels")= Named chr  "" "" "" "" ...
  ..- attr(*, "names")= chr  "" "" "" "" ...
 - attr(*, "var.labels")= chr  "User unique ID" "Days since last purchase (before the push message)" "Number of orders of one year" "Average order size of one year" ...
 - attr(*, "version")= int 115
 - attr(*, "label.table")=List of 2
  ..$ temp1: Named int  1 2
  .. ..- attr(*, "names")= chr  "App" "Browser"
  ..$ temp2: Named int  1 2 3 4
  .. ..- attr(*, "names")= chr  "android" "ipad" "iphone" "others"
 - attr(*, "expansion.fields")= list()
 - attr(*, "byteorder")= int 2
 - attr(*, "orig.dim")= int  13939 11

1 - What percentage of customers responded (i.e. bought anything) after the push message?

perc <- tuango %>% summarize(perc = sum(buyer)/n()) 
perc

2 - Of those who bought, what was the average spending?

tuango %>% filter(buyer==1) %>% summarize(average_spend = mean(ordersize)) 

3 - Create (independent) quintile variables for recency, frequency and monetary. Display a table with the first six observations of the quintile variables only.

tuango %>% 
    group_by(ntile(recency, 20)) %>%
    summarize(avg_recency = mean(recency), perc = mean(buyer)) %>%
    ggplot(mapping = aes(x = avg_recency,y = perc)) + geom_point()

tuango %>% 
    group_by(ntile(frequency, 20)) %>%
    summarize(avg_frequency = mean(frequency), perc = mean(buyer)) %>%
    ggplot(mapping = aes(x = avg_frequency,y = perc)) + geom_point()

tuango %>% 
    group_by(ntile(monetary, 20)) %>%
    summarize(avg_monetary = mean(monetary), perc = mean(buyer)) %>%
    ggplot(mapping = aes(x = avg_monetary,y = perc)) + geom_point()

tuango_iq <- tuango %>% 
  mutate(rec_quin_i = ntile(recency, 5), 
         freq_quin_i = ntile(-frequency, 5), 
         mon_quin_i = ntile(-monetary, 5))

tuango_iq %>% 
  select(buyer, recency, frequency, monetary, rec_quin_i) %>% slice(1:6)
tuango_iq %>% 
  select(buyer, recency, frequency, monetary, freq_quin_i) %>% slice(1:6)
tuango_iq %>% 
  select(buyer, recency, frequency, monetary, mon_quin_i) %>% slice(1:6)

4 - Create bar chart showing the response rate (i.e., the proportion of customers who bought something) to this deal by . recency quintile . frequency quintile . monetary quintile

tuango_iq %>%
  group_by(rec_quin_i) %>%
  summarize(perc = mean(buyer)) %>%
  ggplot() + geom_col(aes(x = rec_quin_i, y = perc))

tuango_iq %>%
  group_by(freq_quin_i) %>%
  summarize(perc = mean(buyer)) %>%
  ggplot() + geom_col(aes(x = freq_quin_i, y = perc))

tuango_iq %>%
  group_by(mon_quin_i) %>%
  summarize(perc = mean(buyer)) %>%
  ggplot() + geom_col(aes(x = mon_quin_i, y = perc))

5 - Repeat questions 4 using only those customers who placed an order after the push message, i.e. create bar charts showing the average spending (in RMB) by recency, frequency and monetary quintile.

tuango_push = tuango_iq %>% filter(buyer==1)

tuango_push %>%
  group_by(rec_quin_i) %>%
  summarize(avg_spent = mean(ordersize)) %>%
  ggplot() + geom_col(aes(x = rec_quin_i, y = avg_spent))

tuango_push %>%
  group_by(freq_quin_i) %>%
  summarize(avg_spent = mean(ordersize)) %>%
  ggplot() + geom_col(aes(x = freq_quin_i, y = avg_spent))

tuango_push %>%
  group_by(mon_quin_i) %>%
  summarize(avg_spent = mean(ordersize)) %>%
  ggplot() + geom_col(aes(x = mon_quin_i, y = avg_spent))

6 - What do the above bar charts reveal about the likelihood of response and the size of the order across the different recency, frequency, and monetary quintiles?

7 - Create the independent RFM index in R.

tuango_iq <- tuango_iq %>%
  mutate(rfmindex_iq = paste(rec_quin_i, freq_quin_i, mon_quin_i, sep = '')) 

tuango_iq %>% 
  select(rec_quin_i, freq_quin_i, mon_quin_i, rfmindex_iq) %>% 
  head()

8 - Create the sequential RFM index in R.

tuango_iq_sq <- tuango_iq %>% 
  mutate(rec_quin_s = ntile(recency, 5)) %>%
  group_by(rec_quin_s) %>% 
  mutate(freq_quin_s = ntile(-frequency, 5)) %>%
  group_by(rec_quin_s, freq_quin_s) %>% 
  mutate(mon_quin_s = ntile(-monetary, 5)) %>% 
  ungroup()
tuango_iq_sq <- tuango_iq_sq %>%
  mutate(rfmindex_sq = paste(rec_quin_s, freq_quin_s, mon_quin_s, sep = ''))  

tuango_iq_sq %>% 
  select(rec_quin_s, freq_quin_s, mon_quin_s,rfmindex_sq) %>% 
  head()

9 - Visually examine the observations in the database. What do you notice about the independent and sequential index values? That is - do the two approaches generally yield the same RFM index for any given customer? What do you see as the pros and cons of the two approaches (from a statistical as well as logical perspective) and why?

tuango_iq %>% 
  group_by(rfmindex_iq) %>% 
  summarise(perc=mean(buyer)) %>% 
  ggplot() + geom_col(aes(x = rfmindex_iq, y = perc))

tuango_iq_sq %>% 
  group_by(rfmindex_sq) %>% 
  summarise(perc=mean(buyer)) %>% 
  ggplot() + geom_col(aes(x = rfmindex_sq, y = perc))

10 - What is the breakeven response rate?

spend <- tuango %>% filter(buyer==1) %>% summarize(average_spend = mean(ordersize)) 
breakeven <- 2.5/(0.5*spend)
breakeven

11 - What is the projected profit in RMB if you offer the deal to all remaining 264,841 customers.

264841*(perc*0.5*spend-2.5)

12 - Consider offering the deal only to those of the 264,841 customers in RFM cells (using the sequential n-tiles approach) with a response rate that is equal to or greater than the breakeven response rate. . Calculate the response rate for the sequential quintiles approach.

resp_rate_sq_df <- tuango_iq_sq %>%
  group_by(rfmindex_sq) %>%
  summarise(resp_rate_sq=mean(buyer)) %>% 
  ungroup()

resp_rate_sq_df %>% 
  head()

. Merge the response rate back into main data.

tuango_final <- tuango_iq_sq %>%
  left_join(resp_rate_sq_df)

tuango_final %>% 
  select(userid,buyer,rfmindex_iq,rfmindex_sq,resp_rate_sq) %>% 
  head()

. Calculate who you should have mailed to in your data.

tuango_final <- tuango_final %>%
  mutate(mailto_sq=1*(resp_rate_sq>0.024709))
  
tuango_final %>% 
  select(rfmindex_sq,resp_rate_sq,mailto_sq) %>%
  head()

. Calculate the projected profit in RMB. What is it?

tuango_final %>%
  summarise(mean(buyer))
tuango_final %>%
  filter(mailto_sq==1) %>%
  summarise(frac_mailed=n()/nrow(tuango_final), resp_rate=mean(buyer))
264841*0.4808092*(0.0561*0.5*202.3565-2.5)
[1] 404438.2