Tuango RFM Analysis

Data and Preliminaries

First, we read in the data and take a look at the variables in the dataset.

rm(list=ls())
load("Tuango_rfm.Rdata") 
str(tuango) # srt() shows the "structure" of the dataset

Classes 'tbl_df', 'tbl' and 'data.frame':   13939 obs. of  9 variables:
 $ userid   : int  63775658 64880613 65051746 66689882 68839217 70630920 73773882 74279814 74461297 74951919 ...
 $ recency  : int  309 297 295 277 259 243 216 213 212 208 ...
 $ frequency: int  7 8 1 1 1 1 1 3 1 1 ...
 $ monetary : num  39.8 39.8 72.9 40 21 ...
 $ buyer    : int  0 0 0 0 0 0 0 0 0 0 ...
 $ ordersize: num  0 0 0 0 0 0 0 0 0 0 ...
 $ platform : Factor w/ 2 levels "App","Browser": 1 2 2 2 1 1 2 2 1 1 ...
 $ category : int  3 3 3 3 3 3 3 3 3 3 ...
 $ mobile_os: Factor w/ 4 levels "android","ipad",..: 1 1 1 1 1 1 1 1 1 1 ...
 - attr(*, "datalabel")= chr ""
 - attr(*, "time.stamp")= chr "26 Sep 2014 08:38"
 - attr(*, "formats")= chr  "%12.0g" "%8.0g" "%8.0g" "%9.0g" ...
 - attr(*, "types")= int  253 252 251 254 254 254 251 254 253 251 ...
 - attr(*, "val.labels")= Named chr  "" "" "" "" ...
  ..- attr(*, "names")= chr  "" "" "" "" ...
 - attr(*, "var.labels")= chr  "User unique ID" "Days since last purchase (before the push message)" "Number of orders of one year" "Average order size of one year" ...
 - attr(*, "version")= int 115
 - attr(*, "label.table")=List of 2
  ..$ temp1: Named int  1 2
  .. ..- attr(*, "names")= chr  "App" "Browser"
  ..$ temp2: Named int  1 2 3 4
  .. ..- attr(*, "names")= chr  "android" "ipad" "iphone" "others"
 - attr(*, "expansion.fields")= list()
 - attr(*, "byteorder")= int 2
 - attr(*, "orig.dim")= int  13939 11

1 - What percentage of customers responded (i.e. bought anything) after the push message?

perc <- tuango %>% summarize(perc = sum(buyer)/n()) 
perc

2 - Of those who bought, what was the average spending?

tuango %>% filter(buyer==1) %>% summarize(average_spend = mean(ordersize))

3 - Create (independent) quintile variables for recency, frequency and monetary. Display a table with the first six observations of the quintile variables only.

tuango %>% 
    group_by(ntile(recency, 20)) %>%
    summarize(avg_recency = mean(recency), perc = mean(buyer)) %>%
    ggplot(mapping = aes(x = avg_recency,y = perc)) + geom_point()

tuango %>% 
    group_by(ntile(frequency, 20)) %>%
    summarize(avg_frequency = mean(frequency), perc = mean(buyer)) %>%
    ggplot(mapping = aes(x = avg_frequency,y = perc)) + geom_point()

tuango %>% 
    group_by(ntile(monetary, 20)) %>%
    summarize(avg_monetary = mean(monetary), perc = mean(buyer)) %>%
    ggplot(mapping = aes(x = avg_monetary,y = perc)) + geom_point()

tuango_iq <- tuango %>% 
  mutate(rec_quin_i = ntile(recency, 5), 
         freq_quin_i = ntile(-frequency, 5), 
         mon_quin_i = ntile(-monetary, 5))

tuango_iq %>% 
  select(buyer, recency, frequency, monetary, rec_quin_i) %>% slice(1:6)

tuango_iq %>% 
  select(buyer, recency, frequency, monetary, freq_quin_i) %>% slice(1:6)

tuango_iq %>% 
  select(buyer, recency, frequency, monetary, mon_quin_i) %>% slice(1:6)

4 - Create bar chart showing the response rate (i.e., the proportion of customers who bought something) to this deal by . recency quintile . frequency quintile . monetary quintile

tuango_iq %>%
  group_by(rec_quin_i) %>%
  summarize(perc = mean(buyer)) %>%
  ggplot() + geom_col(aes(x = rec_quin_i, y = perc))

tuango_iq %>%
  group_by(freq_quin_i) %>%
  summarize(perc = mean(buyer)) %>%
  ggplot() + geom_col(aes(x = freq_quin_i, y = perc))

tuango_iq %>%
  group_by(mon_quin_i) %>%
  summarize(perc = mean(buyer)) %>%
  ggplot() + geom_col(aes(x = mon_quin_i, y = perc))

5 - Repeat questions 4 using only those customers who placed an order after the push message, i.e. create bar charts showing the average spending (in RMB) by recency, frequency and monetary quintile.

tuango_push = tuango_iq %>% filter(buyer==1)

tuango_push %>%
  group_by(rec_quin_i) %>%
  summarize(avg_spent = mean(ordersize)) %>%
  ggplot() + geom_col(aes(x = rec_quin_i, y = avg_spent))

tuango_push %>%
  group_by(freq_quin_i) %>%
  summarize(avg_spent = mean(ordersize)) %>%
  ggplot() + geom_col(aes(x = freq_quin_i, y = avg_spent))

tuango_push %>%
  group_by(mon_quin_i) %>%
  summarize(avg_spent = mean(ordersize)) %>%
  ggplot() + geom_col(aes(x = mon_quin_i, y = avg_spent))

6 - What do the above bar charts reveal about the likelihood of response and the size of the order across the different recency, frequency, and monetary quintiles?

7 - Create the independent RFM index in R.

tuango_iq <- tuango_iq %>%
  mutate(rfmindex_iq = paste(rec_quin_i, freq_quin_i, mon_quin_i, sep = '')) 

tuango_iq %>% 
  select(rec_quin_i, freq_quin_i, mon_quin_i, rfmindex_iq) %>% 
  head()

8 - Create the sequential RFM index in R.

tuango_iq_sq <- tuango_iq %>% 
  mutate(rec_quin_s = ntile(recency, 5)) %>%
  group_by(rec_quin_s) %>% 
  mutate(freq_quin_s = ntile(-frequency, 5)) %>%
  group_by(rec_quin_s, freq_quin_s) %>% 
  mutate(mon_quin_s = ntile(-monetary, 5)) %>% 
  ungroup()

tuango_iq_sq <- tuango_iq_sq %>%
  mutate(rfmindex_sq = paste(rec_quin_s, freq_quin_s, mon_quin_s, sep = ''))  

tuango_iq_sq %>% 
  select(rec_quin_s, freq_quin_s, mon_quin_s,rfmindex_sq) %>% 
  head()

9 - Visually examine the observations in the database. What do you notice about the independent and sequential index values? That is - do the two approaches generally yield the same RFM index for any given customer? What do you see as the pros and cons of the two approaches (from a statistical as well as logical perspective) and why?

tuango_iq %>% 
  group_by(rfmindex_iq) %>% 
  summarise(perc=mean(buyer)) %>% 
  ggplot() + geom_col(aes(x = rfmindex_iq, y = perc))

tuango_iq_sq %>% 
  group_by(rfmindex_sq) %>% 
  summarise(perc=mean(buyer)) %>% 
  ggplot() + geom_col(aes(x = rfmindex_sq, y = perc))

10 - What is the breakeven response rate?

spend <- tuango %>% filter(buyer==1) %>% summarize(average_spend = mean(ordersize)) 
breakeven <- 2.5/(0.5*spend)
breakeven

11 - What is the projected profit in RMB if you offer the deal to all remaining 264,841 customers.

264841*(perc*0.5*spend-2.5)

12 - Consider offering the deal only to those of the 264,841 customers in RFM cells (using the sequential n-tiles approach) with a response rate that is equal to or greater than the breakeven response rate. . Calculate the response rate for the sequential quintiles approach.

resp_rate_sq_df <- tuango_iq_sq %>%
  group_by(rfmindex_sq) %>%
  summarise(resp_rate_sq=mean(buyer)) %>% 
  ungroup()

resp_rate_sq_df %>% 
  head()

. Merge the response rate back into main data.

tuango_final <- tuango_iq_sq %>%
  left_join(resp_rate_sq_df)

tuango_final %>% 
  select(userid,buyer,rfmindex_iq,rfmindex_sq,resp_rate_sq) %>% 
  head()

. Calculate who you should have mailed to in your data.

tuango_final <- tuango_final %>%
  mutate(mailto_sq=1*(resp_rate_sq>0.024709))
  
tuango_final %>% 
  select(rfmindex_sq,resp_rate_sq,mailto_sq) %>%
  head()

. Calculate the projected profit in RMB. What is it?

tuango_final %>%
  summarise(mean(buyer))

tuango_final %>%
  filter(mailto_sq==1) %>%
  summarise(frac_mailed=n()/nrow(tuango_final), resp_rate=mean(buyer))

264841*0.4808092*(0.0561*0.5*202.3565-2.5)

[1] 404438.2

Tuango RFM Analysis

About this file

Data and Preliminaries