1 Rationale

Based on the patterns obtained in the EDA of eCommerce Data, a Recency,Frequency, Monetary (RFM) analysis was performed to obtain specific customer segments based on the RFM values

The following web resources were used for the analysis 1
2

1.0.1 Observing the data to be used for calculating the RFM scores


SELECT
    "CustomerID",
    max("InvoiceDate")::Date as latest_date,
    count(distinct "InvoiceDate") as counts_freq,
    sum("Quantity" * "UnitPrice")::numeric as monetary
  FROM ecommerce_data
  where "InvoiceDate" >= '2011-01-01' and "CustomerID" is not null
  GROUP BY "CustomerID"
  order by monetary desc
  limit 10
  ;
Displaying records 1 - 10
CustomerID latest_date counts_freq monetary
14646 2011-12-08 45 270897.14
18102 2011-12-09 25 228603.88
17450 2011-12-01 32 185453.33
14911 2011-12-08 135 125815.49
12415 2011-11-15 18 123725.45
14156 2011-11-30 49 113069.44
17511 2011-12-07 30 81490.62
16684 2011-12-05 15 62778.68
13694 2011-12-06 38 59768.06
14096 2011-12-05 17 57120.91

The latest date is obtained for its use in the RFM analysis


select
 max("InvoiceDate")::Date as latest_date
from
ecommerce_data;
1 records
latest_date
2011-12-09

1.0.2 Generating RFM scores

Step1: Obtaining the base data for frequency and monetary analysis along with the date of most recent purchase

Step2: Using the above value, days since the last purchase for obtaining recency score is calculated

Step3: Combining the two tables to get a combined data to calculate the recency, frequency and monetary values

Step4: Calculate RFM scores


with rfm_base_data as ( --Step1
 SELECT
    "CustomerID" as customer_id,
    max("InvoiceDate")::Date as most_recent_date,
    count(distinct "InvoiceDate") as counts_freq,
    sum("Quantity" * "UnitPrice")::numeric as monetary
  FROM ecommerce_data
  where "InvoiceDate" >= '2011-01-01' and "CustomerID" is not null
  GROUP BY "CustomerID"
  ),  
rfm_date_diff as (   -- Step 2
 select
customer_id,
date_part('day',date_diff_age) as date_diff
from(
select
customer_id,
 age('2011-12-09'::Date, most_recent_date) as date_diff_age
  from
 rfm_base_data)subq
 ),
 rfm_comb_base_data as (   -- Step 3
 select
 r.customer_id,
 r.counts_freq,
 r.monetary,
 rd.date_diff
 from
 rfm_base_data as r
 join
 rfm_date_diff as rd
  on r.customer_id=rd.customer_id)
  select
  customer_id, 
  rfm_recency,
  rfm_freq,
  rfm_monetary,
  date_diff,
  counts_freq,
  monetary,
  rfm_recency*100 + rfm_freq*10 + rfm_monetary as rfm_combined
  from(
  select  -- Step 4
  customer_id,
  date_diff,
  counts_freq,
  monetary,
  ntile(5) over (order by date_diff desc) as rfm_recency,
  ntile(5) over (order by counts_freq) as rfm_freq,
  ntile(5) over (order by monetary) as rfm_monetary
  from
  rfm_comb_base_data)as subq
  order by rfm_combined desc
  ;   
  
DT::datatable(rfm_scores,
              class = 'cell-border stripe',
              rownames = FALSE,
              caption = htmltools::tags$caption(
    style = 'caption-side: top; text-align: left;',
    htmltools::em('RFM scores dataset')))

1.0.3 Obtaining Customer segements (11 segments) based on RFM scores


select
customer_id,
rfm_recency*100 + rfm_freq*10 + rfm_monetary as rfm_combined,
case 
when (rfm_recency between 4 and 5) and (rfm_freq between 4 and 5 and rfm_monetary between 4 and 5) then 'Champions'
when (rfm_recency between 2 and 5) and (rfm_freq between 2 and 5 and rfm_monetary between 2 and 5) then 'Loyal Customers'
when (rfm_recency between 3 and 5) and (rfm_freq between 1 and 3 and rfm_monetary between 1 and 3) then 'Potential Loyalist'
when (rfm_recency between 4 and 5) and (rfm_freq between 0 and 1 and rfm_monetary between 0 and 1) then 'Recent Customers'
when (rfm_recency between 3 and 4) and (rfm_freq between 0 and 1 and rfm_monetary between 0 and 1) then 'Promising'
when (rfm_recency between 2 and 3) and (rfm_freq between 2 and 3 and rfm_monetary between 2 and 3) then 'Customers Needing Attention'
when (rfm_recency between 2 and 3) and (rfm_freq between 0 and 2 and rfm_monetary between 0 and 2) then 'About To Sleep'
when (rfm_recency between 0 and 2) and (rfm_freq between 2 and 5 and rfm_monetary between 2 and 5) then 'At Risk'
when (rfm_recency between 0 and 1) and (rfm_freq between 4 and 5 and rfm_monetary between 4 and 5) then 'Cant Lose Them'
when (rfm_recency between 1 and 2) and (rfm_freq between 1 and 2 and rfm_monetary between 1 and 2) then 'Hibernating'
when (rfm_recency between 0 and 2) and (rfm_freq between 0 and 2 and rfm_monetary between 0 and 2) then 'Lost'
else 'other' end as customer_segments
from( --- this is the same query used above for rfm scores calculation
 with rfm_base_data as ( 
 SELECT
    "CustomerID" as customer_id,
    max("InvoiceDate")::Date as most_recent_date,
    count(distinct "InvoiceDate") as counts_freq,
    sum("Quantity" * "UnitPrice")::numeric as monetary
  FROM ecommerce_data
  where "InvoiceDate" >= '2011-01-01' and "CustomerID" is not null
  GROUP BY "CustomerID"
  ),  
rfm_date_diff as (
 select
customer_id,
date_part('day',date_diff_age) as date_diff
from(
select
customer_id,
 age('2011-12-09'::Date, most_recent_date) as date_diff_age
  from
 rfm_base_data)subq
 ),
 rfm_comb_base_data as (
 select
 r.customer_id,
 r.counts_freq,
 r.monetary,
 rd.date_diff
 from
 rfm_base_data as r
 join
 rfm_date_diff as rd
  on r.customer_id=rd.customer_id)
  select
  customer_id, 
  rfm_recency,
  rfm_freq,
  rfm_monetary,
  date_diff,
  counts_freq,
  monetary,
  rfm_recency*100 + rfm_freq*10 + rfm_monetary as rfm_combined
  from(
  select
  customer_id,
  date_diff,
  counts_freq,
  monetary,
  ntile(5) over (order by date_diff desc) as rfm_recency,
  ntile(5) over (order by counts_freq) as rfm_freq,
  ntile(5) over (order by monetary) as rfm_monetary
  from
  rfm_comb_base_data)as subq
  order by rfm_combined desc
)as subq
  order by rfm_combined desc
;
DT::datatable(customer_segments,
              class = 'cell-border stripe',
              rownames = FALSE,
              caption = htmltools::tags$caption(
    style = 'caption-side: top; text-align: left;',
    htmltools::em('Customer segments dataset')))

1.0.4 Visualizing the Customer segments and its respective proportions

treemap_data<-customer_segments%>%
    group_by(customer_segments)%>%
    summarise(segment_counts=n(),
              segment_prop=round(n()/nrow(.)*100,2))

treemap_data<-treemap_data%>%
  dplyr::mutate(category_details=paste0(treemap_data$customer_segments,
                                        ": ",treemap_data$segment_prop,
                                        "%"))

require(treemap)
## Loading required package: treemap
treemap_data%>%
    treemap(.,
            index=c("category_details"),
            vSize = "segment_prop",  
            type="index", 
            fontsize.labels = 13,
            fontcolor.labels=c("white"),
                      palette = "Set2",  
                      title="Customer segments", 
                      fontsize.title = 14)