Based on the patterns obtained in the EDA of eCommerce Data, a Recency,Frequency, Monetary (RFM) analysis was performed to obtain specific customer segments based on the RFM values
The following web resources were used for the analysis 1
2
CustomerID | latest_date | counts_freq | monetary |
---|---|---|---|
14646 | 2011-12-08 | 45 | 270897.14 |
18102 | 2011-12-09 | 25 | 228603.88 |
17450 | 2011-12-01 | 32 | 185453.33 |
14911 | 2011-12-08 | 135 | 125815.49 |
12415 | 2011-11-15 | 18 | 123725.45 |
14156 | 2011-11-30 | 49 | 113069.44 |
17511 | 2011-12-07 | 30 | 81490.62 |
16684 | 2011-12-05 | 15 | 62778.68 |
13694 | 2011-12-06 | 38 | 59768.06 |
14096 | 2011-12-05 | 17 | 57120.91 |
The latest date is obtained for its use in the RFM analysis
latest_date |
---|
2011-12-09 |
Step1: Obtaining the base data for frequency and monetary analysis along with the date of most recent purchase
Step2: Using the above value, days since the last purchase for obtaining recency score is calculated
Step3: Combining the two tables to get a combined data to calculate the recency, frequency and monetary values
Step4: Calculate RFM scores
with rfm_base_data as ( --Step1
SELECT
"CustomerID" as customer_id,
max("InvoiceDate")::Date as most_recent_date,
count(distinct "InvoiceDate") as counts_freq,
sum("Quantity" * "UnitPrice")::numeric as monetary
FROM ecommerce_data
where "InvoiceDate" >= '2011-01-01' and "CustomerID" is not null
GROUP BY "CustomerID"
),
rfm_date_diff as ( -- Step 2
select
customer_id,
date_part('day',date_diff_age) as date_diff
from(
select
customer_id,
age('2011-12-09'::Date, most_recent_date) as date_diff_age
from
rfm_base_data)subq
),
rfm_comb_base_data as ( -- Step 3
select
r.customer_id,
r.counts_freq,
r.monetary,
rd.date_diff
from
rfm_base_data as r
join
rfm_date_diff as rd
on r.customer_id=rd.customer_id)
select
customer_id,
rfm_recency,
rfm_freq,
rfm_monetary,
date_diff,
counts_freq,
monetary,
rfm_recency*100 + rfm_freq*10 + rfm_monetary as rfm_combined
from(
select -- Step 4
customer_id,
date_diff,
counts_freq,
monetary,
ntile(5) over (order by date_diff desc) as rfm_recency,
ntile(5) over (order by counts_freq) as rfm_freq,
ntile(5) over (order by monetary) as rfm_monetary
from
rfm_comb_base_data)as subq
order by rfm_combined desc
;
select
customer_id,
rfm_recency*100 + rfm_freq*10 + rfm_monetary as rfm_combined,
case
when (rfm_recency between 4 and 5) and (rfm_freq between 4 and 5 and rfm_monetary between 4 and 5) then 'Champions'
when (rfm_recency between 2 and 5) and (rfm_freq between 2 and 5 and rfm_monetary between 2 and 5) then 'Loyal Customers'
when (rfm_recency between 3 and 5) and (rfm_freq between 1 and 3 and rfm_monetary between 1 and 3) then 'Potential Loyalist'
when (rfm_recency between 4 and 5) and (rfm_freq between 0 and 1 and rfm_monetary between 0 and 1) then 'Recent Customers'
when (rfm_recency between 3 and 4) and (rfm_freq between 0 and 1 and rfm_monetary between 0 and 1) then 'Promising'
when (rfm_recency between 2 and 3) and (rfm_freq between 2 and 3 and rfm_monetary between 2 and 3) then 'Customers Needing Attention'
when (rfm_recency between 2 and 3) and (rfm_freq between 0 and 2 and rfm_monetary between 0 and 2) then 'About To Sleep'
when (rfm_recency between 0 and 2) and (rfm_freq between 2 and 5 and rfm_monetary between 2 and 5) then 'At Risk'
when (rfm_recency between 0 and 1) and (rfm_freq between 4 and 5 and rfm_monetary between 4 and 5) then 'Cant Lose Them'
when (rfm_recency between 1 and 2) and (rfm_freq between 1 and 2 and rfm_monetary between 1 and 2) then 'Hibernating'
when (rfm_recency between 0 and 2) and (rfm_freq between 0 and 2 and rfm_monetary between 0 and 2) then 'Lost'
else 'other' end as customer_segments
from( --- this is the same query used above for rfm scores calculation
with rfm_base_data as (
SELECT
"CustomerID" as customer_id,
max("InvoiceDate")::Date as most_recent_date,
count(distinct "InvoiceDate") as counts_freq,
sum("Quantity" * "UnitPrice")::numeric as monetary
FROM ecommerce_data
where "InvoiceDate" >= '2011-01-01' and "CustomerID" is not null
GROUP BY "CustomerID"
),
rfm_date_diff as (
select
customer_id,
date_part('day',date_diff_age) as date_diff
from(
select
customer_id,
age('2011-12-09'::Date, most_recent_date) as date_diff_age
from
rfm_base_data)subq
),
rfm_comb_base_data as (
select
r.customer_id,
r.counts_freq,
r.monetary,
rd.date_diff
from
rfm_base_data as r
join
rfm_date_diff as rd
on r.customer_id=rd.customer_id)
select
customer_id,
rfm_recency,
rfm_freq,
rfm_monetary,
date_diff,
counts_freq,
monetary,
rfm_recency*100 + rfm_freq*10 + rfm_monetary as rfm_combined
from(
select
customer_id,
date_diff,
counts_freq,
monetary,
ntile(5) over (order by date_diff desc) as rfm_recency,
ntile(5) over (order by counts_freq) as rfm_freq,
ntile(5) over (order by monetary) as rfm_monetary
from
rfm_comb_base_data)as subq
order by rfm_combined desc
)as subq
order by rfm_combined desc
;
## Loading required package: treemap
Customer numbers in each customer segments
customer_segments%>%
group_by(customer_segments)%>%
summarise(segment_counts=n())%>%
plot_ly() %>%
add_bars(x = ~forcats::fct_reorder(customer_segments,
segment_counts),
y = ~segment_counts,
color = ~I("forestgreen")) %>%
layout(title = 'Customer numbers in segments',
xaxis = list(title = "Segments"),
yaxis = list(title = "Customer number"))
Average Recency in Days for the different types of Customers
customer_segments%>%
dplyr::left_join(rfm_scores,
by=c("customer_id"))%>%
dplyr::group_by(customer_segments)%>%
summarise(avg_days_diff=mean(date_diff,
na.rm = TRUE))%>%
dplyr::mutate(avg_days_rounded=round(avg_days_diff,2))%>%
plot_ly(x = ~avg_days_rounded,
y=~forcats::fct_reorder(customer_segments,
desc(avg_days_rounded)),
type = "bar",
color = ~customer_segments,
colors = "Dark2",
text = ~avg_days_rounded,
textposition = 'auto',
textfont = list(color = '#000000', size = 16))%>%
layout(title = 'Average Recency \n (Days since last purchase)',
xaxis = list(title = "Days"),
yaxis = list(title = "Customers"))%>%
layout(showlegend=FALSE)
Customer numbers with corresponding number of orders
rfm_scores%>%
dplyr::group_by(counts_freq)%>%
summarise(customer_number=n())%>%
plot_ly(x = ~counts_freq,
y=~customer_number,
type = "bar",
color=I("grey50")
)%>%
layout(title = 'Customer counts corresponding to number of orders',
xaxis = list(title = "Number of orders"),
yaxis = list(title = "Customer counts"))
Recency X Frequency Heatmap showing respective revenue in each cell
customer_segments%>%
dplyr::left_join(rfm_scores,
by=c("customer_id"))%>%
dplyr::select(-rfm_combined.y)%>%
plot_ly(x=~rfm_freq,
y=~rfm_recency,
z=~monetary,
type = "heatmap",
colors = "Blues")%>%
layout(title = 'Recency X Frequency',
xaxis = list(title = "Frequency"),
yaxis = list(title = "Recency"))