This document uses dataset named “Online Retail.csv” which contains transaction level data (has unique row against a combination of InvoiceNo, StockCode and CustomerID) to perform step by step RFM analysis.
Setting up working directory
Reading the given data
Viewing summary of the data
Analyzing the Observations and Attributes
Cleaning the data wherever needed
Adding derived attributes wherever needed
Perform RFM Analysis, bins creation
Customer Segmentation and Categorization
Write output to File
Analysis via various Plots
Overall RFM Analysis on the Aggregated Data and Tree Map / Heat Map
setwd(dir = "C:\\Users\\pahar\\OneDrive\\IIM-K\\Classes\\2022-10-15\\Data")
olr_data<-read.csv("Online Retail.csv")
library(janitor)
olr_data<-clean_names(olr_data)
summary(olr_data)
## invoice_no stock_code description quantity
## Length:541909 Length:541909 Length:541909 Min. :-80995.00
## Class :character Class :character Class :character 1st Qu.: 1.00
## Mode :character Mode :character Mode :character Median : 3.00
## Mean : 9.55
## 3rd Qu.: 10.00
## Max. : 80995.00
##
## invoice_date unit_price customer_id country
## Length:541909 Min. :-11062.06 Min. :12346 Length:541909
## Class :character 1st Qu.: 1.25 1st Qu.:13953 Class :character
## Mode :character Median : 2.08 Median :15152 Mode :character
## Mean : 4.61 Mean :15288
## 3rd Qu.: 4.13 3rd Qu.:16791
## Max. : 38970.00 Max. :18287
## NA's :135080
str(olr_data)
## 'data.frame': 541909 obs. of 8 variables:
## $ invoice_no : chr "536365" "536365" "536365" "536365" ...
## $ stock_code : chr "85123A" "71053" "84406B" "84029G" ...
## $ description : chr "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ quantity : int 6 6 8 6 6 2 6 6 6 32 ...
## $ invoice_date: chr "12-01-2010" "12-01-2010" "12-01-2010" "12-01-2010" ...
## $ unit_price : num 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ customer_id : int 17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
## $ country : chr "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
There are negative values of quantity, these rows should be removed
invoice_date is character, this should be converted to date
There are NA records, these should be removed
library(dplyr)
olr_data<-filter(olr_data,quantity > 0, unit_price > 0)
olr_data<-subset(olr_data,!is.na(customer_id))
library(lubridate)
attach(olr_data)
olr_data$invoice_date<-mdy(olr_data$invoice_date)
olr_data<-mutate(olr_data,revenue=unit_price*quantity)
str(olr_data$invoice_date)
## Date[1:397884], format: "2010-12-01" "2010-12-01" "2010-12-01" "2010-12-01" "2010-12-01" ...
max(olr_data$invoice_date)
## [1] "2011-12-09"
analysis_date <- as.Date("2011-12-10")
library(rfm)
rfm_analysis <- rfm_table_order(data = olr_data, customer_id = customer_id, order_date = invoice_date,revenue = revenue, analysis_date = analysis_date,recency_bins = 10,frequency_bins = 10, monetary_bins = 10)
rfm_analysis
## # A tibble: 4,338 × 9
## customer_id date_most_recent recency_days transaction_count amount
## <int> <date> <dbl> <dbl> <dbl>
## 1 12346 2011-01-18 326 1 77184.
## 2 12347 2011-12-07 3 182 4310
## 3 12348 2011-09-25 76 31 1797.
## 4 12349 2011-11-21 19 73 1758.
## 5 12350 2011-02-02 311 17 334.
## 6 12352 2011-11-03 37 85 2506.
## 7 12353 2011-05-19 205 4 89
## 8 12354 2011-04-21 233 58 1079.
## 9 12355 2011-05-09 215 13 459.
## 10 12356 2011-11-17 23 59 2811.
## # … with 4,328 more rows, and 4 more variables: recency_score <int>,
## # frequency_score <int>, monetary_score <int>, rfm_score <dbl>
write.csv(rfm_analysis$rfm,"OLR_DATA_RFM_RESULT.csv")
customer_categories <- c("First Grade", "Loyal", "Likely to be Loyal","New Ones", "Could be Promising", "Require Assistance", "Getting Less Frequent","Almost Out", "Can't Lose Them", "Don't Show Up at All")
recency_lower <- c(8, 7, 5, 6, 3, 1, 1, 1, 2, 1)
recency_higher <- c(10, 10, 9, 10, 7, 6, 4, 4, 6, 3)
frequncy_lower <- c(8, 7, 5, 1, 4, 2, 1, 1, 3, 1)
frequency_higher <- c(10, 10, 9, 6, 8, 6, 6, 5, 7, 3)
monetary_lower <- c(8, 7, 5, 1, 4, 3, 2, 1, 6, 1)
monetary_higher <- c(10, 10, 9, 8, 8, 6, 7, 6, 9, 3)
rfm_segments<-rfm_segment(rfm_analysis, customer_categories, recency_lower, recency_higher,frequncy_lower, frequency_higher, monetary_lower, monetary_higher)
write.csv(rfm_segments, "OnlineRetailDataRFMCategories.csv")
Above .csv data set can be used for Ad-hoc analysis in Tableau, Power BI and other tools.
However, I am using treemap in R itself in subsequent sections to create treemap/heatmap on this
library(plotly)
library(treemap)
library(rfm)
library(dplyr)
rfm_plot_median_recency(rfm_segments,print_plot = TRUE)
rfm_plot_median_frequency(rfm_segments,print_plot = TRUE)
rfm_plot_median_monetary(rfm_segments,print_plot = TRUE)
rfm_histograms(rfm_analysis)
rfm_order_dist(rfm_analysis)
rfm_bar_chart(rfm_analysis)
We can conclude that higher the frequency gives higher monetary value and lowering it lowers the the monetary value. So Frequency and Monetary values are directly correlated.
Higher Monetary and Frequency has higher Recency. But Recency is not as important for monetary value as Frequency is when we compare all the three.
rfm_segments_overall <- rfm_segments %>% count(segment) %>% arrange(desc(n)) %>% rename(Count = n) %>% mutate(Percentage = (Count/ sum(Count))*100)
df<-data.frame(rfm_segments_overall)
treemap(df,
index="segment",
vSize = "Count",
type="index",
palette = "RdYlBu",
title="Overall RFM Categorization for Online Retail Data",
fontsize.title = 20
)
Note that the warning=FALSE, message=FALSE, echo = TRUE parameters were added to the code chunk to avoid warnings, messages and at the same time to show code used.