Introduction

This is the part 1 coding for project 2, where we will be conducting market segmentation analysis using RFM analysis.

A personal home goods retailer is running a marketing campaign, targeting customers with promotional elements encouraging purchase of seasonal offerings, and this RFM analysis will help them optimize their campaigns and increase the ROI, by finding those who are more likely to purchase.

Load Required Libraries

library(tidyverse)   # Data manipulation

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)   # Date format
library(rfm)         # RFM analysis
library(ggplot2)     # Data visualization

Load e-commerce data from the working directory

retail_data <-read.csv("retail_rfm.csv")

Cleaning the Data

Checking for any missing values (removing them if there are any).

Summary of statistics and check if there are any missing values in the data

summary(retail_data)

##   customer_id       revenue       number_of_orders   recency_days   
##  Min.   :    1   Min.   :   0.0   Min.   :  1.000   Min.   :  1.00  
##  1st Qu.: 2501   1st Qu.:  69.0   1st Qu.:  2.000   1st Qu.: 90.75  
##  Median : 5000   Median : 170.0   Median :  4.000   Median :262.00  
##  Mean   : 5000   Mean   : 358.8   Mean   :  8.397   Mean   :302.25  
##  3rd Qu.: 7500   3rd Qu.: 425.0   3rd Qu.: 10.000   3rd Qu.:446.00  
##  Max.   :10000   Max.   :8342.9   Max.   :257.000   Max.   :973.00  
##     purchase         zip_code    
##  Min.   :0.0000   Min.   : 7726  
##  1st Qu.:0.0000   1st Qu.:19046  
##  Median :0.0000   Median :20854  
##  Mean   :0.1779   Mean   :18489  
##  3rd Qu.:0.0000   3rd Qu.:21771  
##  Max.   :1.0000   Max.   :24060

There are no missing values to remove.

RFM Analytics

First computing and Looking at the RFM metrics per customer:

rfm_data <- retail_data %>% 
  group_by(customer_id) %>%
  summarise(
    Recency =  min(recency_days),
    Frequency = sum(number_of_orders),
    Monetary = sum(revenue))

Then, assigning RFM scores on a 1 to 5 scale (1 = worse and 5 = best)

rfm_data <- rfm_data %>%
  mutate(
    R_Score = ntile(-Recency, 5),    
    F_Score = ntile(Frequency, 5),
    M_Score = ntile(Monetary, 5))

Segmenting customers based on their RFM scores

rfm_data <- rfm_data %>%
  mutate(
    Segment = case_when(R_Score == 5 & F_Score ==5 & M_Score ==5 ~ "Best Customers",
                        R_Score >= 4 & F_Score >=4 & M_Score >=4 ~ "Loyal Customers",
                        R_Score >= 3 & F_Score >=3 & M_Score >=3 ~ "Potential Loyalists",
                        R_Score == 1 & F_Score ==1 & M_Score <1 ~ "Lost Customers",
                        TRUE ~ "Other"))

Looking at the Recency Results

hist(rfm_data$Recency, 
     breaks = 30, 
     main = "Histogram of Recency", 
     xlab = "Recency", 
     col = "pink", 
     border = "black")

Looking at the Frequency Results

We removed the outliers in order to see the frequency distribution better.

hist(rfm_data$Frequency)

hist(rfm_data$Frequency[rfm_data$Frequency <= 100], 
     breaks = 30, 
     main = "Histogram of Frequency (0-100 Range)", 
     xlab = "Frequency Data", 
     col = "pink", 
     border = "black")

Looking at the Monetary Results

We removed the outliers in order to see the monetary distribution better.

hist(rfm_data$Monetary)

hist(rfm_data$Monetary[rfm_data$Monetary <= 2500], 
     breaks = 30, 
     main = "Histogram of Monetary (0-2500 Range)", 
     xlab = "Monetary", 
     col = "pink", 
     border = "black")

Visualizations of RFM Segments

ggplot(rfm_data, aes(x=Segment, fill=Segment)) +
  geom_bar() +
  theme_bw() +
  geom_text(stat="count", aes(label=..count..), vjust=-0) +
  labs(title ="RFM Customer Segment", x="Segment", y="Count") +
  scale_fill_manual(values = c("Best Customers" = "pink", 
                               "Loyal Customers" = "deepskyblue1", 
                               "Potential Loyalists" = "aquamarine1", 
                               "Lost Customers" = "darkorchid1",
                               "Other" ="orchid1"))

## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Identifying recent and frequent customers

ggplot(rfm_data, aes(x=Recency, y=Frequency, color=Segment))+
  geom_point(size=3)+
  theme_bw()+
  labs(title="Recency vs Frequency by Segment", 
       x="Recency (Days Since Last Purchase)", 
       y="Frequency (Total Transactions)") +
  scale_color_manual(values = c("Best Customers" = "pink", 
                                    "Loyal Customers" = "deepskyblue1", 
                                    "Potential Loyalists" = "aquamarine1", 
                                    "Lost Customers" = "darkorchid1",
                                    "Other" ="orchid1"))

Identifying the most recent and biggest spenders

ggplot(rfm_data, aes(x=Recency, y=Monetary, color=Segment))+
  geom_point(size=3)+
  theme_bw()+
  labs(title="Recency vs Monetary by Segment", 
       x="Recency (Days Since Last Purchase)", 
       y="Monetary (Total Expenditure)") +
  scale_color_manual(values = c("Best Customers" = "pink", 
                                "Loyal Customers" = "deepskyblue1", 
                                "Potential Loyalists" = "aquamarine1", 
                                "Lost Customers" = "darkorchid1",
                                "Other" ="orchid1"))

##### This graph looks the practically the same as the previous Recency vs Frequency graph.

Looking at the Monetary Value per Segment Boxplot

First, we are removing outliers from the data to be able to see the boxplot better.

Out first boxplot shown below is hard to read, so we created rfm_clean without most outliers in order to better see and understand the boxplot.

ggplot(rfm_data, aes(x=Segment, y=Monetary, fill=Segment))+
  geom_boxplot()+
  theme_bw()+
  labs(title="Monetary Value by Customer Segment", x="Segment", 
       y="Total Monetary Value")+
  theme(axis.text.x = element_text(angle = 45,hjust=1))+
  scale_fill_manual(values = c("Best Customers" = "pink", 
                               "Loyal Customers" = "deepskyblue1", 
                               "Potential Loyalists" = "aquamarine1", 
                               "Lost Customers" = "darkorchid1",
                               "Other" ="orchid1"))

rfm_clean <- rfm_data %>%
  group_by(Segment) %>%
  mutate(
    Q1 = quantile(Monetary, 0.25, na.rm = TRUE),
    Q3 = quantile(Monetary, 0.75, na.rm = TRUE),
    IQR = Q3 - Q1
  ) %>%
  filter(Monetary >= (Q1 - 1.5 * IQR) & Monetary <= (Q3 + 1.5 * IQR)) %>%
  ungroup()

ggplot(rfm_clean, aes(x=Segment, y=Monetary, fill=Segment))+
  geom_boxplot()+
  theme_bw()+
  labs(title="Monetary Value by Customer Segment", x="Segment", 
       y="Total Monetary Value")+
  theme(axis.text.x = element_text(angle = 45,hjust=1))+
  scale_fill_manual(values = c("Best Customers" = "pink", 
                               "Loyal Customers" = "deepskyblue1", 
                               "Potential Loyalists" = "aquamarine1", 
                               "Lost Customers" = "darkorchid1",
                               "Other" ="orchid1"))

Looking at the high-value customers through a heatmap based on RFM scores

The lightest pink we consider the best RFM scoring customers. The very top right quadrant is mostly light pink, showing the highest recency, frequency, and monetary customers.

ggplot(rfm_data, aes(x=F_Score, y=R_Score, fill=M_Score))+
  geom_tile()+
  scale_fill_gradient(low = "darkorchid1",high = "pink")+
  labs(title="RFM Score Headmap", x="Frequency Score", y="Recency Score",
       fill="Monetary Score")

Extracting customer lists in each segment

Best Customers

best_customers <- rfm_data %>% filter(R_Score ==5, F_Score ==5, M_Score ==5)
print(best_customers)

## # A tibble: 762 × 8
##    customer_id Recency Frequency Monetary R_Score F_Score M_Score Segment       
##          <int>   <int>     <int>    <dbl>   <int>   <int>   <int> <chr>         
##  1          29      24        14     861.       5       5       5 Best Customers
##  2          54      45        61    2650.       5       5       5 Best Customers
##  3          61      42        13     628.       5       5       5 Best Customers
##  4          62      18        97    3518.       5       5       5 Best Customers
##  5          80      28        29    1472.       5       5       5 Best Customers
##  6          86      18        62    2019.       5       5       5 Best Customers
##  7         126       5        42    1885.       5       5       5 Best Customers
##  8         129      29        61    3647.       5       5       5 Best Customers
##  9         152       4        25     751.       5       5       5 Best Customers
## 10         153      39        24    1047.       5       5       5 Best Customers
## # ℹ 752 more rows

Lost Customers

lost_customers <- rfm_data %>% filter(R_Score ==1, F_Score ==1, M_Score ==1)
print(lost_customers)

## # A tibble: 576 × 8
##    customer_id Recency Frequency Monetary R_Score F_Score M_Score Segment
##          <int>   <int>     <int>    <dbl>   <int>   <int>   <int> <chr>  
##  1          14     913         1     45         1       1       1 Other  
##  2          33     704         1     44         1       1       1 Other  
##  3          35     965         1     20         1       1       1 Other  
##  4          57     680         1     29         1       1       1 Other  
##  5          58     688         1     36         1       1       1 Other  
##  6          60     757         1     40         1       1       1 Other  
##  7          65     658         1     49.5       1       1       1 Other  
##  8          93     689         1     44         1       1       1 Other  
##  9         142     878         1     57         1       1       1 Other  
## 10         190     970         1      0         1       1       1 Other  
## # ℹ 566 more rows

Potential Loyalists

potential_loyalists <-rfm_data %>% filter(R_Score >=3, F_Score >=3)
print(potential_loyalists)

## # A tibble: 4,365 × 8
##    customer_id Recency Frequency Monetary R_Score F_Score M_Score Segment       
##          <int>   <int>     <int>    <dbl>   <int>   <int>   <int> <chr>         
##  1           1      93        19    738.        4       5       5 Loyal Custome…
##  2           5      68        15    552.        4       5       5 Loyal Custome…
##  3           9      19         5    250.        5       3       4 Potential Loy…
##  4          10     133        10    238.        4       4       3 Potential Loy…
##  5          11      28        13    371.        5       5       4 Loyal Custome…
##  6          13      51        15    510         5       5       4 Loyal Custome…
##  7          20      20         4    290         5       3       4 Potential Loy…
##  8          22     258         7    188         3       4       3 Potential Loy…
##  9          23      50         4     98.8       5       3       2 Other         
## 10          27     212         5    267.        3       3       4 Potential Loy…
## # ℹ 4,355 more rows

Inactive Customers

inactive_customers <- rfm_data %>% filter(R_Score <=2, F_Score <=2)
print(inactive_customers)

## # A tibble: 2,365 × 8
##    customer_id Recency Frequency Monetary R_Score F_Score M_Score Segment
##          <int>   <int>     <int>    <dbl>   <int>   <int>   <int> <chr>  
##  1           3     724         1     74         1       1       2 Other  
##  2           4     793         2    178         1       2       3 Other  
##  3           7     892         3    139.        1       2       3 Other  
##  4          12     317         1     65         2       1       2 Other  
##  5          14     913         1     45         1       1       1 Other  
##  6          15     502         1     12.5       2       1       1 Other  
##  7          17     320         2     88         2       2       2 Other  
##  8          21     611         1     95         1       1       2 Other  
##  9          33     704         1     44         1       1       1 Other  
## 10          35     965         1     20         1       1       1 Other  
## # ℹ 2,355 more rows

Looking at the Pie Chart for RFM Customer Segments

First, we counted the customers in each segment of the dataset

rfm_segments <- rfm_data %>% 
  group_by(Segment) %>% 
  summarise(count=n()) %>%
  mutate(percentage =count/sum(count) * 100)
print(rfm_segments)

## # A tibble: 4 × 3
##   Segment             count percentage
##   <chr>               <int>      <dbl>
## 1 Best Customers        762       7.62
## 2 Loyal Customers      1524      15.2 
## 3 Other                5956      59.6 
## 4 Potential Loyalists  1758      17.6

Then, we created the pie chart

ggplot(rfm_segments, aes(x="", y=percentage, fill=Segment))+
  geom_bar(stat="identity", width=1)+
  coord_polar(theta="y")+
  theme_void()+
  labs(title="RFM Customer Segments")+
  geom_text(aes(label=paste0(percentage, "%")),
            position=position_stack(vjust=0.5))+
  scale_fill_manual(values = c("Best Customers" = "pink", 
                               "Loyal Customers" = "deepskyblue1", 
                               "Potential Loyalists" = "aquamarine1", 
                               "Lost Customers" = "darkorchid1",
                               "Other" ="orchid1"))

##### This pie chart showed unecessarily long numbers in each segment, so we calculated the percentage and rounded it to 1 decimal place for each segment, and re-did the pie chart to get simpler results.

rfm_segments$percentage <- round(rfm_segments$count/sum(rfm_segments$count)*100, 1)
print(rfm_segments)

## # A tibble: 4 × 3
##   Segment             count percentage
##   <chr>               <int>      <dbl>
## 1 Best Customers        762        7.6
## 2 Loyal Customers      1524       15.2
## 3 Other                5956       59.6
## 4 Potential Loyalists  1758       17.6

ggplot(rfm_segments, aes(x="", y=percentage, fill=Segment))+
  geom_bar(stat="identity", width=1)+
  coord_polar(theta="y")+
  theme_void()+
  labs(title="RFM Customer Segments")+
  geom_text(aes(label=paste0(percentage, "%")),
            position=position_stack(vjust=0.5))+
  scale_fill_manual(values = c("Best Customers" = "pink", 
                               "Loyal Customers" = "deepskyblue1", 
                               "Potential Loyalists" = "aquamarine1", 
                               "Lost Customers" = "darkorchid1",
                               "Other" ="orchid1"))

Project 2 Part 1

Elina

2025-03-03