library(tidyverse) # Data manipulation
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate) # Date format
library(rfm) # RFM analysis
library(ggplot2) # Data visualization
retail_data <-read.csv("retail_rfm.csv")
summary(retail_data)
## customer_id revenue number_of_orders recency_days
## Min. : 1 Min. : 0.0 Min. : 1.000 Min. : 1.00
## 1st Qu.: 2501 1st Qu.: 69.0 1st Qu.: 2.000 1st Qu.: 90.75
## Median : 5000 Median : 170.0 Median : 4.000 Median :262.00
## Mean : 5000 Mean : 358.8 Mean : 8.397 Mean :302.25
## 3rd Qu.: 7500 3rd Qu.: 425.0 3rd Qu.: 10.000 3rd Qu.:446.00
## Max. :10000 Max. :8342.9 Max. :257.000 Max. :973.00
## purchase zip_code
## Min. :0.0000 Min. : 7726
## 1st Qu.:0.0000 1st Qu.:19046
## Median :0.0000 Median :20854
## Mean :0.1779 Mean :18489
## 3rd Qu.:0.0000 3rd Qu.:21771
## Max. :1.0000 Max. :24060
rfm_data <- retail_data %>%
group_by(customer_id) %>%
summarise(
Recency = min(recency_days),
Frequency = sum(number_of_orders),
Monetary = sum(revenue))
rfm_data <- rfm_data %>%
mutate(
R_Score = ntile(-Recency, 5),
F_Score = ntile(Frequency, 5),
M_Score = ntile(Monetary, 5))
rfm_data <- rfm_data %>%
mutate(
Segment = case_when(R_Score == 5 & F_Score ==5 & M_Score ==5 ~ "Best Customers",
R_Score >= 4 & F_Score >=4 & M_Score >=4 ~ "Loyal Customers",
R_Score >= 3 & F_Score >=3 & M_Score >=3 ~ "Potential Loyalists",
R_Score == 1 & F_Score ==1 & M_Score <1 ~ "Lost Customers",
TRUE ~ "Other"))
hist(rfm_data$Recency,
breaks = 30,
main = "Histogram of Recency",
xlab = "Recency",
col = "pink",
border = "black")
hist(rfm_data$Frequency)
hist(rfm_data$Frequency[rfm_data$Frequency <= 100],
breaks = 30,
main = "Histogram of Frequency (0-100 Range)",
xlab = "Frequency Data",
col = "pink",
border = "black")
hist(rfm_data$Monetary)
hist(rfm_data$Monetary[rfm_data$Monetary <= 2500],
breaks = 30,
main = "Histogram of Monetary (0-2500 Range)",
xlab = "Monetary",
col = "pink",
border = "black")
ggplot(rfm_data, aes(x=Segment, fill=Segment)) +
geom_bar() +
theme_bw() +
geom_text(stat="count", aes(label=..count..), vjust=-0) +
labs(title ="RFM Customer Segment", x="Segment", y="Count") +
scale_fill_manual(values = c("Best Customers" = "pink",
"Loyal Customers" = "deepskyblue1",
"Potential Loyalists" = "aquamarine1",
"Lost Customers" = "darkorchid1",
"Other" ="orchid1"))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(rfm_data, aes(x=Recency, y=Frequency, color=Segment))+
geom_point(size=3)+
theme_bw()+
labs(title="Recency vs Frequency by Segment",
x="Recency (Days Since Last Purchase)",
y="Frequency (Total Transactions)") +
scale_color_manual(values = c("Best Customers" = "pink",
"Loyal Customers" = "deepskyblue1",
"Potential Loyalists" = "aquamarine1",
"Lost Customers" = "darkorchid1",
"Other" ="orchid1"))
ggplot(rfm_data, aes(x=Recency, y=Monetary, color=Segment))+
geom_point(size=3)+
theme_bw()+
labs(title="Recency vs Monetary by Segment",
x="Recency (Days Since Last Purchase)",
y="Monetary (Total Expenditure)") +
scale_color_manual(values = c("Best Customers" = "pink",
"Loyal Customers" = "deepskyblue1",
"Potential Loyalists" = "aquamarine1",
"Lost Customers" = "darkorchid1",
"Other" ="orchid1"))
##### This graph looks the practically the same as the previous Recency
vs Frequency graph.
ggplot(rfm_data, aes(x=Segment, y=Monetary, fill=Segment))+
geom_boxplot()+
theme_bw()+
labs(title="Monetary Value by Customer Segment", x="Segment",
y="Total Monetary Value")+
theme(axis.text.x = element_text(angle = 45,hjust=1))+
scale_fill_manual(values = c("Best Customers" = "pink",
"Loyal Customers" = "deepskyblue1",
"Potential Loyalists" = "aquamarine1",
"Lost Customers" = "darkorchid1",
"Other" ="orchid1"))
rfm_clean <- rfm_data %>%
group_by(Segment) %>%
mutate(
Q1 = quantile(Monetary, 0.25, na.rm = TRUE),
Q3 = quantile(Monetary, 0.75, na.rm = TRUE),
IQR = Q3 - Q1
) %>%
filter(Monetary >= (Q1 - 1.5 * IQR) & Monetary <= (Q3 + 1.5 * IQR)) %>%
ungroup()
ggplot(rfm_clean, aes(x=Segment, y=Monetary, fill=Segment))+
geom_boxplot()+
theme_bw()+
labs(title="Monetary Value by Customer Segment", x="Segment",
y="Total Monetary Value")+
theme(axis.text.x = element_text(angle = 45,hjust=1))+
scale_fill_manual(values = c("Best Customers" = "pink",
"Loyal Customers" = "deepskyblue1",
"Potential Loyalists" = "aquamarine1",
"Lost Customers" = "darkorchid1",
"Other" ="orchid1"))
ggplot(rfm_data, aes(x=F_Score, y=R_Score, fill=M_Score))+
geom_tile()+
scale_fill_gradient(low = "darkorchid1",high = "pink")+
labs(title="RFM Score Headmap", x="Frequency Score", y="Recency Score",
fill="Monetary Score")
best_customers <- rfm_data %>% filter(R_Score ==5, F_Score ==5, M_Score ==5)
print(best_customers)
## # A tibble: 762 × 8
## customer_id Recency Frequency Monetary R_Score F_Score M_Score Segment
## <int> <int> <int> <dbl> <int> <int> <int> <chr>
## 1 29 24 14 861. 5 5 5 Best Customers
## 2 54 45 61 2650. 5 5 5 Best Customers
## 3 61 42 13 628. 5 5 5 Best Customers
## 4 62 18 97 3518. 5 5 5 Best Customers
## 5 80 28 29 1472. 5 5 5 Best Customers
## 6 86 18 62 2019. 5 5 5 Best Customers
## 7 126 5 42 1885. 5 5 5 Best Customers
## 8 129 29 61 3647. 5 5 5 Best Customers
## 9 152 4 25 751. 5 5 5 Best Customers
## 10 153 39 24 1047. 5 5 5 Best Customers
## # ℹ 752 more rows
lost_customers <- rfm_data %>% filter(R_Score ==1, F_Score ==1, M_Score ==1)
print(lost_customers)
## # A tibble: 576 × 8
## customer_id Recency Frequency Monetary R_Score F_Score M_Score Segment
## <int> <int> <int> <dbl> <int> <int> <int> <chr>
## 1 14 913 1 45 1 1 1 Other
## 2 33 704 1 44 1 1 1 Other
## 3 35 965 1 20 1 1 1 Other
## 4 57 680 1 29 1 1 1 Other
## 5 58 688 1 36 1 1 1 Other
## 6 60 757 1 40 1 1 1 Other
## 7 65 658 1 49.5 1 1 1 Other
## 8 93 689 1 44 1 1 1 Other
## 9 142 878 1 57 1 1 1 Other
## 10 190 970 1 0 1 1 1 Other
## # ℹ 566 more rows
potential_loyalists <-rfm_data %>% filter(R_Score >=3, F_Score >=3)
print(potential_loyalists)
## # A tibble: 4,365 × 8
## customer_id Recency Frequency Monetary R_Score F_Score M_Score Segment
## <int> <int> <int> <dbl> <int> <int> <int> <chr>
## 1 1 93 19 738. 4 5 5 Loyal Custome…
## 2 5 68 15 552. 4 5 5 Loyal Custome…
## 3 9 19 5 250. 5 3 4 Potential Loy…
## 4 10 133 10 238. 4 4 3 Potential Loy…
## 5 11 28 13 371. 5 5 4 Loyal Custome…
## 6 13 51 15 510 5 5 4 Loyal Custome…
## 7 20 20 4 290 5 3 4 Potential Loy…
## 8 22 258 7 188 3 4 3 Potential Loy…
## 9 23 50 4 98.8 5 3 2 Other
## 10 27 212 5 267. 3 3 4 Potential Loy…
## # ℹ 4,355 more rows
inactive_customers <- rfm_data %>% filter(R_Score <=2, F_Score <=2)
print(inactive_customers)
## # A tibble: 2,365 × 8
## customer_id Recency Frequency Monetary R_Score F_Score M_Score Segment
## <int> <int> <int> <dbl> <int> <int> <int> <chr>
## 1 3 724 1 74 1 1 2 Other
## 2 4 793 2 178 1 2 3 Other
## 3 7 892 3 139. 1 2 3 Other
## 4 12 317 1 65 2 1 2 Other
## 5 14 913 1 45 1 1 1 Other
## 6 15 502 1 12.5 2 1 1 Other
## 7 17 320 2 88 2 2 2 Other
## 8 21 611 1 95 1 1 2 Other
## 9 33 704 1 44 1 1 1 Other
## 10 35 965 1 20 1 1 1 Other
## # ℹ 2,355 more rows
rfm_segments <- rfm_data %>%
group_by(Segment) %>%
summarise(count=n()) %>%
mutate(percentage =count/sum(count) * 100)
print(rfm_segments)
## # A tibble: 4 × 3
## Segment count percentage
## <chr> <int> <dbl>
## 1 Best Customers 762 7.62
## 2 Loyal Customers 1524 15.2
## 3 Other 5956 59.6
## 4 Potential Loyalists 1758 17.6
ggplot(rfm_segments, aes(x="", y=percentage, fill=Segment))+
geom_bar(stat="identity", width=1)+
coord_polar(theta="y")+
theme_void()+
labs(title="RFM Customer Segments")+
geom_text(aes(label=paste0(percentage, "%")),
position=position_stack(vjust=0.5))+
scale_fill_manual(values = c("Best Customers" = "pink",
"Loyal Customers" = "deepskyblue1",
"Potential Loyalists" = "aquamarine1",
"Lost Customers" = "darkorchid1",
"Other" ="orchid1"))
##### This pie chart showed unecessarily long numbers in each segment,
so we calculated the percentage and rounded it to 1 decimal place for
each segment, and re-did the pie chart to get simpler results.
rfm_segments$percentage <- round(rfm_segments$count/sum(rfm_segments$count)*100, 1)
print(rfm_segments)
## # A tibble: 4 × 3
## Segment count percentage
## <chr> <int> <dbl>
## 1 Best Customers 762 7.6
## 2 Loyal Customers 1524 15.2
## 3 Other 5956 59.6
## 4 Potential Loyalists 1758 17.6
ggplot(rfm_segments, aes(x="", y=percentage, fill=Segment))+
geom_bar(stat="identity", width=1)+
coord_polar(theta="y")+
theme_void()+
labs(title="RFM Customer Segments")+
geom_text(aes(label=paste0(percentage, "%")),
position=position_stack(vjust=0.5))+
scale_fill_manual(values = c("Best Customers" = "pink",
"Loyal Customers" = "deepskyblue1",
"Potential Loyalists" = "aquamarine1",
"Lost Customers" = "darkorchid1",
"Other" ="orchid1"))