chooseCRANmirror(graphics = FALSE, ind = 1)
install.packages("tidyverse")
##
## The downloaded binary packages are in
## /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmplQr2qo/downloaded_packages
# Load Required Libraries
install.packages ("tidyverse")
##
## The downloaded binary packages are in
## /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmplQr2qo/downloaded_packages
install.packages ("lubridate")
##
## The downloaded binary packages are in
## /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmplQr2qo/downloaded_packages
install.packages ("rfm")
##
## The downloaded binary packages are in
## /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmplQr2qo/downloaded_packages
install.packages ("ggplot2")
##
## The downloaded binary packages are in
## /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmplQr2qo/downloaded_packages
install.packages ("dplyr")
##
## The downloaded binary packages are in
## /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmplQr2qo/downloaded_packages
install.packages ("tidyr")
##
## The downloaded binary packages are in
## /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmplQr2qo/downloaded_packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(rfm)
library(ggplot2)
library(dplyr)
library(tidyr)
# Load Retail Data from the working directory
retail_data <-read.csv("retail_rfm.csv")
# Summary statistics & check for missing values
summary(retail_data)
## customer_id revenue number_of_orders recency_days
## Min. : 1 Min. : 0.0 Min. : 1.000 Min. : 1.00
## 1st Qu.: 2501 1st Qu.: 69.0 1st Qu.: 2.000 1st Qu.: 90.75
## Median : 5000 Median : 170.0 Median : 4.000 Median :262.00
## Mean : 5000 Mean : 358.8 Mean : 8.397 Mean :302.25
## 3rd Qu.: 7500 3rd Qu.: 425.0 3rd Qu.: 10.000 3rd Qu.:446.00
## Max. :10000 Max. :8342.9 Max. :257.000 Max. :973.00
## purchase zip_code
## Min. :0.0000 Min. : 7726
## 1st Qu.:0.0000 1st Qu.:19046
## Median :0.0000 Median :20854
## Mean :0.1779 Mean :18489
## 3rd Qu.:0.0000 3rd Qu.:21771
## Max. :1.0000 Max. :24060
# Data Cleaning
## Remove missing values
retail_data <- retail_data %>% drop_na()
summary(retail_data)
## customer_id revenue number_of_orders recency_days
## Min. : 1 Min. : 0.0 Min. : 1.000 Min. : 1.00
## 1st Qu.: 2501 1st Qu.: 69.0 1st Qu.: 2.000 1st Qu.: 90.75
## Median : 5000 Median : 170.0 Median : 4.000 Median :262.00
## Mean : 5000 Mean : 358.8 Mean : 8.397 Mean :302.25
## 3rd Qu.: 7500 3rd Qu.: 425.0 3rd Qu.: 10.000 3rd Qu.:446.00
## Max. :10000 Max. :8342.9 Max. :257.000 Max. :973.00
## purchase zip_code
## Min. :0.0000 Min. : 7726
## 1st Qu.:0.0000 1st Qu.:19046
## Median :0.0000 Median :20854
## Mean :0.1779 Mean :18489
## 3rd Qu.:0.0000 3rd Qu.:21771
## Max. :1.0000 Max. :24060
## Compute RFM metrics per customer
rfm_data <- retail_data %>%
group_by(customer_id) %>%
summarise(
Recency = min(recency_days), # Use minimum recency_days for each customer
Frequency = sum(number_of_orders), # Sum total orders
Monetary = sum(revenue) # Sum total purchase
)
# Assign RFM scores (1-5 scale; 1 = worst, 5 = best)
rfm_data <- rfm_data %>%
mutate(
R_Score = ntile(-Recency, 5),
F_Score = ntile(Frequency, 5),
M_Score = ntile(Monetary, 5)
)
# Segment Customers Based on RFM Scores
rfm_data <- rfm_data %>%
mutate(
Segment = case_when(
R_Score == 5 & F_Score == 5 & M_Score == 5 ~ "Best Customers",
R_Score >= 4 & F_Score >= 4 & M_Score >= 4 ~ "Loyal Customers",
R_Score >= 3 & F_Score >= 3 & M_Score >= 3 ~ "Potential Loyalists",
R_Score == 1 & F_Score == 1 & M_Score == 1 ~ "Lost Customers",
TRUE ~ "Other"))
# View RFM results
print(rfm_data)
## # A tibble: 10,000 × 8
## customer_id Recency Frequency Monetary R_Score F_Score M_Score Segment
## <int> <int> <int> <dbl> <int> <int> <int> <chr>
## 1 1 93 19 738. 4 5 5 Loyal Custome…
## 2 2 419 10 300. 2 4 4 Other
## 3 3 724 1 74 1 1 2 Other
## 4 4 793 2 178 1 2 3 Other
## 5 5 68 15 552. 4 5 5 Loyal Custome…
## 6 6 120 3 137 4 2 3 Other
## 7 7 892 3 139. 1 2 3 Other
## 8 8 22 1 39 5 1 1 Other
## 9 9 19 5 250. 5 3 4 Potential Loy…
## 10 10 133 10 238. 4 4 3 Potential Loy…
## # ℹ 9,990 more rows
# Visualize RFM Segments
# Visualizations
## Distribution Histograms
hist(rfm_data$Recency, main="Recency Distribution", col="blue")

hist(rfm_data$Frequency, main="Frequency Distribution", col="green")

hist(rfm_data$Monetary, main="Monetary Distribution", col="red")

## Bar Plot
ggplot(rfm_data, aes(x = Segment, fill = Segment)) +
geom_bar() +
theme_bw() +
geom_text(stat = "count", aes(label = ..count..), vjust = 0) +
labs(title = "RFM Customer Segments", x = "Segment", y = "Count")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Scatter Plot (Recency vs Frequency)
ggplot(rfm_data, aes(x = Recency, y = Frequency, color = Segment)) +
geom_point(size = 3) +
theme_bw() +
labs(title = "Recency vs Frequency by Segment",
x = "Recency (Days Since Last Purchase)",
y = "Frequency (Total Transactions)")

## Monetary Value per Segment
ggplot(rfm_data, aes(x=Segment, y=Monetary, fill=Segment)) +
geom_boxplot() +
theme_bw() +
labs(title="Monetary Value by Customer Segment", x="Segment", y="Total Monetary Value") +
theme(axis.text.x = element_text(angle = 45, hjust=1))

## Heatmap of RFM Scores
ggplot(rfm_data, aes(x=F_Score, y=R_Score, fill=M_Score)) +
geom_tile() +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
labs(title="RFM Score Heatmap", x="Frequency Score", y="Recency Score", fill="Monetary Score")

# Extract customer lists
## Best Customers
best_customers <- rfm_data %>% filter(R_Score == 5, F_Score == 5, M_Score == 5)
print(best_customers)
## # A tibble: 762 × 8
## customer_id Recency Frequency Monetary R_Score F_Score M_Score Segment
## <int> <int> <int> <dbl> <int> <int> <int> <chr>
## 1 29 24 14 861. 5 5 5 Best Customers
## 2 54 45 61 2650. 5 5 5 Best Customers
## 3 61 42 13 628. 5 5 5 Best Customers
## 4 62 18 97 3518. 5 5 5 Best Customers
## 5 80 28 29 1472. 5 5 5 Best Customers
## 6 86 18 62 2019. 5 5 5 Best Customers
## 7 126 5 42 1885. 5 5 5 Best Customers
## 8 129 29 61 3647. 5 5 5 Best Customers
## 9 152 4 25 751. 5 5 5 Best Customers
## 10 153 39 24 1047. 5 5 5 Best Customers
## # ℹ 752 more rows
## Lost Customers
lost_customers <- rfm_data %>% filter(R_Score == 1, F_Score == 1, M_Score == 1)
print(lost_customers)
## # A tibble: 576 × 8
## customer_id Recency Frequency Monetary R_Score F_Score M_Score Segment
## <int> <int> <int> <dbl> <int> <int> <int> <chr>
## 1 14 913 1 45 1 1 1 Lost Customers
## 2 33 704 1 44 1 1 1 Lost Customers
## 3 35 965 1 20 1 1 1 Lost Customers
## 4 57 680 1 29 1 1 1 Lost Customers
## 5 58 688 1 36 1 1 1 Lost Customers
## 6 60 757 1 40 1 1 1 Lost Customers
## 7 65 658 1 49.5 1 1 1 Lost Customers
## 8 93 689 1 44 1 1 1 Lost Customers
## 9 142 878 1 57 1 1 1 Lost Customers
## 10 190 970 1 0 1 1 1 Lost Customers
## # ℹ 566 more rows
## Potential Loyalists
potential_loyalists <- rfm_data %>% filter(R_Score >= 3, F_Score >= 3)
print(potential_loyalists)
## # A tibble: 4,365 × 8
## customer_id Recency Frequency Monetary R_Score F_Score M_Score Segment
## <int> <int> <int> <dbl> <int> <int> <int> <chr>
## 1 1 93 19 738. 4 5 5 Loyal Custome…
## 2 5 68 15 552. 4 5 5 Loyal Custome…
## 3 9 19 5 250. 5 3 4 Potential Loy…
## 4 10 133 10 238. 4 4 3 Potential Loy…
## 5 11 28 13 371. 5 5 4 Loyal Custome…
## 6 13 51 15 510 5 5 4 Loyal Custome…
## 7 20 20 4 290 5 3 4 Potential Loy…
## 8 22 258 7 188 3 4 3 Potential Loy…
## 9 23 50 4 98.8 5 3 2 Other
## 10 27 212 5 267. 3 3 4 Potential Loy…
## # ℹ 4,355 more rows
## Inactive Customers
inactive_customers <- rfm_data %>% filter(R_Score <= 2, F_Score <= 2)
print(inactive_customers)
## # A tibble: 2,365 × 8
## customer_id Recency Frequency Monetary R_Score F_Score M_Score Segment
## <int> <int> <int> <dbl> <int> <int> <int> <chr>
## 1 3 724 1 74 1 1 2 Other
## 2 4 793 2 178 1 2 3 Other
## 3 7 892 3 139. 1 2 3 Other
## 4 12 317 1 65 2 1 2 Other
## 5 14 913 1 45 1 1 1 Lost Customers
## 6 15 502 1 12.5 2 1 1 Other
## 7 17 320 2 88 2 2 2 Other
## 8 21 611 1 95 1 1 2 Other
## 9 33 704 1 44 1 1 1 Lost Customers
## 10 35 965 1 20 1 1 1 Lost Customers
## # ℹ 2,355 more rows
## Boxplot: Monetary Value per Segment
ggplot(rfm_data, aes(x = Segment, y = Monetary, fill = Segment)) +
geom_boxplot() +
theme_bw() +
labs(title = "Monetary Value by Customer Segment", x = "Segment",
y = "Total Monetary Value") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Pie Chart of RFM Customer Segments
rfm_segments <- rfm_data %>%
group_by(Segment) %>%
summarise(count = n()) %>%
mutate(percentage = round(count / sum(count) * 100, 1))
### Create a Pie Chart
ggplot(rfm_segments, aes(x = "", y = percentage, fill = Segment)) +
geom_bar(stat = "identity", width = 1) +
coord_polar(theta = "y") +
theme_void() +
labs(title = "RFM Customer Segments") +
geom_text(aes(label = paste0(percentage, "%")),
position = position_stack(vjust = 0.5))

# Print customer lists per segment
print(rfm_segments)
## # A tibble: 5 × 3
## Segment count percentage
## <chr> <int> <dbl>
## 1 Best Customers 762 7.6
## 2 Lost Customers 576 5.8
## 3 Loyal Customers 1524 15.2
## 4 Other 5380 53.8
## 5 Potential Loyalists 1758 17.6