chooseCRANmirror(graphics = FALSE, ind = 1)
install.packages("tidyverse")
## 
## The downloaded binary packages are in
##  /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmplQr2qo/downloaded_packages
# Load Required Libraries

install.packages ("tidyverse")
## 
## The downloaded binary packages are in
##  /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmplQr2qo/downloaded_packages
install.packages ("lubridate")
## 
## The downloaded binary packages are in
##  /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmplQr2qo/downloaded_packages
install.packages ("rfm")
## 
## The downloaded binary packages are in
##  /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmplQr2qo/downloaded_packages
install.packages ("ggplot2")
## 
## The downloaded binary packages are in
##  /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmplQr2qo/downloaded_packages
install.packages ("dplyr")
## 
## The downloaded binary packages are in
##  /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmplQr2qo/downloaded_packages
install.packages ("tidyr")
## 
## The downloaded binary packages are in
##  /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmplQr2qo/downloaded_packages
library(tidyverse) 
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(rfm)
library(ggplot2)
library(dplyr)
library(tidyr)

# Load Retail Data from the working directory
retail_data <-read.csv("retail_rfm.csv")

# Summary statistics & check for missing values
summary(retail_data)
##   customer_id       revenue       number_of_orders   recency_days   
##  Min.   :    1   Min.   :   0.0   Min.   :  1.000   Min.   :  1.00  
##  1st Qu.: 2501   1st Qu.:  69.0   1st Qu.:  2.000   1st Qu.: 90.75  
##  Median : 5000   Median : 170.0   Median :  4.000   Median :262.00  
##  Mean   : 5000   Mean   : 358.8   Mean   :  8.397   Mean   :302.25  
##  3rd Qu.: 7500   3rd Qu.: 425.0   3rd Qu.: 10.000   3rd Qu.:446.00  
##  Max.   :10000   Max.   :8342.9   Max.   :257.000   Max.   :973.00  
##     purchase         zip_code    
##  Min.   :0.0000   Min.   : 7726  
##  1st Qu.:0.0000   1st Qu.:19046  
##  Median :0.0000   Median :20854  
##  Mean   :0.1779   Mean   :18489  
##  3rd Qu.:0.0000   3rd Qu.:21771  
##  Max.   :1.0000   Max.   :24060
# Data Cleaning
## Remove missing values
retail_data <- retail_data %>% drop_na()
summary(retail_data)
##   customer_id       revenue       number_of_orders   recency_days   
##  Min.   :    1   Min.   :   0.0   Min.   :  1.000   Min.   :  1.00  
##  1st Qu.: 2501   1st Qu.:  69.0   1st Qu.:  2.000   1st Qu.: 90.75  
##  Median : 5000   Median : 170.0   Median :  4.000   Median :262.00  
##  Mean   : 5000   Mean   : 358.8   Mean   :  8.397   Mean   :302.25  
##  3rd Qu.: 7500   3rd Qu.: 425.0   3rd Qu.: 10.000   3rd Qu.:446.00  
##  Max.   :10000   Max.   :8342.9   Max.   :257.000   Max.   :973.00  
##     purchase         zip_code    
##  Min.   :0.0000   Min.   : 7726  
##  1st Qu.:0.0000   1st Qu.:19046  
##  Median :0.0000   Median :20854  
##  Mean   :0.1779   Mean   :18489  
##  3rd Qu.:0.0000   3rd Qu.:21771  
##  Max.   :1.0000   Max.   :24060
## Compute RFM metrics per customer
rfm_data <- retail_data %>%
  group_by(customer_id) %>%
  summarise(
    Recency = min(recency_days),  # Use minimum recency_days for each customer
    Frequency = sum(number_of_orders), # Sum total orders
    Monetary = sum(revenue) # Sum total purchase
  )

# Assign RFM scores (1-5 scale; 1 = worst, 5 = best)
rfm_data <- rfm_data %>%
  mutate(
    R_Score = ntile(-Recency, 5),
    F_Score = ntile(Frequency, 5),
    M_Score = ntile(Monetary, 5)
  )

# Segment Customers Based on RFM Scores
rfm_data <- rfm_data %>%
  mutate(
    Segment = case_when(
      R_Score == 5 & F_Score == 5 & M_Score == 5 ~ "Best Customers",
      R_Score >= 4 & F_Score >= 4 & M_Score >= 4 ~ "Loyal Customers",
      R_Score >= 3 & F_Score >= 3 & M_Score >= 3 ~ "Potential Loyalists",
      R_Score == 1 & F_Score == 1 & M_Score == 1 ~ "Lost Customers",
      TRUE ~ "Other"))

# View RFM results
print(rfm_data)  
## # A tibble: 10,000 × 8
##    customer_id Recency Frequency Monetary R_Score F_Score M_Score Segment       
##          <int>   <int>     <int>    <dbl>   <int>   <int>   <int> <chr>         
##  1           1      93        19     738.       4       5       5 Loyal Custome…
##  2           2     419        10     300.       2       4       4 Other         
##  3           3     724         1      74        1       1       2 Other         
##  4           4     793         2     178        1       2       3 Other         
##  5           5      68        15     552.       4       5       5 Loyal Custome…
##  6           6     120         3     137        4       2       3 Other         
##  7           7     892         3     139.       1       2       3 Other         
##  8           8      22         1      39        5       1       1 Other         
##  9           9      19         5     250.       5       3       4 Potential Loy…
## 10          10     133        10     238.       4       4       3 Potential Loy…
## # ℹ 9,990 more rows
# Visualize RFM Segments
# Visualizations
## Distribution Histograms
hist(rfm_data$Recency, main="Recency Distribution", col="blue")

hist(rfm_data$Frequency, main="Frequency Distribution", col="green")

hist(rfm_data$Monetary, main="Monetary Distribution", col="red")

## Bar Plot
ggplot(rfm_data, aes(x = Segment, fill = Segment)) +
  geom_bar() +
  theme_bw() +
  geom_text(stat = "count", aes(label = ..count..), vjust = 0) +
  labs(title = "RFM Customer Segments", x = "Segment", y = "Count")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Scatter Plot (Recency vs Frequency)
ggplot(rfm_data, aes(x = Recency, y = Frequency, color = Segment)) +
  geom_point(size = 3) +
  theme_bw() +
  labs(title = "Recency vs Frequency by Segment", 
       x = "Recency (Days Since Last Purchase)", 
       y = "Frequency (Total Transactions)")

## Monetary Value per Segment
ggplot(rfm_data, aes(x=Segment, y=Monetary, fill=Segment)) +
  geom_boxplot() +
  theme_bw() +
  labs(title="Monetary Value by Customer Segment", x="Segment", y="Total Monetary Value") +
  theme(axis.text.x = element_text(angle = 45, hjust=1))

## Heatmap of RFM Scores
ggplot(rfm_data, aes(x=F_Score, y=R_Score, fill=M_Score)) +
  geom_tile() +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(title="RFM Score Heatmap", x="Frequency Score", y="Recency Score", fill="Monetary Score")

# Extract customer lists
## Best Customers
best_customers <- rfm_data %>% filter(R_Score == 5, F_Score == 5, M_Score == 5)
print(best_customers)
## # A tibble: 762 × 8
##    customer_id Recency Frequency Monetary R_Score F_Score M_Score Segment       
##          <int>   <int>     <int>    <dbl>   <int>   <int>   <int> <chr>         
##  1          29      24        14     861.       5       5       5 Best Customers
##  2          54      45        61    2650.       5       5       5 Best Customers
##  3          61      42        13     628.       5       5       5 Best Customers
##  4          62      18        97    3518.       5       5       5 Best Customers
##  5          80      28        29    1472.       5       5       5 Best Customers
##  6          86      18        62    2019.       5       5       5 Best Customers
##  7         126       5        42    1885.       5       5       5 Best Customers
##  8         129      29        61    3647.       5       5       5 Best Customers
##  9         152       4        25     751.       5       5       5 Best Customers
## 10         153      39        24    1047.       5       5       5 Best Customers
## # ℹ 752 more rows
## Lost Customers
lost_customers <- rfm_data %>% filter(R_Score == 1, F_Score == 1, M_Score == 1)
print(lost_customers)
## # A tibble: 576 × 8
##    customer_id Recency Frequency Monetary R_Score F_Score M_Score Segment       
##          <int>   <int>     <int>    <dbl>   <int>   <int>   <int> <chr>         
##  1          14     913         1     45         1       1       1 Lost Customers
##  2          33     704         1     44         1       1       1 Lost Customers
##  3          35     965         1     20         1       1       1 Lost Customers
##  4          57     680         1     29         1       1       1 Lost Customers
##  5          58     688         1     36         1       1       1 Lost Customers
##  6          60     757         1     40         1       1       1 Lost Customers
##  7          65     658         1     49.5       1       1       1 Lost Customers
##  8          93     689         1     44         1       1       1 Lost Customers
##  9         142     878         1     57         1       1       1 Lost Customers
## 10         190     970         1      0         1       1       1 Lost Customers
## # ℹ 566 more rows
## Potential Loyalists
potential_loyalists <- rfm_data %>% filter(R_Score >= 3, F_Score >= 3)
print(potential_loyalists)
## # A tibble: 4,365 × 8
##    customer_id Recency Frequency Monetary R_Score F_Score M_Score Segment       
##          <int>   <int>     <int>    <dbl>   <int>   <int>   <int> <chr>         
##  1           1      93        19    738.        4       5       5 Loyal Custome…
##  2           5      68        15    552.        4       5       5 Loyal Custome…
##  3           9      19         5    250.        5       3       4 Potential Loy…
##  4          10     133        10    238.        4       4       3 Potential Loy…
##  5          11      28        13    371.        5       5       4 Loyal Custome…
##  6          13      51        15    510         5       5       4 Loyal Custome…
##  7          20      20         4    290         5       3       4 Potential Loy…
##  8          22     258         7    188         3       4       3 Potential Loy…
##  9          23      50         4     98.8       5       3       2 Other         
## 10          27     212         5    267.        3       3       4 Potential Loy…
## # ℹ 4,355 more rows
## Inactive Customers
inactive_customers <- rfm_data %>% filter(R_Score <= 2, F_Score <= 2)
print(inactive_customers)
## # A tibble: 2,365 × 8
##    customer_id Recency Frequency Monetary R_Score F_Score M_Score Segment       
##          <int>   <int>     <int>    <dbl>   <int>   <int>   <int> <chr>         
##  1           3     724         1     74         1       1       2 Other         
##  2           4     793         2    178         1       2       3 Other         
##  3           7     892         3    139.        1       2       3 Other         
##  4          12     317         1     65         2       1       2 Other         
##  5          14     913         1     45         1       1       1 Lost Customers
##  6          15     502         1     12.5       2       1       1 Other         
##  7          17     320         2     88         2       2       2 Other         
##  8          21     611         1     95         1       1       2 Other         
##  9          33     704         1     44         1       1       1 Lost Customers
## 10          35     965         1     20         1       1       1 Lost Customers
## # ℹ 2,355 more rows
## Boxplot: Monetary Value per Segment
ggplot(rfm_data, aes(x = Segment, y = Monetary, fill = Segment)) +
  geom_boxplot() +
  theme_bw() +
  labs(title = "Monetary Value by Customer Segment", x = "Segment", 
       y = "Total Monetary Value") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Pie Chart of RFM Customer Segments
rfm_segments <- rfm_data %>% 
  group_by(Segment) %>% 
  summarise(count = n()) %>%
  mutate(percentage = round(count / sum(count) * 100, 1))

### Create a Pie Chart
ggplot(rfm_segments, aes(x = "", y = percentage, fill = Segment)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar(theta = "y") +
  theme_void() +
  labs(title = "RFM Customer Segments") +
  geom_text(aes(label = paste0(percentage, "%")),
            position = position_stack(vjust = 0.5))

# Print customer lists per segment
print(rfm_segments)
## # A tibble: 5 × 3
##   Segment             count percentage
##   <chr>               <int>      <dbl>
## 1 Best Customers        762        7.6
## 2 Lost Customers        576        5.8
## 3 Loyal Customers      1524       15.2
## 4 Other                5380       53.8
## 5 Potential Loyalists  1758       17.6