options(repos = c(CRAN = "https://cloud.r-project.org"))  # Set a CRAN mirror

# Setting Working Directory
getwd()
## [1] "C:/RUBAB/Studies/Winter 2025/MKTG 3P98/Project 2"
setwd("C:/RUBAB/Studies/Winter 2025/MKTG 3P98/Project 2")

#Install required packages
install.packages("tidyverse")
## Installing package into 'C:/Users/rubab/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\rubab\AppData\Local\Temp\RtmpeQWOV0\downloaded_packages
install.packages("lubridate")
## Installing package into 'C:/Users/rubab/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'lubridate' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'lubridate'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\rubab\AppData\Local\R\win-library\4.4\00LOCK\lubridate\libs\x64\lubridate.dll
## to
## C:\Users\rubab\AppData\Local\R\win-library\4.4\lubridate\libs\x64\lubridate.dll:
## Permission denied
## Warning: restored 'lubridate'
## 
## The downloaded binary packages are in
##  C:\Users\rubab\AppData\Local\Temp\RtmpeQWOV0\downloaded_packages
install.packages("rfm")
## Installing package into 'C:/Users/rubab/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'rfm' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\rubab\AppData\Local\Temp\RtmpeQWOV0\downloaded_packages
install.packages("ggplot2")
## Installing package into 'C:/Users/rubab/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\rubab\AppData\Local\Temp\RtmpeQWOV0\downloaded_packages
install.packages("dplyr")
## Installing package into 'C:/Users/rubab/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\rubab\AppData\Local\R\win-library\4.4\00LOCK\dplyr\libs\x64\dplyr.dll
## to C:\Users\rubab\AppData\Local\R\win-library\4.4\dplyr\libs\x64\dplyr.dll:
## Permission denied
## Warning: restored 'dplyr'
## 
## The downloaded binary packages are in
##  C:\Users\rubab\AppData\Local\Temp\RtmpeQWOV0\downloaded_packages
library(tidyverse)  
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)   
library(rfm)        
## Warning: package 'rfm' was built under R version 4.4.3
library(ggplot2)     
library(dplyr)

# Open our data file file in R Studio
RFM_Data <- read.csv("retail_rfm.csv")
View(RFM_Data)

#Check for missing values in our data
summary(RFM_Data)
##   customer_id       revenue       number_of_orders   recency_days   
##  Min.   :    1   Min.   :   0.0   Min.   :  1.000   Min.   :  1.00  
##  1st Qu.: 2501   1st Qu.:  69.0   1st Qu.:  2.000   1st Qu.: 90.75  
##  Median : 5000   Median : 170.0   Median :  4.000   Median :262.00  
##  Mean   : 5000   Mean   : 358.8   Mean   :  8.397   Mean   :302.25  
##  3rd Qu.: 7500   3rd Qu.: 425.0   3rd Qu.: 10.000   3rd Qu.:446.00  
##  Max.   :10000   Max.   :8342.9   Max.   :257.000   Max.   :973.00  
##     purchase         zip_code    
##  Min.   :0.0000   Min.   : 7726  
##  1st Qu.:0.0000   1st Qu.:19046  
##  Median :0.0000   Median :20854  
##  Mean   :0.1779   Mean   :18489  
##  3rd Qu.:0.0000   3rd Qu.:21771  
##  Max.   :1.0000   Max.   :24060
#Compute RFM Metrics and View RFM Scores
rfm_analysis_data <- RFM_Data %>%
  group_by(customer_id) %>%
  summarise(
    Recency = recency_days,
    Frequency = number_of_orders,
    Monetary = revenue
  )

head(rfm_analysis_data)
## # A tibble: 6 × 4
##   customer_id Recency Frequency Monetary
##         <int>   <int>     <int>    <dbl>
## 1           1      93        19     738.
## 2           2     419        10     300.
## 3           3     724         1      74 
## 4           4     793         2     178 
## 5           5      68        15     552.
## 6           6     120         3     137
# Assign RFM Scores (1-5 scale; 1 = worse, 5 = best) and print the values
rfm_analysis_data <- rfm_analysis_data %>%
  mutate(
    R_Score = ntile(-Recency,5),
    F_Score = ntile(Frequency,5),
    M_Score = ntile(Monetary,5)
  )

print(rfm_analysis_data)
## # A tibble: 10,000 × 7
##    customer_id Recency Frequency Monetary R_Score F_Score M_Score
##          <int>   <int>     <int>    <dbl>   <int>   <int>   <int>
##  1           1      93        19     738.       4       5       5
##  2           2     419        10     300.       2       4       4
##  3           3     724         1      74        1       1       2
##  4           4     793         2     178        1       2       3
##  5           5      68        15     552.       4       5       5
##  6           6     120         3     137        4       2       3
##  7           7     892         3     139.       1       2       3
##  8           8      22         1      39        5       1       1
##  9           9      19         5     250.       5       3       4
## 10          10     133        10     238.       4       4       3
## # ℹ 9,990 more rows
# Segment customers based on RFM Scores and view Results 
rfm_analysis_data <- rfm_analysis_data %>%
  mutate(
    segment = case_when(
      R_Score == 5 & F_Score == 5 & M_Score == 5 ~ "Best Customers",
      R_Score >= 4 & F_Score >= 4 & M_Score >= 4 ~ "Loyal Customers",
      R_Score >= 3 & F_Score >= 3 & M_Score >= 3 ~ "Potential Loyalists",
      R_Score <= 2 & F_Score <= 3 & M_Score >= 3 ~ "At Risk Customers",
      R_Score == 1 & F_Score == 1 & M_Score < 1 ~ "Lost Customers",
      TRUE ~ "Other"
    )
  )

print(rfm_analysis_data)
## # A tibble: 10,000 × 8
##    customer_id Recency Frequency Monetary R_Score F_Score M_Score segment       
##          <int>   <int>     <int>    <dbl>   <int>   <int>   <int> <chr>         
##  1           1      93        19     738.       4       5       5 Loyal Custome…
##  2           2     419        10     300.       2       4       4 Other         
##  3           3     724         1      74        1       1       2 Other         
##  4           4     793         2     178        1       2       3 At Risk Custo…
##  5           5      68        15     552.       4       5       5 Loyal Custome…
##  6           6     120         3     137        4       2       3 Other         
##  7           7     892         3     139.       1       2       3 At Risk Custo…
##  8           8      22         1      39        5       1       1 Other         
##  9           9      19         5     250.       5       3       4 Potential Loy…
## 10          10     133        10     238.       4       4       3 Potential Loy…
## # ℹ 9,990 more rows
# Visualization of our RFM Data 

## Extract customer lists in each segment
### Best Customers
best_customers <- rfm_analysis_data %>% filter(R_Score ==5, F_Score ==5, M_Score ==5)
print(best_customers)
## # A tibble: 762 × 8
##    customer_id Recency Frequency Monetary R_Score F_Score M_Score segment       
##          <int>   <int>     <int>    <dbl>   <int>   <int>   <int> <chr>         
##  1          29      24        14     861.       5       5       5 Best Customers
##  2          54      45        61    2650.       5       5       5 Best Customers
##  3          61      42        13     628.       5       5       5 Best Customers
##  4          62      18        97    3518.       5       5       5 Best Customers
##  5          80      28        29    1472.       5       5       5 Best Customers
##  6          86      18        62    2019.       5       5       5 Best Customers
##  7         126       5        42    1885.       5       5       5 Best Customers
##  8         129      29        61    3647.       5       5       5 Best Customers
##  9         152       4        25     751.       5       5       5 Best Customers
## 10         153      39        24    1047.       5       5       5 Best Customers
## # ℹ 752 more rows
### Extract At-Risk Customers
at_risk_customers <- rfm_analysis_data %>% filter(R_Score <= 2, F_Score <= 3, M_Score >= 3)
print(at_risk_customers)
## # A tibble: 946 × 8
##    customer_id Recency Frequency Monetary R_Score F_Score M_Score segment       
##          <int>   <int>     <int>    <dbl>   <int>   <int>   <int> <chr>         
##  1           4     793         2     178        1       2       3 At Risk Custo…
##  2           7     892         3     139.       1       2       3 At Risk Custo…
##  3          37     933         4     194        1       3       3 At Risk Custo…
##  4          46     282         4     128        2       3       3 At Risk Custo…
##  5          64     464         3     156.       2       2       3 At Risk Custo…
##  6          70     457         4     293        2       3       4 At Risk Custo…
##  7          84     590         2     133        1       2       3 At Risk Custo…
##  8          85     323         6    1560        2       3       5 At Risk Custo…
##  9          88     319         2     212.       2       2       3 At Risk Custo…
## 10          92     292         3     204        2       2       3 At Risk Custo…
## # ℹ 936 more rows
### Lost Customers
lost_customers <- rfm_analysis_data %>% filter(R_Score ==1, F_Score ==1, M_Score ==1)
print(lost_customers)
## # A tibble: 576 × 8
##    customer_id Recency Frequency Monetary R_Score F_Score M_Score segment
##          <int>   <int>     <int>    <dbl>   <int>   <int>   <int> <chr>  
##  1          14     913         1     45         1       1       1 Other  
##  2          33     704         1     44         1       1       1 Other  
##  3          35     965         1     20         1       1       1 Other  
##  4          57     680         1     29         1       1       1 Other  
##  5          58     688         1     36         1       1       1 Other  
##  6          60     757         1     40         1       1       1 Other  
##  7          65     658         1     49.5       1       1       1 Other  
##  8          93     689         1     44         1       1       1 Other  
##  9         142     878         1     57         1       1       1 Other  
## 10         190     970         1      0         1       1       1 Other  
## # ℹ 566 more rows
### Potential Loyalists
potential_loyalists <-rfm_analysis_data %>% filter(R_Score >=3, F_Score >=3)
print(potential_loyalists)
## # A tibble: 4,365 × 8
##    customer_id Recency Frequency Monetary R_Score F_Score M_Score segment       
##          <int>   <int>     <int>    <dbl>   <int>   <int>   <int> <chr>         
##  1           1      93        19    738.        4       5       5 Loyal Custome…
##  2           5      68        15    552.        4       5       5 Loyal Custome…
##  3           9      19         5    250.        5       3       4 Potential Loy…
##  4          10     133        10    238.        4       4       3 Potential Loy…
##  5          11      28        13    371.        5       5       4 Loyal Custome…
##  6          13      51        15    510         5       5       4 Loyal Custome…
##  7          20      20         4    290         5       3       4 Potential Loy…
##  8          22     258         7    188         3       4       3 Potential Loy…
##  9          23      50         4     98.8       5       3       2 Other         
## 10          27     212         5    267.        3       3       4 Potential Loy…
## # ℹ 4,355 more rows
### Inactive Customers
inactive_customers <- rfm_analysis_data %>% filter(R_Score <=2, F_Score <=2)
print(inactive_customers)
## # A tibble: 2,365 × 8
##    customer_id Recency Frequency Monetary R_Score F_Score M_Score segment       
##          <int>   <int>     <int>    <dbl>   <int>   <int>   <int> <chr>         
##  1           3     724         1     74         1       1       2 Other         
##  2           4     793         2    178         1       2       3 At Risk Custo…
##  3           7     892         3    139.        1       2       3 At Risk Custo…
##  4          12     317         1     65         2       1       2 Other         
##  5          14     913         1     45         1       1       1 Other         
##  6          15     502         1     12.5       2       1       1 Other         
##  7          17     320         2     88         2       2       2 Other         
##  8          21     611         1     95         1       1       2 Other         
##  9          33     704         1     44         1       1       1 Other         
## 10          35     965         1     20         1       1       1 Other         
## # ℹ 2,355 more rows
# Segmentation of Customers Using a pie chart
### Count customers in each Segment of the dataset
rfm_segments <- rfm_analysis_data %>% 
  group_by(segment) %>% 
  summarise(count=n()) %>%
  mutate(percentage =count/sum(count) * 100)
print(rfm_segments) 
## # A tibble: 5 × 3
##   segment             count percentage
##   <chr>               <int>      <dbl>
## 1 At Risk Customers     946       9.46
## 2 Best Customers        762       7.62
## 3 Loyal Customers      1524      15.2 
## 4 Other                5010      50.1 
## 5 Potential Loyalists  1758      17.6
## Calculate percentage and round to 1 decimal place
rfm_segments$percentage <- round(rfm_segments$count/sum(rfm_segments$count)*100, 1)
print(rfm_segments)
## # A tibble: 5 × 3
##   segment             count percentage
##   <chr>               <int>      <dbl>
## 1 At Risk Customers     946        9.5
## 2 Best Customers        762        7.6
## 3 Loyal Customers      1524       15.2
## 4 Other                5010       50.1
## 5 Potential Loyalists  1758       17.6
## Actually creating the pie chart
ggplot(rfm_segments, aes(x="", y=percentage, fill=segment))+
  geom_bar(stat="identity", width=1)+
  coord_polar(theta="y")+
  theme_void()+
  labs(title="RFM Customer Segments")+
  geom_text(aes(label=paste0(percentage, "%")),
            position=position_stack(vjust=0.5))

## Identity recent and frequent custmers
ggplot(rfm_analysis_data, aes(x=Recency, y=Frequency, color=segment))+
  geom_point(size=3)+
  theme_bw()+
  labs(title="Recency vs Frequency by Segment", 
       x="Recency (Days Since Last Purchase)", 
       y="Frequency (Total Transactions)")

## Heatmap of RFM Scores: Highlight high-value customers based on RFM scores
ggplot(rfm_analysis_data, aes(x=F_Score, y=R_Score, fill=M_Score))+
  geom_tile()+
  scale_fill_gradient(low = "lightblue",high = "darkblue")+
  labs(title="RFM Score Headmap", x="Frequency Score", y="Recency Score",
       fill="Monetary Score")