options(repos = c(CRAN = "https://cloud.r-project.org")) # Set a CRAN mirror
# Setting Working Directory
getwd()
## [1] "C:/RUBAB/Studies/Winter 2025/MKTG 3P98/Project 2"
setwd("C:/RUBAB/Studies/Winter 2025/MKTG 3P98/Project 2")
#Install required packages
install.packages("tidyverse")
## Installing package into 'C:/Users/rubab/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\rubab\AppData\Local\Temp\RtmpeQWOV0\downloaded_packages
install.packages("lubridate")
## Installing package into 'C:/Users/rubab/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'lubridate' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'lubridate'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\rubab\AppData\Local\R\win-library\4.4\00LOCK\lubridate\libs\x64\lubridate.dll
## to
## C:\Users\rubab\AppData\Local\R\win-library\4.4\lubridate\libs\x64\lubridate.dll:
## Permission denied
## Warning: restored 'lubridate'
##
## The downloaded binary packages are in
## C:\Users\rubab\AppData\Local\Temp\RtmpeQWOV0\downloaded_packages
install.packages("rfm")
## Installing package into 'C:/Users/rubab/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'rfm' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\rubab\AppData\Local\Temp\RtmpeQWOV0\downloaded_packages
install.packages("ggplot2")
## Installing package into 'C:/Users/rubab/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\rubab\AppData\Local\Temp\RtmpeQWOV0\downloaded_packages
install.packages("dplyr")
## Installing package into 'C:/Users/rubab/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\rubab\AppData\Local\R\win-library\4.4\00LOCK\dplyr\libs\x64\dplyr.dll
## to C:\Users\rubab\AppData\Local\R\win-library\4.4\dplyr\libs\x64\dplyr.dll:
## Permission denied
## Warning: restored 'dplyr'
##
## The downloaded binary packages are in
## C:\Users\rubab\AppData\Local\Temp\RtmpeQWOV0\downloaded_packages
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(rfm)
## Warning: package 'rfm' was built under R version 4.4.3
library(ggplot2)
library(dplyr)
# Open our data file file in R Studio
RFM_Data <- read.csv("retail_rfm.csv")
View(RFM_Data)
#Check for missing values in our data
summary(RFM_Data)
## customer_id revenue number_of_orders recency_days
## Min. : 1 Min. : 0.0 Min. : 1.000 Min. : 1.00
## 1st Qu.: 2501 1st Qu.: 69.0 1st Qu.: 2.000 1st Qu.: 90.75
## Median : 5000 Median : 170.0 Median : 4.000 Median :262.00
## Mean : 5000 Mean : 358.8 Mean : 8.397 Mean :302.25
## 3rd Qu.: 7500 3rd Qu.: 425.0 3rd Qu.: 10.000 3rd Qu.:446.00
## Max. :10000 Max. :8342.9 Max. :257.000 Max. :973.00
## purchase zip_code
## Min. :0.0000 Min. : 7726
## 1st Qu.:0.0000 1st Qu.:19046
## Median :0.0000 Median :20854
## Mean :0.1779 Mean :18489
## 3rd Qu.:0.0000 3rd Qu.:21771
## Max. :1.0000 Max. :24060
#Compute RFM Metrics and View RFM Scores
rfm_analysis_data <- RFM_Data %>%
group_by(customer_id) %>%
summarise(
Recency = recency_days,
Frequency = number_of_orders,
Monetary = revenue
)
head(rfm_analysis_data)
## # A tibble: 6 × 4
## customer_id Recency Frequency Monetary
## <int> <int> <int> <dbl>
## 1 1 93 19 738.
## 2 2 419 10 300.
## 3 3 724 1 74
## 4 4 793 2 178
## 5 5 68 15 552.
## 6 6 120 3 137
# Assign RFM Scores (1-5 scale; 1 = worse, 5 = best) and print the values
rfm_analysis_data <- rfm_analysis_data %>%
mutate(
R_Score = ntile(-Recency,5),
F_Score = ntile(Frequency,5),
M_Score = ntile(Monetary,5)
)
print(rfm_analysis_data)
## # A tibble: 10,000 × 7
## customer_id Recency Frequency Monetary R_Score F_Score M_Score
## <int> <int> <int> <dbl> <int> <int> <int>
## 1 1 93 19 738. 4 5 5
## 2 2 419 10 300. 2 4 4
## 3 3 724 1 74 1 1 2
## 4 4 793 2 178 1 2 3
## 5 5 68 15 552. 4 5 5
## 6 6 120 3 137 4 2 3
## 7 7 892 3 139. 1 2 3
## 8 8 22 1 39 5 1 1
## 9 9 19 5 250. 5 3 4
## 10 10 133 10 238. 4 4 3
## # ℹ 9,990 more rows
# Segment customers based on RFM Scores and view Results
rfm_analysis_data <- rfm_analysis_data %>%
mutate(
segment = case_when(
R_Score == 5 & F_Score == 5 & M_Score == 5 ~ "Best Customers",
R_Score >= 4 & F_Score >= 4 & M_Score >= 4 ~ "Loyal Customers",
R_Score >= 3 & F_Score >= 3 & M_Score >= 3 ~ "Potential Loyalists",
R_Score <= 2 & F_Score <= 3 & M_Score >= 3 ~ "At Risk Customers",
R_Score == 1 & F_Score == 1 & M_Score < 1 ~ "Lost Customers",
TRUE ~ "Other"
)
)
print(rfm_analysis_data)
## # A tibble: 10,000 × 8
## customer_id Recency Frequency Monetary R_Score F_Score M_Score segment
## <int> <int> <int> <dbl> <int> <int> <int> <chr>
## 1 1 93 19 738. 4 5 5 Loyal Custome…
## 2 2 419 10 300. 2 4 4 Other
## 3 3 724 1 74 1 1 2 Other
## 4 4 793 2 178 1 2 3 At Risk Custo…
## 5 5 68 15 552. 4 5 5 Loyal Custome…
## 6 6 120 3 137 4 2 3 Other
## 7 7 892 3 139. 1 2 3 At Risk Custo…
## 8 8 22 1 39 5 1 1 Other
## 9 9 19 5 250. 5 3 4 Potential Loy…
## 10 10 133 10 238. 4 4 3 Potential Loy…
## # ℹ 9,990 more rows
# Visualization of our RFM Data
## Extract customer lists in each segment
### Best Customers
best_customers <- rfm_analysis_data %>% filter(R_Score ==5, F_Score ==5, M_Score ==5)
print(best_customers)
## # A tibble: 762 × 8
## customer_id Recency Frequency Monetary R_Score F_Score M_Score segment
## <int> <int> <int> <dbl> <int> <int> <int> <chr>
## 1 29 24 14 861. 5 5 5 Best Customers
## 2 54 45 61 2650. 5 5 5 Best Customers
## 3 61 42 13 628. 5 5 5 Best Customers
## 4 62 18 97 3518. 5 5 5 Best Customers
## 5 80 28 29 1472. 5 5 5 Best Customers
## 6 86 18 62 2019. 5 5 5 Best Customers
## 7 126 5 42 1885. 5 5 5 Best Customers
## 8 129 29 61 3647. 5 5 5 Best Customers
## 9 152 4 25 751. 5 5 5 Best Customers
## 10 153 39 24 1047. 5 5 5 Best Customers
## # ℹ 752 more rows
### Extract At-Risk Customers
at_risk_customers <- rfm_analysis_data %>% filter(R_Score <= 2, F_Score <= 3, M_Score >= 3)
print(at_risk_customers)
## # A tibble: 946 × 8
## customer_id Recency Frequency Monetary R_Score F_Score M_Score segment
## <int> <int> <int> <dbl> <int> <int> <int> <chr>
## 1 4 793 2 178 1 2 3 At Risk Custo…
## 2 7 892 3 139. 1 2 3 At Risk Custo…
## 3 37 933 4 194 1 3 3 At Risk Custo…
## 4 46 282 4 128 2 3 3 At Risk Custo…
## 5 64 464 3 156. 2 2 3 At Risk Custo…
## 6 70 457 4 293 2 3 4 At Risk Custo…
## 7 84 590 2 133 1 2 3 At Risk Custo…
## 8 85 323 6 1560 2 3 5 At Risk Custo…
## 9 88 319 2 212. 2 2 3 At Risk Custo…
## 10 92 292 3 204 2 2 3 At Risk Custo…
## # ℹ 936 more rows
### Lost Customers
lost_customers <- rfm_analysis_data %>% filter(R_Score ==1, F_Score ==1, M_Score ==1)
print(lost_customers)
## # A tibble: 576 × 8
## customer_id Recency Frequency Monetary R_Score F_Score M_Score segment
## <int> <int> <int> <dbl> <int> <int> <int> <chr>
## 1 14 913 1 45 1 1 1 Other
## 2 33 704 1 44 1 1 1 Other
## 3 35 965 1 20 1 1 1 Other
## 4 57 680 1 29 1 1 1 Other
## 5 58 688 1 36 1 1 1 Other
## 6 60 757 1 40 1 1 1 Other
## 7 65 658 1 49.5 1 1 1 Other
## 8 93 689 1 44 1 1 1 Other
## 9 142 878 1 57 1 1 1 Other
## 10 190 970 1 0 1 1 1 Other
## # ℹ 566 more rows
### Potential Loyalists
potential_loyalists <-rfm_analysis_data %>% filter(R_Score >=3, F_Score >=3)
print(potential_loyalists)
## # A tibble: 4,365 × 8
## customer_id Recency Frequency Monetary R_Score F_Score M_Score segment
## <int> <int> <int> <dbl> <int> <int> <int> <chr>
## 1 1 93 19 738. 4 5 5 Loyal Custome…
## 2 5 68 15 552. 4 5 5 Loyal Custome…
## 3 9 19 5 250. 5 3 4 Potential Loy…
## 4 10 133 10 238. 4 4 3 Potential Loy…
## 5 11 28 13 371. 5 5 4 Loyal Custome…
## 6 13 51 15 510 5 5 4 Loyal Custome…
## 7 20 20 4 290 5 3 4 Potential Loy…
## 8 22 258 7 188 3 4 3 Potential Loy…
## 9 23 50 4 98.8 5 3 2 Other
## 10 27 212 5 267. 3 3 4 Potential Loy…
## # ℹ 4,355 more rows
### Inactive Customers
inactive_customers <- rfm_analysis_data %>% filter(R_Score <=2, F_Score <=2)
print(inactive_customers)
## # A tibble: 2,365 × 8
## customer_id Recency Frequency Monetary R_Score F_Score M_Score segment
## <int> <int> <int> <dbl> <int> <int> <int> <chr>
## 1 3 724 1 74 1 1 2 Other
## 2 4 793 2 178 1 2 3 At Risk Custo…
## 3 7 892 3 139. 1 2 3 At Risk Custo…
## 4 12 317 1 65 2 1 2 Other
## 5 14 913 1 45 1 1 1 Other
## 6 15 502 1 12.5 2 1 1 Other
## 7 17 320 2 88 2 2 2 Other
## 8 21 611 1 95 1 1 2 Other
## 9 33 704 1 44 1 1 1 Other
## 10 35 965 1 20 1 1 1 Other
## # ℹ 2,355 more rows
# Segmentation of Customers Using a pie chart
### Count customers in each Segment of the dataset
rfm_segments <- rfm_analysis_data %>%
group_by(segment) %>%
summarise(count=n()) %>%
mutate(percentage =count/sum(count) * 100)
print(rfm_segments)
## # A tibble: 5 × 3
## segment count percentage
## <chr> <int> <dbl>
## 1 At Risk Customers 946 9.46
## 2 Best Customers 762 7.62
## 3 Loyal Customers 1524 15.2
## 4 Other 5010 50.1
## 5 Potential Loyalists 1758 17.6
## Calculate percentage and round to 1 decimal place
rfm_segments$percentage <- round(rfm_segments$count/sum(rfm_segments$count)*100, 1)
print(rfm_segments)
## # A tibble: 5 × 3
## segment count percentage
## <chr> <int> <dbl>
## 1 At Risk Customers 946 9.5
## 2 Best Customers 762 7.6
## 3 Loyal Customers 1524 15.2
## 4 Other 5010 50.1
## 5 Potential Loyalists 1758 17.6
## Actually creating the pie chart
ggplot(rfm_segments, aes(x="", y=percentage, fill=segment))+
geom_bar(stat="identity", width=1)+
coord_polar(theta="y")+
theme_void()+
labs(title="RFM Customer Segments")+
geom_text(aes(label=paste0(percentage, "%")),
position=position_stack(vjust=0.5))

## Identity recent and frequent custmers
ggplot(rfm_analysis_data, aes(x=Recency, y=Frequency, color=segment))+
geom_point(size=3)+
theme_bw()+
labs(title="Recency vs Frequency by Segment",
x="Recency (Days Since Last Purchase)",
y="Frequency (Total Transactions)")

## Heatmap of RFM Scores: Highlight high-value customers based on RFM scores
ggplot(rfm_analysis_data, aes(x=F_Score, y=R_Score, fill=M_Score))+
geom_tile()+
scale_fill_gradient(low = "lightblue",high = "darkblue")+
labs(title="RFM Score Headmap", x="Frequency Score", y="Recency Score",
fill="Monetary Score")
