getwd()
## [1] "/Users/jakedudar/Desktop/MKTG3P98/project 1"
setwd("/Users/jakedudar/Desktop/MKTG3P98/project 1")
options(repos = c(CRAN = "https://cloud.r-project.org/"))
install.packages("tidyverse")
##
## The downloaded binary packages are in
## /var/folders/b0/jlkfwcg920s7r19b0x0prj6h0000gn/T//RtmpF1wSn9/downloaded_packages
install.packages("lubridate")
##
## The downloaded binary packages are in
## /var/folders/b0/jlkfwcg920s7r19b0x0prj6h0000gn/T//RtmpF1wSn9/downloaded_packages
install.packages("rfm")
##
## The downloaded binary packages are in
## /var/folders/b0/jlkfwcg920s7r19b0x0prj6h0000gn/T//RtmpF1wSn9/downloaded_packages
install.packages("ggplot2")
##
## The downloaded binary packages are in
## /var/folders/b0/jlkfwcg920s7r19b0x0prj6h0000gn/T//RtmpF1wSn9/downloaded_packages
library(tidyverse) # Data manipulation
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate) # Date format
library(rfm) # RFM analysis
library(ggplot2) # Data visualization
# Load e-commerce Data from the working directory
ecom_data <-read.csv("e-commerce data.csv")
# Summary statistics and check if there are any missing values in the dataset
summary(ecom_data)
## InvoiceNo StockCode Description Quantity
## Length:541909 Length:541909 Length:541909 Min. :-80995.00
## Class :character Class :character Class :character 1st Qu.: 1.00
## Mode :character Mode :character Mode :character Median : 3.00
## Mean : 9.55
## 3rd Qu.: 10.00
## Max. : 80995.00
##
## InvoiceDate UnitPrice CustomerID Country
## Length:541909 Min. :-11062.06 Min. :12346 Length:541909
## Class :character 1st Qu.: 1.25 1st Qu.:13953 Class :character
## Mode :character Median : 2.08 Median :15152 Mode :character
## Mean : 4.61 Mean :15288
## 3rd Qu.: 4.13 3rd Qu.:16791
## Max. : 38970.00 Max. :18287
## NA's :135080
# Data Cleaning
## Remove missing values
ecom_data <- ecom_data %>% drop_na(CustomerID)
summary(ecom_data)
## InvoiceNo StockCode Description Quantity
## Length:406829 Length:406829 Length:406829 Min. :-80995.00
## Class :character Class :character Class :character 1st Qu.: 2.00
## Mode :character Mode :character Mode :character Median : 5.00
## Mean : 12.06
## 3rd Qu.: 12.00
## Max. : 80995.00
## InvoiceDate UnitPrice CustomerID Country
## Length:406829 Min. : 0.00 Min. :12346 Length:406829
## Class :character 1st Qu.: 1.25 1st Qu.:13953 Class :character
## Mode :character Median : 1.95 Median :15152 Mode :character
## Mean : 3.46 Mean :15288
## 3rd Qu.: 3.75 3rd Qu.:16791
## Max. :38970.00 Max. :18287
## Convert InvoiceDate to Date Format
ecom_data$InvoiceDate <- as.Date(ecom_data$InvoiceDate, format = "%m/%d/%y")
head(ecom_data) # display the first few rows of data
## InvoiceNo StockCode Description Quantity InvoiceDate
## 1 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 2020-12-01
## 2 536365 71053 WHITE METAL LANTERN 6 2020-12-01
## 3 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 2020-12-01
## 4 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 2020-12-01
## 5 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 2020-12-01
## 6 536365 22752 SET 7 BABUSHKA NESTING BOXES 2 2020-12-01
## UnitPrice CustomerID Country
## 1 2.55 17850 United Kingdom
## 2 3.39 17850 United Kingdom
## 3 2.75 17850 United Kingdom
## 4 3.39 17850 United Kingdom
## 5 3.39 17850 United Kingdom
## 6 7.65 17850 United Kingdom
## Create TotalPrice colum (Quantity x UnitPrice)
ecom_data <- ecom_data %>%
mutate(TotalPrice = Quantity * UnitPrice)
## View summary
summary(ecom_data)
## InvoiceNo StockCode Description Quantity
## Length:406829 Length:406829 Length:406829 Min. :-80995.00
## Class :character Class :character Class :character 1st Qu.: 2.00
## Mode :character Mode :character Mode :character Median : 5.00
## Mean : 12.06
## 3rd Qu.: 12.00
## Max. : 80995.00
## InvoiceDate UnitPrice CustomerID Country
## Min. :2020-01-04 Min. : 0.00 Min. :12346 Length:406829
## 1st Qu.:2020-05-10 1st Qu.: 1.25 1st Qu.:13953 Class :character
## Median :2020-08-28 Median : 1.95 Median :15152 Mode :character
## Mean :2020-08-03 Mean : 3.46 Mean :15288
## 3rd Qu.:2020-11-06 3rd Qu.: 3.75 3rd Qu.:16791
## Max. :2020-12-23 Max. :38970.00 Max. :18287
## TotalPrice
## Min. :-168469.6
## 1st Qu.: 4.2
## Median : 11.1
## Mean : 20.4
## 3rd Qu.: 19.5
## Max. : 168469.6
View(ecom_data)
# Compute RFM Metrix
## Set analysis date (e.g., max transaction date)
analysis_date <- max(ecom_data$InvoiceDate) + 1 # set as the day after the last recorded trnsaction.
## Compute RFM metrics per customer
rfm_data <- ecom_data %>%
group_by(CustomerID) %>%
summarise(
Recency = as.numeric(analysis_date - max(InvoiceDate)),
Frequency = n_distinct(InvoiceNo),
Monetary = sum(TotalPrice))
## View RFM scores
head(rfm_data)
## # A tibble: 6 × 4
## CustomerID Recency Frequency Monetary
## <int> <dbl> <int> <dbl>
## 1 12346 341 2 0
## 2 12347 17 7 4310.
## 3 12348 8 4 1797.
## 4 12349 33 1 1758.
## 5 12350 326 1 334.
## 6 12352 51 11 1545.
## Assign RFM Scores (1-5 scale; 1 = worse, 5 = best)
rfm_data <- rfm_data %>%
mutate(
R_Score = ntile(-Recency, 5),
F_Score = ntile(Frequency, 5),
M_Score = ntile(Monetary, 5))
## View the final RFM table with scores
print(rfm_data)
## # A tibble: 4,372 × 7
## CustomerID Recency Frequency Monetary R_Score F_Score M_Score
## <int> <dbl> <int> <dbl> <int> <int> <int>
## 1 12346 341 2 0 1 2 1
## 2 12347 17 7 4310. 5 4 5
## 3 12348 8 4 1797. 5 3 4
## 4 12349 33 1 1758. 3 1 4
## 5 12350 326 1 334. 1 1 2
## 6 12352 51 11 1545. 3 5 4
## 7 12353 219 1 89 1 1 1
## 8 12354 247 1 1079. 1 1 4
## 9 12355 229 1 459. 1 1 2
## 10 12356 37 3 2811. 3 3 5
## # ℹ 4,362 more rows
# Segment Customers Based on RFM Scores
rfm_data <- rfm_data %>%
mutate(
Segment = case_when(R_Score == 5 & F_Score ==5 & M_Score ==5 ~ "Best Customers",
R_Score >= 4 & F_Score >=4 & M_Score >=4 ~ "Loyal Customers",
R_Score >= 3 & F_Score >=3 & M_Score >=3 ~ "Potential Loyalists",
R_Score == 1 & F_Score ==1 & M_Score <1 ~ "Lost Customers",
TRUE ~ "Other"))
## View RFM results
print(rfm_data)
## # A tibble: 4,372 × 8
## CustomerID Recency Frequency Monetary R_Score F_Score M_Score Segment
## <int> <dbl> <int> <dbl> <int> <int> <int> <chr>
## 1 12346 341 2 0 1 2 1 Other
## 2 12347 17 7 4310. 5 4 5 Loyal Customers
## 3 12348 8 4 1797. 5 3 4 Potential Loya…
## 4 12349 33 1 1758. 3 1 4 Other
## 5 12350 326 1 334. 1 1 2 Other
## 6 12352 51 11 1545. 3 5 4 Potential Loya…
## 7 12353 219 1 89 1 1 1 Other
## 8 12354 247 1 1079. 1 1 4 Other
## 9 12355 229 1 459. 1 1 2 Other
## 10 12356 37 3 2811. 3 3 5 Potential Loya…
## # ℹ 4,362 more rows
hist(rfm_data$Recency)
hist(rfm_data$Frequency)
hist(rfm_data$Monetary)
# Visualize RFM Segments
## Bar plot of RFM segments
ggplot(rfm_data, aes(x=Segment, fill=Segment))+
geom_bar()+
theme_bw() +
geom_text(stat="count", aes(label=..count..), vjust=0) +
labs(title ="RFM Customer Segment", x="Segment", y="Count")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Identity recent and frequent custmers
ggplot(rfm_data, aes(x=Recency, y=Frequency, color=Segment))+
geom_point(size=3)+
theme_bw()+
labs(title="Recency vs Frequency by Segment",
x="Recency (Days Since Last Purchase)",
y="Frequency (Total Transactions)")
## Boxplot: Monetary Value per Segment
ggplot(rfm_data, aes(x=Segment, y=Monetary, fill=Segment))+
geom_boxplot()+
theme_bw()+
labs(title="Monetary Value by Customer Segment", x="Segment",
y="Total Monetary Value")+
theme(axis.text.x = element_text(angle = 45,hjust=1))
## Heatmap of RFM Scores: Highlight high-value customers based on RFM scores
ggplot(rfm_data, aes(x=F_Score, y=R_Score, fill=M_Score))+
geom_tile()+
scale_fill_gradient(low = "lightblue",high = "darkblue")+
labs(title="RFM Score Headmap", x="Frequency Score", y="Recency Score",
fill="Monetary Score")
## Extract customer lists in each segment
### Best Customers
best_customers <- rfm_data %>% filter(R_Score ==5, F_Score ==5, M_Score ==5)
print(best_customers)
## # A tibble: 353 × 8
## CustomerID Recency Frequency Monetary R_Score F_Score M_Score Segment
## <int> <dbl> <int> <dbl> <int> <int> <int> <chr>
## 1 12395 7 15 2998. 5 5 5 Best Customers
## 2 12417 7 12 3579. 5 5 5 Best Customers
## 3 12431 7 18 6416. 5 5 5 Best Customers
## 4 12437 16 19 4897. 5 5 5 Best Customers
## 5 12471 4 49 18741. 5 5 5 Best Customers
## 6 12476 11 20 6547. 5 5 5 Best Customers
## 7 12481 4 11 5591. 5 5 5 Best Customers
## 8 12540 11 19 13309. 5 5 5 Best Customers
## 9 12567 17 11 9115. 5 5 5 Best Customers
## 10 12569 17 35 4065. 5 5 5 Best Customers
## # ℹ 343 more rows
### Lost Customers
lost_customers <- rfm_data %>% filter(R_Score ==1, F_Score ==1, M_Score ==1)
print(lost_customers)
## # A tibble: 188 × 8
## CustomerID Recency Frequency Monetary R_Score F_Score M_Score Segment
## <int> <dbl> <int> <dbl> <int> <int> <int> <chr>
## 1 12353 219 1 89 1 1 1 Other
## 2 12361 303 1 190. 1 1 1 Other
## 3 12401 319 1 84.3 1 1 1 Other
## 4 12402 339 1 226. 1 1 1 Other
## 5 12503 353 1 -1126 1 1 1 Other
## 6 12505 317 1 -4.5 1 1 1 Other
## 7 12506 247 1 73.5 1 1 1 Other
## 8 12509 300 1 176. 1 1 1 Other
## 9 12548 181 1 95.2 1 1 1 Other
## 10 12573 242 1 161. 1 1 1 Other
## # ℹ 178 more rows
### Potential Loyalists
potential_loyalists <-rfm_data %>% filter(R_Score >=3, F_Score >=3)
print(potential_loyalists)
## # A tibble: 1,971 × 8
## CustomerID Recency Frequency Monetary R_Score F_Score M_Score Segment
## <int> <dbl> <int> <dbl> <int> <int> <int> <chr>
## 1 12347 17 7 4310. 5 4 5 Loyal Customers
## 2 12348 8 4 1797. 5 3 4 Potential Loya…
## 3 12352 51 11 1545. 3 5 4 Potential Loya…
## 4 12356 37 3 2811. 3 3 5 Potential Loya…
## 5 12359 22 6 6246. 4 4 5 Loyal Customers
## 6 12362 18 13 5155. 4 5 5 Loyal Customers
## 7 12364 22 4 1313. 4 3 4 Potential Loya…
## 8 12370 7 4 3546. 5 3 5 Potential Loya…
## 9 12375 17 3 455. 5 3 2 Other
## 10 12380 36 5 2721. 3 4 5 Potential Loya…
## # ℹ 1,961 more rows
### Inactive Customers
inactive_customers <- rfm_data %>% filter(R_Score <=2, F_Score <=2)
print(inactive_customers)
## # A tibble: 1,099 × 8
## CustomerID Recency Frequency Monetary R_Score F_Score M_Score Segment
## <int> <dbl> <int> <dbl> <int> <int> <int> <chr>
## 1 12346 341 2 0 1 2 1 Other
## 2 12350 326 1 334. 1 1 2 Other
## 3 12353 219 1 89 1 1 1 Other
## 4 12354 247 1 1079. 1 1 4 Other
## 5 12355 229 1 459. 1 1 2 Other
## 6 12361 303 1 190. 1 1 1 Other
## 7 12363 124 2 552 2 2 3 Other
## 8 12373 327 1 365. 1 1 2 Other
## 9 12378 144 1 4009. 2 1 5 Other
## 10 12390 94 1 550. 2 1 3 Other
## # ℹ 1,089 more rows
## Create a Pie Chart of RFM Customer Segments
### Count customers in each Segment of the dataset
rfm_segments <- rfm_data %>%
group_by(Segment) %>%
summarise(count=n()) %>%
mutate(percentage =count/sum(count) * 100)
print(rfm_segments)
## # A tibble: 4 × 3
## Segment count percentage
## <chr> <int> <dbl>
## 1 Best Customers 353 8.07
## 2 Loyal Customers 595 13.6
## 3 Other 2654 60.7
## 4 Potential Loyalists 770 17.6
### Create a Pie Chart
ggplot(rfm_segments, aes(x="", y=percentage, fill=Segment))+
geom_bar(stat="identity", width=1)+
coord_polar(theta="y")+
theme_void()+
labs(title="RFM Customer Segments")+
geom_text(aes(label=paste0(percentage, "%")),
position=position_stack(vjust=0.5))
# Calculate percentage and round to 1 decimal place
rfm_segments$percentage <- round(rfm_segments$count/sum(rfm_segments$count)*100, 1)
print(rfm_segments)
## # A tibble: 4 × 3
## Segment count percentage
## <chr> <int> <dbl>
## 1 Best Customers 353 8.1
## 2 Loyal Customers 595 13.6
## 3 Other 2654 60.7
## 4 Potential Loyalists 770 17.6
### Create a Pie Chart
ggplot(rfm_segments, aes(x="", y=percentage, fill=Segment))+
geom_bar(stat="identity", width=1)+
coord_polar(theta="y")+
theme_void()+
labs(title="RFM Customer Segments")+
geom_text(aes(label=paste0(percentage, "%")),
position=position_stack(vjust=0.5))
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.