getwd()
## [1] "/Users/jakedudar/Desktop/MKTG3P98/project 1"
setwd("/Users/jakedudar/Desktop/MKTG3P98/project 1")

options(repos = c(CRAN = "https://cloud.r-project.org/"))


install.packages("tidyverse")
## 
## The downloaded binary packages are in
##  /var/folders/b0/jlkfwcg920s7r19b0x0prj6h0000gn/T//RtmpF1wSn9/downloaded_packages
install.packages("lubridate")
## 
## The downloaded binary packages are in
##  /var/folders/b0/jlkfwcg920s7r19b0x0prj6h0000gn/T//RtmpF1wSn9/downloaded_packages
install.packages("rfm")
## 
## The downloaded binary packages are in
##  /var/folders/b0/jlkfwcg920s7r19b0x0prj6h0000gn/T//RtmpF1wSn9/downloaded_packages
install.packages("ggplot2")
## 
## The downloaded binary packages are in
##  /var/folders/b0/jlkfwcg920s7r19b0x0prj6h0000gn/T//RtmpF1wSn9/downloaded_packages
library(tidyverse)   # Data manipulation
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)   # Date format
library(rfm)         # RFM analysis
library(ggplot2)     # Data visualization


# Load e-commerce Data from the working directory
ecom_data <-read.csv("e-commerce data.csv")

# Summary statistics and check if there are any missing values in the dataset
summary(ecom_data)
##   InvoiceNo          StockCode         Description           Quantity        
##  Length:541909      Length:541909      Length:541909      Min.   :-80995.00  
##  Class :character   Class :character   Class :character   1st Qu.:     1.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :     3.00  
##                                                           Mean   :     9.55  
##                                                           3rd Qu.:    10.00  
##                                                           Max.   : 80995.00  
##                                                                              
##  InvoiceDate          UnitPrice           CustomerID       Country         
##  Length:541909      Min.   :-11062.06   Min.   :12346    Length:541909     
##  Class :character   1st Qu.:     1.25   1st Qu.:13953    Class :character  
##  Mode  :character   Median :     2.08   Median :15152    Mode  :character  
##                     Mean   :     4.61   Mean   :15288                      
##                     3rd Qu.:     4.13   3rd Qu.:16791                      
##                     Max.   : 38970.00   Max.   :18287                      
##                                         NA's   :135080
# Data Cleaning

## Remove missing values
ecom_data <- ecom_data %>% drop_na(CustomerID) 
summary(ecom_data)
##   InvoiceNo          StockCode         Description           Quantity        
##  Length:406829      Length:406829      Length:406829      Min.   :-80995.00  
##  Class :character   Class :character   Class :character   1st Qu.:     2.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :     5.00  
##                                                           Mean   :    12.06  
##                                                           3rd Qu.:    12.00  
##                                                           Max.   : 80995.00  
##  InvoiceDate          UnitPrice          CustomerID      Country         
##  Length:406829      Min.   :    0.00   Min.   :12346   Length:406829     
##  Class :character   1st Qu.:    1.25   1st Qu.:13953   Class :character  
##  Mode  :character   Median :    1.95   Median :15152   Mode  :character  
##                     Mean   :    3.46   Mean   :15288                     
##                     3rd Qu.:    3.75   3rd Qu.:16791                     
##                     Max.   :38970.00   Max.   :18287
## Convert InvoiceDate to Date Format
ecom_data$InvoiceDate <- as.Date(ecom_data$InvoiceDate, format = "%m/%d/%y")
head(ecom_data)  # display the first few rows of data
##   InvoiceNo StockCode                         Description Quantity InvoiceDate
## 1    536365    85123A  WHITE HANGING HEART T-LIGHT HOLDER        6  2020-12-01
## 2    536365     71053                 WHITE METAL LANTERN        6  2020-12-01
## 3    536365    84406B      CREAM CUPID HEARTS COAT HANGER        8  2020-12-01
## 4    536365    84029G KNITTED UNION FLAG HOT WATER BOTTLE        6  2020-12-01
## 5    536365    84029E      RED WOOLLY HOTTIE WHITE HEART.        6  2020-12-01
## 6    536365     22752        SET 7 BABUSHKA NESTING BOXES        2  2020-12-01
##   UnitPrice CustomerID        Country
## 1      2.55      17850 United Kingdom
## 2      3.39      17850 United Kingdom
## 3      2.75      17850 United Kingdom
## 4      3.39      17850 United Kingdom
## 5      3.39      17850 United Kingdom
## 6      7.65      17850 United Kingdom
## Create TotalPrice colum (Quantity x UnitPrice)
ecom_data <- ecom_data %>% 
  mutate(TotalPrice = Quantity * UnitPrice)

## View summary
summary(ecom_data)
##   InvoiceNo          StockCode         Description           Quantity        
##  Length:406829      Length:406829      Length:406829      Min.   :-80995.00  
##  Class :character   Class :character   Class :character   1st Qu.:     2.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :     5.00  
##                                                           Mean   :    12.06  
##                                                           3rd Qu.:    12.00  
##                                                           Max.   : 80995.00  
##   InvoiceDate           UnitPrice          CustomerID      Country         
##  Min.   :2020-01-04   Min.   :    0.00   Min.   :12346   Length:406829     
##  1st Qu.:2020-05-10   1st Qu.:    1.25   1st Qu.:13953   Class :character  
##  Median :2020-08-28   Median :    1.95   Median :15152   Mode  :character  
##  Mean   :2020-08-03   Mean   :    3.46   Mean   :15288                     
##  3rd Qu.:2020-11-06   3rd Qu.:    3.75   3rd Qu.:16791                     
##  Max.   :2020-12-23   Max.   :38970.00   Max.   :18287                     
##    TotalPrice       
##  Min.   :-168469.6  
##  1st Qu.:      4.2  
##  Median :     11.1  
##  Mean   :     20.4  
##  3rd Qu.:     19.5  
##  Max.   : 168469.6
View(ecom_data)

# Compute RFM Metrix

## Set analysis date (e.g., max transaction date)
analysis_date <- max(ecom_data$InvoiceDate) + 1   # set as the day after the last recorded trnsaction.

## Compute RFM metrics per customer
rfm_data <- ecom_data %>% 
  group_by(CustomerID) %>%
  summarise(
    Recency = as.numeric(analysis_date - max(InvoiceDate)),
    Frequency = n_distinct(InvoiceNo),
    Monetary = sum(TotalPrice))

## View RFM scores
head(rfm_data)
## # A tibble: 6 × 4
##   CustomerID Recency Frequency Monetary
##        <int>   <dbl>     <int>    <dbl>
## 1      12346     341         2       0 
## 2      12347      17         7    4310.
## 3      12348       8         4    1797.
## 4      12349      33         1    1758.
## 5      12350     326         1     334.
## 6      12352      51        11    1545.
## Assign RFM Scores (1-5 scale; 1 = worse, 5 = best)

rfm_data <- rfm_data %>%
  mutate(
    R_Score = ntile(-Recency, 5),    
    F_Score = ntile(Frequency, 5),
    M_Score = ntile(Monetary, 5))

## View the final RFM table with scores
print(rfm_data)
## # A tibble: 4,372 × 7
##    CustomerID Recency Frequency Monetary R_Score F_Score M_Score
##         <int>   <dbl>     <int>    <dbl>   <int>   <int>   <int>
##  1      12346     341         2       0        1       2       1
##  2      12347      17         7    4310.       5       4       5
##  3      12348       8         4    1797.       5       3       4
##  4      12349      33         1    1758.       3       1       4
##  5      12350     326         1     334.       1       1       2
##  6      12352      51        11    1545.       3       5       4
##  7      12353     219         1      89        1       1       1
##  8      12354     247         1    1079.       1       1       4
##  9      12355     229         1     459.       1       1       2
## 10      12356      37         3    2811.       3       3       5
## # ℹ 4,362 more rows
# Segment Customers Based on RFM Scores
rfm_data <- rfm_data %>%
  mutate(
    Segment = case_when(R_Score == 5 & F_Score ==5 & M_Score ==5 ~ "Best Customers",
                        R_Score >= 4 & F_Score >=4 & M_Score >=4 ~ "Loyal Customers",
                        R_Score >= 3 & F_Score >=3 & M_Score >=3 ~ "Potential Loyalists",
                        R_Score == 1 & F_Score ==1 & M_Score <1 ~ "Lost Customers",
                        TRUE ~ "Other"))
## View RFM results
print(rfm_data)
## # A tibble: 4,372 × 8
##    CustomerID Recency Frequency Monetary R_Score F_Score M_Score Segment        
##         <int>   <dbl>     <int>    <dbl>   <int>   <int>   <int> <chr>          
##  1      12346     341         2       0        1       2       1 Other          
##  2      12347      17         7    4310.       5       4       5 Loyal Customers
##  3      12348       8         4    1797.       5       3       4 Potential Loya…
##  4      12349      33         1    1758.       3       1       4 Other          
##  5      12350     326         1     334.       1       1       2 Other          
##  6      12352      51        11    1545.       3       5       4 Potential Loya…
##  7      12353     219         1      89        1       1       1 Other          
##  8      12354     247         1    1079.       1       1       4 Other          
##  9      12355     229         1     459.       1       1       2 Other          
## 10      12356      37         3    2811.       3       3       5 Potential Loya…
## # ℹ 4,362 more rows
hist(rfm_data$Recency)

hist(rfm_data$Frequency)

hist(rfm_data$Monetary)

# Visualize RFM Segments

## Bar plot of RFM segments
ggplot(rfm_data, aes(x=Segment, fill=Segment))+
  geom_bar()+
  theme_bw() +
  geom_text(stat="count", aes(label=..count..), vjust=0) +
  labs(title ="RFM Customer Segment", x="Segment", y="Count")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Identity recent and frequent custmers
ggplot(rfm_data, aes(x=Recency, y=Frequency, color=Segment))+
  geom_point(size=3)+
  theme_bw()+
  labs(title="Recency vs Frequency by Segment", 
       x="Recency (Days Since Last Purchase)", 
       y="Frequency (Total Transactions)")

## Boxplot: Monetary Value per Segment 
ggplot(rfm_data, aes(x=Segment, y=Monetary, fill=Segment))+
  geom_boxplot()+
  theme_bw()+
  labs(title="Monetary Value by Customer Segment", x="Segment", 
       y="Total Monetary Value")+
  theme(axis.text.x = element_text(angle = 45,hjust=1))

## Heatmap of RFM Scores: Highlight high-value customers based on RFM scores
ggplot(rfm_data, aes(x=F_Score, y=R_Score, fill=M_Score))+
  geom_tile()+
  scale_fill_gradient(low = "lightblue",high = "darkblue")+
  labs(title="RFM Score Headmap", x="Frequency Score", y="Recency Score",
       fill="Monetary Score")

## Extract customer lists in each segment
### Best Customers
best_customers <- rfm_data %>% filter(R_Score ==5, F_Score ==5, M_Score ==5)
print(best_customers)
## # A tibble: 353 × 8
##    CustomerID Recency Frequency Monetary R_Score F_Score M_Score Segment       
##         <int>   <dbl>     <int>    <dbl>   <int>   <int>   <int> <chr>         
##  1      12395       7        15    2998.       5       5       5 Best Customers
##  2      12417       7        12    3579.       5       5       5 Best Customers
##  3      12431       7        18    6416.       5       5       5 Best Customers
##  4      12437      16        19    4897.       5       5       5 Best Customers
##  5      12471       4        49   18741.       5       5       5 Best Customers
##  6      12476      11        20    6547.       5       5       5 Best Customers
##  7      12481       4        11    5591.       5       5       5 Best Customers
##  8      12540      11        19   13309.       5       5       5 Best Customers
##  9      12567      17        11    9115.       5       5       5 Best Customers
## 10      12569      17        35    4065.       5       5       5 Best Customers
## # ℹ 343 more rows
### Lost Customers
lost_customers <- rfm_data %>% filter(R_Score ==1, F_Score ==1, M_Score ==1)
print(lost_customers)
## # A tibble: 188 × 8
##    CustomerID Recency Frequency Monetary R_Score F_Score M_Score Segment
##         <int>   <dbl>     <int>    <dbl>   <int>   <int>   <int> <chr>  
##  1      12353     219         1     89         1       1       1 Other  
##  2      12361     303         1    190.        1       1       1 Other  
##  3      12401     319         1     84.3       1       1       1 Other  
##  4      12402     339         1    226.        1       1       1 Other  
##  5      12503     353         1  -1126         1       1       1 Other  
##  6      12505     317         1     -4.5       1       1       1 Other  
##  7      12506     247         1     73.5       1       1       1 Other  
##  8      12509     300         1    176.        1       1       1 Other  
##  9      12548     181         1     95.2       1       1       1 Other  
## 10      12573     242         1    161.        1       1       1 Other  
## # ℹ 178 more rows
### Potential Loyalists
potential_loyalists <-rfm_data %>% filter(R_Score >=3, F_Score >=3)
print(potential_loyalists)
## # A tibble: 1,971 × 8
##    CustomerID Recency Frequency Monetary R_Score F_Score M_Score Segment        
##         <int>   <dbl>     <int>    <dbl>   <int>   <int>   <int> <chr>          
##  1      12347      17         7    4310.       5       4       5 Loyal Customers
##  2      12348       8         4    1797.       5       3       4 Potential Loya…
##  3      12352      51        11    1545.       3       5       4 Potential Loya…
##  4      12356      37         3    2811.       3       3       5 Potential Loya…
##  5      12359      22         6    6246.       4       4       5 Loyal Customers
##  6      12362      18        13    5155.       4       5       5 Loyal Customers
##  7      12364      22         4    1313.       4       3       4 Potential Loya…
##  8      12370       7         4    3546.       5       3       5 Potential Loya…
##  9      12375      17         3     455.       5       3       2 Other          
## 10      12380      36         5    2721.       3       4       5 Potential Loya…
## # ℹ 1,961 more rows
### Inactive Customers
inactive_customers <- rfm_data %>% filter(R_Score <=2, F_Score <=2)
print(inactive_customers)
## # A tibble: 1,099 × 8
##    CustomerID Recency Frequency Monetary R_Score F_Score M_Score Segment
##         <int>   <dbl>     <int>    <dbl>   <int>   <int>   <int> <chr>  
##  1      12346     341         2       0        1       2       1 Other  
##  2      12350     326         1     334.       1       1       2 Other  
##  3      12353     219         1      89        1       1       1 Other  
##  4      12354     247         1    1079.       1       1       4 Other  
##  5      12355     229         1     459.       1       1       2 Other  
##  6      12361     303         1     190.       1       1       1 Other  
##  7      12363     124         2     552        2       2       3 Other  
##  8      12373     327         1     365.       1       1       2 Other  
##  9      12378     144         1    4009.       2       1       5 Other  
## 10      12390      94         1     550.       2       1       3 Other  
## # ℹ 1,089 more rows
## Create a Pie Chart of RFM Customer Segments

### Count customers in each Segment of the dataset
rfm_segments <- rfm_data %>% 
  group_by(Segment) %>% 
  summarise(count=n()) %>%
  mutate(percentage =count/sum(count) * 100)
print(rfm_segments)
## # A tibble: 4 × 3
##   Segment             count percentage
##   <chr>               <int>      <dbl>
## 1 Best Customers        353       8.07
## 2 Loyal Customers       595      13.6 
## 3 Other                2654      60.7 
## 4 Potential Loyalists   770      17.6
### Create a Pie Chart
ggplot(rfm_segments, aes(x="", y=percentage, fill=Segment))+
  geom_bar(stat="identity", width=1)+
  coord_polar(theta="y")+
  theme_void()+
  labs(title="RFM Customer Segments")+
  geom_text(aes(label=paste0(percentage, "%")),
            position=position_stack(vjust=0.5))

# Calculate percentage and round to 1 decimal place
rfm_segments$percentage <- round(rfm_segments$count/sum(rfm_segments$count)*100, 1)
print(rfm_segments)
## # A tibble: 4 × 3
##   Segment             count percentage
##   <chr>               <int>      <dbl>
## 1 Best Customers        353        8.1
## 2 Loyal Customers       595       13.6
## 3 Other                2654       60.7
## 4 Potential Loyalists   770       17.6
### Create a Pie Chart
ggplot(rfm_segments, aes(x="", y=percentage, fill=Segment))+
  geom_bar(stat="identity", width=1)+
  coord_polar(theta="y")+
  theme_void()+
  labs(title="RFM Customer Segments")+
  geom_text(aes(label=paste0(percentage, "%")),
            position=position_stack(vjust=0.5))

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.