This is a transactional data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail. The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers. Dataset https://www.kaggle.com/mysarahmadbhat/customersegmentation
Segment Customers based on Recency , Frequency and Monetary (Based on RFM score),so companies can market to each group effectively and appropriately .To know more about RFM analysis visit https://www.putler.com/rfm-analysis/
install.packages("tidyverse")
library(tidyverse)
library(dplyr)
library(ggplot2)
library(readr)
Online_Retail <- read_csv("Online Retail.csv")
head(Online_Retail)
## # A tibble: 6 × 8
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID
## <chr> <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 536365 85123A WHITE HANGING H… 6 12/1/2010 … 2.55 17850
## 2 536365 71053 WHITE METAL LAN… 6 12/1/2010 … 3.39 17850
## 3 536365 84406B CREAM CUPID HEA… 8 12/1/2010 … 2.75 17850
## 4 536365 84029G KNITTED UNION F… 6 12/1/2010 … 3.39 17850
## 5 536365 84029E RED WOOLLY HOTT… 6 12/1/2010 … 3.39 17850
## 6 536365 22752 SET 7 BABUSHKA … 2 12/1/2010 … 7.65 17850
## # … with 1 more variable: Country <chr>
summary(Online_Retail)
## InvoiceNo StockCode Description Quantity
## Length:541909 Length:541909 Length:541909 Min. :-80995.00
## Class :character Class :character Class :character 1st Qu.: 1.00
## Mode :character Mode :character Mode :character Median : 3.00
## Mean : 9.55
## 3rd Qu.: 10.00
## Max. : 80995.00
##
## InvoiceDate UnitPrice CustomerID Country
## Length:541909 Min. :-11062.06 Min. :12346 Length:541909
## Class :character 1st Qu.: 1.25 1st Qu.:13953 Class :character
## Mode :character Median : 2.08 Median :15152 Mode :character
## Mean : 4.61 Mean :15288
## 3rd Qu.: 4.13 3rd Qu.:16791
## Max. : 38970.00 Max. :18287
## NA's :135080
Online_Retail_sub <- subset(Online_Retail,!is.na(CustomerID))
summary(Online_Retail_sub)
## InvoiceNo StockCode Description Quantity
## Length:406829 Length:406829 Length:406829 Min. :-80995.00
## Class :character Class :character Class :character 1st Qu.: 2.00
## Mode :character Mode :character Mode :character Median : 5.00
## Mean : 12.06
## 3rd Qu.: 12.00
## Max. : 80995.00
## InvoiceDate UnitPrice CustomerID Country
## Length:406829 Min. : 0.00 Min. :12346 Length:406829
## Class :character 1st Qu.: 1.25 1st Qu.:13953 Class :character
## Mode :character Median : 1.95 Median :15152 Mode :character
## Mean : 3.46 Mean :15288
## 3rd Qu.: 3.75 3rd Qu.:16791
## Max. :38970.00 Max. :18287
any(is.na(Online_Retail_sub$CustomerID))
## [1] FALSE
Online_Retail_sub <- Online_Retail_sub %>% distinct()
Online_Retail_sub$InvoiceNo <- as.character(Online_Retail_sub$InvoiceNo)
Online_Retail_sub$StockCode <- as.character(Online_Retail_sub$StockCode)
Online_Retail_sub$InvoiceDate <- as.Date(Online_Retail_sub$InvoiceDate,format = "%m/%d/%Y")
Online_Retail_sub$CustomerID <- as.character(Online_Retail_sub$CustomerID)
summary(Online_Retail_sub)
## InvoiceNo StockCode Description Quantity
## Length:401604 Length:401604 Length:401604 Min. :-80995.00
## Class :character Class :character Class :character 1st Qu.: 2.00
## Mode :character Mode :character Mode :character Median : 5.00
## Mean : 12.18
## 3rd Qu.: 12.00
## Max. : 80995.00
## InvoiceDate UnitPrice CustomerID Country
## Min. :2010-12-01 Min. : 0.00 Length:401604 Length:401604
## 1st Qu.:2011-04-06 1st Qu.: 1.25 Class :character Class :character
## Median :2011-07-29 Median : 1.95 Mode :character Mode :character
## Mean :2011-07-09 Mean : 3.47
## 3rd Qu.:2011-10-20 3rd Qu.: 3.75
## Max. :2011-12-09 Max. :38970.00
plt <- ggplot(Online_Retail_sub) +
geom_bar(aes(x =Country),width = 1,fill = "purple",color = "black") +
theme(axis.text.x = element_text(angle = 90)) + labs(x = "Country")
plt
Online_Retail_sub <- filter(
Online_Retail_sub,Country == "United Kingdom")
Online_Retail_sub <- Online_Retail_sub %>% filter(substr(Online_Retail_sub$InvoiceNo, 1, 1) != 'C')
Online_Retail_sub <- Online_Retail_sub %>% mutate(amount = Quantity*UnitPrice)
head(Online_Retail_sub)
## # A tibble: 6 × 9
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID
## <chr> <chr> <chr> <dbl> <date> <dbl> <chr>
## 1 536365 85123A WHITE HANGING H… 6 2010-12-01 2.55 17850
## 2 536365 71053 WHITE METAL LAN… 6 2010-12-01 3.39 17850
## 3 536365 84406B CREAM CUPID HEA… 8 2010-12-01 2.75 17850
## 4 536365 84029G KNITTED UNION F… 6 2010-12-01 3.39 17850
## 5 536365 84029E RED WOOLLY HOTT… 6 2010-12-01 3.39 17850
## 6 536365 22752 SET 7 BABUSHKA … 2 2010-12-01 7.65 17850
## # … with 2 more variables: Country <chr>, amount <dbl>
summary(Online_Retail_sub)
## InvoiceNo StockCode Description Quantity
## Length:349227 Length:349227 Length:349227 Min. : 1.00
## Class :character Class :character Class :character 1st Qu.: 2.00
## Mode :character Mode :character Mode :character Median : 4.00
## Mean : 12.18
## 3rd Qu.: 12.00
## Max. :80995.00
## InvoiceDate UnitPrice CustomerID Country
## Min. :2010-12-01 Min. : 0.000 Length:349227 Length:349227
## 1st Qu.:2011-04-07 1st Qu.: 1.250 Class :character Class :character
## Median :2011-07-31 Median : 1.950 Mode :character Mode :character
## Mean :2011-07-10 Mean : 2.972
## 3rd Qu.:2011-10-23 3rd Qu.: 3.750
## Max. :2011-12-09 Max. :8142.750
## amount
## Min. : 0.00
## 1st Qu.: 4.20
## Median : 10.20
## Mean : 20.86
## 3rd Qu.: 17.85
## Max. :168469.60
snap_date <- as.Date("15/12/2011",format = "%d/%m/%Y")
Online_Retail_sub$date_diff <- snap_date - Online_Retail_sub$InvoiceDate
#View(Online_Retail_sub)
Recency_data <- aggregate(Online_Retail_sub$date_diff, by = list(Online_Retail_sub$CustomerID),
FUN = min)
colnames(Recency_data) <- c("CustomerID","Recency")
Recency_df <- select(Recency_data,CustomerID,Recency)
head(Recency_df)
## CustomerID Recency
## 1 12346 331 days
## 2 12747 8 days
## 3 12748 6 days
## 4 12749 9 days
## 5 12820 9 days
## 6 12821 220 days
freqdata <- aggregate(Online_Retail_sub$InvoiceNo, by = list(Online_Retail_sub$CustomerID),
FUN = length)
colnames(freqdata) <- c("CustomerID","Frequency")
freq_df <- select(freqdata,CustomerID,Frequency)
head(freq_df)
## CustomerID Frequency
## 1 12346 1
## 2 12747 103
## 3 12748 4413
## 4 12749 199
## 5 12820 59
## 6 12821 6
Monetary_data <- aggregate(Online_Retail_sub$amount, by = list(Online_Retail_sub$CustomerID),
FUN = sum)
colnames(Monetary_data) <- c("CustomerID","Monetary")
monetary_df <- select(Monetary_data,CustomerID,Monetary)
head(monetary_df)
## CustomerID Monetary
## 1 12346 77183.60
## 2 12747 4196.01
## 3 12748 33053.19
## 4 12749 4090.88
## 5 12820 942.34
## 6 12821 92.72
RFM_data <- Recency_df %>% inner_join(freq_df,by = "CustomerID" ) %>%
inner_join(monetary_df , by = "CustomerID")
RFM_data <- RFM_data %>% mutate(R = ntile(desc(Recency),5)) %>%
mutate(f = ntile(Frequency,5)) %>%
mutate(M = ntile(Monetary,5))
head(RFM_data)
## CustomerID Recency Frequency Monetary R f M
## 1 12346 331 days 1 77183.60 1 1 5
## 2 12747 8 days 103 4196.01 5 4 5
## 3 12748 6 days 4413 33053.19 5 5 5
## 4 12749 9 days 199 4090.88 5 5 5
## 5 12820 9 days 59 942.34 5 4 4
## 6 12821 220 days 6 92.72 1 1 1
RFM_analysis <- RFM_data %>% mutate(RFM_Segment = as.numeric( paste(paste(as.character(RFM_data $R),as.character(RFM_data $f),sep = "") , as.character(RFM_data $M),sep = "" )))
head(RFM_analysis)
## CustomerID Recency Frequency Monetary R f M RFM_Segment
## 1 12346 331 days 1 77183.60 1 1 5 115
## 2 12747 8 days 103 4196.01 5 4 5 545
## 3 12748 6 days 4413 33053.19 5 5 5 555
## 4 12749 9 days 199 4090.88 5 5 5 555
## 5 12820 9 days 59 942.34 5 4 4 544
## 6 12821 220 days 6 92.72 1 1 1 111
RFM_analysis_final <- RFM_analysis %>%
mutate(category = case_when(RFM_Segment %in% list(555, 554, 544, 545, 454, 455, 445) ~ "Champions",
RFM_Segment %in% list(543, 444, 435, 355, 354, 345, 344, 335) ~ "Loyal Customers",
RFM_Segment %in% list(553, 551,552, 541, 542, 533, 532, 531, 452, 451, 442, 441, 431, 453, 433, 432, 423, 353, 352, 351, 342, 341, 333, 323) ~ "Potential Loyalist",
RFM_Segment %in% list(512,511, 422, 421, 412, 411, 311) ~ "Recent Customers",
RFM_Segment %in% list(525, 524, 523, 522, 521, 515, 514, 513, 425, 424, 413,414, 415, 315, 314, 313) ~ "Promising",
RFM_Segment %in% list(535, 534, 443, 434, 343, 334, 325, 324) ~ "Customers Needing Attention",
RFM_Segment %in% list(331, 321, 312, 221, 213) ~ "About To Sleep",
RFM_Segment %in% list(255, 254, 245, 244, 253, 252, 243, 242, 235, 234, 225, 224, 153, 152, 145, 143, 142, 135, 134, 133, 125, 124) ~ "At Risk",
RFM_Segment %in% list(155, 154, 144, 214,215,115, 114, 113) ~ "Can’t Lose Them",
RFM_Segment %in% list(332, 322, 231, 241, 251, 233, 232, 223, 222, 132, 123, 122, 212, 211) ~ "Hibernating",
RFM_Segment %in% list(111, 112, 121, 131,141,151) ~ "Lost") )
head(RFM_analysis_final)
## CustomerID Recency Frequency Monetary R f M RFM_Segment category
## 1 12346 331 days 1 77183.60 1 1 5 115 Can’t Lose Them
## 2 12747 8 days 103 4196.01 5 4 5 545 Champions
## 3 12748 6 days 4413 33053.19 5 5 5 555 Champions
## 4 12749 9 days 199 4090.88 5 5 5 555 Champions
## 5 12820 9 days 59 942.34 5 4 4 544 Champions
## 6 12821 220 days 6 92.72 1 1 1 111 Lost
write_csv(RFM_analysis_final,"RFM_analysis_final.csv")