let’s see what we can find in it!
Import and Examine the Data
require(data.table)
## 載入需要的套件:data.table
data1 <- data.table::fread("C:/R-language/PBA/onlineRetail.csv")
require(tidyverse)
## 載入需要的套件:tidyverse
## Warning: 套件 'tidyverse' 是用 R 版本 4.2.2 來建造的
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## Warning: 套件 'ggplot2' 是用 R 版本 4.2.2 來建造的
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ purrr::transpose() masks data.table::transpose()
glimpse(data1)
## Rows: 541,909
## Columns: 8
## $ InvoiceNo <chr> "536365", "536365", "536365", "536365", "536365", "536365"…
## $ StockCode <chr> "85123A", "71053", "84406B", "84029G", "84029E", "22752", …
## $ Description <chr> "WHITE HANGING HEART T-LIGHT HOLDER", "WHITE METAL LANTERN…
## $ Quantity <int> 6, 6, 8, 6, 6, 2, 6, 6, 6, 32, 6, 6, 8, 6, 6, 3, 2, 3, 3, …
## $ InvoiceDate <chr> "12/1/10 8:26", "12/1/10 8:26", "12/1/10 8:26", "12/1/10 8…
## $ UnitPrice <dbl> 2.55, 3.39, 2.75, 3.39, 3.39, 7.65, 4.25, 1.85, 1.85, 1.69…
## $ CustomerID <int> 17850, 17850, 17850, 17850, 17850, 17850, 17850, 17850, 17…
## $ Country <chr> "United Kingdom", "United Kingdom", "United Kingdom", "Uni…
summary(data1)
## InvoiceNo StockCode Description Quantity
## Length:541909 Length:541909 Length:541909 Min. :-80995.00
## Class :character Class :character Class :character 1st Qu.: 1.00
## Mode :character Mode :character Mode :character Median : 3.00
## Mean : 9.55
## 3rd Qu.: 10.00
## Max. : 80995.00
##
## InvoiceDate UnitPrice CustomerID Country
## Length:541909 Min. :-11062.06 Min. :12346 Length:541909
## Class :character 1st Qu.: 1.25 1st Qu.:13953 Class :character
## Mode :character Median : 2.08 Median :15152 Mode :character
## Mean : 4.61 Mean :15288
## 3rd Qu.: 4.13 3rd Qu.:16791
## Max. : 38970.00 Max. :18287
## NA's :135080
head(data1,10)
## InvoiceNo StockCode Description Quantity
## 1: 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6
## 2: 536365 71053 WHITE METAL LANTERN 6
## 3: 536365 84406B CREAM CUPID HEARTS COAT HANGER 8
## 4: 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6
## 5: 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6
## 6: 536365 22752 SET 7 BABUSHKA NESTING BOXES 2
## 7: 536365 21730 GLASS STAR FROSTED T-LIGHT HOLDER 6
## 8: 536366 22633 HAND WARMER UNION JACK 6
## 9: 536366 22632 HAND WARMER RED POLKA DOT 6
## 10: 536367 84879 ASSORTED COLOUR BIRD ORNAMENT 32
## InvoiceDate UnitPrice CustomerID Country
## 1: 12/1/10 8:26 2.55 17850 United Kingdom
## 2: 12/1/10 8:26 3.39 17850 United Kingdom
## 3: 12/1/10 8:26 2.75 17850 United Kingdom
## 4: 12/1/10 8:26 3.39 17850 United Kingdom
## 5: 12/1/10 8:26 3.39 17850 United Kingdom
## 6: 12/1/10 8:26 7.65 17850 United Kingdom
## 7: 12/1/10 8:26 4.25 17850 United Kingdom
## 8: 12/1/10 8:28 1.85 17850 United Kingdom
## 9: 12/1/10 8:28 1.85 17850 United Kingdom
## 10: 12/1/10 8:34 1.69 13047 United Kingdom
cat("the unique number of customers:",length(unique(data1$CustomerID)))
## the unique number of customers: 4373
cat("\nthe unique number of products purchased:",length(unique(data1$StockCode)))
##
## the unique number of products purchased: 4070
cat("\nthe unique number of transactions:",length(unique(data1$InvoiceNo)))
##
## the unique number of transactions: 25900
#Drop the value that Quantity or Unit Price are lower than 0.
data1 <- data1[data1$Quantity>=0];data1 <- data1[data1$UnitPrice>=0]
data1$CustomerID <- as.character(data1$CustomerID)
#Drop the InvoiceNo. which contains the letter "C"(but including NA value).
data2 <- subset(data1,grepl("C",data1$InvoiceNo) != TRUE)
RFM Variables
Then I create a variable called Frequency and Monetary for each
customer in the data.
#frequency
data2 <- data2[order(data2$CustomerID)]
dataT <- cbind(dataT, Frequency = with(data2,
as.numeric(by(InvoiceNo, CustomerID, function(x) length(unique(x))))))
#Monetary value
dataT <- cbind(dataT, Monetary = with(data2,
as.numeric(by(Amount, CustomerID, function(x) sum(x)))))
head(dataT,5)
## InvoiceNo StockCode Description Quantity InvoiceDate
## 1: 541431 23166 MEDIUM CERAMIC TOP STORAGE JAR 74215 2011-01-18
## 2: 581180 23497 CLASSIC CHROME BICYCLE BELL 12 2011-12-07
## 3: 568172 23077 DOUGHNUT LIP GLOSS 120 2011-09-25
## 4: 577609 23112 PARISIENNE CURIO CABINET 2 2011-11-21
## 5: 543037 21908 CHOCOLATE THIS WAY METAL SIGN 12 2011-02-02
## UnitPrice CustomerID Country Amount Recency Frequency Monetary
## 1: 1.04 12346 United Kingdom 77183.6 325 days 1 77183.60
## 2: 1.45 12347 Iceland 17.4 2 days 7 4310.00
## 3: 1.25 12348 Finland 150.0 75 days 4 1797.24
## 4: 7.50 12349 Italy 15.0 18 days 1 1757.55
## 5: 2.10 12350 Norway 25.2 310 days 1 334.40
After finish the pre-processing, let’s visualize the RFM variables
with box plots:
par(mfrow = c(1,3))
#boxplot(dataT$Recency,dataT$Frequency,dataT$Monetary,names = c("Recency","Frequency","Monetary"))
boxplot(dataT$Recency,xlab = "Recency")
boxplot(dataT$Frequency,xlab = "Frequency")
boxplot(dataT$Monetary,xlab = "Monetary")

It seems that there are extreme values in the RFM variables. So, I
remove these extreme values/outliers by keeping only the values that are
within the 99th percentile.
Rquan <- quantile(as.numeric(dataT$Recency),0.99)
Fquan <- quantile(dataT$Frequency,0.99)
Mquan <- quantile(dataT$Monetary,0.99)
RFM <- subset(dataT,dataT$Recency <= Rquan & dataT$Frequency <= Fquan & dataT$Monetary <= Mquan)
par(mfrow = c(1,3))
boxplot(RFM$Recency,xlab = "Recency")
boxplot(RFM$Frequency,xlab = "Frequency")
boxplot(RFM$Monetary,xlab = "Monetary")

Examining the Clusters
After we have the cluster,do we observe any difference between the
clusters which compute the average of RFM?
group_by(RFM_new,cluster) %>%
summarise(Avg_R = mean(Recency),Avg_F = mean(Frequency),Avg_M = mean(Monetary))
## # A tibble: 4 × 4
## cluster Avg_R Avg_F Avg_M
## <int> <drtn> <dbl> <dbl>
## 1 1 244.32232 days 1.51 445.
## 2 2 13.34177 days 17.6 8827.
## 3 3 49.24381 days 2.41 754.
## 4 4 27.71390 days 7.95 3070.
require(vtable)
## 載入需要的套件:vtable
## Warning: 套件 'vtable' 是用 R 版本 4.2.2 來建造的
## 載入需要的套件:kableExtra
##
## 載入套件:'kableExtra'
## 下列物件被遮斷自 'package:dplyr':
##
## group_rows
st(RFM_new, vars = c('Recency','Frequency','Monetary') ,group = 'cluster')
Summary Statistics
|
cluster
|
1
|
2
|
3
|
4
|
|
Variable
|
N
|
Mean
|
SD
|
N
|
Mean
|
SD
|
N
|
Mean
|
SD
|
N
|
Mean
|
SD
|
|
Recency
|
999
|
244.322
|
61.822
|
158
|
13.342
|
17.156
|
2342
|
49.244
|
37.551
|
734
|
27.714
|
32.752
|
|
Frequency
|
999
|
1.509
|
0.912
|
158
|
17.551
|
5.941
|
2342
|
2.413
|
1.365
|
734
|
7.946
|
2.905
|
|
Monetary
|
999
|
444.763
|
470.971
|
158
|
8826.556
|
3676.459
|
2342
|
754.051
|
594.682
|
734
|
3070.437
|
1392.713
|
For the Average of RFM for each cluster:
cluster1:The total customer is 999.The mean of Recency is 244.322.
The mean of Frequency is 1.509.
The mean of Monetary is 444.763.
cluster2:The total customer is 999.The mean of Recency is 13.342.
The mean of Frequency is 17.551.
The mean of Monetary is 8826.556.
cluster3:The total customer is 999.The mean of Recency is 49.244.
The mean of Frequency is 2.413.
The mean of Monetary is 754.051.
cluster4:The total customer is 999.The mean of Recency is 27.714.
The mean of Frequency is 7.946.
The mean of Monetary is 3070.437.
As cluster1, it had been long time that they did not come back
shopping, and they spent the least from all cluster, so we can label
them as “potentially lost customers”.
As cluster3, it had the most people in this cluster and their
Monetary is not very high,so it might be the low-to-medium consumer
groups. We can label them as “general customers”.
As cluster4, its monetary had three times larger than general
customers, and did shopping 7 times in a period which is also more often
than general customers. Hence, we can label them as “VIP
customers”.
As cluster2, it had the least days about coming back to shop, the
most times come to shop in a period, and the largest monetary by all the
clusters. Since there are a few people in this cluster, we can label
them as “high-level VIP customers”.
In my opinion, cluster4,vip customers, would be the most suitable
for us to run target marketing campaign, because they are regarded as a
medium-to-high consumer groups, they may be able to pay more money on
things that worth it but become hesitate by our service or other
things.
Therefore, we can do some strategy such as:
1.Promote a sense of superiority: giving them a special service like
“private car delivery service” when deliver thier items.We hope this
strategy can deliver their items immediately and let them be more
willingness to make the order on our website.
l) Based on the list of top selling products, I try to develop my
target marketing strategies. Therefore, I print out the top 5 most
selling products in terms of sales revenue for each cluster.
Customer_clus <- RFM_Scaled[,c(1,5)]
Retail_clus <- left_join(data2,Customer_clus, by = c("CustomerID" = "CustomerID"))
require(dplyr)
cluster_sale <- Retail_clus %>%
na.omit() %>%
select(InvoiceNo, StockCode, Description, Amount,CustomerID ,cluster) %>%
group_by(StockCode,Description ,cluster) %>%
summarise(Total_sales = sum(Amount),.groups = 'drop')
cluster1 -> potentially lost customers
subset(cluster_sale,cluster =="1") %>%
arrange(desc(Total_sales))%>%
head(5)
## # A tibble: 5 × 4
## StockCode Description cluster Total_sales
## <chr> <chr> <int> <dbl>
## 1 22423 REGENCY CAKESTAND 3 TIER 1 8206.
## 2 85123A WHITE HANGING HEART T-LIGHT HOLDER 1 5702.
## 3 47566 PARTY BUNTING 1 5307.
## 4 POST POSTAGE 1 4330
## 5 22328 ROUND SNACK BOXES SET OF 4 FRUITS 1 4169.
cluster2 -> high-level VIP customers
subset(cluster_sale,cluster =="2") %>%
arrange(desc(Total_sales))%>%
head(5)
## # A tibble: 5 × 4
## StockCode Description cluster Total_sales
## <chr> <chr> <int> <dbl>
## 1 22423 REGENCY CAKESTAND 3 TIER 2 33741.
## 2 M Manual 2 17674.
## 3 POST POSTAGE 2 14664.
## 4 85099B JUMBO BAG RED RETROSPOT 2 14029.
## 5 47566 PARTY BUNTING 2 12803.
cluster3 -> general customers
subset(cluster_sale,cluster =="3") %>%
arrange(desc(Total_sales))%>%
head(5)
## # A tibble: 5 × 4
## StockCode Description cluster Total_sales
## <chr> <chr> <int> <dbl>
## 1 POST POSTAGE 3 19613.
## 2 22423 REGENCY CAKESTAND 3 TIER 3 17197.
## 3 85123A WHITE HANGING HEART T-LIGHT HOLDER 3 14969.
## 4 85099B JUMBO BAG RED RETROSPOT 3 13510.
## 5 84879 ASSORTED COLOUR BIRD ORNAMENT 3 13206.
cluster4 -> VIP customers
subset(cluster_sale,cluster =="4") %>%
arrange(desc(Total_sales))%>%
head(5)
## # A tibble: 5 × 4
## StockCode Description cluster Total_sales
## <chr> <chr> <int> <dbl>
## 1 POST POSTAGE 4 27033.
## 2 22423 REGENCY CAKESTAND 3 TIER 4 27026.
## 3 85123A WHITE HANGING HEART T-LIGHT HOLDER 4 21998.
## 4 47566 PARTY BUNTING 4 21508.
## 5 85099B JUMBO BAG RED RETROSPOT 4 17222.
seasonality
I am interested in finding out if there is any seasonality
(variation by month) in purchase frequency of the 5 top/best sellers. As
a consequence, I compute purchase frequency of the top 5 selling
products by month and visualize it using ggplot2.
#View the top5 sellers firstly.
onlineRetail <- Retail_clus %>%
select(InvoiceNo, StockCode, Description, InvoiceDate, Amount,CustomerID)
onlineRetail %>%
group_by(StockCode,Description) %>%
summarise(Total_sales = sum(Amount),.groups = 'drop') %>%
arrange(desc(Total_sales)) %>%
head(6)
## # A tibble: 6 × 3
## StockCode Description Total_sales
## <chr> <chr> <dbl>
## 1 DOT DOTCOM POSTAGE 206249.
## 2 22423 REGENCY CAKESTAND 3 TIER 174485.
## 3 23843 PAPER CRAFT , LITTLE BIRDIE 168470.
## 4 85123A WHITE HANGING HEART T-LIGHT HOLDER 104340.
## 5 47566 PARTY BUNTING 99504.
## 6 85099B JUMBO BAG RED RETROSPOT 94340.
Retail2 <- subset(onlineRetail, Description%in%c("REGENCY CAKESTAND 3 TIER","PAPER CRAFT , LITTLE BIRDIE","WHITE HANGING HEART T-LIGHT HOLDER","PARTY BUNTING","JUMBO BAG RED RETROSPOT"), select = c(InvoiceNo, StockCode, Description, InvoiceDate, Amount,CustomerID))
Retail2$Invoice_month<-month(Retail2$InvoiceDate)
Retail2$Decription<-as.character(Retail2$Description)
ggplot(Retail2, aes(x=Invoice_month, y= length(InvoiceNo)))+ facet_wrap(~Description, ncol=2) +
geom_bar(stat="identity") +
labs(title = "Frequency by month", x = "Month", y = "Purchase Frequency")

According to the bar chart above, we do observe some seasonality.
Take ‘PARTY BUNTING’ as an example, we can obviously find that there is
a peak in May, which is in spring, and sold not very well in
winter.
On the previous part, I assume that the clusters are 4, and now I am
going to check whether k = 4 is a reasonable decision using the
Elbow/Silhouette method:
factoextra::fviz_nbclust(RFM.mat, kmeans, method = "wss") +
geom_vline(xintercept = 3, linetype = 2) +
labs(subtitle = "Elbow method")

factoextra::fviz_nbclust(RFM.mat, kmeans, method = "silhouette") +
labs(subtitle = "Silhouette method")

According to the methods we utilized and the rule of thumb for them,
we should say that it is not suitable for this data to divided to 4
clusters.
Instead, Due to the methods, k=3 will be a more reasonable decision
for the number of clusters.
That’s all my observation. See you next time!