Hi there! Welcome to my data visualization land. Today I will use the data set which is very classic containing all the transactions occurring between 12/01/2010 and 12/09/2011 for a UK-based and registered non-store online retail.

let’s see what we can find in it!

Import and Examine the Data

require(data.table)
## 載入需要的套件:data.table
data1 <- data.table::fread("C:/R-language/PBA/onlineRetail.csv")
require(tidyverse)
## 載入需要的套件:tidyverse
## Warning: 套件 'tidyverse' 是用 R 版本 4.2.2 來建造的
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2
## Warning: 套件 'ggplot2' 是用 R 版本 4.2.2 來建造的
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between()   masks data.table::between()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::first()     masks data.table::first()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::last()      masks data.table::last()
## ✖ purrr::transpose() masks data.table::transpose()
glimpse(data1)
## Rows: 541,909
## Columns: 8
## $ InvoiceNo   <chr> "536365", "536365", "536365", "536365", "536365", "536365"…
## $ StockCode   <chr> "85123A", "71053", "84406B", "84029G", "84029E", "22752", …
## $ Description <chr> "WHITE HANGING HEART T-LIGHT HOLDER", "WHITE METAL LANTERN…
## $ Quantity    <int> 6, 6, 8, 6, 6, 2, 6, 6, 6, 32, 6, 6, 8, 6, 6, 3, 2, 3, 3, …
## $ InvoiceDate <chr> "12/1/10 8:26", "12/1/10 8:26", "12/1/10 8:26", "12/1/10 8…
## $ UnitPrice   <dbl> 2.55, 3.39, 2.75, 3.39, 3.39, 7.65, 4.25, 1.85, 1.85, 1.69…
## $ CustomerID  <int> 17850, 17850, 17850, 17850, 17850, 17850, 17850, 17850, 17…
## $ Country     <chr> "United Kingdom", "United Kingdom", "United Kingdom", "Uni…
summary(data1)
##   InvoiceNo          StockCode         Description           Quantity        
##  Length:541909      Length:541909      Length:541909      Min.   :-80995.00  
##  Class :character   Class :character   Class :character   1st Qu.:     1.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :     3.00  
##                                                           Mean   :     9.55  
##                                                           3rd Qu.:    10.00  
##                                                           Max.   : 80995.00  
##                                                                              
##  InvoiceDate          UnitPrice           CustomerID       Country         
##  Length:541909      Min.   :-11062.06   Min.   :12346    Length:541909     
##  Class :character   1st Qu.:     1.25   1st Qu.:13953    Class :character  
##  Mode  :character   Median :     2.08   Median :15152    Mode  :character  
##                     Mean   :     4.61   Mean   :15288                      
##                     3rd Qu.:     4.13   3rd Qu.:16791                      
##                     Max.   : 38970.00   Max.   :18287                      
##                                         NA's   :135080
head(data1,10)
##     InvoiceNo StockCode                         Description Quantity
##  1:    536365    85123A  WHITE HANGING HEART T-LIGHT HOLDER        6
##  2:    536365     71053                 WHITE METAL LANTERN        6
##  3:    536365    84406B      CREAM CUPID HEARTS COAT HANGER        8
##  4:    536365    84029G KNITTED UNION FLAG HOT WATER BOTTLE        6
##  5:    536365    84029E      RED WOOLLY HOTTIE WHITE HEART.        6
##  6:    536365     22752        SET 7 BABUSHKA NESTING BOXES        2
##  7:    536365     21730   GLASS STAR FROSTED T-LIGHT HOLDER        6
##  8:    536366     22633              HAND WARMER UNION JACK        6
##  9:    536366     22632           HAND WARMER RED POLKA DOT        6
## 10:    536367     84879       ASSORTED COLOUR BIRD ORNAMENT       32
##      InvoiceDate UnitPrice CustomerID        Country
##  1: 12/1/10 8:26      2.55      17850 United Kingdom
##  2: 12/1/10 8:26      3.39      17850 United Kingdom
##  3: 12/1/10 8:26      2.75      17850 United Kingdom
##  4: 12/1/10 8:26      3.39      17850 United Kingdom
##  5: 12/1/10 8:26      3.39      17850 United Kingdom
##  6: 12/1/10 8:26      7.65      17850 United Kingdom
##  7: 12/1/10 8:26      4.25      17850 United Kingdom
##  8: 12/1/10 8:28      1.85      17850 United Kingdom
##  9: 12/1/10 8:28      1.85      17850 United Kingdom
## 10: 12/1/10 8:34      1.69      13047 United Kingdom
cat("the unique number of customers:",length(unique(data1$CustomerID)))
## the unique number of customers: 4373
cat("\nthe unique number of products purchased:",length(unique(data1$StockCode)))
## 
## the unique number of products purchased: 4070
cat("\nthe unique number of transactions:",length(unique(data1$InvoiceNo)))
## 
## the unique number of transactions: 25900
#Drop the value that Quantity or Unit Price are lower than 0.
data1 <- data1[data1$Quantity>=0];data1 <- data1[data1$UnitPrice>=0]
data1$CustomerID <- as.character(data1$CustomerID)
#Drop the InvoiceNo. which contains the letter "C"(but including NA value).
data2 <- subset(data1,grepl("C",data1$InvoiceNo) != TRUE)

RFM Variables

Then I create a variable called Frequency and Monetary for each customer in the data.

#frequency
data2 <- data2[order(data2$CustomerID)]
dataT <- cbind(dataT, Frequency = with(data2,
  as.numeric(by(InvoiceNo, CustomerID, function(x) length(unique(x))))))
#Monetary value
dataT <- cbind(dataT, Monetary = with(data2,
  as.numeric(by(Amount, CustomerID, function(x) sum(x)))))
head(dataT,5)
##    InvoiceNo StockCode                    Description Quantity InvoiceDate
## 1:    541431     23166 MEDIUM CERAMIC TOP STORAGE JAR    74215  2011-01-18
## 2:    581180     23497    CLASSIC CHROME BICYCLE BELL       12  2011-12-07
## 3:    568172     23077             DOUGHNUT LIP GLOSS      120  2011-09-25
## 4:    577609     23112       PARISIENNE CURIO CABINET        2  2011-11-21
## 5:    543037     21908  CHOCOLATE THIS WAY METAL SIGN       12  2011-02-02
##    UnitPrice CustomerID        Country  Amount  Recency Frequency Monetary
## 1:      1.04      12346 United Kingdom 77183.6 325 days         1 77183.60
## 2:      1.45      12347        Iceland    17.4   2 days         7  4310.00
## 3:      1.25      12348        Finland   150.0  75 days         4  1797.24
## 4:      7.50      12349          Italy    15.0  18 days         1  1757.55
## 5:      2.10      12350         Norway    25.2 310 days         1   334.40

After finish the pre-processing, let’s visualize the RFM variables with box plots:

par(mfrow = c(1,3))
#boxplot(dataT$Recency,dataT$Frequency,dataT$Monetary,names = c("Recency","Frequency","Monetary"))
boxplot(dataT$Recency,xlab = "Recency")
boxplot(dataT$Frequency,xlab = "Frequency")
boxplot(dataT$Monetary,xlab = "Monetary")

It seems that there are extreme values in the RFM variables. So, I remove these extreme values/outliers by keeping only the values that are within the 99th percentile.

Rquan <- quantile(as.numeric(dataT$Recency),0.99)
Fquan <- quantile(dataT$Frequency,0.99)
Mquan <- quantile(dataT$Monetary,0.99)
RFM <- subset(dataT,dataT$Recency <= Rquan & dataT$Frequency <= Fquan & dataT$Monetary <= Mquan)
par(mfrow = c(1,3))
boxplot(RFM$Recency,xlab = "Recency")
boxplot(RFM$Frequency,xlab = "Frequency")
boxplot(RFM$Monetary,xlab = "Monetary")

Now I’m gonna scaling the Variables.

To prep the data for clustering, we will need to scale the features/variables. First, I create another data.table object called RFM_Scaled which contains the CustomerID and the standardized RFM variables.

RFM_Scaled <- RFM[,c(7,10,11,12)]
RFM_Scaled$Recency <- scale(RFM_Scaled$Recency,center = TRUE, scale = TRUE)
RFM_Scaled$Frequency <- scale(RFM_Scaled$Frequency,center = TRUE, scale = TRUE)
RFM_Scaled$Monetary <- scale(RFM_Scaled$Monetary,center = TRUE, scale = TRUE)
head(RFM_Scaled,5)
##    CustomerID    Recency   Frequency   Monetary
## 1:      12347 -0.9141340  0.81050527  1.4495521
## 2:      12348 -0.1576197  0.06826784  0.2047077
## 3:      12349 -0.7483226 -0.67396958  0.1850449
## 4:      12350  2.2777345 -0.67396958 -0.5199967
## 5:      12352 -0.5617849  1.05791774  0.5558537

Second, I convert RFM_Scaled to a matrix. (also not forget to remove the CustomerID from the matrix.)

RFM.mat <- as.matrix(RFM_Scaled[,-1])

Third, I set seed at 2021 and run k-means clustering (set k = 4).

set.seed(2021) # Set seed for reproducibility
km.out <- kmeans(RFM.mat, centers = 4); km.out
## K-means clustering with 4 clusters of sizes 999, 158, 2342, 734
## 
## Cluster means:
##      Recency  Frequency   Monetary
## 1  1.5971030 -0.5481582 -0.4653216
## 2 -0.7965968  3.4208635  3.6870952
## 3 -0.4245365 -0.3242961 -0.3120980
## 4 -0.6476555  1.0444348  0.8354614
## 
## Clustering vector:
##    [1] 4 3 3 1 4 1 1 1 3 4 3 4 3 1 4 3 3 1 3 4 3 3 1 3 3 1 3 3 4 3 1 3 1 4 3 3 3
##   [38] 3 4 3 3 3 1 1 3 3 3 3 4 2 1 3 3 1 4 3 3 3 3 4 1 3 1 3 2 4 3 2 4 2 3 4 3 2
##   [75] 3 1 3 4 3 3 1 3 4 1 4 3 3 3 4 3 4 3 3 3 3 4 3 1 2 4 4 2 3 2 2 3 3 4 4 4 4
##  [112] 3 1 4 3 3 1 4 3 3 4 1 4 3 1 3 3 1 3 3 3 1 1 3 4 4 3 4 3 3 4 4 3 3 4 3 3 3
##  [149] 3 3 2 3 4 2 3 3 3 3 1 1 3 1 3 4 3 2 3 1 3 1 4 1 1 3 2 3 3 1 1 3 3 3 3 1 3
##  [186] 1 2 4 1 3 3 3 3 4 1 3 3 4 3 4 4 4 4 1 3 3 3 3 3 4 3 3 4 1 3 3 3 3 4 3 2 1
##  [223] 1 3 1 4 4 3 3 3 3 3 4 3 1 3 3 3 4 3 3 3 4 1 3 3 1 1 3 3 4 3 3 3 3 4 4 1 3
##  [260] 4 3 4 1 3 3 3 2 3 3 2 2 4 4 4 3 1 3 3 3 3 3 4 3 3 3 4 4 1 4 2 3 4 3 4 3 3
##  [297] 1 3 3 2 4 3 3 3 4 4 3 3 2 1 1 1 1 1 3 3 3 4 4 3 3 1 1 3 2 3 3 3 3 1 4 1 1
##  [334] 3 3 3 4 3 4 4 3 1 3 3 1 1 1 3 3 1 4 1 3 1 1 3 3 3 3 1 1 3 1 3 3 3 4 3 3 1
##  [371] 4 1 3 1 1 3 1 3 2 3 2 3 4 3 1 3 3 3 1 4 3 4 1 3 3 3 1 4 1 3 1 1 3 3 4 1 3
##  [408] 1 3 3 3 3 3 1 3 1 3 3 1 2 1 3 4 1 4 4 3 4 1 3 3 4 3 1 3 3 3 4 1 3 3 4 3 3
##  [445] 3 2 3 3 3 1 3 4 4 3 3 3 3 4 1 4 3 4 3 3 1 3 3 1 1 1 4 3 1 3 3 3 1 2 3 3 3
##  [482] 3 3 3 1 3 2 1 3 2 1 3 1 4 4 4 4 3 3 2 4 3 3 4 3 3 3 3 1 3 3 3 3 3 1 1 3 3
##  [519] 4 3 3 2 1 3 3 1 1 3 1 3 3 3 3 2 1 1 3 3 2 3 1 4 3 1 2 3 3 4 4 3 4 3 1 2 3
##  [556] 3 3 3 3 4 2 3 3 4 3 1 1 3 4 4 3 3 3 1 1 3 1 4 4 3 4 3 4 3 1 3 3 3 4 3 1 3
##  [593] 3 3 3 3 3 1 3 3 1 3 3 3 3 3 3 3 3 3 4 3 4 4 1 3 1 3 3 3 3 3 3 4 3 3 3 3 4
##  [630] 3 3 3 3 1 3 3 3 1 1 3 4 1 1 3 1 4 4 3 1 3 3 1 3 3 1 3 3 4 3 3 3 1 3 3 3 3
##  [667] 3 1 1 4 4 2 4 2 1 3 3 3 3 3 3 3 3 3 3 1 4 1 3 1 1 3 3 3 1 1 3 1 1 4 3 3 3
##  [704] 3 3 3 3 4 4 3 2 3 3 3 3 2 3 4 1 3 3 3 4 3 1 1 2 1 1 3 1 3 3 3 3 3 3 3 3 3
##  [741] 1 1 3 4 3 3 3 3 1 1 3 4 3 3 3 4 1 4 1 3 3 3 1 4 1 3 3 1 3 3 3 3 3 3 3 3 3
##  [778] 2 3 1 3 3 3 4 3 3 3 3 3 3 4 3 4 1 3 3 4 3 4 4 1 1 3 4 2 3 3 1 3 1 3 3 4 3
##  [815] 1 1 1 4 1 3 1 3 1 1 3 3 2 3 4 3 1 1 4 3 1 3 3 1 3 3 3 4 1 4 1 4 3 1 3 3 3
##  [852] 3 3 3 3 4 3 3 4 3 3 4 3 1 3 1 2 4 4 3 3 3 3 4 1 4 4 3 3 3 1 3 3 3 3 3 1 3
##  [889] 1 4 1 3 3 2 3 3 1 1 1 3 4 3 4 3 3 3 3 3 3 2 1 3 3 3 3 3 4 3 3 1 3 1 3 3 1
##  [926] 3 2 4 4 4 3 3 3 3 3 3 1 3 4 3 3 1 4 3 3 4 3 1 3 4 1 3 1 3 1 2 3 3 1 3 1 1
##  [963] 4 3 1 1 4 1 3 3 3 4 3 1 4 1 4 3 1 1 1 2 3 1 1 1 1 4 1 3 3 3 3 1 4 3 3 3 3
## [1000] 1 3 4 3 1 3 1 3 4 4 3 1 3 3 1 3 1 1 4 4 4 3 3 3 1 3 3 4 3 3 3 3 3 1 3 4 1
## [1037] 3 3 3 3 3 1 3 4 1 4 3 3 1 3 1 3 1 3 3 3 4 3 3 1 3 3 3 3 1 3 3 4 3 1 3 1 3
## [1074] 1 4 3 3 3 3 3 3 3 3 2 1 3 1 3 4 1 3 3 3 3 4 4 2 3 1 3 3 3 3 4 2 3 4 3 1 3
## [1111] 1 3 3 4 3 1 3 4 3 1 3 1 4 3 3 4 3 3 3 3 1 3 4 3 3 3 3 1 3 3 1 3 1 1 1 3 3
## [1148] 3 4 3 3 1 4 3 1 1 4 3 1 3 3 3 3 1 1 2 4 3 1 3 4 1 3 3 3 3 4 3 2 3 4 3 1 3
## [1185] 3 4 4 3 4 1 3 3 4 4 3 1 1 3 3 3 4 3 3 1 1 3 3 3 3 4 2 4 3 3 1 1 3 3 4 1 3
## [1222] 3 1 3 1 4 3 2 3 3 3 3 2 2 1 2 2 3 1 3 1 4 3 3 3 3 3 3 3 1 1 4 2 3 3 1 4 3
## [1259] 3 4 3 4 3 1 1 4 3 3 4 3 4 3 1 3 1 4 3 3 3 3 3 3 1 4 3 4 3 3 3 3 3 4 4 3 1
## [1296] 1 3 3 1 1 3 3 4 1 1 3 3 1 3 3 3 3 3 1 3 4 3 2 3 1 4 4 4 3 2 1 3 3 1 3 3 3
## [1333] 1 3 4 4 4 3 3 4 3 4 3 3 1 4 1 3 3 4 4 3 1 4 3 3 4 3 1 4 3 1 1 4 1 3 1 1 3
## [1370] 4 1 4 2 3 3 4 1 3 1 1 1 3 3 3 4 1 4 3 3 4 3 1 3 4 4 4 3 3 3 4 3 3 1 3 3 4
## [1407] 4 1 4 3 4 3 1 3 3 1 1 4 3 3 3 4 3 3 1 1 1 4 3 4 3 3 1 3 1 1 1 1 3 1 3 3 3
## [1444] 3 3 4 2 1 1 1 1 1 3 3 3 3 3 1 3 4 3 4 1 4 3 4 1 4 3 4 3 4 4 4 4 1 3 1 4 2
## [1481] 3 3 3 3 3 4 1 3 4 3 3 1 4 4 3 1 1 1 4 3 1 3 3 3 3 3 1 3 3 1 4 1 1 3 3 4 3
## [1518] 1 4 3 3 4 3 3 3 1 3 1 3 1 4 1 3 3 3 3 1 1 4 3 3 1 1 3 3 3 1 4 4 3 4 4 4 3
## [1555] 3 1 3 3 4 3 3 3 3 3 4 4 3 3 4 1 4 2 3 1 3 3 3 3 4 1 4 3 4 4 1 1 3 3 1 3 3
## [1592] 3 3 2 3 2 4 3 3 3 1 4 4 3 3 3 3 3 1 3 1 4 3 3 3 3 4 3 3 1 3 1 2 3 3 1 3 1
## [1629] 1 3 1 3 3 1 4 3 3 3 1 1 3 3 4 4 3 3 1 3 1 1 3 3 3 4 3 1 3 3 4 3 3 3 3 3 3
## [1666] 2 1 1 1 3 3 4 3 1 3 3 2 1 3 4 3 1 4 1 3 1 3 4 4 3 4 1 3 4 3 4 3 4 3 3 3 3
## [1703] 3 1 3 3 1 4 3 4 2 3 4 4 1 4 4 3 1 3 3 3 3 4 3 3 3 3 1 3 1 3 3 4 3 2 1 3 3
## [1740] 4 3 1 3 1 3 3 3 3 3 3 3 4 1 2 3 3 4 3 1 3 3 3 4 4 3 1 1 3 3 3 1 1 3 4 1 3
## [1777] 3 4 3 3 1 4 4 4 1 2 1 3 4 3 4 3 3 3 3 3 3 3 3 3 2 1 4 3 1 3 1 3 4 3 3 3 3
## [1814] 3 1 3 1 1 1 1 3 3 2 1 3 3 4 3 4 3 1 4 3 3 3 3 3 3 3 1 3 3 1 3 3 3 1 3 1 2
## [1851] 3 4 3 3 3 3 3 2 3 3 1 3 3 3 4 1 3 1 3 3 3 3 3 3 3 3 3 3 3 3 1 3 1 3 1 3 3
## [1888] 1 3 3 1 2 3 1 3 3 3 3 1 3 3 1 4 3 2 4 3 3 3 3 3 3 1 4 4 3 3 3 1 1 3 2 1 2
## [1925] 1 3 1 3 3 1 3 3 1 4 4 3 3 1 4 3 3 3 1 3 3 3 3 1 1 3 1 1 3 4 1 3 1 4 3 3 3
## [1962] 3 1 3 3 1 4 3 1 3 1 3 3 4 4 3 3 3 3 1 3 3 3 3 3 2 3 3 4 1 3 3 4 1 3 1 4 3
## [1999] 1 2 3 1 1 3 3 4 4 3 3 3 3 3 2 1 3 3 1 3 1 3 1 3 1 4 1 3 1 3 3 3 2 3 3 2 3
## [2036] 3 3 3 3 4 1 3 3 3 1 3 1 1 4 3 3 4 1 4 1 1 1 1 1 1 3 4 1 3 1 4 1 3 3 3 3 4
## [2073] 3 3 3 1 3 2 2 3 3 3 3 1 1 1 4 3 1 3 3 1 1 3 3 4 4 3 3 3 3 3 1 1 4 3 3 4 3
## [2110] 2 4 1 4 3 4 4 3 4 1 4 3 3 1 3 3 1 3 1 3 1 3 3 3 3 1 3 3 3 1 1 1 3 3 3 3 3
## [2147] 3 3 1 1 3 1 4 1 3 4 2 3 1 3 4 3 3 3 4 3 4 3 3 4 3 1 4 1 3 1 1 1 3 3 3 1 3
## [2184] 1 3 3 3 1 1 3 3 1 4 3 1 3 1 4 3 3 3 1 3 1 3 3 3 3 3 1 3 3 3 1 3 3 1 4 3 1
## [2221] 1 1 3 3 3 4 1 1 1 3 1 3 3 1 2 3 3 3 3 3 3 4 3 3 3 2 1 4 3 3 3 4 3 4 3 3 2
## [2258] 3 2 1 3 4 3 1 1 1 3 1 2 4 3 1 4 3 4 3 3 3 3 3 4 1 4 3 3 3 3 1 1 3 3 3 1 4
## [2295] 3 4 3 3 3 1 3 1 2 3 3 3 1 3 3 3 3 3 4 3 4 3 1 3 1 1 3 4 3 3 1 3 1 3 3 3 3
## [2332] 3 3 3 2 4 3 3 3 4 1 3 1 3 4 3 2 3 3 3 3 4 3 3 3 3 4 3 4 3 3 3 3 2 3 1 1 3
## [2369] 1 1 1 3 3 3 3 1 3 3 1 3 4 1 3 3 1 3 1 3 1 4 4 4 3 4 3 3 3 3 3 3 4 3 3 1 3
## [2406] 2 1 4 3 1 3 1 3 1 3 3 4 1 3 3 3 3 1 4 3 3 4 1 1 3 4 1 3 1 1 3 4 2 3 3 3 3
## [2443] 3 3 3 3 4 4 1 3 4 3 3 3 1 3 3 4 3 3 3 3 1 3 1 1 3 3 3 4 4 3 1 3 3 3 1 4 3
## [2480] 3 3 3 3 3 4 2 3 3 1 3 3 4 3 1 3 3 3 1 3 3 1 3 4 3 3 3 1 3 3 3 3 3 3 3 1 3
## [2517] 4 3 3 1 3 4 3 2 3 3 4 4 3 3 4 3 3 4 1 3 3 4 3 1 1 3 3 3 3 1 1 1 1 1 3 1 3
## [2554] 3 4 3 4 3 1 4 1 4 1 1 3 1 1 1 3 4 3 1 3 3 2 1 3 1 1 3 3 3 3 3 3 4 4 3 3 3
## [2591] 3 3 3 1 4 1 4 3 1 3 3 3 4 3 4 3 3 4 3 3 3 4 3 3 4 2 4 3 1 3 3 3 1 4 3 3 3
## [2628] 4 3 3 4 3 1 3 3 3 3 3 1 3 2 1 3 1 3 3 3 3 3 3 3 1 3 3 1 1 3 4 3 3 1 1 3 3
## [2665] 3 1 1 4 1 3 3 4 3 1 3 3 1 3 3 3 1 1 4 3 3 3 4 3 1 3 4 3 1 3 3 1 1 4 1 1 3
## [2702] 3 3 3 3 1 1 4 3 3 4 3 1 3 3 3 3 1 4 1 4 1 3 1 3 4 1 3 4 3 1 3 2 1 1 3 2 3
## [2739] 4 3 3 4 3 3 3 3 3 2 3 4 3 3 3 4 2 3 3 3 4 1 3 3 3 1 3 3 3 3 3 3 4 3 3 3 1
## [2776] 3 1 1 4 3 1 3 3 1 1 3 1 3 3 3 3 3 3 4 4 3 1 3 1 3 3 1 3 4 1 1 4 3 3 4 3 3
## [2813] 1 3 3 1 1 3 3 3 1 3 3 1 3 4 3 1 3 3 4 3 1 3 3 3 3 1 4 3 3 1 1 3 3 1 3 3 4
## [2850] 4 3 4 3 1 1 3 3 3 3 1 3 3 4 1 3 1 3 2 3 1 3 3 4 4 3 3 3 3 4 3 3 3 3 3 3 3
## [2887] 3 1 1 3 3 3 3 3 3 1 3 1 4 3 3 1 3 3 4 1 1 3 1 3 4 4 3 3 3 3 3 3 3 3 3 1 1
## [2924] 4 1 3 3 1 3 3 4 3 4 3 3 3 3 3 1 1 1 3 3 4 3 1 4 3 1 3 3 3 4 3 3 3 1 1 4 3
## [2961] 4 3 3 3 3 1 1 3 3 3 3 1 3 3 4 1 1 3 3 3 3 3 1 3 3 3 3 1 3 3 1 2 2 1 3 3 1
## [2998] 4 4 3 3 3 3 3 1 1 4 3 3 1 4 3 3 4 3 2 3 3 3 1 3 1 3 4 3 3 3 3 1 3 3 1 1 3
## [3035] 3 4 3 3 3 3 3 1 3 3 3 3 4 4 1 1 3 3 1 3 1 4 1 3 3 1 3 1 2 3 4 3 4 1 3 4 3
## [3072] 3 3 1 1 1 3 3 4 3 3 4 2 3 1 1 3 1 3 3 3 3 4 3 2 3 3 4 1 3 4 3 3 4 3 3 3 1
## [3109] 3 3 1 1 4 4 2 3 3 4 4 3 4 2 1 3 1 4 4 3 1 4 3 1 3 1 3 2 3 3 3 4 3 1 1 3 4
## [3146] 4 2 2 3 3 3 3 3 3 1 1 3 3 3 3 1 4 1 1 4 3 1 3 4 3 3 4 3 3 3 2 3 4 3 3 3 4
## [3183] 4 1 3 4 1 3 3 1 3 3 1 3 3 3 3 1 3 1 4 2 3 4 4 3 1 3 3 3 3 1 3 1 3 1 3 3 4
## [3220] 1 4 3 2 3 3 3 1 1 3 3 3 3 1 3 3 3 3 3 4 3 4 3 3 3 4 3 3 1 3 3 3 1 3 3 1 3
## [3257] 1 3 3 3 3 3 4 4 3 3 3 4 4 1 3 3 3 1 3 4 3 2 3 3 4 4 3 2 3 3 3 3 4 3 4 1 2
## [3294] 1 3 4 3 3 3 3 4 1 3 1 3 3 3 1 1 4 1 3 3 3 1 3 4 1 3 4 4 4 3 4 3 3 1 3 4 1
## [3331] 3 3 4 3 3 3 3 3 3 3 3 2 3 3 3 3 3 1 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 1 3 3 4
## [3368] 4 1 3 3 3 3 4 1 4 1 4 3 2 4 3 4 3 3 3 3 3 3 3 3 3 3 3 4 3 4 4 3 1 3 3 4 3
## [3405] 3 1 1 2 3 1 3 4 3 1 1 3 1 3 1 1 3 3 1 4 4 3 4 1 2 1 1 3 3 3 3 1 3 3 3 3 3
## [3442] 3 4 4 3 4 1 3 3 1 3 4 1 3 1 1 3 3 4 3 1 4 3 3 4 4 1 3 3 3 4 1 3 3 4 1 4 3
## [3479] 1 4 3 4 3 3 1 1 3 4 3 1 4 3 3 1 3 3 3 4 1 3 2 3 1 3 3 3 1 3 3 3 3 3 1 3 1
## [3516] 3 3 3 3 3 3 1 1 3 3 4 3 4 4 3 3 1 4 3 3 3 3 3 3 1 2 3 3 3 3 4 3 3 3 4 3 1
## [3553] 3 3 3 3 1 3 4 3 2 1 1 3 3 4 3 3 1 3 3 3 3 1 3 1 3 3 3 1 4 4 1 4 3 3 4 3 1
## [3590] 1 3 2 3 3 3 3 3 4 3 3 1 1 1 4 3 3 3 4 3 3 1 1 1 3 3 4 3 3 4 3 4 3 4 3 3 4
## [3627] 3 2 3 3 1 3 3 4 3 1 1 4 1 3 1 3 4 4 3 1 1 1 3 3 3 3 3 3 3 1 3 3 3 3 1 3 3
## [3664] 3 3 1 3 4 4 1 3 1 3 1 3 1 3 3 4 1 3 1 1 4 3 4 3 3 3 3 1 4 3 3 3 3 1 1 3 3
## [3701] 4 4 1 3 1 1 3 3 3 1 3 1 3 1 1 3 3 4 3 1 3 3 3 4 3 3 4 1 3 3 3 1 2 1 3 1 2
## [3738] 1 3 3 3 3 3 4 4 3 3 4 1 3 1 4 3 1 3 3 4 4 4 3 3 1 3 1 3 3 3 3 3 3 4 1 3 3
## [3775] 4 3 3 1 3 3 4 1 3 3 3 3 4 4 3 1 4 3 3 4 3 4 3 3 3 3 4 1 3 3 4 3 4 2 1 3 3
## [3812] 4 3 4 4 4 1 3 3 3 3 4 1 1 4 1 3 3 1 4 2 1 1 1 3 3 1 1 4 1 2 3 3 4 3 4 3 3
## [3849] 4 3 3 3 2 3 3 3 4 3 4 1 3 1 4 1 3 2 4 3 3 3 1 3 1 3 3 4 1 3 3 3 1 3 3 1 3
## [3886] 3 3 3 3 1 4 3 3 1 3 3 4 4 3 3 3 4 3 3 3 3 4 3 4 3 1 3 3 3 3 3 4 4 3 1 3 3
## [3923] 4 1 3 1 3 4 1 1 4 1 4 3 2 3 3 3 4 1 3 1 1 3 1 1 1 3 3 3 3 3 4 1 1 1 3 1 4
## [3960] 3 3 1 3 1 1 3 3 3 1 3 1 3 3 3 3 2 3 1 4 3 3 3 3 3 3 3 3 3 4 1 4 3 3 1 3 1
## [3997] 3 3 4 1 1 3 3 1 3 3 4 3 1 1 1 3 3 4 3 1 3 1 1 3 3 1 3 1 1 3 1 4 3 3 1 3 3
## [4034] 4 1 1 1 3 3 3 3 3 3 3 1 3 3 1 3 3 3 3 3 3 2 3 3 4 3 1 1 3 4 1 3 3 4 1 3 4
## [4071] 3 3 1 4 1 1 4 4 4 1 4 3 3 3 3 1 1 4 3 2 3 4 1 3 4 3 3 3 3 1 1 2 3 3 1 1 4
## [4108] 3 2 1 1 1 4 3 3 3 3 1 3 1 3 3 1 4 1 3 4 4 3 3 3 3 3 3 3 3 3 1 3 3 3 1 3 1
## [4145] 3 3 3 2 3 3 1 3 3 4 4 3 1 3 1 3 3 1 1 3 1 3 3 2 3 1 1 3 3 3 4 3 1 1 3 3 3
## [4182] 1 4 1 3 3 2 1 4 4 1 3 2 4 1 3 1 3 3 3 1 3 4 3 4 3 3 3 1 3 3 3 4 3 4 3 3 3
## [4219] 3 3 1 3 4 3 3 3 3 3 1 1 3 4 3
## 
## Within cluster sum of squares by cluster:
## [1] 514.7621 864.9687 824.8309 812.1645
##  (between_SS / total_SS =  76.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Then, I attach the cluster numbers (i.e., km.out$cluster) onto RFM_Scaled.

require(dplyr)
RFM_Scaled <- cbind(RFM_Scaled,km.out$cluster)
RFM_Scaled <- RFM_Scaled %>%
  rename(
    cluster = V2
  )
RFM_new <- cbind(RFM,RFM_Scaled$cluster)
RFM_new <- RFM_new %>%
  rename(
    cluster = V2
  )

Examining the Clusters

After we have the cluster,do we observe any difference between the clusters which compute the average of RFM?

group_by(RFM_new,cluster) %>%
  summarise(Avg_R = mean(Recency),Avg_F = mean(Frequency),Avg_M = mean(Monetary))
## # A tibble: 4 × 4
##   cluster Avg_R          Avg_F Avg_M
##     <int> <drtn>         <dbl> <dbl>
## 1       1 244.32232 days  1.51  445.
## 2       2  13.34177 days 17.6  8827.
## 3       3  49.24381 days  2.41  754.
## 4       4  27.71390 days  7.95 3070.
require(vtable)
## 載入需要的套件:vtable
## Warning: 套件 'vtable' 是用 R 版本 4.2.2 來建造的
## 載入需要的套件:kableExtra
## 
## 載入套件:'kableExtra'
## 下列物件被遮斷自 'package:dplyr':
## 
##     group_rows
st(RFM_new, vars = c('Recency','Frequency','Monetary') ,group = 'cluster')
Summary Statistics
cluster
1
2
3
4
Variable N Mean SD N Mean SD N Mean SD N Mean SD
Recency 999 244.322 61.822 158 13.342 17.156 2342 49.244 37.551 734 27.714 32.752
Frequency 999 1.509 0.912 158 17.551 5.941 2342 2.413 1.365 734 7.946 2.905
Monetary 999 444.763 470.971 158 8826.556 3676.459 2342 754.051 594.682 734 3070.437 1392.713

For the Average of RFM for each cluster:

cluster1:The total customer is 999.The mean of Recency is 244.322. The mean of Frequency is 1.509.

The mean of Monetary is 444.763.

cluster2:The total customer is 999.The mean of Recency is 13.342. The mean of Frequency is 17.551.

The mean of Monetary is 8826.556.

cluster3:The total customer is 999.The mean of Recency is 49.244. The mean of Frequency is 2.413.

The mean of Monetary is 754.051.

cluster4:The total customer is 999.The mean of Recency is 27.714. The mean of Frequency is 7.946.

The mean of Monetary is 3070.437.

As cluster1, it had been long time that they did not come back shopping, and they spent the least from all cluster, so we can label them as “potentially lost customers”.

As cluster3, it had the most people in this cluster and their Monetary is not very high,so it might be the low-to-medium consumer groups. We can label them as “general customers”.

As cluster4, its monetary had three times larger than general customers, and did shopping 7 times in a period which is also more often than general customers. Hence, we can label them as “VIP customers”.

As cluster2, it had the least days about coming back to shop, the most times come to shop in a period, and the largest monetary by all the clusters. Since there are a few people in this cluster, we can label them as “high-level VIP customers”.

In my opinion, cluster4,vip customers, would be the most suitable for us to run target marketing campaign, because they are regarded as a medium-to-high consumer groups, they may be able to pay more money on things that worth it but become hesitate by our service or other things.

Therefore, we can do some strategy such as:

1.Promote a sense of superiority: giving them a special service like “private car delivery service” when deliver thier items.We hope this strategy can deliver their items immediately and let them be more willingness to make the order on our website.

l) Based on the list of top selling products, I try to develop my target marketing strategies. Therefore, I print out the top 5 most selling products in terms of sales revenue for each cluster.

Customer_clus <- RFM_Scaled[,c(1,5)]
Retail_clus <- left_join(data2,Customer_clus, by = c("CustomerID" = "CustomerID"))
require(dplyr)
cluster_sale <- Retail_clus %>%
  na.omit() %>% 
  select(InvoiceNo, StockCode, Description, Amount,CustomerID ,cluster) %>%
  group_by(StockCode,Description ,cluster) %>%
  summarise(Total_sales = sum(Amount),.groups = 'drop')

cluster1 -> potentially lost customers

subset(cluster_sale,cluster =="1") %>%
  arrange(desc(Total_sales))%>%
  head(5)
## # A tibble: 5 × 4
##   StockCode Description                        cluster Total_sales
##   <chr>     <chr>                                <int>       <dbl>
## 1 22423     REGENCY CAKESTAND 3 TIER                 1       8206.
## 2 85123A    WHITE HANGING HEART T-LIGHT HOLDER       1       5702.
## 3 47566     PARTY BUNTING                            1       5307.
## 4 POST      POSTAGE                                  1       4330 
## 5 22328     ROUND SNACK BOXES SET OF 4 FRUITS        1       4169.

cluster2 -> high-level VIP customers

subset(cluster_sale,cluster =="2") %>%
  arrange(desc(Total_sales))%>%
  head(5)
## # A tibble: 5 × 4
##   StockCode Description              cluster Total_sales
##   <chr>     <chr>                      <int>       <dbl>
## 1 22423     REGENCY CAKESTAND 3 TIER       2      33741.
## 2 M         Manual                         2      17674.
## 3 POST      POSTAGE                        2      14664.
## 4 85099B    JUMBO BAG RED RETROSPOT        2      14029.
## 5 47566     PARTY BUNTING                  2      12803.

cluster3 -> general customers

subset(cluster_sale,cluster =="3") %>%
  arrange(desc(Total_sales))%>%
  head(5)
## # A tibble: 5 × 4
##   StockCode Description                        cluster Total_sales
##   <chr>     <chr>                                <int>       <dbl>
## 1 POST      POSTAGE                                  3      19613.
## 2 22423     REGENCY CAKESTAND 3 TIER                 3      17197.
## 3 85123A    WHITE HANGING HEART T-LIGHT HOLDER       3      14969.
## 4 85099B    JUMBO BAG RED RETROSPOT                  3      13510.
## 5 84879     ASSORTED COLOUR BIRD ORNAMENT            3      13206.

cluster4 -> VIP customers

subset(cluster_sale,cluster =="4") %>%
  arrange(desc(Total_sales))%>%
  head(5)
## # A tibble: 5 × 4
##   StockCode Description                        cluster Total_sales
##   <chr>     <chr>                                <int>       <dbl>
## 1 POST      POSTAGE                                  4      27033.
## 2 22423     REGENCY CAKESTAND 3 TIER                 4      27026.
## 3 85123A    WHITE HANGING HEART T-LIGHT HOLDER       4      21998.
## 4 47566     PARTY BUNTING                            4      21508.
## 5 85099B    JUMBO BAG RED RETROSPOT                  4      17222.

seasonality

I am interested in finding out if there is any seasonality (variation by month) in purchase frequency of the 5 top/best sellers. As a consequence, I compute purchase frequency of the top 5 selling products by month and visualize it using ggplot2.

#View the top5 sellers firstly.
onlineRetail <- Retail_clus %>%
  select(InvoiceNo, StockCode, Description, InvoiceDate, Amount,CustomerID)
onlineRetail %>%
  group_by(StockCode,Description) %>%
  summarise(Total_sales = sum(Amount),.groups = 'drop') %>%
  arrange(desc(Total_sales)) %>%
  head(6)
## # A tibble: 6 × 3
##   StockCode Description                        Total_sales
##   <chr>     <chr>                                    <dbl>
## 1 DOT       DOTCOM POSTAGE                         206249.
## 2 22423     REGENCY CAKESTAND 3 TIER               174485.
## 3 23843     PAPER CRAFT , LITTLE BIRDIE            168470.
## 4 85123A    WHITE HANGING HEART T-LIGHT HOLDER     104340.
## 5 47566     PARTY BUNTING                           99504.
## 6 85099B    JUMBO BAG RED RETROSPOT                 94340.
Retail2 <- subset(onlineRetail, Description%in%c("REGENCY CAKESTAND 3 TIER","PAPER CRAFT , LITTLE BIRDIE","WHITE HANGING HEART T-LIGHT HOLDER","PARTY BUNTING","JUMBO BAG RED RETROSPOT"), select = c(InvoiceNo, StockCode, Description, InvoiceDate, Amount,CustomerID))
Retail2$Invoice_month<-month(Retail2$InvoiceDate)
Retail2$Decription<-as.character(Retail2$Description)

ggplot(Retail2, aes(x=Invoice_month, y= length(InvoiceNo)))+ facet_wrap(~Description, ncol=2) + 
  geom_bar(stat="identity") + 
  labs(title = "Frequency by month", x = "Month", y = "Purchase Frequency")

According to the bar chart above, we do observe some seasonality. Take ‘PARTY BUNTING’ as an example, we can obviously find that there is a peak in May, which is in spring, and sold not very well in winter.

On the previous part, I assume that the clusters are 4, and now I am going to check whether k = 4 is a reasonable decision using the Elbow/Silhouette method:

factoextra::fviz_nbclust(RFM.mat, kmeans, method = "wss") +
  geom_vline(xintercept = 3, linetype = 2) +
  labs(subtitle = "Elbow method")

factoextra::fviz_nbclust(RFM.mat, kmeans, method = "silhouette") +
  labs(subtitle = "Silhouette method")

According to the methods we utilized and the rule of thumb for them, we should say that it is not suitable for this data to divided to 4 clusters.

Instead, Due to the methods, k=3 will be a more reasonable decision for the number of clusters.

That’s all my observation. See you next time!