packages = c(
  "dplyr","ggplot2","googleVis","devtools","magrittr","slam","irlba","plotly",
  "arules","arulesViz","Matrix","recommenderlab")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=TRUE))
LOAD = FALSE
library(dplyr)
library(ggplot2)
library(googleVis)
library(Matrix)
library(slam)
library(irlba)
library(plotly)
library(arules)
library(arulesViz)
library(recommenderlab)


A. RFM分群

load("data/tf0.rdata")
A = A0; X = X0; Z = Z0; rm(A0,X0,Z0); gc()
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  2801293 149.7    4752805 253.9  4752805 253.9
## Vcells 10622267  81.1   14899527 113.7 10672654  81.5
k = 8; set.seed(777)
A$group = kmeans(scale(A[,2:6]), k)$cluster; table(A$group)
## 
##    1    2    3    4    5    6    7    8 
## 2371 2825 6789 4252 9758  317 4300 1629
df = group_by(A, group) %>% summarise(
  avg_frequency = mean(f),
  avg_monetary = mean(m),
  total_revenue = sum(rev),
  group_size = n(),
  avg_recency = mean(r),
  profit = sum(raw)
  ) %>% ungroup %>% 
  mutate(
    pc_revenue = round(100*total_revenue/sum(total_revenue),1), # % revenue contrib.
    pc_profit = round(100*profit/sum(profit),1),                # % profit contrib. 
    dummy = 2001  # to fit googleViz's data format
  ) %>% data.frame
head(df)
##   group avg_frequency avg_monetary total_revenue group_size avg_recency
## 1     1      1.630114    2224.0645       8192064       2371   29.684521
## 2     2     11.861593     984.1008      27447494       2825    9.084602
## 3     3      1.471056     776.7725       7522130       6789   90.874797
## 4     4      1.518109     626.1522       3988792       4252   13.433678
## 5     5      4.502972     735.5990      29183322       9758   17.784280
## 6     6     33.492114     890.7913       8065546        317    3.425868
##    profit pc_revenue pc_profit dummy
## 1 1391812        8.1       8.9  2001
## 2 4244727       27.0      27.2  2001
## 3 1134525        7.4       7.3  2001
## 4  579186        3.9       3.7  2001
## 5 4206232       28.7      26.9  2001
## 6 1227735        7.9       7.9  2001
plot( gvisMotionChart(
  df, "group", "dummy",
  options=list(width=800, height=600) 
  ))
## starting httpd help server ... done


【QUIZ】 如果我們把X,Y軸分別改成log(avg_monetary)和log(avg_recency),根據圖上的顯示 …



B. 顧客產品矩陣

n_distinct(Z$cust)  # 32256
## [1] 32256
n_distinct(Z$prod)  # 23789
## [1] 23789

操作矩陣運算之前,通常我會載入這兩個套件

library(Matrix)
library(slam)

製作顧客產品矩陣其實很快、也很容易

mx = xtabs(~ cust + prod, Z, sparse=T)

顧客產品矩陣通常是一個很稀疏的矩陣

mean(mx > 0)
## [1] 0.0009679946

有一些產品沒什麼人買

table(colSums(mx) < 10)       # 12588 
## 
## FALSE  TRUE 
## 11201 12588

刪去購買次數小於10的產品,然後刪去沒有購買產品的顧客

mx = mx[, colSums(mx) > 10]   # 
mx = mx[rowSums(mx) > 0, ]    #
dim(mx)                       # 32256>32066 23789>10675
## [1] 32066 10675

檢查一下矩陣裡面的值分布

max(mx)                       # 49
## [1] 49
table(mx@x) %>% prop.table %>% round(4) %>% head(10)
## 
##      1      2      3      4      5      6      7      8      9     10 
## 0.9235 0.0594 0.0112 0.0033 0.0013 0.0006 0.0003 0.0002 0.0001 0.0001


【QUIZ】 如果mx[i, j] = 3,這表示 ….



C. 依購買頻率分群

C1. 改變(稀疏)矩陣格式

稀疏矩陣有很多種格式,不同的工具會使用不同的格式

library(slam)
tmx = as(mx,"dgTMatrix")
tmx = simple_triplet_matrix(
  1+tmx@i, 1+tmx@j, tmx@x, dimnames=mx@Dimnames)
dim(tmx)
## [1] 32066 10675
C2. 估計各產品的平均重要性 (Average TF-IDF Scores)

我們借用文字分析裡面估計單字在文章之中的重要性的方法(TF-IDF),計算各產品在所有顧客之間的平均重要性

tfidf = tapply(tmx$v/row_sums(tmx)[tmx$i], tmx$j, mean) *
  log2(nrow(tmx)/col_sums(tmx > 0))
summary(tfidf)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1369  0.3129  0.3880  0.4441  0.5066  4.4687
hist(tfidf)

C3. 篩選產品、重新排列

篩去平均重要性比較低的產品、然後將產品依被購買次數降冪排列

tmx = tmx[, tfidf > mean(tfidf)]    # remove products with low tfidf score
tmx = tmx[, order(-col_sums(tmx))]  # order product by frequency
dim(tmx)
## [1] 32066  3823
C4. 依(購買頻率最高的)產品對顧客分群
nop= 400  # no. product
k = 200   # no. cluster
set.seed(111); kg = kmeans(tmx[,1:nop], k)$cluster
table(kg) %>% as.vector %>% sort
##   [1]     1     1     1     1     1     1     1     1     1     1     1
##  [12]     1     1     1     2     2     2     2     3     3     3     4
##  [23]     5     5     5     6     6     6     6     6     7     7     7
##  [34]     7     7     7     8     8     8     8     9     9     9     9
##  [45]     9     9     9    10    10    10    11    12    12    12    13
##  [56]    14    14    14    14    15    16    17    19    19    21    22
##  [67]    23    24    25    26    27    29    30    32    33    34    35
##  [78]    35    35    36    37    38    39    39    40    41    41    43
##  [89]    43    43    44    44    45    46    47    47    48    48    48
## [100]    48    49    50    51    51    51    52    52    54    55    55
## [111]    55    56    56    57    57    57    58    58    58    60    60
## [122]    62    62    63    64    64    64    65    65    65    67    67
## [133]    68    69    70    70    70    72    72    72    72    73    73
## [144]    73    73    74    74    75    75    78    78    79    79    80
## [155]    80    82    82    87    87    88    88    89    91    93    93
## [166]    94   105   107   107   109   111   112   119   123   126   128
## [177]   131   136   137   138   140   141   141   145   147   148   148
## [188]   153   165   168   182   185   187   193   212   257   422   494
## [199]   556 19875
C5. 各群組平均屬性

將分群結果併入顧客資料框(A)

df = inner_join(A, data.frame(
  cust = as.integer(tmx$dimnames$cust), kg))
## Joining, by = "cust"

計算各群組的平均屬性

df = data.frame(
  aggregate(. ~ kg, df[,c(2:7,11)], mean),
  size = as.vector(table(kg))
  )
head(df)
##   kg        r         s         f         m       rev       raw size
## 1  1 14.03509 103.84211  8.982456 1351.5971 10364.456 1246.3333   57
## 2  2 28.46377  92.03623  6.927536  752.7416  4505.804  722.3841  138
## 3  3  6.50000 113.00000 16.500000 1810.2222 27129.500 3764.5000    2
## 4  4 47.12418 100.84967  4.764706 1603.9312  5964.882  784.0392  153
## 5  5 23.55556 102.66667  6.888889 1882.3126 10613.556 1405.2222    9
## 6  6 18.10000 105.40000  8.800000 1170.1295  9537.100 1566.5000   10
C6. 互動式泡泡圖
df$dummy = 2001                        # dummy column for googleViz
plot( gvisMotionChart(
  subset(df[,c(1,4,5,6,8,2,3,7,9)], 
         size >= 20 & size <= 5000     # range of group size
         ), 
  "kg", "dummy", options=list(width=800, height=600) ) )
C7. 各群組的代表性產品 (Signature Product)
Sig = function(gx, P=1000, H=10) {
  print(sprintf("Group %d: No. Customers = %d", gx, sum(kg==gx)))
  bx = tmx[,1:P]
  data.frame(n = col_sums(bx[kg==gx,])) %>%      # frequency
    mutate(
      share = round(100*n/col_sums(bx),2),       # %prod sold to this cluster
      conf = round(100*n/sum(kg==gx),2),         # %buy this product, given cluster
      base = round(100*col_sums(bx)/nrow(bx),2), # %buy this product, all cust 
      lift = round(conf/base,1),                 # conf/base  
      name = colnames(bx)                        # name of prod
    ) %>% arrange(desc(lift)) %>% head(H)
  }
Sig(1)
## [1] "Group 1: No. Customers = 57"
##      n share   conf base  lift          name
## 1  137 39.83 240.35 1.07 224.6 7610053910787
## 2    3  8.82   5.26 0.11  47.8 4710088500366
## 3    3  7.69   5.26 0.12  43.8 4710091111757
## 4    3  7.14   5.26 0.13  40.5 2230090000101
## 5   18  6.02  31.58 0.93  34.0 7610053910794
## 6    2  5.56   3.51 0.11  31.9 4710982090260
## 7    2  5.00   3.51 0.12  29.2 4710091111528
## 8    2  5.13   3.51 0.12  29.2 4971275703278
## 9    2  5.26   3.51 0.12  29.2      20546786
## 10   2  5.26   3.51 0.12  29.2 4710174011196
  • G族群一共買了n個P產品
  • P產品有share%是賣給G族群
  • G族群每人平均購買conf/100個P產品
  • 所有顧客之中,每人平均購買base/100個P產品
  • 跟所有顧客相比,G族群購買P產品的機率上升了lift-1

【QUIZ】 從互動式泡泡圖看來 …

  • 平均客單價、平均購買次數、平均營收貢獻、平均獲利貢獻最大的分別是哪一個族群?
  • 它們的特徵產品分別是什麼?

【QUIZ】 在以上這個段落裡面 …

  • 我們分群的區隔變數是什麼?
  • 我們在泡泡圖裡面觀察的變數是哪一些?
  • 它們是相同的嗎?
  • 這個分析的程序,跟Airlines CRM那一個例子有甚麼不同?
  • 我們從這個例子學到什麼?


D. 使用尺度縮減方法抽取顧客(產品)的特徵向量

D1. 巨大尺度縮減 (SVD, Sigular Value Decomposition)
library(irlba)
if(LOAD) {
  load("data/svd.rdata")
} else {
  smx = mx
  smx@x = pmin(smx@x, 2)            # cap at 2, similar to normalization  
  t0 = Sys.time()
  svd = irlba(smx, 
              nv=400,               # length of feature vector
              maxit=800, work=800)    
  print(Sys.time() - t0)            # 1.8795 mins
  save(svd, file = "data/svd.rdata")
}
## Time difference of 1.742891 mins
D2. 依顧客向量對顧客分群
set.seed(111); kg = kmeans(svd$u, 200)$cluster
table(kg) %>% as.vector %>% sort
##   [1]    1    1    1    1    1    1    1    1    1    1    1    1    1    1
##  [15]    1    1    1    1    1    1    1    1    1    1    1    1    1    1
##  [29]    1    1    1    1    1    1    1    1    1    1    1    1    1    1
##  [43]    1    1    1    1    1    1    1    1    1    1    1    1    1    1
##  [57]    1    1    1    1    1    1    2    2    2    2    2    3    3    4
##  [71]    4    4    5    7   14   17   20   28   29   30   33   38   41   43
##  [85]   44   44   48   48   50   51   52   56   57   58   61   64   66   67
##  [99]   67   67   75   83   84   89   93   99  100  102  107  110  113  114
## [113]  117  117  128  131  134  136  137  142  145  147  150  151  151  160
## [127]  160  165  166  167  168  169  169  171  175  175  178  178  180  180
## [141]  181  182  184  184  185  188  191  191  192  194  194  197  199  205
## [155]  208  210  210  211  211  211  213  215  216  218  219  221  222  223
## [169]  224  224  228  230  230  232  233  236  237  238  240  241  250  251
## [183]  252  255  268  274  283  285  290  293  305  318  329  384  440  718
## [197]  781  865 1102 8844
D3. 互動式泡泡圖 (Google Motion Chart)
# clustster summary
df = left_join(A, data.frame(         
  cust = as.integer(smx@Dimnames$cust), kg)) %>% 
  group_by(kg) %>% summarise(
    avg_frequency = mean(f),
    avg_monetary = mean(m),
    avg_revenue_contr = mean(rev),
    group_size = n(),
    avg_recency = mean(r),
    avg_gross_profit = mean(raw)) %>% 
  ungroup %>% 
  mutate(dummy = 2001, kg = sprintf("G%03d",kg)) %>% 
  data.frame
## Joining, by = "cust"
# Google Motion Chart
plot( gvisMotionChart(
  subset(df, group_size >= 20 & group_size <= 5000),     
  "kg", "dummy", options=list(width=800, height=600) ) )
D4. 互動式泡泡圖 (ggplot + plotly)
filter(df, group_size >= 20 & group_size <= 5000)$group_size %>% 
  sqrt %>% range    # for bubble size adjustment
## [1]  4.472136 33.196385
library(ggplot2)
library(plotly)

p = df %>% filter(group_size >= 20 & group_size <= 5000) %>% 
  ggplot(aes(x=avg_frequency, y=avg_monetary)) +
  geom_point(aes(size=group_size, col=avg_revenue_contr),alpha=0.7) +
  geom_text(aes(label=kg), alpha=0) +
  scale_size(range=c(1.5,12)) +
  #scale_color_gradient(low="green",high="magenta") +
  scale_colour_gradientn(
    colours = rev(c("red","yellow","green","lightblue","darkblue"))) +
  theme_bw() + guides(size=F) + labs(
    title="顧客集群(依購買產品)",
    color="平均營收貢獻", size="集群人數") +
  xlab("平均購買次數") + 
  ylab("平均購買金額")
plotly_build(p)
D5. 群組的代表性產品 (Signature Product)
Sig(138)
## [1] "Group 138: No. Customers = 228"
##      n share   conf base lift          name
## 1  316 58.63 138.60 1.68 82.5 8712045008539
## 2   35 15.35  15.35 0.71 21.6 8712045013038
## 3   32 11.55  14.04 0.86 16.3 4902430493437
## 4    4 11.76   1.75 0.11 15.9 4710091110491
## 5   24  8.03  10.53 0.93 11.3 7610053910794
## 6    8  7.34   3.51 0.34 10.3 4902430493451
## 7    3  7.32   1.32 0.13 10.2 4714050000021
## 8   17  6.97   7.46 0.76  9.8 4902430040334
## 9    4  7.02   1.75 0.18  9.7      20510183
## 10   5  6.58   2.19 0.24  9.1 4710036110067


E. 購物籃分析 Baskets Analysis

dim(mx)   # 32066 cust * 10675 prod
## [1] 32066 10675
E1. 準備資料 (for Association Rule Analysis)
library(arules)
library(arulesViz)
bx = subset(Z, prod %in% as.numeric(colnames(mx)), 
            select=c("cust","prod"))  # select product items
bx = split(bx$prod, bx$cust)          # split by customer
bx = as(bx, "transactions")           # data for arules package
## Warning in asMethod(object): removing duplicated items in transactions
E2. Top20 熱賣產品
itemFrequencyPlot(bx, topN=20, type="absolute", cex=0.8)

E3. 關聯規則和Apriori演算法

關聯規則(A => B)

  • support: A被購買的機率 (A的基礎機率)
  • confidence: A被購買時,B被購買的機率
  • lift: A被購買時,B被購買的機率增加的倍數 (與B的基礎機率相比)
  • 一般來講support、confidence和lift越高的關聯規則越重要
  • support、confidence和lift設的越低(高),找到的關聯規則越多(少)
  • 建議一開始把標準設低,先找到多一點規則,之後再用subset篩選出特定的規則來看
rules = apriori(bx, parameter=list(supp=0.005, conf=0.6))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.6    0.1    1 none FALSE            TRUE       5   0.005      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 160 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[10675 item(s), 32066 transaction(s)] done [0.37s].
## sorting and recoding items ... [884 item(s)] done [0.02s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 done [0.02s].
## writing ... [67 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
summary(rules)
## set of 67 rules
## 
## rule length distribution (lhs + rhs):sizes
##  2  3  4 
## 16 43  8 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   3.000   3.000   2.881   3.000   4.000 
## 
## summary of quality measures:
##     support           confidence          lift            count    
##  Min.   :0.005052   Min.   :0.6020   Min.   : 3.204   Min.   :162  
##  1st Qu.:0.005582   1st Qu.:0.6348   1st Qu.:16.798   1st Qu.:179  
##  Median :0.006393   Median :0.6762   Median :21.147   Median :205  
##  Mean   :0.007859   Mean   :0.7004   Mean   :22.535   Mean   :252  
##  3rd Qu.:0.009574   3rd Qu.:0.7507   3rd Qu.:27.986   3rd Qu.:307  
##  Max.   :0.019086   Max.   :0.8712   Max.   :70.632   Max.   :612  
## 
## mining info:
##  data ntransactions support confidence
##    bx         32066   0.005        0.6
E4. 檢視關聯規則

關聯規則 (A => B):

  • support: A被購買的機率 (A的基礎機率)
  • confidence: A被購買時,B被購買的機率
  • lift: A被購買時,B被購買的機率增加的倍數 (與B的基礎機率相比)
options(digits=4)
inspect(rules)
##      lhs                rhs              support confidence   lift count
## [1]  {4710030346103} => {4710030346059} 0.005146     0.6762 70.632   165
## [2]  {4719090701051} => {4719090790000} 0.005520     0.6367 39.111   177
## [3]  {719859796124}  => {719859796117}  0.007297     0.7222 45.859   234
## [4]  {4710321861209} => {4710321861186} 0.007017     0.6338 41.392   225
## [5]  {4711524000891} => {4711524001041} 0.005832     0.6561 63.564   187
## [6]  {4719090790017} => {4719090790000} 0.010510     0.8300 50.989   337
## [7]  {4719090790000} => {4719090790017} 0.010510     0.6456 50.989   337
## [8]  {4710011401142} => {4710011401128} 0.007765     0.6434 16.679   249
## [9]  {4710085120697} => {4710085120680} 0.012537     0.7992 30.691   402
## [10] {4710085120710} => {4710085120703} 0.010416     0.6243 29.790   334
## [11] {4710011402026} => {4710011402019} 0.010073     0.7210 29.376   323
## [12] {4710085172702} => {4710085172696} 0.009543     0.6120 21.354   306
## [13] {4710011409056} => {4710011401128} 0.014813     0.7353 19.061   475
## [14] {4710011401135} => {4710011401128} 0.019086     0.7897 20.470   612
## [15] {4710011405133} => {4710011401128} 0.016840     0.6968 18.062   540
## [16] {4710011406123} => {4710011401128} 0.015624     0.6358 16.481   501
## [17] {4714981010038,                                                    
##       4719090790017} => {4719090790000} 0.005489     0.8263 50.758   176
## [18] {4714981010038,                                                    
##       4719090790000} => {4719090790017} 0.005489     0.6692 52.854   176
## [19] {4710011401135,                                                    
##       4710011401142} => {4710011401128} 0.005114     0.7700 19.959   164
## [20] {4710011401128,                                                    
##       4710011401142} => {4710011401135} 0.005114     0.6586 27.251   164
## [21] {4710085120697,                                                    
##       4714981010038} => {4710085120680} 0.005395     0.8047 30.901   173
## [22] {4710060000099,                                                    
##       4711271000014} => {4714981010038} 0.005613     0.6667  3.548   180
## [23] {4710085120093,                                                    
##       4710085172702} => {4710085172696} 0.005551     0.7206 25.145   178
## [24] {4710085120093,                                                    
##       4710085172702} => {4710085120628} 0.005052     0.6559 17.778   162
## [25] {4710085172696,                                                    
##       4710085172702} => {4710085120628} 0.006112     0.6405 17.362   196
## [26] {4710085120628,                                                    
##       4710085172702} => {4710085172696} 0.006112     0.6712 23.421   196
## [27] {4710311703014,                                                    
##       4714981010038} => {4711271000014} 0.005863     0.6045  3.702   188
## [28] {4710683100015,                                                    
##       4714981010038} => {4711271000014} 0.006393     0.6949  4.256   205
## [29] {4710088620156,                                                    
##       4714981010038} => {4711271000014} 0.005645     0.6177  3.783   181
## [30] {4710063031106,                                                    
##       4711271000014} => {4714981010038} 0.007984     0.6139  3.267   256
## [31] {4710685440362,                                                    
##       4714981010038} => {4711271000014} 0.005520     0.6254  3.830   177
## [32] {4711022100017,                                                    
##       4711271000014} => {4714981010038} 0.005364     0.6277  3.340   172
## [33] {4710011401135,                                                    
##       4710011409056} => {4710011405133} 0.007266     0.6132 25.370   233
## [34] {4710011405133,                                                    
##       4710011409056} => {4710011401135} 0.007266     0.6833 28.271   233
## [35] {4710011406123,                                                    
##       4710011409056} => {4710011401135} 0.006144     0.6611 27.352   197
## [36] {4710011401135,                                                    
##       4710011409056} => {4710011401128} 0.009917     0.8368 21.693   318
## [37] {4710011401128,                                                    
##       4710011409056} => {4710011401135} 0.009917     0.6695 27.700   318
## [38] {4710011406123,                                                    
##       4710011409056} => {4710011405133} 0.005676     0.6107 25.270   182
## [39] {4710011405133,                                                    
##       4710011409056} => {4710011401128} 0.008389     0.7889 20.449   269
## [40] {4710011406123,                                                    
##       4710011409056} => {4710011401128} 0.007391     0.7953 20.616   237
## [41] {4710011409056,                                                    
##       4714981010038} => {4710011401128} 0.005707     0.7409 19.206   183
## [42] {4711271000014,                                                    
##       4714381003128} => {4714981010038} 0.006362     0.6296  3.350   204
## [43] {4710085120093,                                                    
##       4710085172696} => {4710085120628} 0.008545     0.6241 16.918   274
## [44] {4710011401135,                                                    
##       4710011405133} => {4710011401128} 0.010634     0.8158 21.147   341
## [45] {4710011401128,                                                    
##       4710011405133} => {4710011401135} 0.010634     0.6315 26.128   341
## [46] {4710011401135,                                                    
##       4710011406123} => {4710011401128} 0.009013     0.8353 21.652   289
## [47] {4710011401135,                                                    
##       4711271000014} => {4710011401128} 0.005801     0.7782 20.174   186
## [48] {4710011401135,                                                    
##       4714981010038} => {4710011401128} 0.007048     0.7958 20.628   226
## [49] {4710011405133,                                                    
##       4710011406123} => {4710011401128} 0.008015     0.7449 19.310   257
## [50] {4710011405133,                                                    
##       4711271000014} => {4710011401128} 0.005426     0.7468 19.358   174
## [51] {4710011405133,                                                    
##       4714981010038} => {4710011401128} 0.006674     0.7086 18.369   214
## [52] {4710011406123,                                                    
##       4714981010038} => {4710011401128} 0.006674     0.6751 17.500   214
## [53] {4710421090059,                                                    
##       4714981010038} => {4711271000014} 0.009605     0.6299  3.857   308
## [54] {4710583996008,                                                    
##       4719090900065} => {4714981010038} 0.006019     0.6942  3.694   193
## [55] {4710265849066,                                                    
##       4719090900065} => {4714981010038} 0.009200     0.6020  3.204   295
## [56] {4710265849066,                                                    
##       4711271000014} => {4714981010038} 0.011539     0.6390  3.400   370
## [57] {4713985863121,                                                    
##       4719090900065} => {4714981010038} 0.005894     0.7269  3.868   189
## [58] {4711271000014,                                                    
##       4713985863121} => {4714981010038} 0.010728     0.6922  3.683   344
## [59] {4711271000014,                                                    
##       4719090900065} => {4714981010038} 0.014533     0.6099  3.246   466
## [60] {4710011401135,                                                    
##       4710011405133,                                                    
##       4710011409056} => {4710011401128} 0.006331     0.8712 22.585   203
## [61] {4710011401128,                                                    
##       4710011401135,                                                    
##       4710011409056} => {4710011405133} 0.006331     0.6384 26.413   203
## [62] {4710011401128,                                                    
##       4710011405133,                                                    
##       4710011409056} => {4710011401135} 0.006331     0.7546 31.224   203
## [63] {4710011401135,                                                    
##       4710011406123,                                                    
##       4710011409056} => {4710011401128} 0.005333     0.8680 22.501   171
## [64] {4710011401128,                                                    
##       4710011406123,                                                    
##       4710011409056} => {4710011401135} 0.005333     0.7215 29.853   171
## [65] {4710011401135,                                                    
##       4710011405133,                                                    
##       4710011406123} => {4710011401128} 0.005520     0.8634 22.382   177
## [66] {4710011401128,                                                    
##       4710011401135,                                                    
##       4710011406123} => {4710011405133} 0.005520     0.6125 25.341   177
## [67] {4710011401128,                                                    
##       4710011405133,                                                    
##       4710011406123} => {4710011401135} 0.005520     0.6887 28.496   177
# install.packages(
#   "https://cran.r-project.org/bin/windows/contrib/3.5/arulesViz_1.3-1.zip",
#   repos=NULL)

# install.packages("arulesViz_1.3-1.zip", repos=NULL)
# library(plotly)
# plotly_arules(rules,colors=c("red","green"),
#               marker=list(opacity=.6,size=10))
# plotly_arules(rules,method="matrix",
#               shading="lift",
#               colors=c("red", "green"))
# 
E5. 互動圖表顯示
plot(rules,colors=c("red","green"),engine="htmlwidget",
     marker=list(opacity=.6,size=8))
plot(rules,method="matrix",shading="lift",engine="htmlwidget",
     colors=c("red", "green"))
E6. 篩選產品、互動式關聯圖
r1 = subset(rules, subset = rhs %in% c("4719090790000"))
summary(r1)
## set of 3 rules
## 
## rule length distribution (lhs + rhs):sizes
## 2 3 
## 2 1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    2.00    2.00    2.33    2.50    3.00 
## 
## summary of quality measures:
##     support          confidence         lift          count    
##  Min.   :0.00549   Min.   :0.637   Min.   :39.1   Min.   :176  
##  1st Qu.:0.00550   1st Qu.:0.732   1st Qu.:44.9   1st Qu.:176  
##  Median :0.00552   Median :0.826   Median :50.8   Median :177  
##  Mean   :0.00717   Mean   :0.764   Mean   :47.0   Mean   :230  
##  3rd Qu.:0.00801   3rd Qu.:0.828   3rd Qu.:50.9   3rd Qu.:257  
##  Max.   :0.01051   Max.   :0.830   Max.   :51.0   Max.   :337  
## 
## mining info:
##  data ntransactions support confidence
##    bx         32066   0.005        0.6
plot(r1,method="graph",engine="htmlwidget",itemCol="cyan") 
  • 泡泡大小:support: A被購買的機率 (A的基礎機率)
  • 泡泡顏色:lift: A被購買時,B被購買的機率增加的倍數 (與B的基礎機率相比)
r2 = subset(rules, subset = rhs %in% c("4710011401135"))
summary(r2)
## set of 8 rules
## 
## rule length distribution (lhs + rhs):sizes
## 3 4 
## 5 3 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.00    3.00    3.00    3.38    4.00    4.00 
## 
## summary of quality measures:
##     support          confidence         lift          count    
##  Min.   :0.00511   Min.   :0.631   Min.   :26.1   Min.   :164  
##  1st Qu.:0.00547   1st Qu.:0.660   1st Qu.:27.3   1st Qu.:176  
##  Median :0.00624   Median :0.676   Median :28.0   Median :200  
##  Mean   :0.00703   Mean   :0.684   Mean   :28.3   Mean   :226  
##  3rd Qu.:0.00793   3rd Qu.:0.697   3rd Qu.:28.8   3rd Qu.:254  
##  Max.   :0.01063   Max.   :0.755   Max.   :31.2   Max.   :341  
## 
## mining info:
##  data ntransactions support confidence
##    bx         32066   0.005        0.6
plot(r2,method="graph",engine="htmlwidget",itemCol="cyan") 


F. 產品推薦 Product Recommendation

F1. 篩選顧客、產品

太少被購買的產品和購買太少產品的顧客都不適合使用Collaborative Filtering這種產品推薦方法,所以我們先對顧客和產品做一次篩選

library(recommenderlab)
rx = mx[, colSums(mx > 0) >= 50]
rx = rx[rowSums(rx > 0) >= 20 & rowSums(rx > 0) <= 300, ]
dim(rx)
## [1] 8860 3355
F2. 選擇產品評分方式

可以選擇要用

  • 購買次數 (realRatingMatrix) 或
  • 是否購買 (binaryRatingMatrix)

做模型。

rx = as(rx, "realRatingMatrix")  # realRatingMatrix
bx = binarize(rx, minRating=1)   # binaryRatingMatrix
dim(bx)
## [1] 8860 3355
F3. 設定模型(準確性)驗證方式
set.seed(4321)
scheme = evaluationScheme(     
  bx, method="split", train = .75,  given=5)
F4. 設定推薦方法(參數)
algorithms = list(            
  AR53 = list(name="AR", param=list(support=0.0005, confidence=0.3)),
  AR43 = list(name="AR", param=list(support=0.0004, confidence=0.3)),
  RANDOM = list(name="RANDOM", param=NULL),
  POPULAR = list(name="POPULAR", param=NULL),
  UBCF = list(name="UBCF", param=NULL),
  IBCF = list(name="IBCF", param=NULL) )
F5. 建模、預測、驗證(準確性)
if(LOAD) {
  load("results.rdata")
} else {
  t0 = Sys.time()
  results = evaluate(            
    scheme, algorithms, type="topNList",
    n=c(5, 10, 15, 20))
  print(Sys.time() - t0)
  save(results, file="results.rdata")
}
## AR run fold/sample [model time/prediction time]
##   1  [3.5sec/181.3sec] 
## AR run fold/sample [model time/prediction time]
##   1  [8.68sec/455sec] 
## RANDOM run fold/sample [model time/prediction time]
##   1  [0sec/8.23sec] 
## POPULAR run fold/sample [model time/prediction time]
##   1  [0sec/8.5sec] 
## UBCF run fold/sample [model time/prediction time]
##   1  [0sec/62.4sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [170.2sec/1.43sec] 
## Time difference of 15.19 mins
F6. 模型準確性比較
# load("data/results.rdata")
par(mar=c(4,4,3,2),cex=0.8)
cols = c("red", "magenta", "gray", "orange", "blue", "green")
plot(results, annotate=c(1,3), legend="topleft", pch=19, lwd=2, col=cols)
abline(v=seq(0,0.006,0.001), h=seq(0,0.08,0.01), col='lightgray', lty=2)

F7. 儲存產品推薦模型
save(results, file="data/results.rdata")