packages = c(
  "dplyr","ggplot2","googleVis","devtools","magrittr","slam","irlba","plotly",
  "arules","arulesViz","Matrix","recommenderlab")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=TRUE))
LOAD = TRUE
library(dplyr)
library(ggplot2)
library(googleVis)
library(Matrix)
library(slam)
library(irlba)
library(plotly)
library(arules)
library(arulesViz)
library(recommenderlab)


A. 顧客產品矩陣

Load data frame and rename

load("data/tf0.rdata")
A = A0; X = X0; Z = Z0; rm(A0,X0,Z0); gc()
##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 2439683 130.3    3886542 207.6  3205452 171.2
## Vcells 9473576  72.3   13593784 103.8  9523388  72.7
Z = subset(Z, cust %in% A$cust)
n_distinct(Z$cust)  # 32241
## [1] 32241
n_distinct(Z$prod)  # 23787
## [1] 23787

製作顧客產品矩陣其實很快、也很容易

library(Matrix)
library(slam)
cpm = xtabs(~ cust + prod, Z, sparse=T)  # customer product matrix
dim(cpm)             # 32241 23787
## [1] 32241 23787
mean(cpm > 0)        # 0.00096799 # 有資料的密度
## [1] 0.0009674258
# cpm為0或1,矩陣的密度(有資料的密度)0.00096799

顧客產品矩陣通常是一個很稀疏的矩,陣有一些產品沒什麼人買

colSums(cpm) %>% quantile(seq(0,1,0.1))
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##    1    1    2    4    6    8   13   20   35   76 8475
mean(colSums(cpm) > 10) #每一個產品被買超過10次 mean在邏輯運算中是指TRUE的比率
## [1] 0.4483541

刪去購買次數小於6的產品,然後刪去沒有購買產品的顧客

cpm = cpm[, colSums(cpm) >= 6]      # remove the least frequent products
# cpm = cpm[rowSums(cpm) > 0, ]     # remove non-buying customers
cpm = cpm[, order(-colSums(cpm))]   # order product by frequency 
                                    # 以產品購買次數降冪排列
dim(cpm)                            # 32241 23787>14621
## [1] 32241 14621
max(cpm)         # 49
## [1] 49
mean(cpm > 0)    # 0.0015248
## [1] 0.001524785
table(cpm@x) %>% prop.table %>% round(4) %>% head(10)
## 
##      1      2      3      4      5      6      7      8      9     10 
## 0.9256 0.0579 0.0108 0.0032 0.0012 0.0006 0.0003 0.0002 0.0001 0.0001

請你用一個指令列出被購買最多次的10個產品,和它們被購買的次數。

colSums(cpm[,1:10]) 
## 4714981010038 4711271000014 4719090900065 4711080010112 4710114128038 
##          8475          6119          2444          2249          2178 
## 4710265849066 4713985863121 4710088410139 4710583996008 4710908131589 
##          2017          1976          1869          1840          1679

# 需確認cpm裡面顧客的排列是和A裡的cust排列是一樣的
dim(cpm)
## [1] 32241 14621
# sum(A$cust == rownames(cpm)) # cpm裡顧客的排列跟A的cust的排列是一樣的 有多少個?
# A = cbind(A , as.matrix(cpm[,1:20])) # 併入20個變數
                          # cpm是一個稀疏矩陣 所以要併入的話 先轉成正常矩陣



B. 直接以產品的被購買頻率作為變數

以產品的被購買頻率製作(顧客)變數的時候,cpm在最前邊的(N個)欄位就是變數!

B1. 以(最常被購買的)產品的購買次數對顧客分群
nop= 400  # no. product = no. variables
k = 200   # no. cluster
set.seed(111); kg = kmeans(cpm[,1:nop], k)$cluster
table(kg) %>% as.vector %>% sort
##   [1]     1     1     1     1     1     1     1     1     1     1     1
##  [12]     1     1     1     1     1     1     1     1     1     1     1
##  [23]     2     2     2     2     2     2     3     3     3     3     3
##  [34]     3     3     4     4     4     4     4     4     4     5     5
##  [45]     6     6     6     6     7     7     7     8     8     8     9
##  [56]     9     9     9    10    10    10    10    11    11    11    11
##  [67]    11    12    13    13    15    15    15    16    16    18    19
##  [78]    20    20    20    20    21    22    22    22    24    24    25
##  [89]    25    27    28    28    32    32    35    36    39    40    41
## [100]    42    44    45    46    47    47    48    49    49    50    51
## [111]    52    53    56    58    58    61    63    66    67    68    69
## [122]    69    72    81    85    85    86    87    90    94    96    97
## [133]    97   100   100   101   110   111   113   114   116   118   123
## [144]   123   126   130   134   136   141   141   142   143   162   165
## [155]   172   175   178   179   182   182   184   187   195   210   222
## [166]   225   228   228   237   239   242   253   254   258   258   266
## [177]   268   272   287   293   301   311   325   329   350   351   363
## [188]   396   407   407   410   418   432   448   473   523   561  1156
## [199]  1266 11215
B2. 各群組平均屬性

將分群結果併入顧客資料框(A)

df = A %>% inner_join(data.frame(
  cust = as.integer(rownames(cpm)), 
  kg) )
## Joining, by = "cust"
head(df)  # 32241
##   cust  r   s f      m   rev  raw age area  kg
## 1 1069 19 108 4  486.0  1944   15   K    E 196
## 2 1113 54 109 4  557.5  2230  241   K    F  81
## 3 1250 19  25 2  791.5  1583  354   D    D 169
## 4 1359 87  87 1  364.0   364  104   K    G  49
## 5 1823 36 119 3  869.0  2607  498   K    D 109
## 6 2189 57  89 2 7028.0 14056 3299   K    B 158

計算各群組的平均屬性

df = data.frame(
  aggregate(. ~ kg, df[,c(2:7,10)], mean), # averages
  size = as.vector(table(kg)),   # no. customers in the group
  dummy = 2001                   # dummy column for googleViz
  )
head(df)
##   kg         r         s         f         m       rev       raw size
## 1  1 10.772727 113.54545 13.318182  872.7878 10697.909 1733.8182   22
## 2  2 43.934959  95.96748  3.455285 1252.9938  3819.837  533.0407  123
## 3  3  5.384615 111.15385 14.307692 1325.2611 12617.769 1951.8462   13
## 4  4  8.000000 119.00000 24.000000  955.6250 22935.000 3658.0000    1
## 5  5  6.666667 116.66667 20.333333  579.9135 10741.444 1554.2222    9
## 6  6  2.500000 114.00000 23.500000  411.9918  9775.000 1324.5000    2
##   dummy
## 1  2001
## 2  2001
## 3  2001
## 4  2001
## 5  2001
## 6  2001
B3. 互動式泡泡圖
plot( gvisMotionChart(
  subset(df[,c(1,4,5,6,8,2,3,7,9)], 
         size >= 20 & size <= 1000),  # range of group size 
  "kg", "dummy", options=list(width=800, height=600) ) )
## starting httpd help server ... done
B4. 各群組的代表性產品 (Signature Product)
# use global variables: cpm, kg
Sig = function(gx, P=1000, H=10) {
  print(sprintf("Group %d: No. Customers = %d", gx, sum(kg==gx)))
  bx = cpm[,1:P]
  data.frame(n = col_sums(bx[kg==gx,])) %>%      # frequency
    mutate(
      share = round(100*n/col_sums(bx),2),       # %prod sold to this cluster
      conf = round(100*n/sum(kg==gx),2),         # %buy this product, given cluster
      base = round(100*col_sums(bx)/nrow(bx),2), # %buy this product, all cust 
      lift = round(conf/base,1),                 # conf/base  
      name = colnames(bx)                        # name of prod
    ) %>% arrange(desc(lift)) %>% head(H)
  }
Sig(130)
## [1] "Group 130: No. Customers = 97"
## Warning: package 'bindrcpp' was built under R version 3.4.4
##      n share   conf  base lift          name
## 1   29  6.65  29.90  1.35 22.1 4719090106009
## 2   19  6.62  19.59  0.89 22.0 4710632001172
## 3   12  5.38  12.37  0.69 17.9 4710093080044
## 4  389  4.59 401.03 26.29 15.3 4714981010038
## 5   20  4.59  20.62  1.35 15.3 4719090105002
## 6   14  4.53  14.43  0.96 15.0 4719090106016
## 7    8  4.40   8.25  0.56 14.7 9310022862601
## 8   10  4.31  10.31  0.72 14.3 4710189820851
## 9    9  4.25   9.28  0.66 14.1 4710088434258
## 10  15  4.19  15.46  1.11 13.9 4710189851282

C. 使用尺度縮減方法抽取顧客(產品)的特徵向量

C1. 巨大尺度縮減 (SVD, Sigular Value Decomposition)
library(irlba)
if(LOAD) {
  load("data/svd2a.rdata")
} else {
  smx = cpm
  smx@x = pmin(smx@x, 2)            # cap at 2, similar to normalization 
                                    # 現在要對稀疏矩陣做常態化 再做SVD
  t0 = Sys.time()
  svd = irlba(smx, 
              nv=400,               # length of feature vector
              maxit=800, work=800)    
  print(Sys.time() - t0)            # 1.8795 mins #印出來跑的時間多久
  save(svd, file = "data/svd2a.rdata")
}
C2. 依特徵向量對顧客分群
set.seed(111); kg = kmeans(svd$u, 200)$cluster
table(kg) %>% as.vector %>% sort
##   [1]    1    1    1    1    1    1    1    1    1    1    1    1    1    1
##  [15]    1    1    1    1    1    1    1    1    1    1    1    1    1    1
##  [29]    1    1    1    1    1    1    1    1    1    1    1    1    1    1
##  [43]    1    1    1    1    1    1    1    1    1    1    1    1    1    1
##  [57]    1    2    2    2    2    2    3    4    4    5    7   10   14   30
##  [71]   31   32   36   38   38   39   39   40   40   41   44   45   46   47
##  [85]   49   54   59   62   62   69   71   77   79   79   80   82   82   84
##  [99]   87   91  101  103  109  110  111  113  117  120  123  127  127  129
## [113]  132  133  134  135  136  139  141  143  143  147  147  157  159  159
## [127]  160  160  160  166  168  169  172  175  180  181  181  182  183  184
## [141]  184  188  190  190  193  194  195  196  198  198  200  201  201  202
## [155]  202  204  204  204  207  209  209  210  213  214  216  219  219  222
## [169]  225  233  234  235  236  237  237  238  239  241  248  248  248  253
## [183]  256  257  258  259  261  261  264  269  277  281  285  293  305  411
## [197]  612  896 1092 8987
C3. 互動式泡泡圖 (Google Motion Chart)
# clustster summary
df = inner_join(A, data.frame(         
  cust = as.integer(rownames(cpm)), kg)) %>% 
  group_by(kg) %>% summarise(
    avg_frequency = mean(f),
    avg_monetary = mean(m),
    avg_revenue_contr = mean(rev),
    group_size = n(),
    avg_recency = mean(r),
    avg_gross_profit = mean(raw)) %>% 
  ungroup %>% 
  mutate(dummy = 2001, kg = sprintf("G%03d",kg)) %>% 
  data.frame
## Joining, by = "cust"
# Google Motion Chart
plot( gvisMotionChart(
  subset(df, group_size >= 20 & group_size <= 1200),     
  "kg", "dummy", options=list(width=800, height=600) ) )
C4. 各群組的代表性產品 (Signature Product)
Sig(162)
## [1] "Group 162: No. Customers = 87"
##      n share   conf base  lift          name
## 1  165 38.64 189.66 1.32 143.7 4710105045313
## 2  133 35.37 152.87 1.17 130.7 4710105045320
## 3   59 13.50  67.82 1.36  49.9 4710105045450
## 4   19  9.60  21.84 0.61  35.8 4710105051147
## 5   35  8.31  40.23 1.31  30.7 4710105045443
## 6    9  5.33  10.34 0.52  19.9 4710088432650
## 7    8  4.91   9.20 0.51  18.0 4710105051185
## 8    8  4.19   9.20 0.59  15.6 4710171028326
## 9    7  3.89   8.05 0.56  14.4 4710058137059
## 10   7  3.70   8.05 0.59  13.6 4710010020061


D. 購物籃分析 Baskets Analysis

dim(cpm)   # 32241 14621 #被買超過五次(>=6)的商品有14621
## [1] 32241 14621
D1. 準備資料 (for Association Rule Analysis)
library(arules)
library(arulesViz)
# bx = subset(Z, prod %in% as.numeric(colnames(cpm)),
                                  # 被購買超過五次的產品名稱的集合
                                  # 濾掉一些購買次數太少的產品
#       select=c("cust","prod"))  # select product items 選rows 哪一筆資料要進來
# bx = Z
# bx = split(bx$prod, bx$tid)         # split by transcation id
# bx = as(bx, "transactions")         # data structure for arules package

bx = as(split(Z$prod, Z$tid), "transactions")
D2. Top20 熱賣產品
itemFrequencyPlot(bx, topN=20, type="absolute", cex=0.8)

D3. 關聯規則和Apriori演算法

關聯規則(A => B)

  • support: A被購買的機率 (A的基礎機率)
  • confidence: A被購買時,B被購買的機率
  • lift: A被購買時,B被購買的機率增加的倍數 (與B的基礎機率相比)
  • 一般來講support、confidence和lift越高的關聯規則越重要
  • support、confidence和lift設的越低(高),找到的關聯規則越多(少)
  • 建議一開始把標準設低,先找到多一點規則,之後再用subset篩選出特定的規則來看
rules = apriori(bx, parameter=list(supp=0.001, conf=0.4))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.4    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 119 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[23787 item(s), 119407 transaction(s)] done [0.10s].
## sorting and recoding items ... [1452 item(s)] done [0.01s].
## creating transaction tree ... done [0.04s].
## checking subsets of size 1 2 3 4 done [0.04s].
## writing ... [133 rule(s)] done [0.00s].
## creating S4 object  ... done [0.01s].
summary(rules)
## set of 133 rules
## 
## rule length distribution (lhs + rhs):sizes
##  2  3  4 
## 66 51 16 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.000   3.000   2.624   3.000   4.000 
## 
## summary of quality measures:
##     support           confidence          lift            count    
##  Min.   :0.001005   Min.   :0.4030   Min.   : 31.47   Min.   :120  
##  1st Qu.:0.001348   1st Qu.:0.4670   1st Qu.: 57.15   1st Qu.:161  
##  Median :0.001625   Median :0.5360   Median : 68.91   Median :194  
##  Mean   :0.002035   Mean   :0.5562   Mean   : 90.79   Mean   :243  
##  3rd Qu.:0.002445   3rd Qu.:0.6138   3rd Qu.: 89.84   3rd Qu.:292  
##  Max.   :0.005854   Max.   :0.8368   Max.   :301.37   Max.   :699  
## 
## mining info:
##  data ntransactions support confidence
##    bx        119407   0.001        0.4
D4. 檢視關聯規則

關聯規則 (A => B):

  • support: A被購買的機率 (A的基礎機率)
  • confidence: A被購買時,B被購買的機率 //條件機率
  • lift: A被購買時,B被購買的機率增加的倍數 (與B的基礎機率相比)
  • (B的基礎機率P(B)為0.6762/70.841)
options(digits=4)
inspect(rules)
##       lhs                rhs              support confidence   lift count
## [1]   {4710030346110} => {4710030346103} 0.001005     0.6486 301.37   120
## [2]   {4710030346103} => {4710030346110} 0.001005     0.4669 301.37   120
## [3]   {4710030346097} => {4710030346059} 0.001097     0.5822 211.96   131
## [4]   {4710030346103} => {4710030346059} 0.001407     0.6537 237.98   168
## [5]   {4710030346059} => {4710030346103} 0.001407     0.5122 237.98   168
## [6]   {4716114000312} => {4716114000329} 0.001273     0.5527 231.58   152
## [7]   {4716114000329} => {4716114000312} 0.001273     0.5333 231.58   152
## [8]   {4719090701051} => {4719090790017} 0.001097     0.4295 118.17   131
## [9]   {4719090701051} => {4719090790000} 0.001474     0.5770 121.95   176
## [10]  {4711524000419} => {4711524000396} 0.001382     0.6790 219.13   165
## [11]  {4711524000396} => {4711524000419} 0.001382     0.4459 219.13   165
## [12]  {4711524000495} => {4711524000396} 0.001038     0.5000 161.36   124
## [13]  {4719090106016} => {4719090106009} 0.001239     0.4790 131.17   148
## [14]  {4710321861209} => {4710321871260} 0.001449     0.4701 151.71   173
## [15]  {4710321871260} => {4710321861209} 0.001449     0.4676 151.71   173
## [16]  {4710321861209} => {4710321861186} 0.001876     0.6087 137.92   224
## [17]  {4710321861186} => {4710321861209} 0.001876     0.4250 137.92   224
## [18]  {4710321871260} => {4710321861186} 0.001675     0.5405 122.47   200
## [19]  {4711524000907} => {4711524000891} 0.001290     0.5620 191.20   154
## [20]  {4711524000891} => {4711524000907} 0.001290     0.4387 191.20   154
## [21]  {4711524000907} => {4711524001041} 0.001491     0.6496 194.41   178
## [22]  {4711524001041} => {4711524000907} 0.001491     0.4461 194.41   178
## [23]  {4719090790017} => {4719090790000} 0.002940     0.8088 170.92   351
## [24]  {4719090790000} => {4719090790017} 0.002940     0.6212 170.92   351
## [25]  {719859796124}  => {719859796117}  0.002102     0.6972 147.61   251
## [26]  {719859796117}  => {719859796124}  0.002102     0.4450 147.61   251
## [27]  {4710762101018} => {4710762101025} 0.001566     0.4484 112.49   187
## [28]  {4710857000059} => {4710011401128} 0.001072     0.4310  31.47   128
## [29]  {4710466103073} => {4710466103080} 0.001332     0.4892  78.84   159
## [30]  {4711524000891} => {4711524001041} 0.001759     0.5983 179.05   210
## [31]  {4711524001041} => {4711524000891} 0.001759     0.5263 179.05   210
## [32]  {4711856000088} => {4711856000125} 0.002479     0.4625  61.50   296
## [33]  {4710011402026} => {4710011402019} 0.002822     0.6740  90.22   337
## [34]  {4711856020215} => {4711856000125} 0.002169     0.4368  58.08   259
## [35]  {4710085120697} => {4710085120680} 0.003467     0.7753 100.41   414
## [36]  {4710085120680} => {4710085120697} 0.003467     0.4490 100.41   414
## [37]  {4711856020208} => {4711856000125} 0.002127     0.4084  54.30   254
## [38]  {4710011401142} => {4710011409056} 0.001591     0.4308  67.96   190
## [39]  {4710011401142} => {4710011401135} 0.001826     0.4943  63.54   218
## [40]  {4710011401142} => {4710011405133} 0.001801     0.4875  62.06   215
## [41]  {4710011401142} => {4710011406123} 0.001524     0.4127  50.39   182
## [42]  {4710011401142} => {4710011401128} 0.002203     0.5964  43.55   263
## [43]  {4710088414328} => {4710088414311} 0.001792     0.4672  86.37   214
## [44]  {4710085172702} => {4710085120093} 0.002060     0.4581  60.91   246
## [45]  {4710085172702} => {4710085172696} 0.002429     0.5400  62.00   290
## [46]  {4710085172702} => {4710085120628} 0.002462     0.5475  48.25   294
## [47]  {4710254049323} => {4710254049521} 0.002010     0.4309  55.56   240
## [48]  {4710085120710} => {4710085120703} 0.002914     0.5613  89.84   348
## [49]  {4710085120703} => {4710085120710} 0.002914     0.4665  89.84   348
## [50]  {4710018004704} => {4710018004605} 0.002990     0.5360  46.08   357
## [51]  {4710011409056} => {4710011401135} 0.003383     0.5337  68.60   404
## [52]  {4710011401135} => {4710011409056} 0.003383     0.4349  68.60   404
## [53]  {4710011409056} => {4710011405133} 0.002973     0.4690  59.70   355
## [54]  {4710011409056} => {4710011406123} 0.002621     0.4135  50.48   313
## [55]  {4710011409056} => {4710011401128} 0.004430     0.6988  51.04   529
## [56]  {4710085120093} => {4710085172696} 0.003743     0.4978  57.15   447
## [57]  {4710085172696} => {4710085120093} 0.003743     0.4298  57.15   447
## [58]  {4710085120093} => {4710085120628} 0.003961     0.5267  46.42   473
## [59]  {4710011401135} => {4710011405133} 0.003668     0.4715  60.02   438
## [60]  {4710011405133} => {4710011401135} 0.003668     0.4670  60.02   438
## [61]  {4710011401135} => {4710011401128} 0.005854     0.7524  54.95   699
## [62]  {4710011401128} => {4710011401135} 0.005854     0.4275  54.95   699
## [63]  {4710085172696} => {4710085120628} 0.004355     0.5000  44.06   520
## [64]  {4710011405133} => {4710011406123} 0.003166     0.4030  49.20   378
## [65]  {4710011405133} => {4710011401128} 0.005176     0.6588  48.12   618
## [66]  {4710011406123} => {4710011401128} 0.004849     0.5920  43.24   579
## [67]  {4710011401142,                                                    
##        4710011409056} => {4710011401135} 0.001097     0.6895  88.62   131
## [68]  {4710011401135,                                                    
##        4710011401142} => {4710011409056} 0.001097     0.6009  94.79   131
## [69]  {4710011401142,                                                    
##        4710011409056} => {4710011401128} 0.001181     0.7421  54.20   141
## [70]  {4710011401128,                                                    
##        4710011401142} => {4710011409056} 0.001181     0.5361  84.57   141
## [71]  {4710011401135,                                                    
##        4710011401142} => {4710011405133} 0.001047     0.5734  72.99   125
## [72]  {4710011401142,                                                    
##        4710011405133} => {4710011401135} 0.001047     0.5814  74.73   125
## [73]  {4710011401135,                                                    
##        4710011401142} => {4710011401128} 0.001382     0.7569  55.28   165
## [74]  {4710011401128,                                                    
##        4710011401142} => {4710011401135} 0.001382     0.6274  80.64   165
## [75]  {4710011401142,                                                    
##        4710011405133} => {4710011401128} 0.001231     0.6837  49.93   147
## [76]  {4710011401128,                                                    
##        4710011401142} => {4710011405133} 0.001231     0.5589  71.15   147
## [77]  {4710011401142,                                                    
##        4710011406123} => {4710011401128} 0.001005     0.6593  48.15   120
## [78]  {4710011401128,                                                    
##        4710011401142} => {4710011406123} 0.001005     0.4563  55.71   120
## [79]  {4710085120093,                                                    
##        4710085172702} => {4710085172696} 0.001348     0.6545  75.14   161
## [80]  {4710085172696,                                                    
##        4710085172702} => {4710085120093} 0.001348     0.5552  73.82   161
## [81]  {4710085120093,                                                    
##        4710085172702} => {4710085120628} 0.001281     0.6220  54.81   153
## [82]  {4710085120628,                                                    
##        4710085172702} => {4710085120093} 0.001281     0.5204  69.20   153
## [83]  {4710085172696,                                                    
##        4710085172702} => {4710085120628} 0.001491     0.6138  54.09   178
## [84]  {4710085120628,                                                    
##        4710085172702} => {4710085172696} 0.001491     0.6054  69.51   178
## [85]  {4710011401135,                                                    
##        4710011409056} => {4710011405133} 0.001884     0.5569  70.90   225
## [86]  {4710011405133,                                                    
##        4710011409056} => {4710011401135} 0.001884     0.6338  81.46   225
## [87]  {4710011401135,                                                    
##        4710011405133} => {4710011409056} 0.001884     0.5137  81.03   225
## [88]  {4710011401135,                                                    
##        4710011409056} => {4710011406123} 0.001591     0.4703  57.42   190
## [89]  {4710011406123,                                                    
##        4710011409056} => {4710011401135} 0.001591     0.6070  78.02   190
## [90]  {4710011401135,                                                    
##        4710011406123} => {4710011409056} 0.001591     0.5220  82.34   190
## [91]  {4710011401135,                                                    
##        4710011409056} => {4710011401128} 0.002713     0.8020  58.57   324
## [92]  {4710011401128,                                                    
##        4710011409056} => {4710011401135} 0.002713     0.6125  78.72   324
## [93]  {4710011401128,                                                    
##        4710011401135} => {4710011409056} 0.002713     0.4635  73.11   324
## [94]  {4710011405133,                                                    
##        4710011409056} => {4710011406123} 0.001466     0.4930  60.19   175
## [95]  {4710011406123,                                                    
##        4710011409056} => {4710011405133} 0.001466     0.5591  71.17   175
## [96]  {4710011405133,                                                    
##        4710011406123} => {4710011409056} 0.001466     0.4630  73.03   175
## [97]  {4710011405133,                                                    
##        4710011409056} => {4710011401128} 0.002261     0.7606  55.55   270
## [98]  {4710011401128,                                                    
##        4710011409056} => {4710011405133} 0.002261     0.5104  64.97   270
## [99]  {4710011401128,                                                    
##        4710011405133} => {4710011409056} 0.002261     0.4369  68.91   270
## [100] {4710011406123,                                                    
##        4710011409056} => {4710011401128} 0.001985     0.7572  55.30   237
## [101] {4710011401128,                                                    
##        4710011409056} => {4710011406123} 0.001985     0.4480  54.70   237
## [102] {4710011401128,                                                    
##        4710011406123} => {4710011409056} 0.001985     0.4093  64.57   237
## [103] {4710085120093,                                                    
##        4710085172696} => {4710085120628} 0.002136     0.5705  50.27   255
## [104] {4710085120093,                                                    
##        4710085120628} => {4710085172696} 0.002136     0.5391  61.90   255
## [105] {4710085120628,                                                    
##        4710085172696} => {4710085120093} 0.002136     0.4904  65.21   255
## [106] {4710011401135,                                                    
##        4710011405133} => {4710011406123} 0.001625     0.4429  54.08   194
## [107] {4710011401135,                                                    
##        4710011406123} => {4710011405133} 0.001625     0.5330  67.85   194
## [108] {4710011405133,                                                    
##        4710011406123} => {4710011401135} 0.001625     0.5132  65.97   194
## [109] {4710011401135,                                                    
##        4710011405133} => {4710011401128} 0.002831     0.7717  56.36   338
## [110] {4710011401128,                                                    
##        4710011401135} => {4710011405133} 0.002831     0.4835  61.56   338
## [111] {4710011401128,                                                    
##        4710011405133} => {4710011401135} 0.002831     0.5469  70.30   338
## [112] {4710011401135,                                                    
##        4710011406123} => {4710011401128} 0.002445     0.8022  58.59   292
## [113] {4710011401128,                                                    
##        4710011401135} => {4710011406123} 0.002445     0.4177  51.00   292
## [114] {4710011401128,                                                    
##        4710011406123} => {4710011401135} 0.002445     0.5043  64.82   292
## [115] {4710011405133,                                                    
##        4710011406123} => {4710011401128} 0.002219     0.7011  51.20   265
## [116] {4710011401128,                                                    
##        4710011405133} => {4710011406123} 0.002219     0.4288  52.35   265
## [117] {4710011401128,                                                    
##        4710011406123} => {4710011405133} 0.002219     0.4577  58.26   265
## [118] {4710011401135,                                                    
##        4710011405133,                                                    
##        4710011409056} => {4710011401128} 0.001566     0.8311  60.70   187
## [119] {4710011401128,                                                    
##        4710011401135,                                                    
##        4710011409056} => {4710011405133} 0.001566     0.5772  73.47   187
## [120] {4710011401128,                                                    
##        4710011405133,                                                    
##        4710011409056} => {4710011401135} 0.001566     0.6926  89.02   187
## [121] {4710011401128,                                                    
##        4710011401135,                                                    
##        4710011405133} => {4710011409056} 0.001566     0.5533  87.27   187
## [122] {4710011401135,                                                    
##        4710011406123,                                                    
##        4710011409056} => {4710011401128} 0.001332     0.8368  61.12   159
## [123] {4710011401128,                                                    
##        4710011401135,                                                    
##        4710011409056} => {4710011406123} 0.001332     0.4907  59.92   159
## [124] {4710011401128,                                                    
##        4710011406123,                                                    
##        4710011409056} => {4710011401135} 0.001332     0.6709  86.23   159
## [125] {4710011401128,                                                    
##        4710011401135,                                                    
##        4710011406123} => {4710011409056} 0.001332     0.5445  85.89   159
## [126] {4710011405133,                                                    
##        4710011406123,                                                    
##        4710011409056} => {4710011401128} 0.001156     0.7886  57.59   138
## [127] {4710011401128,                                                    
##        4710011405133,                                                    
##        4710011409056} => {4710011406123} 0.001156     0.5111  62.40   138
## [128] {4710011401128,                                                    
##        4710011406123,                                                    
##        4710011409056} => {4710011405133} 0.001156     0.5823  74.12   138
## [129] {4710011401128,                                                    
##        4710011405133,                                                    
##        4710011406123} => {4710011409056} 0.001156     0.5208  82.14   138
## [130] {4710011401135,                                                    
##        4710011405133,                                                    
##        4710011406123} => {4710011401128} 0.001348     0.8299  60.61   161
## [131] {4710011401128,                                                    
##        4710011401135,                                                    
##        4710011405133} => {4710011406123} 0.001348     0.4763  58.16   161
## [132] {4710011401128,                                                    
##        4710011401135,                                                    
##        4710011406123} => {4710011405133} 0.001348     0.5514  70.19   161
## [133] {4710011401128,                                                    
##        4710011405133,                                                    
##        4710011406123} => {4710011401135} 0.001348     0.6075  78.09   161
# install.packages(
#   "https://cran.r-project.org/bin/windows/contrib/3.5/arulesViz_1.3-1.zip",
#   repos=NULL)

# install.packages("arulesViz_1.3-1.zip", repos=NULL)
# library(plotly)
# plotly_arules(rules,colors=c("red","green"),
#               marker=list(opacity=.6,size=10))
# plotly_arules(rules,method="matrix",
#               shading="lift",
#               colors=c("red", "green"))
# 
D5. 互動圖表顯示
plot(rules,colors=c("red","green"),engine="htmlwidget",
     marker=list(opacity=.6,size=8))
# 想要促銷某項產品,想銷售B所以要賣A,可以從這張圖看出
# 架位擺設方面,擺在旁邊或擺開一點
plot(rules,method="matrix",shading="lift",engine="htmlwidget",
     colors=c("red", "green"))
# 通常比較關心righthandside
# 有很多選擇可以看到若要賣B,可以有哪些A的選擇
D6. 篩選產品、互動式關聯圖
r1 = subset(rules, subset = rhs %in% c("4719090790000"))
summary(r1)
## set of 2 rules
## 
## rule length distribution (lhs + rhs):sizes
## 2 
## 2 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2       2       2       2       2       2 
## 
## summary of quality measures:
##     support          confidence         lift         count    
##  Min.   :0.00147   Min.   :0.577   Min.   :122   Min.   :176  
##  1st Qu.:0.00184   1st Qu.:0.635   1st Qu.:134   1st Qu.:220  
##  Median :0.00221   Median :0.693   Median :146   Median :264  
##  Mean   :0.00221   Mean   :0.693   Mean   :146   Mean   :264  
##  3rd Qu.:0.00257   3rd Qu.:0.751   3rd Qu.:159   3rd Qu.:307  
##  Max.   :0.00294   Max.   :0.809   Max.   :171   Max.   :351  
## 
## mining info:
##  data ntransactions support confidence
##    bx        119407   0.001        0.4
plot(r1,method="graph",engine="htmlwidget",itemCol="cyan") 
# 只要想知道A出現時什麼會跟著出現,就可以用
  • 泡泡大小:support: A被購買的機率 (A的基礎機率)
  • 泡泡顏色:lift: A被購買時,B被購買的機率增加的倍數 (與B的基礎機率相比)
r2 = subset(rules, subset = rhs %in% c("4710011401135"))
summary(r2)
## set of 16 rules
## 
## rule length distribution (lhs + rhs):sizes
## 2 3 4 
## 4 9 3 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    2.75    3.00    2.94    3.00    4.00 
## 
## summary of quality measures:
##     support          confidence         lift          count    
##  Min.   :0.00105   Min.   :0.427   Min.   :55.0   Min.   :125  
##  1st Qu.:0.00137   1st Qu.:0.511   1st Qu.:65.7   1st Qu.:164  
##  Median :0.00172   Median :0.594   Median :76.4   Median :206  
##  Mean   :0.00222   Mean   :0.576   Mean   :74.0   Mean   :266  
##  3rd Qu.:0.00274   3rd Qu.:0.629   3rd Qu.:80.8   3rd Qu.:328  
##  Max.   :0.00585   Max.   :0.693   Max.   :89.0   Max.   :699  
## 
## mining info:
##  data ntransactions support confidence
##    bx        119407   0.001        0.4
plot(r2,method="graph",engine="htmlwidget",itemCol="cyan") 


E. 產品推薦 Product Recommendation

E1. 篩選顧客、產品

太少被購買的產品和購買太少產品的顧客都不適合使用Collaborative Filtering這種產品推薦方法,所以我們先對顧客和產品做一次篩選

library(recommenderlab)
rx = cpm[, colSums(cpm > 0) >= 50] # 選有50個人以上的
rx = rx[rowSums(rx > 0) >= 20 & rowSums(rx > 0) <= 300, ]
# 買超過20樣產品的,還有買太多也不要,非正常顧客
dim(rx)  # 8846 3354
## [1] 8846 3354
E2. 選擇產品評分方式

可以選擇要用

  • 購買次數 (realRatingMatrix) 或
  • 是否購買 (binaryRatingMatrix)

做模型。

rx = as(rx, "realRatingMatrix")  # realRatingMatrix
bx = binarize(rx, minRating=1)   # binaryRatingMatrix binarize()變成一個評分方式
E3. 建立模型、產生建議 - UBCF

UBCF:User Based Collaborative Filtering

(rUBCF <- Recommender(bx[1:8800,], method = "UBCF"))
## Recommender of type 'UBCF' for 'binaryRatingMatrix' 
## learned using 8800 users.
# 將購買習慣相似的顧客分成一群
pred = predict(rUBCF, bx[8801:8846,], n=4)
do.call(rbind, as(pred, "list")) %>% head(15)
##         [,1]            [,2]            [,3]            [,4]           
## 2170855 "4711271000014" "4710114128038" "4714981010038" "4713985863121"
## 2171265 "4719090900065" "4710254049521" "4710036008562" "4714981010038"
## 2171340 "723125488040"  "723125488064"  "723125485032"  "4714981010038"
## 2171425 "4710011401135" "4710011409056" "4711080010112" "4710011401142"
## 2171432 "4714981010038" "4710011406123" "4711258007371" "4710011401128"
## 2171555 "4719090900065" "4711271000014" "37000329169"   "4710943109352"
## 2171883 "4711271000014" "4710583996008" "4710291112172" "4710018004704"
## 2172194 "4711271000014" "4714981010038" "4710114128038" "4710114105046"
## 2172392 "4903111345717" "4710908131589" "4710168705056" "4711271000014"
## 2172569 "4711271000014" "4714981010038" "4710128030037" "4712162000038"
## 2172583 "4714981010038" "4710085120093" "4719090900065" "4710154015206"
## 2172590 "4710011406123" "4710011401142" "4710857000028" "4710011432856"
## 2172668 "4711271000014" "4710088620156" "4719090900065" "4712425010712"
## 2172705 "4711271000014" "4714981010038" "37000445111"   "37000440192"  
## 2172811 "4714981010038" "37000442127"   "4710719000333" "4710114128038"
# 46可以拿來做測試,n=4推薦4件產品
# 開始推薦一個顧客四樣產品
E4. 建立模型、產生建議 - IBCF

IBCF:Item Based Collaborative Filtering

(rIBCF <- Recommender(bx[1:8800,], method = "IBCF"))
## Recommender of type 'IBCF' for 'binaryRatingMatrix' 
## learned using 8800 users.
# 使用者當區隔變數 對產品分群
# 假設每一項產品是差不多哪一群差不多的人在買
  • 對產品分群,假設每件產品購買的人都相同,用這個推論的資訊再回推回來,所以要算比較久,有時會與UBCF相同
pred = predict(rIBCF, bx[8801:8846,], n=4)
do.call(rbind, as(pred, "list")) %>% head(15)
##         [,1]            [,2]            [,3]            [,4]           
## 2170855 "4719090900065" "4714981010038" "4711271000014" "4712162000038"
## 2171265 "4719090900065" "4710015103288" "4714981010038" "4711271000014"
## 2171340 "37000445111"   "4710036005608" "37000442127"   "723125485032" 
## 2171425 "4711311617899" "4711311218836" "4710011401135" "4710011409056"
## 2171432 "4714981010038" "4710321791698" "4710857000042" "4710626111252"
## 2171555 "93432641"      "93362993"      "4710105045320" "4711271000014"
## 2171883 "4710670200100" "4710670200407" "4711271000014" "3228020490329"
## 2172194 "4714108700019" "4714108700064" "4909978199111" "20332433"     
## 2172392 "4710706211759" "4710908131589" "4719090900058" "4710731040614"
## 2172569 "4711371850243" "84501297329"   "84501293529"   "4710085121007"
## 2172583 "4710085172702" "4710085120093" "34000100095"   "34000231508"  
## 2172590 "4710011406123" "4711271000014" "4711437000162" "4710011401142"
## 2172668 "4711371850243" "4719090900065" "4714981010038" "4711437000117"
## 2172705 "37000445111"   "4710018004605" "37000442127"   "37000304593"  
## 2172811 "4719581980293" "4712067899287" "4719581980279" "4710908131589"
save(rIBCF, rUBCF, file="data/recommenders.rdata")
E5. 設定模型(準確性)驗證方式
set.seed(4321)
scheme = evaluationScheme(     
  bx, method="split", train = .75,  given=5)
# 設定驗證方式,given=5我留下0.25test裡要給我五件產品才可以猜,不能完全沒買的
E6. 設定推薦方法(參數)
algorithms = list(            
  AR53 = list(name="AR", param=list(support=0.0005, confidence=0.3)),
  AR43 = list(name="AR", param=list(support=0.0004, confidence=0.3)),
  RANDOM = list(name="RANDOM", param=NULL),
  POPULAR = list(name="POPULAR", param=NULL),
  UBCF = list(name="UBCF", param=NULL),
  IBCF = list(name="IBCF", param=NULL) )
E7. 建模、預測、驗證(準確性)
if(LOAD) {
  load("data/results2a.rdata")
} else {
  t0 = Sys.time()
  results = evaluate(            
    scheme, algorithms, 
    type="topNList",     # method of evaluation
    n=c(5, 10, 15, 20)   # no. recom. to be evaluated 剛剛給五個產品,所以要猜5、10、15、20
    )
  print(Sys.time() - t0)
  save(results, file="data/results2a.rdata")
}
## AR run fold/sample [model time/prediction time]
##   1  [4.02sec/214.6sec] 
## AR run fold/sample [model time/prediction time]
##   1  [10.49sec/538.5sec] 
## RANDOM run fold/sample [model time/prediction time]
##   1  [0sec/9.48sec] 
## POPULAR run fold/sample [model time/prediction time]
##   1  [0sec/11.09sec] 
## UBCF run fold/sample [model time/prediction time]
##   1  [0sec/75.42sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [198.2sec/1.63sec] 
## Time difference of 18.72 mins
## 這些程式跑完需要15.16分鐘
E8. 模型準確性比較
# load("data/results.rdata")
par(mar=c(4,4,3,2),cex=0.8)
cols = c("red", "magenta", "gray", "orange", "blue", "green")
plot(results, annotate=c(1,3), legend="topleft", pch=19, lwd=2, col=cols)
abline(v=seq(0,0.006,0.001), h=seq(0,0.08,0.01), col='lightgray', lty=2)

# 若是推薦不好,間接對公司有殺傷力
getConfusionMatrix(results$IBCF)
## [[1]]
##       TP     FP    FN   TN precision  recall     TPR      FPR
## 5  1.116  3.884 32.97 3311    0.2231 0.03899 0.03899 0.001171
## 10 1.699  8.301 32.39 3307    0.1699 0.05812 0.05812 0.002503
## 15 2.075 12.925 32.01 3302    0.1383 0.07021 0.07021 0.003898
## 20 2.385 17.615 31.70 3297    0.1193 0.08002 0.08002 0.005313