packages = c(
"dplyr","ggplot2","googleVis","devtools","magrittr","slam","irlba","plotly",
"arules","arulesViz","Matrix","recommenderlab")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)rm(list=ls(all=TRUE))
LOAD = TRUE
library(dplyr)
library(ggplot2)
library(googleVis)
library(Matrix)
library(slam)
library(irlba)
library(plotly)
library(arules)
library(arulesViz)
library(recommenderlab)Load data frame and rename
load("data/tf0.rdata")
A = A0; X = X0; Z = Z0; rm(A0,X0,Z0); gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2439683 130.3 3886542 207.6 3205452 171.2
## Vcells 9473576 72.3 13593784 103.8 9523388 72.7
Z = subset(Z, cust %in% A$cust)n_distinct(Z$cust) # 32241## [1] 32241
n_distinct(Z$prod) # 23787## [1] 23787
製作顧客產品矩陣其實很快、也很容易
library(Matrix)
library(slam)
cpm = xtabs(~ cust + prod, Z, sparse=T) # customer product matrix
dim(cpm) # 32241 23787## [1] 32241 23787
mean(cpm > 0) # 0.00096799 # 有資料的密度## [1] 0.0009674258
# cpm為0或1,矩陣的密度(有資料的密度)0.00096799顧客產品矩陣通常是一個很稀疏的矩,陣有一些產品沒什麼人買
colSums(cpm) %>% quantile(seq(0,1,0.1))## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 1 1 2 4 6 8 13 20 35 76 8475
mean(colSums(cpm) > 10) #每一個產品被買超過10次 mean在邏輯運算中是指TRUE的比率## [1] 0.4483541
刪去購買次數小於6的產品,然後刪去沒有購買產品的顧客
cpm = cpm[, colSums(cpm) >= 6] # remove the least frequent products
# cpm = cpm[rowSums(cpm) > 0, ] # remove non-buying customers
cpm = cpm[, order(-colSums(cpm))] # order product by frequency
# 以產品購買次數降冪排列
dim(cpm) # 32241 23787>14621## [1] 32241 14621
max(cpm) # 49## [1] 49
mean(cpm > 0) # 0.0015248## [1] 0.001524785
table(cpm@x) %>% prop.table %>% round(4) %>% head(10)##
## 1 2 3 4 5 6 7 8 9 10
## 0.9256 0.0579 0.0108 0.0032 0.0012 0.0006 0.0003 0.0002 0.0001 0.0001
請你用一個指令列出被購買最多次的10個產品,和它們被購買的次數。
colSums(cpm[,1:10]) ## 4714981010038 4711271000014 4719090900065 4711080010112 4710114128038
## 8475 6119 2444 2249 2178
## 4710265849066 4713985863121 4710088410139 4710583996008 4710908131589
## 2017 1976 1869 1840 1679
cbind()新變數併入顧客資料框呢?cbind()併入新變數的條件嗎? 我們要如何確認這一件事呢?# 需確認cpm裡面顧客的排列是和A裡的cust排列是一樣的
dim(cpm)## [1] 32241 14621
# sum(A$cust == rownames(cpm)) # cpm裡顧客的排列跟A的cust的排列是一樣的 有多少個?# A = cbind(A , as.matrix(cpm[,1:20])) # 併入20個變數
# cpm是一個稀疏矩陣 所以要併入的話 先轉成正常矩陣
以產品的被購買頻率製作(顧客)變數的時候,排cpm在最前邊的(N個)欄位就是變數!
nop= 400 # no. product = no. variables
k = 200 # no. cluster
set.seed(111); kg = kmeans(cpm[,1:nop], k)$cluster
table(kg) %>% as.vector %>% sort## [1] 1 1 1 1 1 1 1 1 1 1 1
## [12] 1 1 1 1 1 1 1 1 1 1 1
## [23] 2 2 2 2 2 2 3 3 3 3 3
## [34] 3 3 4 4 4 4 4 4 4 5 5
## [45] 6 6 6 6 7 7 7 8 8 8 9
## [56] 9 9 9 10 10 10 10 11 11 11 11
## [67] 11 12 13 13 15 15 15 16 16 18 19
## [78] 20 20 20 20 21 22 22 22 24 24 25
## [89] 25 27 28 28 32 32 35 36 39 40 41
## [100] 42 44 45 46 47 47 48 49 49 50 51
## [111] 52 53 56 58 58 61 63 66 67 68 69
## [122] 69 72 81 85 85 86 87 90 94 96 97
## [133] 97 100 100 101 110 111 113 114 116 118 123
## [144] 123 126 130 134 136 141 141 142 143 162 165
## [155] 172 175 178 179 182 182 184 187 195 210 222
## [166] 225 228 228 237 239 242 253 254 258 258 266
## [177] 268 272 287 293 301 311 325 329 350 351 363
## [188] 396 407 407 410 418 432 448 473 523 561 1156
## [199] 1266 11215
將分群結果併入顧客資料框(A)
df = A %>% inner_join(data.frame(
cust = as.integer(rownames(cpm)),
kg) )## Joining, by = "cust"
head(df) # 32241## cust r s f m rev raw age area kg
## 1 1069 19 108 4 486.0 1944 15 K E 196
## 2 1113 54 109 4 557.5 2230 241 K F 81
## 3 1250 19 25 2 791.5 1583 354 D D 169
## 4 1359 87 87 1 364.0 364 104 K G 49
## 5 1823 36 119 3 869.0 2607 498 K D 109
## 6 2189 57 89 2 7028.0 14056 3299 K B 158
計算各群組的平均屬性
df = data.frame(
aggregate(. ~ kg, df[,c(2:7,10)], mean), # averages
size = as.vector(table(kg)), # no. customers in the group
dummy = 2001 # dummy column for googleViz
)
head(df)## kg r s f m rev raw size
## 1 1 10.772727 113.54545 13.318182 872.7878 10697.909 1733.8182 22
## 2 2 43.934959 95.96748 3.455285 1252.9938 3819.837 533.0407 123
## 3 3 5.384615 111.15385 14.307692 1325.2611 12617.769 1951.8462 13
## 4 4 8.000000 119.00000 24.000000 955.6250 22935.000 3658.0000 1
## 5 5 6.666667 116.66667 20.333333 579.9135 10741.444 1554.2222 9
## 6 6 2.500000 114.00000 23.500000 411.9918 9775.000 1324.5000 2
## dummy
## 1 2001
## 2 2001
## 3 2001
## 4 2001
## 5 2001
## 6 2001
plot( gvisMotionChart(
subset(df[,c(1,4,5,6,8,2,3,7,9)],
size >= 20 & size <= 1000), # range of group size
"kg", "dummy", options=list(width=800, height=600) ) )## starting httpd help server ... done
# use global variables: cpm, kg
Sig = function(gx, P=1000, H=10) {
print(sprintf("Group %d: No. Customers = %d", gx, sum(kg==gx)))
bx = cpm[,1:P]
data.frame(n = col_sums(bx[kg==gx,])) %>% # frequency
mutate(
share = round(100*n/col_sums(bx),2), # %prod sold to this cluster
conf = round(100*n/sum(kg==gx),2), # %buy this product, given cluster
base = round(100*col_sums(bx)/nrow(bx),2), # %buy this product, all cust
lift = round(conf/base,1), # conf/base
name = colnames(bx) # name of prod
) %>% arrange(desc(lift)) %>% head(H)
}Sig(130)## [1] "Group 130: No. Customers = 97"
## Warning: package 'bindrcpp' was built under R version 3.4.4
## n share conf base lift name
## 1 29 6.65 29.90 1.35 22.1 4719090106009
## 2 19 6.62 19.59 0.89 22.0 4710632001172
## 3 12 5.38 12.37 0.69 17.9 4710093080044
## 4 389 4.59 401.03 26.29 15.3 4714981010038
## 5 20 4.59 20.62 1.35 15.3 4719090105002
## 6 14 4.53 14.43 0.96 15.0 4719090106016
## 7 8 4.40 8.25 0.56 14.7 9310022862601
## 8 10 4.31 10.31 0.72 14.3 4710189820851
## 9 9 4.25 9.28 0.66 14.1 4710088434258
## 10 15 4.19 15.46 1.11 13.9 4710189851282
library(irlba)
if(LOAD) {
load("data/svd2a.rdata")
} else {
smx = cpm
smx@x = pmin(smx@x, 2) # cap at 2, similar to normalization
# 現在要對稀疏矩陣做常態化 再做SVD
t0 = Sys.time()
svd = irlba(smx,
nv=400, # length of feature vector
maxit=800, work=800)
print(Sys.time() - t0) # 1.8795 mins #印出來跑的時間多久
save(svd, file = "data/svd2a.rdata")
}set.seed(111); kg = kmeans(svd$u, 200)$cluster
table(kg) %>% as.vector %>% sort## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [15] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [29] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [43] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [57] 1 2 2 2 2 2 3 4 4 5 7 10 14 30
## [71] 31 32 36 38 38 39 39 40 40 41 44 45 46 47
## [85] 49 54 59 62 62 69 71 77 79 79 80 82 82 84
## [99] 87 91 101 103 109 110 111 113 117 120 123 127 127 129
## [113] 132 133 134 135 136 139 141 143 143 147 147 157 159 159
## [127] 160 160 160 166 168 169 172 175 180 181 181 182 183 184
## [141] 184 188 190 190 193 194 195 196 198 198 200 201 201 202
## [155] 202 204 204 204 207 209 209 210 213 214 216 219 219 222
## [169] 225 233 234 235 236 237 237 238 239 241 248 248 248 253
## [183] 256 257 258 259 261 261 264 269 277 281 285 293 305 411
## [197] 612 896 1092 8987
# clustster summary
df = inner_join(A, data.frame(
cust = as.integer(rownames(cpm)), kg)) %>%
group_by(kg) %>% summarise(
avg_frequency = mean(f),
avg_monetary = mean(m),
avg_revenue_contr = mean(rev),
group_size = n(),
avg_recency = mean(r),
avg_gross_profit = mean(raw)) %>%
ungroup %>%
mutate(dummy = 2001, kg = sprintf("G%03d",kg)) %>%
data.frame## Joining, by = "cust"
# Google Motion Chart
plot( gvisMotionChart(
subset(df, group_size >= 20 & group_size <= 1200),
"kg", "dummy", options=list(width=800, height=600) ) )Sig(162)## [1] "Group 162: No. Customers = 87"
## n share conf base lift name
## 1 165 38.64 189.66 1.32 143.7 4710105045313
## 2 133 35.37 152.87 1.17 130.7 4710105045320
## 3 59 13.50 67.82 1.36 49.9 4710105045450
## 4 19 9.60 21.84 0.61 35.8 4710105051147
## 5 35 8.31 40.23 1.31 30.7 4710105045443
## 6 9 5.33 10.34 0.52 19.9 4710088432650
## 7 8 4.91 9.20 0.51 18.0 4710105051185
## 8 8 4.19 9.20 0.59 15.6 4710171028326
## 9 7 3.89 8.05 0.56 14.4 4710058137059
## 10 7 3.70 8.05 0.59 13.6 4710010020061
dim(cpm) # 32241 14621 #被買超過五次(>=6)的商品有14621## [1] 32241 14621
library(arules)
library(arulesViz)
# bx = subset(Z, prod %in% as.numeric(colnames(cpm)),
# 被購買超過五次的產品名稱的集合
# 濾掉一些購買次數太少的產品
# select=c("cust","prod")) # select product items 選rows 哪一筆資料要進來
# bx = Z
# bx = split(bx$prod, bx$tid) # split by transcation id
# bx = as(bx, "transactions") # data structure for arules package
bx = as(split(Z$prod, Z$tid), "transactions")itemFrequencyPlot(bx, topN=20, type="absolute", cex=0.8)關聯規則(A => B)
rules = apriori(bx, parameter=list(supp=0.001, conf=0.4))## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.4 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 119
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[23787 item(s), 119407 transaction(s)] done [0.10s].
## sorting and recoding items ... [1452 item(s)] done [0.01s].
## creating transaction tree ... done [0.04s].
## checking subsets of size 1 2 3 4 done [0.04s].
## writing ... [133 rule(s)] done [0.00s].
## creating S4 object ... done [0.01s].
summary(rules)## set of 133 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4
## 66 51 16
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 3.000 2.624 3.000 4.000
##
## summary of quality measures:
## support confidence lift count
## Min. :0.001005 Min. :0.4030 Min. : 31.47 Min. :120
## 1st Qu.:0.001348 1st Qu.:0.4670 1st Qu.: 57.15 1st Qu.:161
## Median :0.001625 Median :0.5360 Median : 68.91 Median :194
## Mean :0.002035 Mean :0.5562 Mean : 90.79 Mean :243
## 3rd Qu.:0.002445 3rd Qu.:0.6138 3rd Qu.: 89.84 3rd Qu.:292
## Max. :0.005854 Max. :0.8368 Max. :301.37 Max. :699
##
## mining info:
## data ntransactions support confidence
## bx 119407 0.001 0.4
關聯規則 (A => B):
options(digits=4)
inspect(rules)## lhs rhs support confidence lift count
## [1] {4710030346110} => {4710030346103} 0.001005 0.6486 301.37 120
## [2] {4710030346103} => {4710030346110} 0.001005 0.4669 301.37 120
## [3] {4710030346097} => {4710030346059} 0.001097 0.5822 211.96 131
## [4] {4710030346103} => {4710030346059} 0.001407 0.6537 237.98 168
## [5] {4710030346059} => {4710030346103} 0.001407 0.5122 237.98 168
## [6] {4716114000312} => {4716114000329} 0.001273 0.5527 231.58 152
## [7] {4716114000329} => {4716114000312} 0.001273 0.5333 231.58 152
## [8] {4719090701051} => {4719090790017} 0.001097 0.4295 118.17 131
## [9] {4719090701051} => {4719090790000} 0.001474 0.5770 121.95 176
## [10] {4711524000419} => {4711524000396} 0.001382 0.6790 219.13 165
## [11] {4711524000396} => {4711524000419} 0.001382 0.4459 219.13 165
## [12] {4711524000495} => {4711524000396} 0.001038 0.5000 161.36 124
## [13] {4719090106016} => {4719090106009} 0.001239 0.4790 131.17 148
## [14] {4710321861209} => {4710321871260} 0.001449 0.4701 151.71 173
## [15] {4710321871260} => {4710321861209} 0.001449 0.4676 151.71 173
## [16] {4710321861209} => {4710321861186} 0.001876 0.6087 137.92 224
## [17] {4710321861186} => {4710321861209} 0.001876 0.4250 137.92 224
## [18] {4710321871260} => {4710321861186} 0.001675 0.5405 122.47 200
## [19] {4711524000907} => {4711524000891} 0.001290 0.5620 191.20 154
## [20] {4711524000891} => {4711524000907} 0.001290 0.4387 191.20 154
## [21] {4711524000907} => {4711524001041} 0.001491 0.6496 194.41 178
## [22] {4711524001041} => {4711524000907} 0.001491 0.4461 194.41 178
## [23] {4719090790017} => {4719090790000} 0.002940 0.8088 170.92 351
## [24] {4719090790000} => {4719090790017} 0.002940 0.6212 170.92 351
## [25] {719859796124} => {719859796117} 0.002102 0.6972 147.61 251
## [26] {719859796117} => {719859796124} 0.002102 0.4450 147.61 251
## [27] {4710762101018} => {4710762101025} 0.001566 0.4484 112.49 187
## [28] {4710857000059} => {4710011401128} 0.001072 0.4310 31.47 128
## [29] {4710466103073} => {4710466103080} 0.001332 0.4892 78.84 159
## [30] {4711524000891} => {4711524001041} 0.001759 0.5983 179.05 210
## [31] {4711524001041} => {4711524000891} 0.001759 0.5263 179.05 210
## [32] {4711856000088} => {4711856000125} 0.002479 0.4625 61.50 296
## [33] {4710011402026} => {4710011402019} 0.002822 0.6740 90.22 337
## [34] {4711856020215} => {4711856000125} 0.002169 0.4368 58.08 259
## [35] {4710085120697} => {4710085120680} 0.003467 0.7753 100.41 414
## [36] {4710085120680} => {4710085120697} 0.003467 0.4490 100.41 414
## [37] {4711856020208} => {4711856000125} 0.002127 0.4084 54.30 254
## [38] {4710011401142} => {4710011409056} 0.001591 0.4308 67.96 190
## [39] {4710011401142} => {4710011401135} 0.001826 0.4943 63.54 218
## [40] {4710011401142} => {4710011405133} 0.001801 0.4875 62.06 215
## [41] {4710011401142} => {4710011406123} 0.001524 0.4127 50.39 182
## [42] {4710011401142} => {4710011401128} 0.002203 0.5964 43.55 263
## [43] {4710088414328} => {4710088414311} 0.001792 0.4672 86.37 214
## [44] {4710085172702} => {4710085120093} 0.002060 0.4581 60.91 246
## [45] {4710085172702} => {4710085172696} 0.002429 0.5400 62.00 290
## [46] {4710085172702} => {4710085120628} 0.002462 0.5475 48.25 294
## [47] {4710254049323} => {4710254049521} 0.002010 0.4309 55.56 240
## [48] {4710085120710} => {4710085120703} 0.002914 0.5613 89.84 348
## [49] {4710085120703} => {4710085120710} 0.002914 0.4665 89.84 348
## [50] {4710018004704} => {4710018004605} 0.002990 0.5360 46.08 357
## [51] {4710011409056} => {4710011401135} 0.003383 0.5337 68.60 404
## [52] {4710011401135} => {4710011409056} 0.003383 0.4349 68.60 404
## [53] {4710011409056} => {4710011405133} 0.002973 0.4690 59.70 355
## [54] {4710011409056} => {4710011406123} 0.002621 0.4135 50.48 313
## [55] {4710011409056} => {4710011401128} 0.004430 0.6988 51.04 529
## [56] {4710085120093} => {4710085172696} 0.003743 0.4978 57.15 447
## [57] {4710085172696} => {4710085120093} 0.003743 0.4298 57.15 447
## [58] {4710085120093} => {4710085120628} 0.003961 0.5267 46.42 473
## [59] {4710011401135} => {4710011405133} 0.003668 0.4715 60.02 438
## [60] {4710011405133} => {4710011401135} 0.003668 0.4670 60.02 438
## [61] {4710011401135} => {4710011401128} 0.005854 0.7524 54.95 699
## [62] {4710011401128} => {4710011401135} 0.005854 0.4275 54.95 699
## [63] {4710085172696} => {4710085120628} 0.004355 0.5000 44.06 520
## [64] {4710011405133} => {4710011406123} 0.003166 0.4030 49.20 378
## [65] {4710011405133} => {4710011401128} 0.005176 0.6588 48.12 618
## [66] {4710011406123} => {4710011401128} 0.004849 0.5920 43.24 579
## [67] {4710011401142,
## 4710011409056} => {4710011401135} 0.001097 0.6895 88.62 131
## [68] {4710011401135,
## 4710011401142} => {4710011409056} 0.001097 0.6009 94.79 131
## [69] {4710011401142,
## 4710011409056} => {4710011401128} 0.001181 0.7421 54.20 141
## [70] {4710011401128,
## 4710011401142} => {4710011409056} 0.001181 0.5361 84.57 141
## [71] {4710011401135,
## 4710011401142} => {4710011405133} 0.001047 0.5734 72.99 125
## [72] {4710011401142,
## 4710011405133} => {4710011401135} 0.001047 0.5814 74.73 125
## [73] {4710011401135,
## 4710011401142} => {4710011401128} 0.001382 0.7569 55.28 165
## [74] {4710011401128,
## 4710011401142} => {4710011401135} 0.001382 0.6274 80.64 165
## [75] {4710011401142,
## 4710011405133} => {4710011401128} 0.001231 0.6837 49.93 147
## [76] {4710011401128,
## 4710011401142} => {4710011405133} 0.001231 0.5589 71.15 147
## [77] {4710011401142,
## 4710011406123} => {4710011401128} 0.001005 0.6593 48.15 120
## [78] {4710011401128,
## 4710011401142} => {4710011406123} 0.001005 0.4563 55.71 120
## [79] {4710085120093,
## 4710085172702} => {4710085172696} 0.001348 0.6545 75.14 161
## [80] {4710085172696,
## 4710085172702} => {4710085120093} 0.001348 0.5552 73.82 161
## [81] {4710085120093,
## 4710085172702} => {4710085120628} 0.001281 0.6220 54.81 153
## [82] {4710085120628,
## 4710085172702} => {4710085120093} 0.001281 0.5204 69.20 153
## [83] {4710085172696,
## 4710085172702} => {4710085120628} 0.001491 0.6138 54.09 178
## [84] {4710085120628,
## 4710085172702} => {4710085172696} 0.001491 0.6054 69.51 178
## [85] {4710011401135,
## 4710011409056} => {4710011405133} 0.001884 0.5569 70.90 225
## [86] {4710011405133,
## 4710011409056} => {4710011401135} 0.001884 0.6338 81.46 225
## [87] {4710011401135,
## 4710011405133} => {4710011409056} 0.001884 0.5137 81.03 225
## [88] {4710011401135,
## 4710011409056} => {4710011406123} 0.001591 0.4703 57.42 190
## [89] {4710011406123,
## 4710011409056} => {4710011401135} 0.001591 0.6070 78.02 190
## [90] {4710011401135,
## 4710011406123} => {4710011409056} 0.001591 0.5220 82.34 190
## [91] {4710011401135,
## 4710011409056} => {4710011401128} 0.002713 0.8020 58.57 324
## [92] {4710011401128,
## 4710011409056} => {4710011401135} 0.002713 0.6125 78.72 324
## [93] {4710011401128,
## 4710011401135} => {4710011409056} 0.002713 0.4635 73.11 324
## [94] {4710011405133,
## 4710011409056} => {4710011406123} 0.001466 0.4930 60.19 175
## [95] {4710011406123,
## 4710011409056} => {4710011405133} 0.001466 0.5591 71.17 175
## [96] {4710011405133,
## 4710011406123} => {4710011409056} 0.001466 0.4630 73.03 175
## [97] {4710011405133,
## 4710011409056} => {4710011401128} 0.002261 0.7606 55.55 270
## [98] {4710011401128,
## 4710011409056} => {4710011405133} 0.002261 0.5104 64.97 270
## [99] {4710011401128,
## 4710011405133} => {4710011409056} 0.002261 0.4369 68.91 270
## [100] {4710011406123,
## 4710011409056} => {4710011401128} 0.001985 0.7572 55.30 237
## [101] {4710011401128,
## 4710011409056} => {4710011406123} 0.001985 0.4480 54.70 237
## [102] {4710011401128,
## 4710011406123} => {4710011409056} 0.001985 0.4093 64.57 237
## [103] {4710085120093,
## 4710085172696} => {4710085120628} 0.002136 0.5705 50.27 255
## [104] {4710085120093,
## 4710085120628} => {4710085172696} 0.002136 0.5391 61.90 255
## [105] {4710085120628,
## 4710085172696} => {4710085120093} 0.002136 0.4904 65.21 255
## [106] {4710011401135,
## 4710011405133} => {4710011406123} 0.001625 0.4429 54.08 194
## [107] {4710011401135,
## 4710011406123} => {4710011405133} 0.001625 0.5330 67.85 194
## [108] {4710011405133,
## 4710011406123} => {4710011401135} 0.001625 0.5132 65.97 194
## [109] {4710011401135,
## 4710011405133} => {4710011401128} 0.002831 0.7717 56.36 338
## [110] {4710011401128,
## 4710011401135} => {4710011405133} 0.002831 0.4835 61.56 338
## [111] {4710011401128,
## 4710011405133} => {4710011401135} 0.002831 0.5469 70.30 338
## [112] {4710011401135,
## 4710011406123} => {4710011401128} 0.002445 0.8022 58.59 292
## [113] {4710011401128,
## 4710011401135} => {4710011406123} 0.002445 0.4177 51.00 292
## [114] {4710011401128,
## 4710011406123} => {4710011401135} 0.002445 0.5043 64.82 292
## [115] {4710011405133,
## 4710011406123} => {4710011401128} 0.002219 0.7011 51.20 265
## [116] {4710011401128,
## 4710011405133} => {4710011406123} 0.002219 0.4288 52.35 265
## [117] {4710011401128,
## 4710011406123} => {4710011405133} 0.002219 0.4577 58.26 265
## [118] {4710011401135,
## 4710011405133,
## 4710011409056} => {4710011401128} 0.001566 0.8311 60.70 187
## [119] {4710011401128,
## 4710011401135,
## 4710011409056} => {4710011405133} 0.001566 0.5772 73.47 187
## [120] {4710011401128,
## 4710011405133,
## 4710011409056} => {4710011401135} 0.001566 0.6926 89.02 187
## [121] {4710011401128,
## 4710011401135,
## 4710011405133} => {4710011409056} 0.001566 0.5533 87.27 187
## [122] {4710011401135,
## 4710011406123,
## 4710011409056} => {4710011401128} 0.001332 0.8368 61.12 159
## [123] {4710011401128,
## 4710011401135,
## 4710011409056} => {4710011406123} 0.001332 0.4907 59.92 159
## [124] {4710011401128,
## 4710011406123,
## 4710011409056} => {4710011401135} 0.001332 0.6709 86.23 159
## [125] {4710011401128,
## 4710011401135,
## 4710011406123} => {4710011409056} 0.001332 0.5445 85.89 159
## [126] {4710011405133,
## 4710011406123,
## 4710011409056} => {4710011401128} 0.001156 0.7886 57.59 138
## [127] {4710011401128,
## 4710011405133,
## 4710011409056} => {4710011406123} 0.001156 0.5111 62.40 138
## [128] {4710011401128,
## 4710011406123,
## 4710011409056} => {4710011405133} 0.001156 0.5823 74.12 138
## [129] {4710011401128,
## 4710011405133,
## 4710011406123} => {4710011409056} 0.001156 0.5208 82.14 138
## [130] {4710011401135,
## 4710011405133,
## 4710011406123} => {4710011401128} 0.001348 0.8299 60.61 161
## [131] {4710011401128,
## 4710011401135,
## 4710011405133} => {4710011406123} 0.001348 0.4763 58.16 161
## [132] {4710011401128,
## 4710011401135,
## 4710011406123} => {4710011405133} 0.001348 0.5514 70.19 161
## [133] {4710011401128,
## 4710011405133,
## 4710011406123} => {4710011401135} 0.001348 0.6075 78.09 161
# install.packages(
# "https://cran.r-project.org/bin/windows/contrib/3.5/arulesViz_1.3-1.zip",
# repos=NULL)
# install.packages("arulesViz_1.3-1.zip", repos=NULL)
# library(plotly)
# plotly_arules(rules,colors=c("red","green"),
# marker=list(opacity=.6,size=10))
# plotly_arules(rules,method="matrix",
# shading="lift",
# colors=c("red", "green"))
# plot(rules,colors=c("red","green"),engine="htmlwidget",
marker=list(opacity=.6,size=8))# 想要促銷某項產品,想銷售B所以要賣A,可以從這張圖看出
# 架位擺設方面,擺在旁邊或擺開一點plot(rules,method="matrix",shading="lift",engine="htmlwidget",
colors=c("red", "green"))# 通常比較關心righthandside
# 有很多選擇可以看到若要賣B,可以有哪些A的選擇r1 = subset(rules, subset = rhs %in% c("4719090790000"))
summary(r1)## set of 2 rules
##
## rule length distribution (lhs + rhs):sizes
## 2
## 2
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 2 2 2 2 2
##
## summary of quality measures:
## support confidence lift count
## Min. :0.00147 Min. :0.577 Min. :122 Min. :176
## 1st Qu.:0.00184 1st Qu.:0.635 1st Qu.:134 1st Qu.:220
## Median :0.00221 Median :0.693 Median :146 Median :264
## Mean :0.00221 Mean :0.693 Mean :146 Mean :264
## 3rd Qu.:0.00257 3rd Qu.:0.751 3rd Qu.:159 3rd Qu.:307
## Max. :0.00294 Max. :0.809 Max. :171 Max. :351
##
## mining info:
## data ntransactions support confidence
## bx 119407 0.001 0.4
plot(r1,method="graph",engine="htmlwidget",itemCol="cyan") # 只要想知道A出現時什麼會跟著出現,就可以用r2 = subset(rules, subset = rhs %in% c("4710011401135"))
summary(r2)## set of 16 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4
## 4 9 3
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 2.75 3.00 2.94 3.00 4.00
##
## summary of quality measures:
## support confidence lift count
## Min. :0.00105 Min. :0.427 Min. :55.0 Min. :125
## 1st Qu.:0.00137 1st Qu.:0.511 1st Qu.:65.7 1st Qu.:164
## Median :0.00172 Median :0.594 Median :76.4 Median :206
## Mean :0.00222 Mean :0.576 Mean :74.0 Mean :266
## 3rd Qu.:0.00274 3rd Qu.:0.629 3rd Qu.:80.8 3rd Qu.:328
## Max. :0.00585 Max. :0.693 Max. :89.0 Max. :699
##
## mining info:
## data ntransactions support confidence
## bx 119407 0.001 0.4
plot(r2,method="graph",engine="htmlwidget",itemCol="cyan") 太少被購買的產品和購買太少產品的顧客都不適合使用Collaborative Filtering這種產品推薦方法,所以我們先對顧客和產品做一次篩選
library(recommenderlab)
rx = cpm[, colSums(cpm > 0) >= 50] # 選有50個人以上的
rx = rx[rowSums(rx > 0) >= 20 & rowSums(rx > 0) <= 300, ]
# 買超過20樣產品的,還有買太多也不要,非正常顧客
dim(rx) # 8846 3354## [1] 8846 3354
可以選擇要用
做模型。
rx = as(rx, "realRatingMatrix") # realRatingMatrix
bx = binarize(rx, minRating=1) # binaryRatingMatrix binarize()變成一個評分方式UBCF:User Based Collaborative Filtering
(rUBCF <- Recommender(bx[1:8800,], method = "UBCF"))## Recommender of type 'UBCF' for 'binaryRatingMatrix'
## learned using 8800 users.
# 將購買習慣相似的顧客分成一群pred = predict(rUBCF, bx[8801:8846,], n=4)
do.call(rbind, as(pred, "list")) %>% head(15)## [,1] [,2] [,3] [,4]
## 2170855 "4711271000014" "4710114128038" "4714981010038" "4713985863121"
## 2171265 "4719090900065" "4710254049521" "4710036008562" "4714981010038"
## 2171340 "723125488040" "723125488064" "723125485032" "4714981010038"
## 2171425 "4710011401135" "4710011409056" "4711080010112" "4710011401142"
## 2171432 "4714981010038" "4710011406123" "4711258007371" "4710011401128"
## 2171555 "4719090900065" "4711271000014" "37000329169" "4710943109352"
## 2171883 "4711271000014" "4710583996008" "4710291112172" "4710018004704"
## 2172194 "4711271000014" "4714981010038" "4710114128038" "4710114105046"
## 2172392 "4903111345717" "4710908131589" "4710168705056" "4711271000014"
## 2172569 "4711271000014" "4714981010038" "4710128030037" "4712162000038"
## 2172583 "4714981010038" "4710085120093" "4719090900065" "4710154015206"
## 2172590 "4710011406123" "4710011401142" "4710857000028" "4710011432856"
## 2172668 "4711271000014" "4710088620156" "4719090900065" "4712425010712"
## 2172705 "4711271000014" "4714981010038" "37000445111" "37000440192"
## 2172811 "4714981010038" "37000442127" "4710719000333" "4710114128038"
# 46可以拿來做測試,n=4推薦4件產品
# 開始推薦一個顧客四樣產品IBCF:Item Based Collaborative Filtering
(rIBCF <- Recommender(bx[1:8800,], method = "IBCF"))## Recommender of type 'IBCF' for 'binaryRatingMatrix'
## learned using 8800 users.
# 使用者當區隔變數 對產品分群
# 假設每一項產品是差不多哪一群差不多的人在買pred = predict(rIBCF, bx[8801:8846,], n=4)
do.call(rbind, as(pred, "list")) %>% head(15)## [,1] [,2] [,3] [,4]
## 2170855 "4719090900065" "4714981010038" "4711271000014" "4712162000038"
## 2171265 "4719090900065" "4710015103288" "4714981010038" "4711271000014"
## 2171340 "37000445111" "4710036005608" "37000442127" "723125485032"
## 2171425 "4711311617899" "4711311218836" "4710011401135" "4710011409056"
## 2171432 "4714981010038" "4710321791698" "4710857000042" "4710626111252"
## 2171555 "93432641" "93362993" "4710105045320" "4711271000014"
## 2171883 "4710670200100" "4710670200407" "4711271000014" "3228020490329"
## 2172194 "4714108700019" "4714108700064" "4909978199111" "20332433"
## 2172392 "4710706211759" "4710908131589" "4719090900058" "4710731040614"
## 2172569 "4711371850243" "84501297329" "84501293529" "4710085121007"
## 2172583 "4710085172702" "4710085120093" "34000100095" "34000231508"
## 2172590 "4710011406123" "4711271000014" "4711437000162" "4710011401142"
## 2172668 "4711371850243" "4719090900065" "4714981010038" "4711437000117"
## 2172705 "37000445111" "4710018004605" "37000442127" "37000304593"
## 2172811 "4719581980293" "4712067899287" "4719581980279" "4710908131589"
save(rIBCF, rUBCF, file="data/recommenders.rdata")set.seed(4321)
scheme = evaluationScheme(
bx, method="split", train = .75, given=5)
# 設定驗證方式,given=5我留下0.25test裡要給我五件產品才可以猜,不能完全沒買的algorithms = list(
AR53 = list(name="AR", param=list(support=0.0005, confidence=0.3)),
AR43 = list(name="AR", param=list(support=0.0004, confidence=0.3)),
RANDOM = list(name="RANDOM", param=NULL),
POPULAR = list(name="POPULAR", param=NULL),
UBCF = list(name="UBCF", param=NULL),
IBCF = list(name="IBCF", param=NULL) )if(LOAD) {
load("data/results2a.rdata")
} else {
t0 = Sys.time()
results = evaluate(
scheme, algorithms,
type="topNList", # method of evaluation
n=c(5, 10, 15, 20) # no. recom. to be evaluated 剛剛給五個產品,所以要猜5、10、15、20
)
print(Sys.time() - t0)
save(results, file="data/results2a.rdata")
}
## AR run fold/sample [model time/prediction time]
## 1 [4.02sec/214.6sec]
## AR run fold/sample [model time/prediction time]
## 1 [10.49sec/538.5sec]
## RANDOM run fold/sample [model time/prediction time]
## 1 [0sec/9.48sec]
## POPULAR run fold/sample [model time/prediction time]
## 1 [0sec/11.09sec]
## UBCF run fold/sample [model time/prediction time]
## 1 [0sec/75.42sec]
## IBCF run fold/sample [model time/prediction time]
## 1 [198.2sec/1.63sec]
## Time difference of 18.72 mins
## 這些程式跑完需要15.16分鐘# load("data/results.rdata")
par(mar=c(4,4,3,2),cex=0.8)
cols = c("red", "magenta", "gray", "orange", "blue", "green")
plot(results, annotate=c(1,3), legend="topleft", pch=19, lwd=2, col=cols)
abline(v=seq(0,0.006,0.001), h=seq(0,0.08,0.01), col='lightgray', lty=2)# 若是推薦不好,間接對公司有殺傷力getConfusionMatrix(results$IBCF)## [[1]]
## TP FP FN TN precision recall TPR FPR
## 5 1.116 3.884 32.97 3311 0.2231 0.03899 0.03899 0.001171
## 10 1.699 8.301 32.39 3307 0.1699 0.05812 0.05812 0.002503
## 15 2.075 12.925 32.01 3302 0.1383 0.07021 0.07021 0.003898
## 20 2.385 17.615 31.70 3297 0.1193 0.08002 0.08002 0.005313