packages = c(
"dplyr","ggplot2","googleVis","devtools","magrittr","slam","irlba","plotly",
"arules","arulesViz","Matrix","recommenderlab")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=TRUE))
LOAD = TRUE
library(dplyr)
library(ggplot2)
library(googleVis)
library(Matrix)
library(slam)
library(irlba)
library(plotly)
library(arules)
library(arulesViz)
library(recommenderlab)
Load data frame and rename
load("data/tf0.rdata")
A = A0; X = X0; Z = Z0; rm(A0,X0,Z0); gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 2437276 130.2 3886542 207.6 3236611 172.9
Vcells 8537037 65.2 85635293 653.4 133428552 1018.0
Z = subset(Z, cust %in% A$cust)
n_distinct(Z$cust) # 32241
[1] 32241
n_distinct(Z$prod) # 23787
[1] 23787
製作顧客產品矩陣其實很快、也很容易
library(Matrix)
library(slam)
cpm = xtabs(~ cust + prod, Z, sparse=T) # customer product matrix
dim(cpm) # 32241 23787
[1] 32241 23787
mean(cpm > 0) # 0.00096799 這個稀疏矩陣有資料的密度(>0代表有資料,<0代表無資料)
[1] 0.0009674
顧客產品矩陣通常是一個很稀疏的矩,陣有一些產品沒什麼人買
#被買超過十次的產品是true的比例
colSums(cpm) %>% quantile(seq(0,1,0.1))
0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
1 1 2 4 6 8 13 20 35 76 8475
mean(colSums(cpm) > 10)
[1] 0.4484
刪去購買次數小於6的產品,然後刪去沒有購買產品的顧客
cpm = cpm[, colSums(cpm) >= 6] # remove the least frequent products
# cpm = cpm[rowSums(cpm) > 0, ] # remove non-buying customers 怕刪掉的話等等無法bind
cpm = cpm[, order(-colSums(cpm))] # order product by frequency以產品的購買次數做降冪排列
dim(cpm) # 32241 23787>14621
[1] 32241 14621
#顧客數量不變,因為只刪除比較少被買的產品
max(cpm) # 49
[1] 49
mean(cpm > 0) # 0.0015248
[1] 0.001525
table(cpm@x) %>% prop.table %>% round(4) %>% head(10)
1 2 3 4 5 6 7 8 9 10
0.9256 0.0579 0.0108 0.0032 0.0012 0.0006 0.0003 0.0002 0.0001 0.0001
請你用一個指令列出被購買最多次的10個產品,和它們被購買的次數。
cpm[,1:10] %>% colSums
4714981010038 4711271000014 4719090900065 4711080010112 4710114128038
8475 6119 2444 2249 2178
4710265849066 4713985863121 4710088410139 4710583996008 4710908131589
2017 1976 1869 1840 1679
■ 在什麼前提之下,我們可以把購買這十個產品的次數當作變數,用來預測顧客在下一期會不會來購買呢?
■ 我們如何把這十個變數,併入顧客資料框呢?
■ 我們可不可以(在什麼前提之下我們可以)直接用cbind()新變數併入顧客資料框呢?
■ 我們期中競賽的資料,符合直接用cbind()併入新變數的條件嗎? 我們要如何確認這一件事呢?
dim(cpm)
[1] 32241 14621
sum(A$cust == rownames(cpm) ) #rowname:顧客,colname:產品
[1] 32241
#顧客id有沒有符合
#A = cbind(A,as.matrix(cpm[,1:20]))
#A = merge(A,cpm[,1:20])
#A = cbind(svd$u,as.matrix(cpm[,1:20]))
#A = A[,1:9]
以產品的被購買頻率製作(顧客)變數的時候,排cpm在最前邊的(N個)欄位就是變數!
nop= 400 # no. product = no. variables
k = 200 # no. cluster
set.seed(111); kg = kmeans(cpm[,1:nop], k)$cluster
table(kg) %>% as.vector %>% sort
[1] 1 1 1 1 1 1 1 1 1 1 1
[12] 1 1 1 1 1 1 1 1 1 1 1
[23] 2 2 2 2 2 2 3 3 3 3 3
[34] 3 3 4 4 4 4 4 4 4 5 5
[45] 6 6 6 6 7 7 7 8 8 8 9
[56] 9 9 9 10 10 10 10 11 11 11 11
[67] 11 12 13 13 15 15 15 16 16 18 19
[78] 20 20 20 20 21 22 22 22 24 24 25
[89] 25 27 28 28 32 32 35 36 39 40 41
[100] 42 44 45 46 47 47 48 49 49 50 51
[111] 52 53 56 58 58 61 63 66 67 68 69
[122] 69 72 81 85 85 86 87 90 94 96 97
[133] 97 100 100 101 110 111 113 114 116 118 123
[144] 123 126 130 134 136 141 141 142 143 162 165
[155] 172 175 178 179 182 182 184 187 195 210 222
[166] 225 228 228 237 239 242 253 254 258 258 266
[177] 268 272 287 293 301 311 325 329 350 351 363
[188] 396 407 407 410 418 432 448 473 523 561 1156
[199] 1266 11215
將分群結果併入顧客資料框(A)
df = A %>% inner_join(data.frame(
cust = as.integer(rownames(cpm)),
kg) )
Joining, by = "cust"
head(df) # 32241
計算各群組的平均屬性
df = data.frame(
aggregate(. ~ kg, df[,c(2:7,10)], mean), # averages
size = as.vector(table(kg)), # no. customers in the group
dummy = 2001 # dummy column for googleViz
)
head(df)
plot( gvisMotionChart(
subset(df[,c(1,4,5,6,8,2,3,7,9)],
size >= 20 & size <= 1000), # range of group size
"kg", "dummy", options=list(width=800, height=600) ) )
# use global variables: cpm, kg
Sig = function(gx, P=1000, H=10) {
print(sprintf("Group %d: No. Customers = %d", gx, sum(kg==gx)))
bx = cpm[,1:P]
data.frame(n = col_sums(bx[kg==gx,])) %>% # frequency
mutate(
share = round(100*n/col_sums(bx),2), # %prod sold to this cluster
conf = round(100*n/sum(kg==gx),2), # %buy this product, given cluster
base = round(100*col_sums(bx)/nrow(bx),2), # %buy this product, all cust
lift = round(conf/base,1), # conf/base
name = colnames(bx) # name of prod
) %>% arrange(desc(lift)) %>% head(H)
}
Sig(130)
[1] "Group 130: No. Customers = 97"
■ 在什麼前提之下,我們可以把顧客購買產品的特徵向量當作變數,用來預測顧客在下一期會不會來購買呢?
■ 如果可以的話,我們如何把顧客購買產品的特徵向量,併入顧客資料框呢?
■ 我們可不可以(在什麼前提之下我們可以)直接用cbind()將特徵向量併入顧客資料框呢?
■ 我們期中競賽的資料,符合直接用cbind()併入特徵向量的條件嗎? 我們要如何確認這一件事呢?
library(irlba)
if(LOAD) {
load("data/svd2a.rdata")
} else {
smx = cpm
smx@x = pmin(smx@x, 2) # cap at 2, similar to normalization #先要做常態化,因為要讓產品權重盡量一樣
t0 = Sys.time()
svd = irlba(smx,
nv=400, # length of feature vector
maxit=800, work=800)
print(Sys.time() - t0) # 1.8795 mins
save(svd, file = "data/svd2a.rdata")
}
set.seed(111); kg = kmeans(svd$u, 200)$cluster
table(kg) %>% as.vector %>% sort
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[15] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[29] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[43] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[57] 1 2 2 2 2 2 3 4 4 5 7 10 14 30
[71] 31 32 36 38 38 39 39 40 40 41 44 45 46 47
[85] 49 54 59 62 62 69 71 77 79 79 80 82 82 84
[99] 87 91 101 103 109 110 111 113 117 120 123 127 127 129
[113] 132 133 134 135 136 139 141 143 143 147 147 157 159 159
[127] 160 160 160 166 168 169 172 175 180 181 181 182 183 184
[141] 184 188 190 190 193 194 195 196 198 198 200 201 201 202
[155] 202 204 204 204 207 209 209 210 213 214 216 219 219 222
[169] 225 233 234 235 236 237 237 238 239 241 248 248 248 253
[183] 256 257 258 259 261 261 264 269 277 281 285 293 305 411
[197] 612 896 1092 8987
# clustster summary
df = inner_join(A, data.frame(
cust = as.integer(rownames(cpm)), kg)) %>%
group_by(kg) %>% summarise(
avg_frequency = mean(f),
avg_monetary = mean(m),
avg_revenue_contr = mean(rev),
group_size = n(),
avg_recency = mean(r),
avg_gross_profit = mean(raw)) %>%
ungroup %>%
mutate(dummy = 2001, kg = sprintf("G%03d",kg)) %>%
data.frame
Joining, by = "cust"
# Google Motion Chart
plot( gvisMotionChart(
subset(df, group_size >= 20 & group_size <= 1200),
"kg", "dummy", options=list(width=800, height=600) ) )
Sig(162)
[1] "Group 162: No. Customers = 87"
dim(cpm) # 32241 14621
[1] 32241 14621
library(arules)
library(arulesViz)
#bx = subset(Z, prod %in% as.numeric(colnames(cpm)), #挑選z裡面的product #把cpm(被購買超過六次以上)的產品名稱集合
# select=c("cust","prod")) # select product items
bx=Z
bx = split(bx$prod, bx$tid) # split by transaction id 把每個顧客每個產品撿成一個一個list
bx = as(bx, "transactions") # data structure for arules package
#法二
bx=as(split(Z$prod,Z$tid),"transactions")
itemFrequencyPlot(bx, topN=20, type="absolute", cex=0.8)
關聯規則(A => B)
rules = apriori(bx, parameter=list(supp=0.001, conf=0.4)) #設的越低,找到的組合就越多
Apriori
Parameter specification:
confidence minval smax arem aval originalSupport maxtime support minlen
0.4 0.1 1 none FALSE TRUE 5 0.001 1
maxlen target ext
10 rules FALSE
Algorithmic control:
filter tree heap memopt load sort verbose
0.1 TRUE TRUE FALSE TRUE 2 TRUE
Absolute minimum support count: 119
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[23787 item(s), 119407 transaction(s)] done [0.12s].
sorting and recoding items ... [1452 item(s)] done [0.01s].
creating transaction tree ... done [0.05s].
checking subsets of size 1 2 3 4 done [0.04s].
writing ... [133 rule(s)] done [0.00s].
creating S4 object ... done [0.01s].
summary(rules)
set of 133 rules
rule length distribution (lhs + rhs):sizes
2 3 4
66 51 16
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.00 2.00 3.00 2.62 3.00 4.00
summary of quality measures:
support confidence lift count
Min. :0.00100 Min. :0.403 Min. : 31.5 Min. :120
1st Qu.:0.00135 1st Qu.:0.467 1st Qu.: 57.1 1st Qu.:161
Median :0.00162 Median :0.536 Median : 68.9 Median :194
Mean :0.00203 Mean :0.556 Mean : 90.8 Mean :243
3rd Qu.:0.00245 3rd Qu.:0.614 3rd Qu.: 89.8 3rd Qu.:292
Max. :0.00585 Max. :0.837 Max. :301.4 Max. :699
mining info:
data ntransactions support confidence
bx 119407 0.001 0.4
關聯規則 (A => B):
options(digits=4)
inspect(rules)
lhs rhs support confidence lift count
[1] {4710030346110} => {4710030346103} 0.001005 0.6486 301.37 120
[2] {4710030346103} => {4710030346110} 0.001005 0.4669 301.37 120
[3] {4710030346097} => {4710030346059} 0.001097 0.5822 211.96 131
[4] {4710030346103} => {4710030346059} 0.001407 0.6537 237.98 168
[5] {4710030346059} => {4710030346103} 0.001407 0.5122 237.98 168
[6] {4716114000312} => {4716114000329} 0.001273 0.5527 231.58 152
[7] {4716114000329} => {4716114000312} 0.001273 0.5333 231.58 152
[8] {4719090701051} => {4719090790017} 0.001097 0.4295 118.17 131
[9] {4719090701051} => {4719090790000} 0.001474 0.5770 121.95 176
[10] {4711524000419} => {4711524000396} 0.001382 0.6790 219.13 165
[11] {4711524000396} => {4711524000419} 0.001382 0.4459 219.13 165
[12] {4711524000495} => {4711524000396} 0.001038 0.5000 161.36 124
[13] {4719090106016} => {4719090106009} 0.001239 0.4790 131.17 148
[14] {4710321861209} => {4710321871260} 0.001449 0.4701 151.71 173
[15] {4710321871260} => {4710321861209} 0.001449 0.4676 151.71 173
[16] {4710321861209} => {4710321861186} 0.001876 0.6087 137.92 224
[17] {4710321861186} => {4710321861209} 0.001876 0.4250 137.92 224
[18] {4710321871260} => {4710321861186} 0.001675 0.5405 122.47 200
[19] {4711524000907} => {4711524000891} 0.001290 0.5620 191.20 154
[20] {4711524000891} => {4711524000907} 0.001290 0.4387 191.20 154
[21] {4711524000907} => {4711524001041} 0.001491 0.6496 194.41 178
[22] {4711524001041} => {4711524000907} 0.001491 0.4461 194.41 178
[23] {4719090790017} => {4719090790000} 0.002940 0.8088 170.92 351
[24] {4719090790000} => {4719090790017} 0.002940 0.6212 170.92 351
[25] {719859796124} => {719859796117} 0.002102 0.6972 147.61 251
[26] {719859796117} => {719859796124} 0.002102 0.4450 147.61 251
[27] {4710762101018} => {4710762101025} 0.001566 0.4484 112.49 187
[28] {4710857000059} => {4710011401128} 0.001072 0.4310 31.47 128
[29] {4710466103073} => {4710466103080} 0.001332 0.4892 78.84 159
[30] {4711524000891} => {4711524001041} 0.001759 0.5983 179.05 210
[31] {4711524001041} => {4711524000891} 0.001759 0.5263 179.05 210
[32] {4711856000088} => {4711856000125} 0.002479 0.4625 61.50 296
[33] {4710011402026} => {4710011402019} 0.002822 0.6740 90.22 337
[34] {4711856020215} => {4711856000125} 0.002169 0.4368 58.08 259
[35] {4710085120697} => {4710085120680} 0.003467 0.7753 100.41 414
[36] {4710085120680} => {4710085120697} 0.003467 0.4490 100.41 414
[37] {4711856020208} => {4711856000125} 0.002127 0.4084 54.30 254
[38] {4710011401142} => {4710011409056} 0.001591 0.4308 67.96 190
[39] {4710011401142} => {4710011401135} 0.001826 0.4943 63.54 218
[40] {4710011401142} => {4710011405133} 0.001801 0.4875 62.06 215
[41] {4710011401142} => {4710011406123} 0.001524 0.4127 50.39 182
[42] {4710011401142} => {4710011401128} 0.002203 0.5964 43.55 263
[43] {4710088414328} => {4710088414311} 0.001792 0.4672 86.37 214
[44] {4710085172702} => {4710085120093} 0.002060 0.4581 60.91 246
[45] {4710085172702} => {4710085172696} 0.002429 0.5400 62.00 290
[46] {4710085172702} => {4710085120628} 0.002462 0.5475 48.25 294
[47] {4710254049323} => {4710254049521} 0.002010 0.4309 55.56 240
[48] {4710085120710} => {4710085120703} 0.002914 0.5613 89.84 348
[49] {4710085120703} => {4710085120710} 0.002914 0.4665 89.84 348
[50] {4710018004704} => {4710018004605} 0.002990 0.5360 46.08 357
[51] {4710011409056} => {4710011401135} 0.003383 0.5337 68.60 404
[52] {4710011401135} => {4710011409056} 0.003383 0.4349 68.60 404
[53] {4710011409056} => {4710011405133} 0.002973 0.4690 59.70 355
[54] {4710011409056} => {4710011406123} 0.002621 0.4135 50.48 313
[55] {4710011409056} => {4710011401128} 0.004430 0.6988 51.04 529
[56] {4710085120093} => {4710085172696} 0.003743 0.4978 57.15 447
[57] {4710085172696} => {4710085120093} 0.003743 0.4298 57.15 447
[58] {4710085120093} => {4710085120628} 0.003961 0.5267 46.42 473
[59] {4710011401135} => {4710011405133} 0.003668 0.4715 60.02 438
[60] {4710011405133} => {4710011401135} 0.003668 0.4670 60.02 438
[61] {4710011401135} => {4710011401128} 0.005854 0.7524 54.95 699
[62] {4710011401128} => {4710011401135} 0.005854 0.4275 54.95 699
[63] {4710085172696} => {4710085120628} 0.004355 0.5000 44.06 520
[64] {4710011405133} => {4710011406123} 0.003166 0.4030 49.20 378
[65] {4710011405133} => {4710011401128} 0.005176 0.6588 48.12 618
[66] {4710011406123} => {4710011401128} 0.004849 0.5920 43.24 579
[67] {4710011401142,
4710011409056} => {4710011401135} 0.001097 0.6895 88.62 131
[68] {4710011401135,
4710011401142} => {4710011409056} 0.001097 0.6009 94.79 131
[69] {4710011401142,
4710011409056} => {4710011401128} 0.001181 0.7421 54.20 141
[70] {4710011401128,
4710011401142} => {4710011409056} 0.001181 0.5361 84.57 141
[71] {4710011401135,
4710011401142} => {4710011405133} 0.001047 0.5734 72.99 125
[72] {4710011401142,
4710011405133} => {4710011401135} 0.001047 0.5814 74.73 125
[73] {4710011401135,
4710011401142} => {4710011401128} 0.001382 0.7569 55.28 165
[74] {4710011401128,
4710011401142} => {4710011401135} 0.001382 0.6274 80.64 165
[75] {4710011401142,
4710011405133} => {4710011401128} 0.001231 0.6837 49.93 147
[76] {4710011401128,
4710011401142} => {4710011405133} 0.001231 0.5589 71.15 147
[77] {4710011401142,
4710011406123} => {4710011401128} 0.001005 0.6593 48.15 120
[78] {4710011401128,
4710011401142} => {4710011406123} 0.001005 0.4563 55.71 120
[79] {4710085120093,
4710085172702} => {4710085172696} 0.001348 0.6545 75.14 161
[80] {4710085172696,
4710085172702} => {4710085120093} 0.001348 0.5552 73.82 161
[81] {4710085120093,
4710085172702} => {4710085120628} 0.001281 0.6220 54.81 153
[82] {4710085120628,
4710085172702} => {4710085120093} 0.001281 0.5204 69.20 153
[83] {4710085172696,
4710085172702} => {4710085120628} 0.001491 0.6138 54.09 178
[84] {4710085120628,
4710085172702} => {4710085172696} 0.001491 0.6054 69.51 178
[85] {4710011401135,
4710011409056} => {4710011405133} 0.001884 0.5569 70.90 225
[86] {4710011405133,
4710011409056} => {4710011401135} 0.001884 0.6338 81.46 225
[87] {4710011401135,
4710011405133} => {4710011409056} 0.001884 0.5137 81.03 225
[88] {4710011401135,
4710011409056} => {4710011406123} 0.001591 0.4703 57.42 190
[89] {4710011406123,
4710011409056} => {4710011401135} 0.001591 0.6070 78.02 190
[90] {4710011401135,
4710011406123} => {4710011409056} 0.001591 0.5220 82.34 190
[91] {4710011401135,
4710011409056} => {4710011401128} 0.002713 0.8020 58.57 324
[92] {4710011401128,
4710011409056} => {4710011401135} 0.002713 0.6125 78.72 324
[93] {4710011401128,
4710011401135} => {4710011409056} 0.002713 0.4635 73.11 324
[94] {4710011405133,
4710011409056} => {4710011406123} 0.001466 0.4930 60.19 175
[95] {4710011406123,
4710011409056} => {4710011405133} 0.001466 0.5591 71.17 175
[96] {4710011405133,
4710011406123} => {4710011409056} 0.001466 0.4630 73.03 175
[97] {4710011405133,
4710011409056} => {4710011401128} 0.002261 0.7606 55.55 270
[98] {4710011401128,
4710011409056} => {4710011405133} 0.002261 0.5104 64.97 270
[99] {4710011401128,
4710011405133} => {4710011409056} 0.002261 0.4369 68.91 270
[100] {4710011406123,
4710011409056} => {4710011401128} 0.001985 0.7572 55.30 237
[101] {4710011401128,
4710011409056} => {4710011406123} 0.001985 0.4480 54.70 237
[102] {4710011401128,
4710011406123} => {4710011409056} 0.001985 0.4093 64.57 237
[103] {4710085120093,
4710085172696} => {4710085120628} 0.002136 0.5705 50.27 255
[104] {4710085120093,
4710085120628} => {4710085172696} 0.002136 0.5391 61.90 255
[105] {4710085120628,
4710085172696} => {4710085120093} 0.002136 0.4904 65.21 255
[106] {4710011401135,
4710011405133} => {4710011406123} 0.001625 0.4429 54.08 194
[107] {4710011401135,
4710011406123} => {4710011405133} 0.001625 0.5330 67.85 194
[108] {4710011405133,
4710011406123} => {4710011401135} 0.001625 0.5132 65.97 194
[109] {4710011401135,
4710011405133} => {4710011401128} 0.002831 0.7717 56.36 338
[110] {4710011401128,
4710011401135} => {4710011405133} 0.002831 0.4835 61.56 338
[111] {4710011401128,
4710011405133} => {4710011401135} 0.002831 0.5469 70.30 338
[112] {4710011401135,
4710011406123} => {4710011401128} 0.002445 0.8022 58.59 292
[113] {4710011401128,
4710011401135} => {4710011406123} 0.002445 0.4177 51.00 292
[114] {4710011401128,
4710011406123} => {4710011401135} 0.002445 0.5043 64.82 292
[115] {4710011405133,
4710011406123} => {4710011401128} 0.002219 0.7011 51.20 265
[116] {4710011401128,
4710011405133} => {4710011406123} 0.002219 0.4288 52.35 265
[117] {4710011401128,
4710011406123} => {4710011405133} 0.002219 0.4577 58.26 265
[118] {4710011401135,
4710011405133,
4710011409056} => {4710011401128} 0.001566 0.8311 60.70 187
[119] {4710011401128,
4710011401135,
4710011409056} => {4710011405133} 0.001566 0.5772 73.47 187
[120] {4710011401128,
4710011405133,
4710011409056} => {4710011401135} 0.001566 0.6926 89.02 187
[121] {4710011401128,
4710011401135,
4710011405133} => {4710011409056} 0.001566 0.5533 87.27 187
[122] {4710011401135,
4710011406123,
4710011409056} => {4710011401128} 0.001332 0.8368 61.12 159
[123] {4710011401128,
4710011401135,
4710011409056} => {4710011406123} 0.001332 0.4907 59.92 159
[124] {4710011401128,
4710011406123,
4710011409056} => {4710011401135} 0.001332 0.6709 86.23 159
[125] {4710011401128,
4710011401135,
4710011406123} => {4710011409056} 0.001332 0.5445 85.89 159
[126] {4710011405133,
4710011406123,
4710011409056} => {4710011401128} 0.001156 0.7886 57.59 138
[127] {4710011401128,
4710011405133,
4710011409056} => {4710011406123} 0.001156 0.5111 62.40 138
[128] {4710011401128,
4710011406123,
4710011409056} => {4710011405133} 0.001156 0.5823 74.12 138
[129] {4710011401128,
4710011405133,
4710011406123} => {4710011409056} 0.001156 0.5208 82.14 138
[130] {4710011401135,
4710011405133,
4710011406123} => {4710011401128} 0.001348 0.8299 60.61 161
[131] {4710011401128,
4710011401135,
4710011405133} => {4710011406123} 0.001348 0.4763 58.16 161
[132] {4710011401128,
4710011401135,
4710011406123} => {4710011405133} 0.001348 0.5514 70.19 161
[133] {4710011401128,
4710011405133,
4710011406123} => {4710011401135} 0.001348 0.6075 78.09 161
#a:left hand side b:right hand side 有買a,就會買b (count:有幾比、support:a被購買的機率、confidence:如果a被購買時,b被購買的機率、lift:a被購買時,b被購買的機率增加的倍數)
# install.packages(
# "https://cran.r-project.org/bin/windows/contrib/3.5/arulesViz_1.3-1.zip",
# repos=NULL)
# install.packages("arulesViz_1.3-1.zip", repos=NULL)
# library(plotly)
# plotly_arules(rules,colors=c("red","green"),
# marker=list(opacity=.6,size=10))
# plotly_arules(rules,method="matrix",
# shading="lift",
# colors=c("red", "green"))
#
plot(rules,colors=c("red","green"),engine="htmlwidget",
marker=list(opacity=.6,size=8))
#右上is best
plot(rules,method="matrix",shading="lift",engine="htmlwidget",
colors=c("red", "green"))
#在righthandside之下,會發現有多少lefthandside會帶著它
r1 = subset(rules, subset = rhs %in% c("4710011409056"))
summary(r1)
set of 13 rules
rule length distribution (lhs + rhs):sizes
2 3 4
2 8 3
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.00 3.00 3.00 3.08 3.00 4.00
summary of quality measures:
support confidence lift count
Min. :0.00110 Min. :0.409 Min. :64.6 Min. :131
1st Qu.:0.00133 1st Qu.:0.437 1st Qu.:68.9 1st Qu.:159
Median :0.00159 Median :0.514 Median :81.0 Median :190
Mean :0.00178 Mean :0.495 Mean :78.0 Mean :213
3rd Qu.:0.00198 3rd Qu.:0.536 3rd Qu.:84.6 3rd Qu.:237
Max. :0.00338 Max. :0.601 Max. :94.8 Max. :404
mining info:
data ntransactions support confidence
bx 119407 0.001 0.4
plot(r1,method="graph",engine="htmlwidget",itemCol="cyan")
r2 = subset(rules, subset = rhs %in% c("4710011401135"))
summary(r2)
set of 16 rules
rule length distribution (lhs + rhs):sizes
2 3 4
4 9 3
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.00 2.75 3.00 2.94 3.00 4.00
summary of quality measures:
support confidence lift count
Min. :0.00105 Min. :0.427 Min. :55.0 Min. :125
1st Qu.:0.00137 1st Qu.:0.511 1st Qu.:65.7 1st Qu.:164
Median :0.00172 Median :0.594 Median :76.4 Median :206
Mean :0.00222 Mean :0.576 Mean :74.0 Mean :266
3rd Qu.:0.00274 3rd Qu.:0.629 3rd Qu.:80.8 3rd Qu.:328
Max. :0.00585 Max. :0.693 Max. :89.0 Max. :699
mining info:
data ntransactions support confidence
bx 119407 0.001 0.4
plot(r2,method="graph",engine="htmlwidget",itemCol="cyan")
太少被購買的產品和購買太少產品的顧客都不適合使用Collaborative Filtering這種產品推薦方法,所以我們先對顧客和產品做一次篩選
library(recommenderlab)
rx = cpm[, colSums(cpm > 0) >= 50] #不允許顧客沒買東西,且產品太少人買很難做推薦系統
rx = rx[rowSums(rx > 0) >= 20 & rowSums(rx > 0) <= 300, ]#選顧客 #買超過20樣產品(下限),超過300樣產品(上限)的顧客:有可能是批貨的零售商
dim(rx) # 8846 3354
[1] 8846 3354
可以選擇要用
做模型。
rx = as(rx, "realRatingMatrix") # realRatingMatrix
bx = binarize(rx, minRating=1) # binaryRatingMatrix #只要買超過一次就是minrating=1
UBCF:User Based Collaborative Filtering
(rUBCF <- Recommender(bx[1:8800,], method = "UBCF"))
Recommender of type 'UBCF' for 'binaryRatingMatrix'
learned using 8800 users.
#用前面8800去做一個模型
pred = predict(rUBCF, bx[8801:8846,], n=4) #8801:8846測試資料 根據每個顧客,推薦四件產品n=4
do.call(rbind, as(pred, "list")) %>% head(15) #出來的資料結構是list
[,1] [,2] [,3] [,4]
2170855 "4711271000014" "4710114128038" "4714981010038" "4713985863121"
2171265 "4719090900065" "4710254049521" "4710036008562" "4714981010038"
2171340 "723125488040" "723125488064" "723125485032" "4714981010038"
2171425 "4710011401135" "4710011409056" "4711080010112" "4710011401142"
2171432 "4714981010038" "4710011406123" "4711258007371" "4710011401128"
2171555 "4719090900065" "4711271000014" "37000329169" "4710943109352"
2171883 "4711271000014" "4710583996008" "4710291112172" "4710018004704"
2172194 "4711271000014" "4714981010038" "4710114128038" "4710114105046"
2172392 "4903111345717" "4710908131589" "4710168705056" "4711271000014"
2172569 "4711271000014" "4714981010038" "4710128030037" "4712162000038"
2172583 "4714981010038" "4710085120093" "4719090900065" "4710154015206"
2172590 "4710011406123" "4710011401142" "4710857000028" "4710011432856"
2172668 "4711271000014" "4710088620156" "4719090900065" "4712425010712"
2172705 "4711271000014" "4714981010038" "37000445111" "37000440192"
2172811 "4714981010038" "37000442127" "4710719000333" "4710114128038"
UBCF:user based collaborative diltering把購買習慣類似的人集合在一起 IBCF:item based collaborative diltering先去把ithem group起來(對產品分群),用使用者作區隔變數。假設某樣產品購買的人幾乎都是那些人在買
IBCF:Item Based Collaborative Filtering
(rIBCF <- Recommender(bx[1:8800,], method = "IBCF"))
Recommender of type 'IBCF' for 'binaryRatingMatrix'
learned using 8800 users.
pred = predict(rIBCF, bx[8801:8846,], n=4)
do.call(rbind, as(pred, "list")) %>% head(15)
[,1] [,2] [,3] [,4]
2170855 "4719090900065" "4714981010038" "4711271000014" "4712162000038"
2171265 "4719090900065" "4710015103288" "4714981010038" "4711271000014"
2171340 "37000445111" "4710036005608" "37000442127" "723125485032"
2171425 "4711311617899" "4711311218836" "4710011401135" "4710011409056"
2171432 "4714981010038" "4710321791698" "4710857000042" "4710626111252"
2171555 "93432641" "93362993" "4710105045320" "4711271000014"
2171883 "4710670200100" "4710670200407" "4711271000014" "3228020490329"
2172194 "4714108700019" "4714108700064" "4909978199111" "20332433"
2172392 "4710706211759" "4710908131589" "4719090900058" "4710731040614"
2172569 "4711371850243" "84501297329" "84501293529" "4710085121007"
2172583 "4710085172702" "4710085120093" "34000100095" "34000231508"
2172590 "4710011406123" "4711271000014" "4711437000162" "4710011401142"
2172668 "4711371850243" "4719090900065" "4714981010038" "4711437000117"
2172705 "37000445111" "4710018004605" "37000442127" "37000304593"
2172811 "4719581980293" "4712067899287" "4719581980279" "4710908131589"
save(rIBCF, rUBCF, file="data/recommenders.rdata")
set.seed(4321)
scheme = evaluationScheme(
bx, method="split", train = .75, given=5)
#切開,預測時要給我五件東西
algorithms = list(
AR53 = list(name="AR", param=list(support=0.0005, confidence=0.3)), #購物籃
AR43 = list(name="AR", param=list(support=0.0004, confidence=0.3)),
RANDOM = list(name="RANDOM", param=NULL), #亂猜,類似底線模型
POPULAR = list(name="POPULAR", param=NULL),#smart baseline:哪一樣流行就推薦它
UBCF = list(name="UBCF", param=NULL), #param=null代表用它預測的參數
IBCF = list(name="IBCF", param=NULL) )
if(LOAD) {
load("data/results2a.rdata")
} else {
t0 = Sys.time()
results = evaluate(
scheme, algorithms,
type="topNList", # method of evaluation
n=c(5, 10, 15, 20) # no. recom. to be evaluated #推薦五件、十件、十五件、二十件產品
)
print(Sys.time() - t0)
save(results, file="data/results2a.rdata")
}
## AR run fold/sample [model time/prediction time]
## 1 [4.02sec/214.6sec]
## AR run fold/sample [model time/prediction time]
## 1 [10.49sec/538.5sec]
## RANDOM run fold/sample [model time/prediction time]
## 1 [0sec/9.48sec]
## POPULAR run fold/sample [model time/prediction time]
## 1 [0sec/11.09sec]
## UBCF run fold/sample [model time/prediction time]
## 1 [0sec/75.42sec]
## IBCF run fold/sample [model time/prediction time]
## 1 [198.2sec/1.63sec]
## Time difference of 18.72 mins
# load("data/results.rdata")
par(mar=c(4,4,3,2),cex=0.8)
cols = c("red", "magenta", "gray", "orange", "blue", "green")
plot(results, annotate=c(1,3), legend="topleft", pch=19, lwd=2, col=cols)
abline(v=seq(0,0.006,0.001), h=seq(0,0.08,0.01), col='lightgray', lty=2)
#tpr越高越好、fpr越低越好
getConfusionMatrix(results$IBCF)
[[1]]
TP FP FN TN precision recall TPR FPR
5 1.116 3.884 32.97 3311 0.2231 0.03899 0.03899 0.001171
10 1.699 8.301 32.39 3307 0.1699 0.05812 0.05812 0.002503
15 2.075 12.925 32.01 3302 0.1383 0.07021 0.07021 0.003898
20 2.385 17.615 31.70 3297 0.1193 0.08002 0.08002 0.005313