packages = c(
"dplyr","ggplot2","googleVis","devtools","magrittr","slam","irlba","plotly",
"arules","arulesViz","Matrix","recommenderlab")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)rm(list=ls(all=TRUE))
LOAD = FALSE
library(dplyr)
library(ggplot2)
library(googleVis)
library(Matrix)
library(slam)
library(irlba)
library(plotly)
library(arules)
library(arulesViz)
library(recommenderlab)load("data/tf0.rdata")
A = A0; X = X0; Z = Z0; rm(A0,X0,Z0); gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2801293 149.7 4752805 253.9 4752805 253.9
## Vcells 10622267 81.1 14899527 113.7 10672654 81.5
k = 8; set.seed(777)
A$group = kmeans(scale(A[,2:6]), k)$cluster; table(A$group)##
## 1 2 3 4 5 6 7 8
## 2371 2825 6789 4252 9758 317 4300 1629
df = group_by(A, group) %>% summarise(
avg_frequency = mean(f),
avg_monetary = mean(m),
total_revenue = sum(rev),
group_size = n(),
avg_recency = mean(r),
profit = sum(raw)
) %>% ungroup %>%
mutate(
pc_revenue = round(100*total_revenue/sum(total_revenue),1), # % revenue contrib.
pc_profit = round(100*profit/sum(profit),1), # % profit contrib.
dummy = 2001 # to fit googleViz's data format
) %>% data.framehead(df)## group avg_frequency avg_monetary total_revenue group_size avg_recency
## 1 1 1.630114 2224.0645 8192064 2371 29.684521
## 2 2 11.861593 984.1008 27447494 2825 9.084602
## 3 3 1.471056 776.7725 7522130 6789 90.874797
## 4 4 1.518109 626.1522 3988792 4252 13.433678
## 5 5 4.502972 735.5990 29183322 9758 17.784280
## 6 6 33.492114 890.7913 8065546 317 3.425868
## profit pc_revenue pc_profit dummy
## 1 1391812 8.1 8.9 2001
## 2 4244727 27.0 27.2 2001
## 3 1134525 7.4 7.3 2001
## 4 579186 3.9 3.7 2001
## 5 4206232 28.7 26.9 2001
## 6 1227735 7.9 7.9 2001
plot( gvisMotionChart(
df, "group", "dummy",
options=list(width=800, height=600)
))## starting httpd help server ... done
【QUIZ】 如果我們把X,Y軸分別改成log(avg_monetary)和log(avg_recency),根據圖上的顯示 …
n_distinct(Z$cust) # 32256## [1] 32256
n_distinct(Z$prod) # 23789## [1] 23789
操作矩陣運算之前,通常我會載入這兩個套件
library(Matrix)
library(slam)製作顧客產品矩陣其實很快、也很容易
mx = xtabs(~ cust + prod, Z, sparse=T)顧客產品矩陣通常是一個很稀疏的矩陣
mean(mx > 0)## [1] 0.0009679946
有一些產品沒什麼人買
table(colSums(mx) < 10) # 12588 ##
## FALSE TRUE
## 11201 12588
刪去購買次數小於10的產品,然後刪去沒有購買產品的顧客
mx = mx[, colSums(mx) > 10] #
mx = mx[rowSums(mx) > 0, ] #
dim(mx) # 32256>32066 23789>10675## [1] 32066 10675
檢查一下矩陣裡面的值分布
max(mx) # 49## [1] 49
table(mx@x) %>% prop.table %>% round(4) %>% head(10)##
## 1 2 3 4 5 6 7 8 9 10
## 0.9235 0.0594 0.0112 0.0033 0.0013 0.0006 0.0003 0.0002 0.0001 0.0001
【QUIZ】 如果mx[i, j] = 3,這表示 ….
稀疏矩陣有很多種格式,不同的工具會使用不同的格式
library(slam)
tmx = as(mx,"dgTMatrix")
tmx = simple_triplet_matrix(
1+tmx@i, 1+tmx@j, tmx@x, dimnames=mx@Dimnames)
dim(tmx)## [1] 32066 10675
我們借用文字分析裡面估計單字在文章之中的重要性的方法(TF-IDF),計算各產品在所有顧客之間的平均重要性
tfidf = tapply(tmx$v/row_sums(tmx)[tmx$i], tmx$j, mean) *
log2(nrow(tmx)/col_sums(tmx > 0))
summary(tfidf)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1369 0.3129 0.3880 0.4441 0.5066 4.4687
hist(tfidf)篩去平均重要性比較低的產品、然後將產品依被購買次數降冪排列
tmx = tmx[, tfidf > mean(tfidf)] # remove products with low tfidf score
tmx = tmx[, order(-col_sums(tmx))] # order product by frequency
dim(tmx)## [1] 32066 3823
nop= 400 # no. product
k = 200 # no. cluster
set.seed(111); kg = kmeans(tmx[,1:nop], k)$cluster
table(kg) %>% as.vector %>% sort## [1] 1 1 1 1 1 1 1 1 1 1 1
## [12] 1 1 1 2 2 2 2 3 3 3 4
## [23] 5 5 5 6 6 6 6 6 7 7 7
## [34] 7 7 7 8 8 8 8 9 9 9 9
## [45] 9 9 9 10 10 10 11 12 12 12 13
## [56] 14 14 14 14 15 16 17 19 19 21 22
## [67] 23 24 25 26 27 29 30 32 33 34 35
## [78] 35 35 36 37 38 39 39 40 41 41 43
## [89] 43 43 44 44 45 46 47 47 48 48 48
## [100] 48 49 50 51 51 51 52 52 54 55 55
## [111] 55 56 56 57 57 57 58 58 58 60 60
## [122] 62 62 63 64 64 64 65 65 65 67 67
## [133] 68 69 70 70 70 72 72 72 72 73 73
## [144] 73 73 74 74 75 75 78 78 79 79 80
## [155] 80 82 82 87 87 88 88 89 91 93 93
## [166] 94 105 107 107 109 111 112 119 123 126 128
## [177] 131 136 137 138 140 141 141 145 147 148 148
## [188] 153 165 168 182 185 187 193 212 257 422 494
## [199] 556 19875
將分群結果併入顧客資料框(A)
df = inner_join(A, data.frame(
cust = as.integer(tmx$dimnames$cust), kg))## Joining, by = "cust"
計算各群組的平均屬性
df = data.frame(
aggregate(. ~ kg, df[,c(2:7,11)], mean),
size = as.vector(table(kg))
)
head(df)## kg r s f m rev raw size
## 1 1 14.03509 103.84211 8.982456 1351.5971 10364.456 1246.3333 57
## 2 2 28.46377 92.03623 6.927536 752.7416 4505.804 722.3841 138
## 3 3 6.50000 113.00000 16.500000 1810.2222 27129.500 3764.5000 2
## 4 4 47.12418 100.84967 4.764706 1603.9312 5964.882 784.0392 153
## 5 5 23.55556 102.66667 6.888889 1882.3126 10613.556 1405.2222 9
## 6 6 18.10000 105.40000 8.800000 1170.1295 9537.100 1566.5000 10
df$dummy = 2001 # dummy column for googleViz
plot( gvisMotionChart(
subset(df[,c(1,4,5,6,8,2,3,7,9)],
size >= 20 & size <= 5000 # range of group size
),
"kg", "dummy", options=list(width=800, height=600) ) )Sig = function(gx, P=1000, H=10) {
print(sprintf("Group %d: No. Customers = %d", gx, sum(kg==gx)))
bx = tmx[,1:P]
data.frame(n = col_sums(bx[kg==gx,])) %>% # frequency
mutate(
share = round(100*n/col_sums(bx),2), # %prod sold to this cluster
conf = round(100*n/sum(kg==gx),2), # %buy this product, given cluster
base = round(100*col_sums(bx)/nrow(bx),2), # %buy this product, all cust
lift = round(conf/base,1), # conf/base
name = colnames(bx) # name of prod
) %>% arrange(desc(lift)) %>% head(H)
}Sig(1)## [1] "Group 1: No. Customers = 57"
## n share conf base lift name
## 1 137 39.83 240.35 1.07 224.6 7610053910787
## 2 3 8.82 5.26 0.11 47.8 4710088500366
## 3 3 7.69 5.26 0.12 43.8 4710091111757
## 4 3 7.14 5.26 0.13 40.5 2230090000101
## 5 18 6.02 31.58 0.93 34.0 7610053910794
## 6 2 5.56 3.51 0.11 31.9 4710982090260
## 7 2 5.00 3.51 0.12 29.2 4710091111528
## 8 2 5.13 3.51 0.12 29.2 4971275703278
## 9 2 5.26 3.51 0.12 29.2 20546786
## 10 2 5.26 3.51 0.12 29.2 4710174011196
n個P產品share%是賣給G族群conf/100個P產品base/100個P產品lift-1倍【QUIZ】 從互動式泡泡圖看來 …
【QUIZ】 在以上這個段落裡面 …
library(irlba)
if(LOAD) {
load("data/svd.rdata")
} else {
smx = mx
smx@x = pmin(smx@x, 2) # cap at 2, similar to normalization
t0 = Sys.time()
svd = irlba(smx,
nv=400, # length of feature vector
maxit=800, work=800)
print(Sys.time() - t0) # 1.8795 mins
save(svd, file = "data/svd.rdata")
}## Time difference of 1.742891 mins
set.seed(111); kg = kmeans(svd$u, 200)$cluster
table(kg) %>% as.vector %>% sort## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [15] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [29] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [43] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [57] 1 1 1 1 1 1 2 2 2 2 2 3 3 4
## [71] 4 4 5 7 14 17 20 28 29 30 33 38 41 43
## [85] 44 44 48 48 50 51 52 56 57 58 61 64 66 67
## [99] 67 67 75 83 84 89 93 99 100 102 107 110 113 114
## [113] 117 117 128 131 134 136 137 142 145 147 150 151 151 160
## [127] 160 165 166 167 168 169 169 171 175 175 178 178 180 180
## [141] 181 182 184 184 185 188 191 191 192 194 194 197 199 205
## [155] 208 210 210 211 211 211 213 215 216 218 219 221 222 223
## [169] 224 224 228 230 230 232 233 236 237 238 240 241 250 251
## [183] 252 255 268 274 283 285 290 293 305 318 329 384 440 718
## [197] 781 865 1102 8844
# clustster summary
df = left_join(A, data.frame(
cust = as.integer(smx@Dimnames$cust), kg)) %>%
group_by(kg) %>% summarise(
avg_frequency = mean(f),
avg_monetary = mean(m),
avg_revenue_contr = mean(rev),
group_size = n(),
avg_recency = mean(r),
avg_gross_profit = mean(raw)) %>%
ungroup %>%
mutate(dummy = 2001, kg = sprintf("G%03d",kg)) %>%
data.frame## Joining, by = "cust"
# Google Motion Chart
plot( gvisMotionChart(
subset(df, group_size >= 20 & group_size <= 5000),
"kg", "dummy", options=list(width=800, height=600) ) )filter(df, group_size >= 20 & group_size <= 5000)$group_size %>%
sqrt %>% range # for bubble size adjustment## [1] 4.472136 33.196385
library(ggplot2)
library(plotly)
p = df %>% filter(group_size >= 20 & group_size <= 5000) %>%
ggplot(aes(x=avg_frequency, y=avg_monetary)) +
geom_point(aes(size=group_size, col=avg_revenue_contr),alpha=0.7) +
geom_text(aes(label=kg), alpha=0) +
scale_size(range=c(1.5,12)) +
#scale_color_gradient(low="green",high="magenta") +
scale_colour_gradientn(
colours = rev(c("red","yellow","green","lightblue","darkblue"))) +
theme_bw() + guides(size=F) + labs(
title="顧客集群(依購買產品)",
color="平均營收貢獻", size="集群人數") +
xlab("平均購買次數") +
ylab("平均購買金額")
plotly_build(p)Sig(138)## [1] "Group 138: No. Customers = 228"
## n share conf base lift name
## 1 316 58.63 138.60 1.68 82.5 8712045008539
## 2 35 15.35 15.35 0.71 21.6 8712045013038
## 3 32 11.55 14.04 0.86 16.3 4902430493437
## 4 4 11.76 1.75 0.11 15.9 4710091110491
## 5 24 8.03 10.53 0.93 11.3 7610053910794
## 6 8 7.34 3.51 0.34 10.3 4902430493451
## 7 3 7.32 1.32 0.13 10.2 4714050000021
## 8 17 6.97 7.46 0.76 9.8 4902430040334
## 9 4 7.02 1.75 0.18 9.7 20510183
## 10 5 6.58 2.19 0.24 9.1 4710036110067
dim(mx) # 32066 cust * 10675 prod## [1] 32066 10675
library(arules)
library(arulesViz)
bx = subset(Z, prod %in% as.numeric(colnames(mx)),
select=c("cust","prod")) # select product items
bx = split(bx$prod, bx$cust) # split by customer
bx = as(bx, "transactions") # data for arules package## Warning in asMethod(object): removing duplicated items in transactions
itemFrequencyPlot(bx, topN=20, type="absolute", cex=0.8)關聯規則(A => B)
rules = apriori(bx, parameter=list(supp=0.005, conf=0.6))## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.6 0.1 1 none FALSE TRUE 5 0.005 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 160
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[10675 item(s), 32066 transaction(s)] done [0.37s].
## sorting and recoding items ... [884 item(s)] done [0.02s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 done [0.02s].
## writing ... [67 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(rules)## set of 67 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4
## 16 43 8
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 3.000 3.000 2.881 3.000 4.000
##
## summary of quality measures:
## support confidence lift count
## Min. :0.005052 Min. :0.6020 Min. : 3.204 Min. :162
## 1st Qu.:0.005582 1st Qu.:0.6348 1st Qu.:16.798 1st Qu.:179
## Median :0.006393 Median :0.6762 Median :21.147 Median :205
## Mean :0.007859 Mean :0.7004 Mean :22.535 Mean :252
## 3rd Qu.:0.009574 3rd Qu.:0.7507 3rd Qu.:27.986 3rd Qu.:307
## Max. :0.019086 Max. :0.8712 Max. :70.632 Max. :612
##
## mining info:
## data ntransactions support confidence
## bx 32066 0.005 0.6
關聯規則 (A => B):
options(digits=4)
inspect(rules)## lhs rhs support confidence lift count
## [1] {4710030346103} => {4710030346059} 0.005146 0.6762 70.632 165
## [2] {4719090701051} => {4719090790000} 0.005520 0.6367 39.111 177
## [3] {719859796124} => {719859796117} 0.007297 0.7222 45.859 234
## [4] {4710321861209} => {4710321861186} 0.007017 0.6338 41.392 225
## [5] {4711524000891} => {4711524001041} 0.005832 0.6561 63.564 187
## [6] {4719090790017} => {4719090790000} 0.010510 0.8300 50.989 337
## [7] {4719090790000} => {4719090790017} 0.010510 0.6456 50.989 337
## [8] {4710011401142} => {4710011401128} 0.007765 0.6434 16.679 249
## [9] {4710085120697} => {4710085120680} 0.012537 0.7992 30.691 402
## [10] {4710085120710} => {4710085120703} 0.010416 0.6243 29.790 334
## [11] {4710011402026} => {4710011402019} 0.010073 0.7210 29.376 323
## [12] {4710085172702} => {4710085172696} 0.009543 0.6120 21.354 306
## [13] {4710011409056} => {4710011401128} 0.014813 0.7353 19.061 475
## [14] {4710011401135} => {4710011401128} 0.019086 0.7897 20.470 612
## [15] {4710011405133} => {4710011401128} 0.016840 0.6968 18.062 540
## [16] {4710011406123} => {4710011401128} 0.015624 0.6358 16.481 501
## [17] {4714981010038,
## 4719090790017} => {4719090790000} 0.005489 0.8263 50.758 176
## [18] {4714981010038,
## 4719090790000} => {4719090790017} 0.005489 0.6692 52.854 176
## [19] {4710011401135,
## 4710011401142} => {4710011401128} 0.005114 0.7700 19.959 164
## [20] {4710011401128,
## 4710011401142} => {4710011401135} 0.005114 0.6586 27.251 164
## [21] {4710085120697,
## 4714981010038} => {4710085120680} 0.005395 0.8047 30.901 173
## [22] {4710060000099,
## 4711271000014} => {4714981010038} 0.005613 0.6667 3.548 180
## [23] {4710085120093,
## 4710085172702} => {4710085172696} 0.005551 0.7206 25.145 178
## [24] {4710085120093,
## 4710085172702} => {4710085120628} 0.005052 0.6559 17.778 162
## [25] {4710085172696,
## 4710085172702} => {4710085120628} 0.006112 0.6405 17.362 196
## [26] {4710085120628,
## 4710085172702} => {4710085172696} 0.006112 0.6712 23.421 196
## [27] {4710311703014,
## 4714981010038} => {4711271000014} 0.005863 0.6045 3.702 188
## [28] {4710683100015,
## 4714981010038} => {4711271000014} 0.006393 0.6949 4.256 205
## [29] {4710088620156,
## 4714981010038} => {4711271000014} 0.005645 0.6177 3.783 181
## [30] {4710063031106,
## 4711271000014} => {4714981010038} 0.007984 0.6139 3.267 256
## [31] {4710685440362,
## 4714981010038} => {4711271000014} 0.005520 0.6254 3.830 177
## [32] {4711022100017,
## 4711271000014} => {4714981010038} 0.005364 0.6277 3.340 172
## [33] {4710011401135,
## 4710011409056} => {4710011405133} 0.007266 0.6132 25.370 233
## [34] {4710011405133,
## 4710011409056} => {4710011401135} 0.007266 0.6833 28.271 233
## [35] {4710011406123,
## 4710011409056} => {4710011401135} 0.006144 0.6611 27.352 197
## [36] {4710011401135,
## 4710011409056} => {4710011401128} 0.009917 0.8368 21.693 318
## [37] {4710011401128,
## 4710011409056} => {4710011401135} 0.009917 0.6695 27.700 318
## [38] {4710011406123,
## 4710011409056} => {4710011405133} 0.005676 0.6107 25.270 182
## [39] {4710011405133,
## 4710011409056} => {4710011401128} 0.008389 0.7889 20.449 269
## [40] {4710011406123,
## 4710011409056} => {4710011401128} 0.007391 0.7953 20.616 237
## [41] {4710011409056,
## 4714981010038} => {4710011401128} 0.005707 0.7409 19.206 183
## [42] {4711271000014,
## 4714381003128} => {4714981010038} 0.006362 0.6296 3.350 204
## [43] {4710085120093,
## 4710085172696} => {4710085120628} 0.008545 0.6241 16.918 274
## [44] {4710011401135,
## 4710011405133} => {4710011401128} 0.010634 0.8158 21.147 341
## [45] {4710011401128,
## 4710011405133} => {4710011401135} 0.010634 0.6315 26.128 341
## [46] {4710011401135,
## 4710011406123} => {4710011401128} 0.009013 0.8353 21.652 289
## [47] {4710011401135,
## 4711271000014} => {4710011401128} 0.005801 0.7782 20.174 186
## [48] {4710011401135,
## 4714981010038} => {4710011401128} 0.007048 0.7958 20.628 226
## [49] {4710011405133,
## 4710011406123} => {4710011401128} 0.008015 0.7449 19.310 257
## [50] {4710011405133,
## 4711271000014} => {4710011401128} 0.005426 0.7468 19.358 174
## [51] {4710011405133,
## 4714981010038} => {4710011401128} 0.006674 0.7086 18.369 214
## [52] {4710011406123,
## 4714981010038} => {4710011401128} 0.006674 0.6751 17.500 214
## [53] {4710421090059,
## 4714981010038} => {4711271000014} 0.009605 0.6299 3.857 308
## [54] {4710583996008,
## 4719090900065} => {4714981010038} 0.006019 0.6942 3.694 193
## [55] {4710265849066,
## 4719090900065} => {4714981010038} 0.009200 0.6020 3.204 295
## [56] {4710265849066,
## 4711271000014} => {4714981010038} 0.011539 0.6390 3.400 370
## [57] {4713985863121,
## 4719090900065} => {4714981010038} 0.005894 0.7269 3.868 189
## [58] {4711271000014,
## 4713985863121} => {4714981010038} 0.010728 0.6922 3.683 344
## [59] {4711271000014,
## 4719090900065} => {4714981010038} 0.014533 0.6099 3.246 466
## [60] {4710011401135,
## 4710011405133,
## 4710011409056} => {4710011401128} 0.006331 0.8712 22.585 203
## [61] {4710011401128,
## 4710011401135,
## 4710011409056} => {4710011405133} 0.006331 0.6384 26.413 203
## [62] {4710011401128,
## 4710011405133,
## 4710011409056} => {4710011401135} 0.006331 0.7546 31.224 203
## [63] {4710011401135,
## 4710011406123,
## 4710011409056} => {4710011401128} 0.005333 0.8680 22.501 171
## [64] {4710011401128,
## 4710011406123,
## 4710011409056} => {4710011401135} 0.005333 0.7215 29.853 171
## [65] {4710011401135,
## 4710011405133,
## 4710011406123} => {4710011401128} 0.005520 0.8634 22.382 177
## [66] {4710011401128,
## 4710011401135,
## 4710011406123} => {4710011405133} 0.005520 0.6125 25.341 177
## [67] {4710011401128,
## 4710011405133,
## 4710011406123} => {4710011401135} 0.005520 0.6887 28.496 177
# install.packages(
# "https://cran.r-project.org/bin/windows/contrib/3.5/arulesViz_1.3-1.zip",
# repos=NULL)
# install.packages("arulesViz_1.3-1.zip", repos=NULL)
# library(plotly)
# plotly_arules(rules,colors=c("red","green"),
# marker=list(opacity=.6,size=10))
# plotly_arules(rules,method="matrix",
# shading="lift",
# colors=c("red", "green"))
# plot(rules,colors=c("red","green"),engine="htmlwidget",
marker=list(opacity=.6,size=8))plot(rules,method="matrix",shading="lift",engine="htmlwidget",
colors=c("red", "green"))r1 = subset(rules, subset = rhs %in% c("4719090790000"))
summary(r1)## set of 3 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3
## 2 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 2.00 2.00 2.33 2.50 3.00
##
## summary of quality measures:
## support confidence lift count
## Min. :0.00549 Min. :0.637 Min. :39.1 Min. :176
## 1st Qu.:0.00550 1st Qu.:0.732 1st Qu.:44.9 1st Qu.:176
## Median :0.00552 Median :0.826 Median :50.8 Median :177
## Mean :0.00717 Mean :0.764 Mean :47.0 Mean :230
## 3rd Qu.:0.00801 3rd Qu.:0.828 3rd Qu.:50.9 3rd Qu.:257
## Max. :0.01051 Max. :0.830 Max. :51.0 Max. :337
##
## mining info:
## data ntransactions support confidence
## bx 32066 0.005 0.6
plot(r1,method="graph",engine="htmlwidget",itemCol="cyan") r2 = subset(rules, subset = rhs %in% c("4710011401135"))
summary(r2)## set of 8 rules
##
## rule length distribution (lhs + rhs):sizes
## 3 4
## 5 3
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 3.00 3.00 3.38 4.00 4.00
##
## summary of quality measures:
## support confidence lift count
## Min. :0.00511 Min. :0.631 Min. :26.1 Min. :164
## 1st Qu.:0.00547 1st Qu.:0.660 1st Qu.:27.3 1st Qu.:176
## Median :0.00624 Median :0.676 Median :28.0 Median :200
## Mean :0.00703 Mean :0.684 Mean :28.3 Mean :226
## 3rd Qu.:0.00793 3rd Qu.:0.697 3rd Qu.:28.8 3rd Qu.:254
## Max. :0.01063 Max. :0.755 Max. :31.2 Max. :341
##
## mining info:
## data ntransactions support confidence
## bx 32066 0.005 0.6
plot(r2,method="graph",engine="htmlwidget",itemCol="cyan") 太少被購買的產品和購買太少產品的顧客都不適合使用Collaborative Filtering這種產品推薦方法,所以我們先對顧客和產品做一次篩選
library(recommenderlab)
rx = mx[, colSums(mx > 0) >= 50]
rx = rx[rowSums(rx > 0) >= 20 & rowSums(rx > 0) <= 300, ]
dim(rx)## [1] 8860 3355
可以選擇要用
做模型。
rx = as(rx, "realRatingMatrix") # realRatingMatrix
bx = binarize(rx, minRating=1) # binaryRatingMatrix
dim(bx)## [1] 8860 3355
set.seed(4321)
scheme = evaluationScheme(
bx, method="split", train = .75, given=5)algorithms = list(
AR53 = list(name="AR", param=list(support=0.0005, confidence=0.3)),
AR43 = list(name="AR", param=list(support=0.0004, confidence=0.3)),
RANDOM = list(name="RANDOM", param=NULL),
POPULAR = list(name="POPULAR", param=NULL),
UBCF = list(name="UBCF", param=NULL),
IBCF = list(name="IBCF", param=NULL) )if(LOAD) {
load("results.rdata")
} else {
t0 = Sys.time()
results = evaluate(
scheme, algorithms, type="topNList",
n=c(5, 10, 15, 20))
print(Sys.time() - t0)
save(results, file="results.rdata")
}## AR run fold/sample [model time/prediction time]
## 1 [3.5sec/181.3sec]
## AR run fold/sample [model time/prediction time]
## 1 [8.68sec/455sec]
## RANDOM run fold/sample [model time/prediction time]
## 1 [0sec/8.23sec]
## POPULAR run fold/sample [model time/prediction time]
## 1 [0sec/8.5sec]
## UBCF run fold/sample [model time/prediction time]
## 1 [0sec/62.4sec]
## IBCF run fold/sample [model time/prediction time]
## 1 [170.2sec/1.43sec]
## Time difference of 15.19 mins
# load("data/results.rdata")
par(mar=c(4,4,3,2),cex=0.8)
cols = c("red", "magenta", "gray", "orange", "blue", "green")
plot(results, annotate=c(1,3), legend="topleft", pch=19, lwd=2, col=cols)
abline(v=seq(0,0.006,0.001), h=seq(0,0.08,0.01), col='lightgray', lty=2)save(results, file="data/results.rdata")