packages = c(
"dplyr","ggplot2","googleVis","devtools","magrittr","slam","irlba","plotly",
"arules","arulesViz","Matrix","recommenderlab")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)rm(list=ls(all=TRUE))
LOAD = TRUE
library(dplyr)
library(ggplot2)
library(googleVis)
library(Matrix)
library(slam)
library(irlba)
library(plotly)
library(arules)
library(arulesViz)
library(recommenderlab)Load data frame and rename
load("data/tf0.rdata")
A = A0; X = X0; Z = Z0; rm(A0,X0,Z0); gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2536407 135.5 3886542 207.6 3205452 171.2
## Vcells 9943933 75.9 14442948 110.2 9993740 76.3
Z = subset(Z, cust %in% A$cust)n_distinct(Z$cust) # 32241## [1] 32241
n_distinct(Z$prod) # 23787## [1] 23787
製作顧客產品矩陣其實很快、也很容易
library(Matrix)
library(slam)
cpm = xtabs(~ cust + prod, Z, sparse=T) # customer product matrix
dim(cpm) # 32241 23787## [1] 32241 23787
mean(cpm > 0) # 0.00096799## [1] 0.0009674258
顧客產品矩陣通常是一個很稀疏的矩,陣有一些產品沒什麼人買
colSums(cpm) %>% quantile(seq(0,1,0.1))## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 1 1 2 4 6 8 13 20 35 76 8475
mean(colSums(cpm) > 10)## [1] 0.4483541
刪去購買次數小於6的產品;為了方便顧客產品矩陣和顧客資料框的合併,我們選擇先保留沒有購買產品的顧客
cpm = cpm[, colSums(cpm) >= 6] # remove the least frequent products
# cpm = cpm[rowSums(cpm) > 0, ] # remove non-buying customers
cpm = cpm[, order(-colSums(cpm))] # order product by frequency
dim(cpm) # 32241 23787>14621## [1] 32241 14621
max(cpm) # 49## [1] 49
mean(cpm > 0) # 0.0015248## [1] 0.001524785
table(cpm@x) %>% prop.table %>% round(4) %>% head(10)##
## 1 2 3 4 5 6 7 8 9 10
## 0.9256 0.0579 0.0108 0.0032 0.0012 0.0006 0.0003 0.0002 0.0001 0.0001
請你用一個指令列出被購買最多次的10個產品,和它們被購買的次數。
#
■ 在什麼前提之下,我們可以把購買這十個產品的次數當作變數,用來預測顧客在下一期會不會來購買呢?
■ 我們如何把這十個變數,併入顧客資料框呢?
■ 我們可不可以(在什麼前提之下我們可以)直接用cbind()新變數併入顧客資料框呢?
■ 我們期中競賽的資料,符合直接用cbind()併入新變數的條件嗎? 我們要如何確認這一件事呢?
以產品的被購買頻率製作(顧客)變數的時候,排cpm在最前邊的(N個)欄位就是變數!
nop= 400 # no. product = no. variables
k = 200 # no. cluster
set.seed(111); kg = kmeans(cpm[,1:nop], k)$cluster
table(kg) %>% as.vector %>% sort## [1] 1 1 1 1 1 1 1 1 1 1 1
## [12] 1 1 1 1 1 1 1 1 1 1 1
## [23] 2 2 2 2 2 2 3 3 3 3 3
## [34] 3 3 4 4 4 4 4 4 4 5 5
## [45] 6 6 6 6 7 7 7 8 8 8 9
## [56] 9 9 9 10 10 10 10 11 11 11 11
## [67] 11 12 13 13 15 15 15 16 16 18 19
## [78] 20 20 20 20 21 22 22 22 24 24 25
## [89] 25 27 28 28 32 32 35 36 39 40 41
## [100] 42 44 45 46 47 47 48 49 49 50 51
## [111] 52 53 56 58 58 61 63 66 67 68 69
## [122] 69 72 81 85 85 86 87 90 94 96 97
## [133] 97 100 100 101 110 111 113 114 116 118 123
## [144] 123 126 130 134 136 141 141 142 143 162 165
## [155] 172 175 178 179 182 182 184 187 195 210 222
## [166] 225 228 228 237 239 242 253 254 258 258 266
## [177] 268 272 287 293 301 311 325 329 350 351 363
## [188] 396 407 407 410 418 432 448 473 523 561 1156
## [199] 1266 11215
將分群結果併入顧客資料框(A)
df = A %>% inner_join(data.frame(
cust = as.integer(rownames(cpm)),
kg) )## Joining, by = "cust"
head(df) # 32241## cust r s f m rev raw age area kg
## 1 1069 19 108 4 486.0 1944 15 K E 196
## 2 1113 54 109 4 557.5 2230 241 K F 81
## 3 1250 19 25 2 791.5 1583 354 D D 169
## 4 1359 87 87 1 364.0 364 104 K G 49
## 5 1823 36 119 3 869.0 2607 498 K D 109
## 6 2189 57 89 2 7028.0 14056 3299 K B 158
計算各群組的平均屬性
df = data.frame(
aggregate(. ~ kg, df[,c(2:7,10)], mean), # averages
size = as.vector(table(kg)), # no. customers in the group
dummy = 2001 # dummy column for googleViz
)
head(df)## kg r s f m rev raw size
## 1 1 10.772727 113.54545 13.318182 872.7878 10697.909 1733.8182 22
## 2 2 43.934959 95.96748 3.455285 1252.9938 3819.837 533.0407 123
## 3 3 5.384615 111.15385 14.307692 1325.2611 12617.769 1951.8462 13
## 4 4 8.000000 119.00000 24.000000 955.6250 22935.000 3658.0000 1
## 5 5 6.666667 116.66667 20.333333 579.9135 10741.444 1554.2222 9
## 6 6 2.500000 114.00000 23.500000 411.9918 9775.000 1324.5000 2
## dummy
## 1 2001
## 2 2001
## 3 2001
## 4 2001
## 5 2001
## 6 2001
library(googleVis)
op <- options(gvis.plot.tag='chart')plot( gvisMotionChart(
subset(df[,c(1,4,5,6,8,2,3,7,9)],
size >= 20 & size <= 1000), # range of group size
"kg", "dummy", options=list(width=800, height=600) ) )## Set options back to original options
## options(op)# use global variables: cpm, kg
Sig = function(gx, P=1000, H=10) {
print(sprintf("Group %d: No. Customers = %d", gx, sum(kg==gx)))
bx = cpm[,1:P]
data.frame(n = col_sums(bx[kg==gx,])) %>% # frequency
mutate(
share = round(100*n/col_sums(bx),2), # %prod sold to this cluster
conf = round(100*n/sum(kg==gx),2), # %buy this product, given cluster
base = round(100*col_sums(bx)/nrow(bx),2), # %buy this product, all cust
lift = round(conf/base,1), # conf/base
name = colnames(bx) # name of prod
) %>% arrange(desc(lift)) %>% head(H)
}Sig(130)## [1] "Group 130: No. Customers = 97"
## n share conf base lift name
## 1 29 6.65 29.90 1.35 22.1 4719090106009
## 2 19 6.62 19.59 0.89 22.0 4710632001172
## 3 12 5.38 12.37 0.69 17.9 4710093080044
## 4 389 4.59 401.03 26.29 15.3 4714981010038
## 5 20 4.59 20.62 1.35 15.3 4719090105002
## 6 14 4.53 14.43 0.96 15.0 4719090106016
## 7 8 4.40 8.25 0.56 14.7 9310022862601
## 8 10 4.31 10.31 0.72 14.3 4710189820851
## 9 9 4.25 9.28 0.66 14.1 4710088434258
## 10 15 4.19 15.46 1.11 13.9 4710189851282
library(irlba)
if(LOAD) {
load("data/svd2a.rdata")
} else {
smx = cpm
smx@x = pmin(smx@x, 2) # cap at 2, similar to normalization
t0 = Sys.time()
svd = irlba(smx,
nv=400, # length of feature vector
maxit=800, work=800)
print(Sys.time() - t0) # 1.8795 mins
save(svd, file = "data/svd2a.rdata")
}
■ 在什麼前提之下,我們可以把顧客購買產品的特徵向量當作變數,用來預測顧客在下一期會不會來購買呢?
■ 如果可以的話,我們如何把顧客購買產品的特徵向量,併入顧客資料框呢?
■ 我們可不可以(在什麼前提之下我們可以)直接用cbind()將特徵向量併入顧客資料框呢?
■ 我們期中競賽的資料,符合直接用cbind()併入特徵向量的條件嗎? 我們要如何確認這一件事呢?
set.seed(111); kg = kmeans(svd$u, 200)$cluster
table(kg) %>% as.vector %>% sort## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [15] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [29] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [43] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [57] 1 2 2 2 2 2 3 4 4 5 7 10 14 30
## [71] 31 32 36 38 38 39 39 40 40 41 44 45 46 47
## [85] 49 54 59 62 62 69 71 77 79 79 80 82 82 84
## [99] 87 91 101 103 109 110 111 113 117 120 123 127 127 129
## [113] 132 133 134 135 136 139 141 143 143 147 147 157 159 159
## [127] 160 160 160 166 168 169 172 175 180 181 181 182 183 184
## [141] 184 188 190 190 193 194 195 196 198 198 200 201 201 202
## [155] 202 204 204 204 207 209 209 210 213 214 216 219 219 222
## [169] 225 233 234 235 236 237 237 238 239 241 248 248 248 253
## [183] 256 257 258 259 261 261 264 269 277 281 285 293 305 411
## [197] 612 896 1092 8987
# clustster summary
df = inner_join(A, data.frame(
cust = as.integer(rownames(cpm)), kg)) %>%
group_by(kg) %>% summarise(
avg_frequency = mean(f),
avg_monetary = mean(m),
avg_revenue_contr = mean(rev),
group_size = n(),
avg_recency = mean(r),
avg_gross_profit = mean(raw)) %>%
ungroup %>%
mutate(dummy = 2001, kg = sprintf("G%03d",kg)) %>%
data.frame## Joining, by = "cust"
# Google Motion Chart
plot( gvisMotionChart(
subset(df, group_size >= 20 & group_size <= 1200),
"kg", "dummy", options=list(width=800, height=600) ) )Sig(162)## [1] "Group 162: No. Customers = 87"
## n share conf base lift name
## 1 165 38.64 189.66 1.32 143.7 4710105045313
## 2 133 35.37 152.87 1.17 130.7 4710105045320
## 3 59 13.50 67.82 1.36 49.9 4710105045450
## 4 19 9.60 21.84 0.61 35.8 4710105051147
## 5 35 8.31 40.23 1.31 30.7 4710105045443
## 6 9 5.33 10.34 0.52 19.9 4710088432650
## 7 8 4.91 9.20 0.51 18.0 4710105051185
## 8 8 4.19 9.20 0.59 15.6 4710171028326
## 9 7 3.89 8.05 0.56 14.4 4710058137059
## 10 7 3.70 8.05 0.59 13.6 4710010020061
n_distinct(Z$tid)## [1] 119407
n_distinct(Z$prod)## [1] 23787
購物籃分析會使用arules這個套件,它要用的資料很容易準備,直接使用交易項目資料,一行程式就可以搞定。
library(arules)
library(arulesViz)
bx = as(split(Z$prod, Z$tid), "transactions") itemFrequencyPlot(bx, topN=20, type="absolute", cex=0.8)關聯規則(A => B)
rules = apriori(bx, parameter=list(supp=0.002, conf=0.6))## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.6 0.1 1 none FALSE TRUE 5 0.002 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 238
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[23787 item(s), 119407 transaction(s)] done [0.15s].
## sorting and recoding items ... [569 item(s)] done [0.01s].
## creating transaction tree ... done [0.05s].
## checking subsets of size 1 2 3 done [0.01s].
## writing ... [14 rule(s)] done [0.01s].
## creating S4 object ... done [0.01s].
summary(rules)## set of 14 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3
## 8 6
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 2.000 2.429 3.000 3.000
##
## summary of quality measures:
## support confidence lift count
## Min. :0.002102 Min. :0.6125 Min. : 48.12 Min. :251.0
## 1st Qu.:0.002512 1st Qu.:0.6798 1st Qu.: 55.10 1st Qu.:300.0
## Median :0.002826 Median :0.7267 Median : 58.58 Median :337.5
## Mean :0.003208 Mean :0.7240 Mean : 85.23 Mean :383.1
## 3rd Qu.:0.003335 3rd Qu.:0.7744 3rd Qu.: 97.86 3rd Qu.:398.2
## Max. :0.005854 Max. :0.8088 Max. :170.92 Max. :699.0
##
## mining info:
## data ntransactions support confidence
## bx 119407 0.002 0.6
關聯規則 (A => B):
options(digits=4)
inspect(rules)## lhs rhs support confidence
## [1] {4719090790017} => {4719090790000} 0.002940 0.8088
## [2] {4719090790000} => {4719090790017} 0.002940 0.6212
## [3] {719859796124} => {719859796117} 0.002102 0.6972
## [4] {4710011402026} => {4710011402019} 0.002822 0.6740
## [5] {4710085120697} => {4710085120680} 0.003467 0.7753
## [6] {4710011409056} => {4710011401128} 0.004430 0.6988
## [7] {4710011401135} => {4710011401128} 0.005854 0.7524
## [8] {4710011405133} => {4710011401128} 0.005176 0.6588
## [9] {4710011401135,4710011409056} => {4710011401128} 0.002713 0.8020
## [10] {4710011401128,4710011409056} => {4710011401135} 0.002713 0.6125
## [11] {4710011405133,4710011409056} => {4710011401128} 0.002261 0.7606
## [12] {4710011401135,4710011405133} => {4710011401128} 0.002831 0.7717
## [13] {4710011401135,4710011406123} => {4710011401128} 0.002445 0.8022
## [14] {4710011405133,4710011406123} => {4710011401128} 0.002219 0.7011
## lift count
## [1] 170.92 351
## [2] 170.92 351
## [3] 147.61 251
## [4] 90.22 337
## [5] 100.41 414
## [6] 51.04 529
## [7] 54.95 699
## [8] 48.12 618
## [9] 58.57 324
## [10] 78.72 324
## [11] 55.55 270
## [12] 56.36 338
## [13] 58.59 292
## [14] 51.20 265
有互動圖表的幫助,我們可以把條件放寬,多找一些關聯規則進來
rules = apriori(bx, parameter=list(supp=0.0003, conf=0.5))## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 3e-04 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 35
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[23787 item(s), 119407 transaction(s)] done [0.15s].
## sorting and recoding items ... [4691 item(s)] done [0.01s].
## creating transaction tree ... done [0.06s].
## checking subsets of size 1 2 3 4 5 6
## Warning in apriori(bx, parameter = list(supp = 3e-04, conf = 0.5)): Mining
## stopped (maxlen reached). Only patterns up to a length of 6 returned!
## done [0.24s].
## writing ... [581 rule(s)] done [0.04s].
## creating S4 object ... done [0.03s].
plot(rules,colors=c("red","green"),engine="htmlwidget",
marker=list(opacity=.6,size=8))## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
plot(rules,method="matrix",shading="lift",engine="htmlwidget",
colors=c("red", "green"))r1 = subset(rules, subset = rhs %in% c("93362993"))
plot(r1,method="graph",engine="htmlwidget",itemCol="cyan") 太少被購買的產品和購買太少產品的顧客都不適合使用Collaborative Filtering這種產品推薦方法,所以我們先對顧客和產品做一次篩選
library(recommenderlab)
rx = cpm[, colSums(cpm > 0) >= 50]
rx = rx[rowSums(rx > 0) >= 20 & rowSums(rx > 0) <= 300, ]
dim(rx) # 8846 3354## [1] 8846 3354
可以選擇要用
做模型。
rx = as(rx, "realRatingMatrix") # realRatingMatrix
bx = binarize(rx, minRating=1) # binaryRatingMatrixUBCF:User Based Collaborative Filtering
rUBCF = Recommender(bx[1:8800,], method = "UBCF")
pred = predict(rUBCF, bx[8801:8846,], n=4)
do.call(rbind, as(pred, "list")) %>% head(15)## [,1] [,2] [,3] [,4]
## 2170855 "4711271000014" "4710114128038" "4714981010038" "4713985863121"
## 2171265 "4719090900065" "4710254049521" "4710036008562" "4714981010038"
## 2171340 "723125488040" "723125488064" "723125485032" "4714981010038"
## 2171425 "4710011401135" "4710011409056" "4711080010112" "4710011401142"
## 2171432 "4714981010038" "4710011406123" "4711258007371" "4710011401128"
## 2171555 "4719090900065" "4711271000014" "37000329169" "4710943109352"
## 2171883 "4711271000014" "4710583996008" "4710291112172" "4710018004704"
## 2172194 "4711271000014" "4714981010038" "4710114128038" "4710114105046"
## 2172392 "4903111345717" "4710908131589" "4710168705056" "4711271000014"
## 2172569 "4711271000014" "4714981010038" "4710128030037" "4712162000038"
## 2172583 "4714981010038" "4710085120093" "4719090900065" "4710154015206"
## 2172590 "4710011406123" "4710011401142" "4710857000028" "4710011432856"
## 2172668 "4711271000014" "4710088620156" "4719090900065" "4712425010712"
## 2172705 "4711271000014" "4714981010038" "37000445111" "37000440192"
## 2172811 "4714981010038" "37000442127" "4710719000333" "4710114128038"
IBCF:Item Based Collaborative Filtering
if(LOAD) {
load("data/recommenders.rdata")
} else{
rIBCF <- Recommender(bx[1:6000,], method = "IBCF")
}
pred = predict(rIBCF, bx[8801:8846,], n=4)
do.call(rbind, as(pred, "list")) %>% head(15)## [,1] [,2] [,3] [,4]
## 2170855 "4719090900065" "4714981010038" "4711271000014" "4712162000038"
## 2171265 "4719090900065" "4710015103288" "4714981010038" "4711271000014"
## 2171340 "37000445111" "4710036005608" "37000442127" "723125485032"
## 2171425 "4711311617899" "4711311218836" "4710011401135" "4710011409056"
## 2171432 "4714981010038" "4710321791698" "4710857000042" "4710626111252"
## 2171555 "93432641" "93362993" "4710105045320" "4711271000014"
## 2171883 "4710670200100" "4710670200407" "4711271000014" "3228020490329"
## 2172194 "4714108700019" "4714108700064" "4909978199111" "20332433"
## 2172392 "4710706211759" "4710908131589" "4719090900058" "4710731040614"
## 2172569 "4711371850243" "84501297329" "84501293529" "4710085121007"
## 2172583 "4710085172702" "4710085120093" "34000100095" "34000231508"
## 2172590 "4710011406123" "4711271000014" "4711437000162" "4710011401142"
## 2172668 "4711371850243" "4719090900065" "4714981010038" "4711437000117"
## 2172705 "37000445111" "4710018004605" "37000442127" "37000304593"
## 2172811 "4719581980293" "4712067899287" "4719581980279" "4710908131589"
save(rIBCF, rUBCF, file="data/recommenders.rdata")set.seed(4321)
scheme = evaluationScheme(
bx, method="split", train = .75, given=5)algorithms = list(
AR53 = list(name="AR", param=list(support=0.0005, confidence=0.3)),
AR43 = list(name="AR", param=list(support=0.0004, confidence=0.3)),
RANDOM = list(name="RANDOM", param=NULL),
POPULAR = list(name="POPULAR", param=NULL),
UBCF = list(name="UBCF", param=NULL),
IBCF = list(name="IBCF", param=NULL) )if(LOAD) {
load("data/results2a.rdata")
} else {
t0 = Sys.time()
results = evaluate(
scheme, algorithms,
type="topNList", # method of evaluation
n=c(5, 10, 15, 20) # no. recom. to be evaluated
)
print(Sys.time() - t0)
save(results, file="data/results2a.rdata")
}
## AR run fold/sample [model time/prediction time]
## 1 [4.02sec/214.6sec]
## AR run fold/sample [model time/prediction time]
## 1 [10.49sec/538.5sec]
## RANDOM run fold/sample [model time/prediction time]
## 1 [0sec/9.48sec]
## POPULAR run fold/sample [model time/prediction time]
## 1 [0sec/11.09sec]
## UBCF run fold/sample [model time/prediction time]
## 1 [0sec/75.42sec]
## IBCF run fold/sample [model time/prediction time]
## 1 [198.2sec/1.63sec]
## Time difference of 18.72 mins# load("data/results.rdata")
par(mar=c(4,4,3,2),cex=0.8)
cols = c("red", "magenta", "gray", "orange", "blue", "green")
plot(results, annotate=c(1,3), legend="topleft", pch=19, lwd=2, col=cols)
abline(v=seq(0,0.006,0.001), h=seq(0,0.08,0.01), col='lightgray', lty=2)getConfusionMatrix(results$IBCF)## [[1]]
## TP FP FN TN precision recall TPR FPR
## 5 1.116 3.884 32.97 3311 0.2231 0.03899 0.03899 0.001171
## 10 1.699 8.301 32.39 3307 0.1699 0.05812 0.05812 0.002503
## 15 2.075 12.925 32.01 3302 0.1383 0.07021 0.07021 0.003898
## 20 2.385 17.615 31.70 3297 0.1193 0.08002 0.08002 0.005313