packages = c(
"dplyr","ggplot2","googleVis","devtools","magrittr","slam","irlba","plotly",
"arules","arulesViz","Matrix","recommenderlab")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=TRUE))
LOAD = FALSE
library(dplyr)
library(ggplot2)
library(googleVis)
library(Matrix)
library(slam)
library(irlba)
library(plotly)
library(arules)
library(arulesViz)
library(recommenderlab)
Load data frame and rename
load("data/tf0.rdata")
A = A0; X = X0; Z = Z0; rm(A0,X0,Z0); gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 3211456 171.6 9601876 512.8 12002346 641
Vcells 16727491 127.7 105458009 804.6 228211284 1741
Z = subset(Z, cust %in% A$cust)
n_distinct(Z$cust) # 32241
[1] 32241
n_distinct(Z$prod) # 23787
[1] 23787
製作顧客產品矩陣其實很快、也很容易
library(Matrix)
library(slam)
cpm = xtabs(~ cust + prod, Z, sparse=T) # customer product matrix
dim(cpm) # 32241 23787
[1] 32241 23787
mean(cpm > 0) # 0.00096799
[1] 0.0009674
顧客產品矩陣通常是一個很稀疏的矩,陣有一些產品沒什麼人買
colSums(cpm) %>% quantile(seq(0,1,0.1))
0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
1 1 2 4 6 8 13 20 35 76 8475
mean(colSums(cpm) > 10)
[1] 0.4484
刪去購買次數小於6的產品;為了方便顧客產品矩陣和顧客資料框的合併,我們選擇先保留沒有購買產品的顧客
cpm = cpm[, colSums(cpm) >= 6] # remove the least frequent products
# cpm = cpm[rowSums(cpm) > 0, ] # remove non-buying customers
cpm = cpm[, order(-colSums(cpm))] # order product by frequency
dim(cpm) # 32241 23787>14621
[1] 32241 14621
max(cpm) # 49
[1] 49
mean(cpm > 0) # 0.0015248
[1] 0.001525
table(cpm@x) %>% prop.table %>% round(4) %>% head(10)
1 2 3 4 5 6 7 8 9 10
0.9256 0.0579 0.0108 0.0032 0.0012 0.0006 0.0003 0.0002 0.0001 0.0001
# X單筆交易紀錄彙整
請你用一個指令列出被購買最多次的10個產品,和它們被購買的次數。
colSums(cpm)%>% head(10)
4714981010038 4711271000014 4719090900065 4711080010112 4710114128038
8475 6119 2444 2249 2178
4710265849066 4713985863121 4710088410139 4710583996008 4710908131589
2017 1976 1869 1840 1679
■ 在什麼前提之下,我們可以把購買這十個產品的次數當作變數,用來預測顧客在下一期會不會來購買呢?
1.no
2.這10個產品是依照購買次數但不一定能對顧客下次是否能來購買產生解釋力。
■ 我們如何把這十個變數,併入顧客資料框呢?
1.從Z資料中,找出每位顧客購買Top10產品的次數
2.再以CID併入資料集A中
■ 我們可不可以(在什麼前提之下我們可以)直接用cbind()新變數併入顧客資料框呢?
1.cbind()把矩陣橫向合併成一個大矩陣(列方式)
2.可以。前提是,row數目要相同且二個資料框cid 的次序要相同。
■ 我們期中競賽的資料,符合直接用cbind()併入新變數的條件嗎? 我們要如何確認這一件事呢?
1.符合。
2.要確認資料框row的次序要相同。
以產品的被購買頻率製作(顧客)變數的時候,排cpm在最前邊的(N個)欄+位就是變數!
nop= 400 # no. product = no. variables
k = 200 # no. cluster
set.seed(111); kg = kmeans(cpm[,1:nop], k)$cluster
table(kg) %>% as.vector %>% sort
[1] 1 1 1 1 1 1 1 1 1 1 1
[12] 1 1 1 1 1 1 1 1 1 1 1
[23] 2 2 2 2 2 2 3 3 3 3 3
[34] 3 3 4 4 4 4 4 4 4 5 5
[45] 6 6 6 6 7 7 7 8 8 8 9
[56] 9 9 9 10 10 10 10 11 11 11 11
[67] 11 12 13 13 15 15 15 16 16 18 19
[78] 20 20 20 20 21 22 22 22 24 24 25
[89] 25 27 28 28 32 32 35 36 39 40 41
[100] 42 44 45 46 47 47 48 49 49 50 51
[111] 52 53 56 58 58 61 63 66 67 68 69
[122] 69 72 81 85 85 86 87 90 94 96 97
[133] 97 100 100 101 110 111 113 114 116 118 123
[144] 123 126 130 134 136 141 141 142 143 162 165
[155] 172 175 178 179 182 182 184 187 195 210 222
[166] 225 228 228 237 239 242 253 254 258 258 266
[177] 268 272 287 293 301 311 325 329 350 351 363
[188] 396 407 407 410 418 432 448 473 523 561 1156
[199] 1266 11215
將分群結果併入顧客資料框(A)
df = A %>% inner_join(data.frame(
cust = as.integer(rownames(cpm)),
kg) )
Joining, by = "cust"
head(df) # 32241
計算各群組的平均屬性
plot( gvisMotionChart(
subset(df[,c(1,4,5,6,8,2,3,7,9)],
size >= 20 & size <= 1000), # range of group size
"kg", "dummy", options=list(width=800, height=600) ) )
plot( gvisMotionChart(
subset(df[,c(1,4,5,6,8,2,3,7,9)],
size >= 20 & size <= 1000), # range of group size
"kg", "dummy", options=list(width=800, height=600) ) )
# use global variables: cpm, kg
Sig = function(gx, P=1000, H=10) {
print(sprintf("Group %d: No. Customers = %d", gx, sum(kg==gx)))
bx = cpm[,1:P]
data.frame(n = col_sums(bx[kg==gx,])) %>% # frequency
mutate(
share = round(100*n/col_sums(bx),2), # %prod sold to this cluster (%prod已售出此群集)
conf = round(100*n/sum(kg==gx),2), # %buy this product, given cluster (%購買此產品,給定群集)
base = round(100*col_sums(bx)/nrow(bx),2), # %buy this product, all cust (%購買此產品,所有cust
lift = round(conf/base,1), # conf/base
name = colnames(bx) # name of prod # prod的名稱
) %>% arrange(desc(lift)) %>% head(H)
}
Sig(130)
[1] "Group 130: No. Customers = 97"
library(irlba)
if(LOAD) {
load("data/svd2a.rdata")
} else {
smx = cpm
smx@x = pmin(smx@x, 2) # cap at 2, similar to normalization
t0 = Sys.time()
svd = irlba(smx,
nv=400, # length of feature vector
maxit=800, work=800)
print(Sys.time() - t0) # 1.8795 mins
save(svd, file = "data/svd2a.rdata")
}
Time difference of 1.765 mins
■ 在什麼前提之下,我們可以把顧客購買產品的特徵向量當作變數,用來預測顧客在下一期會不會來購買呢?
1.可以。
2.如果要將產品的特徵向量(X)去預測(Y),(X)必須要對預測(Y)有意義,如X與Y要有相關性、預測力或影響力。
■ 如果可以的話,我們如何把顧客購買產品的特徵向併入顧客資料框呢?
1.從svd資料中,找出每位顧客對於400個產品的特徵向量
2.再以CID併入資料集A中
■ 我們可不可以(在什麼前提之下我們可以)直接用cbind()將特徵向量併入顧客資料框呢?
1.cbind()把矩陣橫向合併成一個大矩陣(列方式)
2.可以。前提是,row數目要相同且二個資料框cid 的次序要相同。
■ 我們期中競賽的資料,符合直接用cbind()併入特徵向量的條件嗎? 我們要如何確認這一件事呢?
1.符合。
2.要確認資料框row的次序要相同。
set.seed(111); kg = kmeans(svd$u, 200)$cluster
table(kg) %>% as.vector %>% sort
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1
[14] 1 1 1 1 1 1 1 1 1 1 1 1 1
[27] 1 1 1 1 1 1 1 1 1 1 1 1 1
[40] 1 1 1 1 1 1 1 1 1 1 1 1 1
[53] 1 1 1 1 1 2 2 2 2 2 3 4 4
[66] 5 7 10 14 30 31 32 36 38 38 39 39 40
[79] 40 41 44 45 46 47 49 54 59 62 62 69 71
[92] 77 79 79 80 82 82 84 87 91 101 103 109 110
[105] 111 113 117 120 123 127 127 129 132 133 134 135 136
[118] 139 141 143 143 147 147 157 159 159 160 160 160 166
[131] 168 169 172 175 180 181 181 182 183 184 184 188 190
[144] 190 193 194 195 196 198 198 200 201 201 202 202 204
[157] 204 204 207 209 209 210 213 214 216 219 219 222 225
[170] 233 234 235 236 237 237 238 239 241 248 248 248 253
[183] 256 257 258 259 261 261 264 269 277 281 285 293 305
[196] 411 612 896 1092 8987
# clustster summary 簇摘要
df = inner_join(A, data.frame(
cust = as.integer(rownames(cpm)), kg)) %>%
group_by(kg) %>% summarise(
avg_frequency = mean(f),
avg_monetary = mean(m),
avg_revenue_contr = mean(rev),
group_size = n(),
avg_recency = mean(r), #近因
avg_gross_profit = mean(raw)) %>% #毛利
ungroup %>%
mutate(dummy = 2001, kg = sprintf("G%03d",kg)) %>%
data.frame
Joining, by = "cust"
# Google Motion Chart
plot( gvisMotionChart(
subset(df, group_size >= 20 & group_size <= 1200),
"kg", "dummy", options=list(width=800, height=600) ) )
Sig(162) #代表性。G162平均購買頻率:11.5
[1] "Group 162: No. Customers = 87"
n_distinct(Z$tid)
[1] 119407
n_distinct(Z$prod)
[1] 23787
購物籃分析會使用arules這個套件,它要用的資料很容易準備,直接使用交易項目資料,一行程式就可以搞定。
library(arules)
library(arulesViz)
bx = as(split(Z$prod, Z$tid), "transactions")
itemFrequencyPlot(bx, topN=20, type="absolute", cex=0.8) #項目頻率圖
關聯規則(A => B)
rules = apriori(bx, parameter=list(supp=0.002, conf=0.6))
Apriori
Parameter specification:
confidence minval smax arem aval originalSupport maxtime support
0.6 0.1 1 none FALSE TRUE 5 0.002
minlen maxlen target ext
1 10 rules FALSE
Algorithmic control:
filter tree heap memopt load sort verbose
0.1 TRUE TRUE FALSE TRUE 2 TRUE
Absolute minimum support count: 238
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[23787 item(s), 119407 transaction(s)] done [0.16s].
sorting and recoding items ... [569 item(s)] done [0.01s].
creating transaction tree ... done [0.07s].
checking subsets of size 1 2 3 done [0.01s].
writing ... [14 rule(s)] done [0.00s].
creating S4 object ... done [0.03s].
summary(rules)
set of 14 rules
rule length distribution (lhs + rhs):sizes
2 3
8 6
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.00 2.00 2.00 2.43 3.00 3.00
summary of quality measures:
support confidence lift count
Min. :0.00210 Min. :0.613 Min. : 48.1 Min. :251
1st Qu.:0.00251 1st Qu.:0.680 1st Qu.: 55.1 1st Qu.:300
Median :0.00283 Median :0.727 Median : 58.6 Median :338
Mean :0.00321 Mean :0.724 Mean : 85.2 Mean :383
3rd Qu.:0.00333 3rd Qu.:0.774 3rd Qu.: 97.9 3rd Qu.:398
Max. :0.00585 Max. :0.809 Max. :170.9 Max. :699
mining info:
data ntransactions support confidence
bx 119407 0.002 0.6
關聯規則 (A => B):
options(digits=4)
inspect(rules)
lhs rhs support
[1] {4719090790017} => {4719090790000} 0.002940
[2] {4719090790000} => {4719090790017} 0.002940
[3] {719859796124} => {719859796117} 0.002102
[4] {4710011402026} => {4710011402019} 0.002822
[5] {4710085120697} => {4710085120680} 0.003467
[6] {4710011409056} => {4710011401128} 0.004430
[7] {4710011401135} => {4710011401128} 0.005854
[8] {4710011405133} => {4710011401128} 0.005176
[9] {4710011401135,4710011409056} => {4710011401128} 0.002713
[10] {4710011401128,4710011409056} => {4710011401135} 0.002713
[11] {4710011405133,4710011409056} => {4710011401128} 0.002261
[12] {4710011401135,4710011405133} => {4710011401128} 0.002831
[13] {4710011401135,4710011406123} => {4710011401128} 0.002445
[14] {4710011405133,4710011406123} => {4710011401128} 0.002219
confidence lift count
[1] 0.8088 170.92 351
[2] 0.6212 170.92 351
[3] 0.6972 147.61 251
[4] 0.6740 90.22 337
[5] 0.7753 100.41 414
[6] 0.6988 51.04 529
[7] 0.7524 54.95 699
[8] 0.6588 48.12 618
[9] 0.8020 58.57 324
[10] 0.6125 78.72 324
[11] 0.7606 55.55 270
[12] 0.7717 56.36 338
[13] 0.8022 58.59 292
[14] 0.7011 51.20 265
#lift:A被購買時,B被購買的機率增加的倍數 (與B的基礎機率相比)
#lhs(left hand side)和rhs(right hand side),表示左操作数右操作数
有互動圖表的幫助,我們可以把條件放寬,多找一些關聯規則進來
rules = apriori(bx, parameter=list(supp=0.0003, conf=0.5))
Apriori
Parameter specification:
confidence minval smax arem aval originalSupport maxtime support
0.5 0.1 1 none FALSE TRUE 5 0.0003
minlen maxlen target ext
1 10 rules FALSE
Algorithmic control:
filter tree heap memopt load sort verbose
0.1 TRUE TRUE FALSE TRUE 2 TRUE
Absolute minimum support count: 35
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[23787 item(s), 119407 transaction(s)] done [0.16s].
sorting and recoding items ... [4691 item(s)] done [0.02s].
creating transaction tree ... done [0.07s].
checking subsets of size 1 2 3 4 5 6
Mining stopped (maxlen reached). Only patterns up to a length of 6 returned!
done [0.22s].
writing ... [581 rule(s)] done [0.04s].
creating S4 object ... done [0.02s].
plot(rules,colors=c("red","green"),engine="htmlwidget",
marker=list(opacity=.6,size=8))
To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
plot(rules,method="matrix",shading="lift",engine="htmlwidget",
colors=c("red", "green"))
r1 = subset(rules, subset = rhs %in% c("93362993"))
plot(r1,method="graph",engine="htmlwidget",itemCol="cyan")
太少被購買的產品和購買太少產品的顧客都不適合使用Collaborative Filtering這種產品推薦方法,所以我們先對顧客和產品做一次篩選
library(recommenderlab)
rx = cpm[, colSums(cpm > 0) >= 50]
rx = rx[rowSums(rx > 0) >= 20 & rowSums(rx > 0) <= 300, ]
dim(rx) # 8846 3354
[1] 8846 3354
可以選擇要用
rx = as(rx, "realRatingMatrix") # realRatingMatrix 真正的評級矩陣
bx = binarize(rx, minRating=1) # binaryRatingMatrix 二元評級矩陣
UBCF:User Based Collaborative Filtering
rUBCF = Recommender(bx[1:8800,], method = "UBCF")
pred = predict(rUBCF, bx[8801:8846,], n=4)
do.call(rbind, as(pred, "list")) %>% head(15)
[,1] [,2] [,3] [,4]
2170855 "4711271000014" "4710114128038" "4714981010038" "4713985863121"
2171265 "4719090900065" "4710254049521" "4710036008562" "4714981010038"
2171340 "723125488040" "723125488064" "723125485032" "4714981010038"
2171425 "4710011401135" "4710011409056" "4711080010112" "4710011401142"
2171432 "4714981010038" "4710011406123" "4711258007371" "4710011401128"
2171555 "4719090900065" "4711271000014" "37000329169" "4710943109352"
2171883 "4711271000014" "4710583996008" "4710291112172" "4710018004704"
2172194 "4711271000014" "4714981010038" "4710114128038" "4710114105046"
2172392 "4903111345717" "4710908131589" "4710168705056" "4711271000014"
2172569 "4711271000014" "4714981010038" "4710128030037" "4712162000038"
2172583 "4714981010038" "4710085120093" "4719090900065" "4710154015206"
2172590 "4710011406123" "4710011401142" "4710857000028" "4710011432856"
2172668 "4711271000014" "4710088620156" "4719090900065" "4712425010712"
2172705 "4711271000014" "4714981010038" "37000445111" "37000440192"
2172811 "4714981010038" "37000442127" "4710719000333" "4710114128038"
IBCF:Item Based Collaborative Filtering
if(LOAD) {
load("data/recommenders.rdata")
} else{
rIBCF <- Recommender(bx[1:6000,], method = "IBCF")
}
pred = predict(rIBCF, bx[8801:8846,], n=4)
do.call(rbind, as(pred, "list")) %>% head(15)
[,1] [,2] [,3] [,4]
2170855 "4719090900065" "4714981010038" "4711271000014" "4712162000038"
2171265 "4719090900065" "4714981010038" "4711271000014" "4712162000038"
2171340 "37000445111" "723125488064" "4710036008586" "723125485032"
2171425 "4711311617899" "4711311218836" "4710011401135" "4710011409056"
2171432 "4714981010038" "4710857000042" "4710321791698" "2250078000350"
2171555 "93432641" "93362993" "4711271000014" "4713985863121"
2171883 "4710670200100" "4710670200407" "4711271000014" "4719581980248"
2172194 "4714108700019" "4714108700064" "4902430493437" "4909978199111"
2172392 "4710908131589" "4710706211759" "4719090900058" "4712425010712"
2172569 "4711371850243" "84501293529" "4710085121007" "4710375112210"
2172583 "4710085120093" "4710734001186" "4710085172702" "34000231508"
2172590 "4710011406123" "4711271000014" "4711437000162" "4710777110265"
2172668 "4711371850243" "4710126020474" "4719090900065" "723125485032"
2172705 "37000445111" "37000304593" "37000442127" "37000441809"
2172811 "4719581980293" "4719581980279" "4712067899287" "4710908131589"
save(rIBCF, rUBCF, file="data/recommenders.rdata")
set.seed(4321)
scheme = evaluationScheme(
bx, method="split", train = .75, given=5)
algorithms = list(
AR53 = list(name="AR", param=list(support=0.0005, confidence=0.3)),
AR43 = list(name="AR", param=list(support=0.0004, confidence=0.3)),
RANDOM = list(name="RANDOM", param=NULL),
POPULAR = list(name="POPULAR", param=NULL),
UBCF = list(name="UBCF", param=NULL),
IBCF = list(name="IBCF", param=NULL) )
if(LOAD) {
load("data/results2a.rdata")
} else {
t0 = Sys.time()
results = evaluate(
scheme, algorithms,
type="topNList", # method of evaluation 評估方法
n=c(5, 10, 15, 20) # no. recom. to be evaluated #RECOM待評估
)
print(Sys.time() - t0)
save(results, file="data/results2a.rdata")
}
AR run fold/sample [model time/prediction time]
1 [3.64sec/173.4sec]
AR run fold/sample [model time/prediction time]
1 [8.54sec/445.6sec]
RANDOM run fold/sample [model time/prediction time]
1 [0.01sec/10.62sec]
POPULAR run fold/sample [model time/prediction time]
1 [0.02sec/9.63sec]
UBCF run fold/sample [model time/prediction time]
1 [0sec/60.53sec]
IBCF run fold/sample [model time/prediction time]
1 [159.5sec/1.44sec]
Time difference of 14.71 mins
## AR run fold/sample [model time/prediction time]
## 1 [4.02sec/214.6sec]
## AR run fold/sample [model time/prediction time]
## 1 [10.49sec/538.5sec]
## RANDOM run fold/sample [model time/prediction time]
## 1 [0sec/9.48sec]
## POPULAR run fold/sample [model time/prediction time]
## 1 [0sec/11.09sec]
## UBCF run fold/sample [model time/prediction time]
## 1 [0sec/75.42sec]
## IBCF run fold/sample [model time/prediction time]
## 1 [198.2sec/1.63sec]
## Time difference of 18.72 mins #時差52.66分鐘
# load("data/results.rdata")
par(mar=c(4,4,3,2),cex=0.8)
cols = c("red", "magenta", "gray", "orange", "blue", "green")
plot(results, annotate=c(1,3), legend="topleft", pch=19, lwd=2, col=cols)
abline(v=seq(0,0.006,0.001), h=seq(0,0.08,0.01), col='lightgray', lty=2)
getConfusionMatrix(results$IBCF)
[[1]]
TP FP FN TN precision recall TPR FPR
5 1.116 3.884 32.97 3311 0.2231 0.03899 0.03899 0.001171
10 1.699 8.301 32.39 3307 0.1699 0.05812 0.05812 0.002503
15 2.075 12.925 32.01 3302 0.1383 0.07021 0.07021 0.003898
20 2.385 17.615 31.70 3297 0.1193 0.08002 0.08002 0.005313