packages = c(
"dplyr","ggplot2","googleVis","devtools","magrittr","slam","irlba","plotly",
"arules","arulesViz","Matrix","recommenderlab")
There were 14 warnings (use warnings() to see them)
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=TRUE))
LOAD = TRUE
Sys.setlocale("LC_ALL","C")
library(dplyr)
library(ggplot2)
library(googleVis)
library(Matrix)
library(slam)
library(irlba)
library(plotly)
library(arules)
library(arulesViz)
library(recommenderlab)
Load data frame and rename
load("data/tf0.rdata")
A = A0; X = X0; Z = Z0; rm(A0,X0,Z0); gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 2599666 138.9 3886542 207.6 3205452 171.2
Vcells 11331894 86.5 14320023 109.3 11332917 86.5
Z = subset(Z, cust %in% A$cust)
n_distinct(Z$cust) # 32241 有多少顧客
[1] 32241
n_distinct(Z$prod) # 23787 有多少產品
[1] 23787
製作顧客產品矩陣其實很快、也很容易
library(Matrix)
library(slam)
cpm = xtabs(~ cust + prod, Z, sparse=T) # customer product matrix
dim(cpm) # 32241 23787
[1] 32241 23787
mean(cpm > 0) # 0.00096799 這個矩陣有資料的密度
[1] 0.0009674258
顧客產品矩陣通常是一個很稀疏的矩,陣有一些產品沒什麼人買
colSums(cpm) %>% quantile(seq(0,1,0.1))
0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
1 1 2 4 6 8 13 20 35 76 8475
mean(colSums(cpm) > 10) #被買超過10次的產品 在邏輯運算前面加mean 代表TRUE的比率
[1] 0.4483541
刪去購買次數小於6的產品,然後刪去沒有購買產品的顧客
cpm = cpm[, colSums(cpm) >= 6] # remove the least frequent products
# cpm = cpm[rowSums(cpm) > 0, ] # remove non-buying customers
cpm = cpm[, order(-colSums(cpm))] # order product by frequency
dim(cpm) # 32241 23787>14621 剩下14621個產品
[1] 32241 14621
max(cpm) # 49
[1] 49
mean(cpm > 0) # 0.0015248
[1] 0.001524785
table(cpm@x) %>% prop.table %>% round(4) %>% head(10)
1 2 3 4 5 6 7 8 9 10
0.9256 0.0579 0.0108 0.0032 0.0012 0.0006 0.0003 0.0002 0.0001 0.0001
請你用一個指令列出被購買最多次的10個產品,和它們被購買的次數。
cpm[,1:10] %>% colSums
4714981010038 4711271000014 4719090900065 4711080010112 4710114128038 4710265849066 4713985863121
8475 6119 2444 2249 2178 2017 1976
4710088410139 4710583996008 4710908131589
1869 1840 1679
#
■ 在什麼前提之下,我們可以把購買這十個產品的次數當作變數,用來預測顧客在下一期會不會來購買呢?
■ 我們如何把這十個變數,併入顧客資料框呢?
■ 我們可不可以(在什麼前提之下我們可以)直接用cbind()新變數併入顧客資料框呢?
■ 我們期中競賽的資料,符合直接用cbind()併入新變數的條件嗎? 我們要如何確認這一件事呢?
只要有理由相信x對y有預測力,就可以把購買這十個產品的次數當作變數 可以使用cbind()或是merge()把這十個變數,併入顧客資料框。如果customer id排序一樣,可以直接用cbind(),否則要用merge()
# dim(cpm) #確定cpm跟A
# sum( A$cust == rownames(cpm) )
# 要確認A跟cpm的rownames次序是不是一樣(rownames = cust)
# sum起來會變成32241個TRUE或FALSE
# 如果用尺度縮減的話
# A = cbind(A , svd$u[ , 1:20])
以產品的被購買頻率製作(顧客)變數的時候,排cpm在最前邊的(N個)欄位就是變數!
nop= 400 # no. product = no. variables
k = 200 # no. cluster
set.seed(111); kg = kmeans(cpm[,1:nop], k)$cluster
table(kg) %>% as.vector %>% sort
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[17] 1 1 1 1 1 1 2 2 2 2 2 2 3 3 3 3
[33] 3 3 3 4 4 4 4 4 4 4 5 5 6 6 6 6
[49] 7 7 7 8 8 8 9 9 9 9 10 10 10 10 11 11
[65] 11 11 11 12 13 13 15 15 15 16 16 18 19 20 20 20
[81] 20 21 22 22 22 24 24 25 25 27 28 28 32 32 35 36
[97] 39 40 41 42 44 45 46 47 47 48 49 49 50 51 52 53
[113] 56 58 58 61 63 66 67 68 69 69 72 81 85 85 86 87
[129] 90 94 96 97 97 100 100 101 110 111 113 114 116 118 123 123
[145] 126 130 134 136 141 141 142 143 162 165 172 175 178 179 182 182
[161] 184 187 195 210 222 225 228 228 237 239 242 253 254 258 258 266
[177] 268 272 287 293 301 311 325 329 350 351 363 396 407 407 410 418
[193] 432 448 473 523 561 1156 1266 11215
將分群結果併入顧客資料框(A)
df = A %>% inner_join(data.frame(
cust = as.integer(rownames(cpm)),
kg) )
Joining, by = "cust"
head(df) # 32241
計算各群組的平均屬性
df = data.frame(
aggregate(. ~ kg, df[,c(2:7,10)], mean), # averages
size = as.vector(table(kg)), # no. customers in the group
dummy = 2001 # dummy column for googleViz
)
head(df)
plot( gvisMotionChart(
subset(df[,c(1,4,5,6,8,2,3,7,9)],
size >= 20 & size <= 1000), # range of group size
"kg", "dummy", options=list(width=800, height=600) ) )
# use global variables: cpm, kg
Sig = function(gx, P=1000, H=10) {
print(sprintf("Group %d: No. Customers = %d", gx, sum(kg==gx)))
bx = cpm[,1:P]
data.frame(n = col_sums(bx[kg==gx,])) %>% # frequency
mutate(
share = round(100*n/col_sums(bx),2), # %prod sold to this cluster
conf = round(100*n/sum(kg==gx),2), # %buy this product, given cluster
base = round(100*col_sums(bx)/nrow(bx),2), # %buy this product, all cust
lift = round(conf/base,1), # conf/base
name = colnames(bx) # name of prod
) %>% arrange(desc(lift)) %>% head(H)
}
Sig(130)
[1] "Group 130: No. Customers = 97"
package 'bindrcpp' was built under R version 3.4.3
■ 在什麼前提之下,我們可以把顧客購買產品的特徵向量當作變數,用來預測顧客在下一期會不會來購買呢?
■ 如果可以的話,我們如何把顧客購買產品的特徵向量,併入顧客資料框呢?
■ 我們可不可以(在什麼前提之下我們可以)直接用cbind()將特徵向量併入顧客資料框呢?
■ 我們期中競賽的資料,符合直接用cbind()併入特徵向量的條件嗎? 我們要如何確認這一件事呢?
只要有理由相信x對y有預測力,就可以把顧客購買產品的特徵向量當作變數 可以使用cbind()或是merge()把顧客購買產品的特徵向量,併入顧客資料框。 如果customer id排序一樣,可以直接用cbind(),否則要用merge() 期中競賽的資料符合直接用cbind(),只要看他的cust排序是否相同就可以確認了。
library(irlba)
if(LOAD) {
load("data/svd2a.rdata")
} else {
smx = cpm
smx@x = pmin(smx@x, 2) # cap at 2, similar to normalization
t0 = Sys.time()
svd = irlba(smx, # 對稀鬆矩陣做常態化 要cap at 2才有常態化效果
nv=400, # length of feature vector
maxit=800, work=800) # nv=400有點大 一般來說不會用那麼大 maxit和work設nv*2
print(Sys.time() - t0) # 1.8795 mins
save(svd, file = "data/svd2a.rdata")
}
set.seed(111); kg = kmeans(svd$u, 200)$cluster
table(kg) %>% as.vector %>% sort
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[20] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[39] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[58] 2 2 2 2 2 3 4 4 5 7 10 14 30 31 32 36 38 38 39
[77] 39 40 40 41 44 45 46 47 49 54 59 62 62 69 71 77 79 79 80
[96] 82 82 84 87 91 101 103 109 110 111 113 117 120 123 127 127 129 132 133
[115] 134 135 136 139 141 143 143 147 147 157 159 159 160 160 160 166 168 169 172
[134] 175 180 181 181 182 183 184 184 188 190 190 193 194 195 196 198 198 200 201
[153] 201 202 202 204 204 204 207 209 209 210 213 214 216 219 219 222 225 233 234
[172] 235 236 237 237 238 239 241 248 248 248 253 256 257 258 259 261 261 264 269
[191] 277 281 285 293 305 411 612 896 1092 8987
# clustster summary
df = inner_join(A, data.frame(
cust = as.integer(rownames(cpm)), kg)) %>%
group_by(kg) %>% summarise(
avg_frequency = mean(f),
avg_monetary = mean(m),
avg_revenue_contr = mean(rev),
group_size = n(),
avg_recency = mean(r),
avg_gross_profit = mean(raw)) %>%
ungroup %>%
mutate(dummy = 2001, kg = sprintf("G%03d",kg)) %>%
data.frame
Joining, by = "cust"
# Google Motion Chart
plot( gvisMotionChart(
subset(df, group_size >= 20 & group_size <= 1200),
"kg", "dummy", options=list(width=800, height=600) ) )
Sig(162)
[1] "Group 162: No. Customers = 87"
dim(cpm) # 32241 14621
[1] 32241 14621
bx和cpm表達的資訊其實一樣,只是格式不一樣
library(arules)
library(arulesViz) #做visualization用
# bx = subset(Z, prod %in% as.numeric(colnames(cpm)),
# select=c("cust","prod")) # select product items
# bx = Z
# bx = split(bx$prod, bx$tid) # split by transaction id
# bx = as(bx, "transactions") # data structure for arules package
# 先從Z裡面選取
# as.numeric(colnames(cpm)) 被購買超過六次以上的產品的集合 選row
# %in% 集合
# select=c("cust","prod")) 選col
# bx其實就是另一種格式的顧客產品
bx = as(split(Z$prod , Z$tid),'transactions')
itemFrequencyPlot(bx, topN=20, type="absolute", cex=0.8)
關聯規則(A => B)
rules = apriori(bx, parameter=list(supp=0.002, conf=0.5))
Apriori
Parameter specification:
confidence minval smax arem aval originalSupport maxtime support minlen maxlen target ext
0.5 0.1 1 none FALSE TRUE 5 0.002 1 10 rules FALSE
Algorithmic control:
filter tree heap memopt load sort verbose
0.1 TRUE TRUE FALSE TRUE 2 TRUE
Absolute minimum support count: 238
set item appearances ...[0 item(s)] done [0.01s].
set transactions ...[23787 item(s), 119407 transaction(s)] done [0.20s].
sorting and recoding items ... [569 item(s)] done [0.02s].
creating transaction tree ... done [0.07s].
checking subsets of size 1 2 3 done [0.03s].
writing ... [28 rule(s)] done [0.01s].
creating S4 object ... done [0.03s].
summary(rules)
set of 28 rules
rule length distribution (lhs + rhs):sizes
2 3
17 11
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.000 2.000 2.000 2.393 3.000 3.000
summary of quality measures:
support confidence lift count
Min. :0.002102 Min. :0.5000 Min. : 43.24 Min. :251.0
1st Qu.:0.002387 1st Qu.:0.5398 1st Qu.: 49.77 1st Qu.:285.0
Median :0.002826 Median :0.6044 Median : 58.58 Median :337.5
Mean :0.003081 Mean :0.6336 Mean : 71.34 Mean :367.9
3rd Qu.:0.003404 3rd Qu.:0.7139 3rd Qu.: 72.40 3rd Qu.:406.5
Max. :0.005854 Max. :0.8088 Max. :170.92 Max. :699.0
mining info:
data ntransactions support confidence
bx 119407 0.002 0.5
# 我要找support>0.005 confidence>0.6的 這些都可以自己調 從高往低調
# 門檻設越低 找到的rule越多
# 找到28個關聯規則
關聯規則 (A => B):
lift: A被購買時,B被購買的機率增加的倍數 (與B的基礎機率相比)
rhs : Right Hand Side
support, confidence, lift越高越好,代表關聯規則越重要
#看28個rule
options(digits=4)
inspect(rules)
lhs rhs support confidence lift count
[1] {4719090790017} => {4719090790000} 0.002940 0.8088 170.92 351
[2] {4719090790000} => {4719090790017} 0.002940 0.6212 170.92 351
[3] {719859796124} => {719859796117} 0.002102 0.6972 147.61 251
[4] {4710011402026} => {4710011402019} 0.002822 0.6740 90.22 337
[5] {4710085120697} => {4710085120680} 0.003467 0.7753 100.41 414
[6] {4710011401142} => {4710011401128} 0.002203 0.5964 43.55 263
[7] {4710085172702} => {4710085172696} 0.002429 0.5400 62.00 290
[8] {4710085172702} => {4710085120628} 0.002462 0.5475 48.25 294
[9] {4710085120710} => {4710085120703} 0.002914 0.5613 89.84 348
[10] {4710018004704} => {4710018004605} 0.002990 0.5360 46.08 357
[11] {4710011409056} => {4710011401135} 0.003383 0.5337 68.60 404
[12] {4710011409056} => {4710011401128} 0.004430 0.6988 51.04 529
[13] {4710085120093} => {4710085120628} 0.003961 0.5267 46.42 473
[14] {4710011401135} => {4710011401128} 0.005854 0.7524 54.95 699
[15] {4710085172696} => {4710085120628} 0.004355 0.5000 44.06 520
[16] {4710011405133} => {4710011401128} 0.005176 0.6588 48.12 618
[17] {4710011406123} => {4710011401128} 0.004849 0.5920 43.24 579
[18] {4710011401135,4710011409056} => {4710011401128} 0.002713 0.8020 58.57 324
[19] {4710011401128,4710011409056} => {4710011401135} 0.002713 0.6125 78.72 324
[20] {4710011405133,4710011409056} => {4710011401128} 0.002261 0.7606 55.55 270
[21] {4710011401128,4710011409056} => {4710011405133} 0.002261 0.5104 64.97 270
[22] {4710085120093,4710085172696} => {4710085120628} 0.002136 0.5705 50.27 255
[23] {4710085120093,4710085120628} => {4710085172696} 0.002136 0.5391 61.90 255
[24] {4710011401135,4710011405133} => {4710011401128} 0.002831 0.7717 56.36 338
[25] {4710011401128,4710011405133} => {4710011401135} 0.002831 0.5469 70.30 338
[26] {4710011401135,4710011406123} => {4710011401128} 0.002445 0.8022 58.59 292
[27] {4710011401128,4710011406123} => {4710011401135} 0.002445 0.5043 64.82 292
[28] {4710011405133,4710011406123} => {4710011401128} 0.002219 0.7011 51.20 265
# lhs left hand side
# rhs right hand side
# 我們會希望support高
# 菜籃分析是看不到人的 是看一張一張單
# B基礎機率 就是在所有菜籃裡面 B出現的機率
# confidence/lift = B的基礎機率
# 若support confidence lift都高 就代表這個rule非常重要
# install.packages(
# "https://cran.r-project.org/bin/windows/contrib/3.5/arulesViz_1.3-1.zip",
# repos=NULL)
# install.packages("arulesViz_1.3-1.zip", repos=NULL)
# library(plotly)
# plotly_arules(rules,colors=c("red","green"),
# marker=list(opacity=.6,size=10))
# plotly_arules(rules,method="matrix",
# shading="lift",
# colors=c("red", "green"))
#
每個點都是一個rule lift是顏色 可以做為架位擺設的參考
plot(rules,colors=c("red","green"),engine="htmlwidget",
marker=list(opacity=.6,size=8))
#每一點都是一個rule
plot(rules,method="matrix",shading="lift",engine="htmlwidget",
colors=c("red", "green"))
# 衡的有蠻長的一條 代表有比較多的rule指向right hand side
r1 = subset(rules, subset = rhs %in% c("4719090790000"))
summary(r1)
set of 1 rules
rule length distribution (lhs + rhs):sizes
2
1
Min. 1st Qu. Median Mean 3rd Qu. Max.
2 2 2 2 2 2
summary of quality measures:
support confidence lift count
Min. :0.00294 Min. :0.809 Min. :171 Min. :351
1st Qu.:0.00294 1st Qu.:0.809 1st Qu.:171 1st Qu.:351
Median :0.00294 Median :0.809 Median :171 Median :351
Mean :0.00294 Mean :0.809 Mean :171 Mean :351
3rd Qu.:0.00294 3rd Qu.:0.809 3rd Qu.:171 3rd Qu.:351
Max. :0.00294 Max. :0.809 Max. :171 Max. :351
mining info:
data ntransactions support confidence
bx 119407 0.002 0.5
plot(r1,method="graph",engine="htmlwidget",itemCol="cyan")
#從原本rule挑出right hand side有47001..的
r2 = subset(rules, subset = rhs %in% c("4710011401135"))
summary(r2)
set of 4 rules
rule length distribution (lhs + rhs):sizes
2 3
1 3
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.00 2.75 3.00 2.75 3.00 3.00
summary of quality measures:
support confidence lift count
Min. :0.00245 Min. :0.504 Min. :64.8 Min. :292
1st Qu.:0.00265 1st Qu.:0.526 1st Qu.:67.7 1st Qu.:316
Median :0.00277 Median :0.540 Median :69.5 Median :331
Mean :0.00284 Mean :0.549 Mean :70.6 Mean :340
3rd Qu.:0.00297 3rd Qu.:0.563 3rd Qu.:72.4 3rd Qu.:354
Max. :0.00338 Max. :0.613 Max. :78.7 Max. :404
mining info:
data ntransactions support confidence
bx 119407 0.002 0.5
plot(r2,method="graph",engine="htmlwidget",itemCol="cyan")
# 只要我想要知道出現甚麼東西就會出現甚麼的話 就可以用關聯規則去做
太少被購買的產品和購買太少產品的顧客都不適合使用Collaborative Filtering這種產品推薦方法,所以我們先對顧客和產品做一次篩選
library(recommenderlab)
rx = cpm[, colSums(cpm > 0) >= 50] #先篩掉有50個人以上買的column
rx = rx[rowSums(rx > 0) >= 20 & rowSums(rx > 0) <= 300, ] #不允許顧客不買東西 只留買超過20件產品的顧客 買超過300件的我也不要 這行是在選row(顧客)
dim(rx) # 8846 3354
[1] 8846 3354
可以選擇要用
做模型。
rx = as(rx, "realRatingMatrix") # realRatingMatrix
bx = binarize(rx, minRating=1) # binaryRatingMatrix
# minRating=1 就是超過1次就算1 放2就是買2次以上才是1
# 通常零售商比較care binary
# 在這裡是用bx
UBCF:User Based Collaborative Filtering
(rUBCF <- Recommender(bx[1:8800,], method = "UBCF"))
Recommender of type 'UBCF' for 'binaryRatingMatrix'
learned using 8800 users.
# 用前面的8800去做模型 把模型放在rUBCF裡面
# UBCF 把購買習慣類似的人先集合在一起
pred = predict(rUBCF, bx[8801:8846,], n=4) #用剩下的46個做測試 叫他每個人幫我推薦4件產品
do.call(rbind, as(pred, "list")) %>% head(15) #head可以不用打 就可以看46個顧客的結果
[,1] [,2] [,3] [,4]
2170855 "4711271000014" "4710114128038" "4714981010038" "4713985863121"
2171265 "4719090900065" "4710254049521" "4710036008562" "4714981010038"
2171340 "723125488040" "723125488064" "723125485032" "4714981010038"
2171425 "4710011401135" "4710011409056" "4711080010112" "4710011401142"
2171432 "4714981010038" "4710011406123" "4711258007371" "4710011401128"
2171555 "4719090900065" "4711271000014" "37000329169" "4710943109352"
2171883 "4711271000014" "4710583996008" "4710291112172" "4710018004704"
2172194 "4711271000014" "4714981010038" "4710114128038" "4710114105046"
2172392 "4903111345717" "4710908131589" "4710168705056" "4711271000014"
2172569 "4711271000014" "4714981010038" "4710128030037" "4712162000038"
2172583 "4714981010038" "4710085120093" "4719090900065" "4710154015206"
2172590 "4710011406123" "4710011401142" "4710857000028" "4710011432856"
2172668 "4711271000014" "4710088620156" "4719090900065" "4712425010712"
2172705 "4711271000014" "4714981010038" "37000445111" "37000440192"
2172811 "4714981010038" "37000442127" "4710719000333" "4710114128038"
IBCF:Item Based Collaborative Filtering
(rIBCF <- Recommender(bx[1:8800,], method = "IBCF"))
pred = predict(rIBCF, bx[8801:8846,], n=4)
do.call(rbind, as(pred, "list")) %>% head(15)
[,1] [,2] [,3] [,4]
2170855 "4719090900065" "4714981010038" "4711271000014" "4712162000038"
2171265 "4719090900065" "4710015103288" "4714981010038" "4711271000014"
2171340 "37000445111" "4710036005608" "37000442127" "723125485032"
2171425 "4711311617899" "4711311218836" "4710011401135" "4710011409056"
2171432 "4714981010038" "4710321791698" "4710857000042" "4710626111252"
2171555 "93432641" "93362993" "4710105045320" "4711271000014"
2171883 "4710670200100" "4710670200407" "4711271000014" "3228020490329"
2172194 "4714108700019" "4714108700064" "4909978199111" "20332433"
2172392 "4710706211759" "4710908131589" "4719090900058" "4710731040614"
2172569 "4711371850243" "84501297329" "84501293529" "4710085121007"
2172583 "4710085172702" "4710085120093" "34000100095" "34000231508"
2172590 "4710011406123" "4711271000014" "4711437000162" "4710011401142"
2172668 "4711371850243" "4719090900065" "4714981010038" "4711437000117"
2172705 "37000445111" "4710018004605" "37000442127" "37000304593"
2172811 "4719581980293" "4712067899287" "4719581980279" "4710908131589"
save(rIBCF, rUBCF, file="data/recommenders.rdata")
set.seed(4321)
scheme = evaluationScheme(
bx, method="split", train = .75, given=5)
#把整個資料split成0.75做train
#given 5 是指我留下0.25來做的testing data中 你要給我5件產品 我才能猜
algorithms = list(
AR53 = list(name="AR", param=list(support=0.0005, confidence=0.3)),
AR43 = list(name="AR", param=list(support=0.0004, confidence=0.3)),
RANDOM = list(name="RANDOM", param=NULL), #radom亂猜 有點像baseline model
POPULAR = list(name="POPULAR", param=NULL), #popular 看哪件流行就推薦他那一件 看你沒買哪樣我就推薦給你 類似smart baseline
UBCF = list(name="UBCF", param=NULL), #param=NULL代表就用他們原本預設的參數
IBCF = list(name="IBCF", param=NULL) )
if(LOAD) {
load("data/results2a.rdata")
} else {
t0 = Sys.time()
results = evaluate(
scheme, algorithms,
type="topNList", # method of evaluation
n=c(5, 10, 15, 20) # no. recom. to be evaluated 每個人給我5個 我要猜5件10件15件20件
)
print(Sys.time() - t0)
save(results, file="data/results2a.rdata")
}
## AR run fold/sample [model time/prediction time]
## 1 [4.02sec/214.6sec]
## AR run fold/sample [model time/prediction time]
## 1 [10.49sec/538.5sec]
## RANDOM run fold/sample [model time/prediction time]
## 1 [0sec/9.48sec]
## POPULAR run fold/sample [model time/prediction time]
## 1 [0sec/11.09sec]
## UBCF run fold/sample [model time/prediction time]
## 1 [0sec/75.42sec]
## IBCF run fold/sample [model time/prediction time]
## 1 [198.2sec/1.63sec]
## Time difference of 18.72 mins
## 這段程式要跑10幾分鐘
若推薦10個產品,AR53表現最好,True Positive Rate最高,False Positive Rate也最低 若推薦15 or 20個產品,整體來說IBCF是最好的
# load("data/results.rdata")
par(mar=c(4,4,3,2),cex=0.8)
cols = c("red", "magenta", "gray", "orange", "blue", "green")
plot(results, annotate=c(1,3), legend="topleft", pch=19, lwd=2, col=cols)
abline(v=seq(0,0.006,0.001), h=seq(0,0.08,0.01), col='lightgray', lty=2)
# ROC Curve
# 猜得越多中的機率就越大
# 灰色表亂猜
# 橘色錶smart baseline
# TPR越高越好 FPR越低越好
# 比較各個現在5 10 15的TPR和FPR
getConfusionMatrix(results$IBCF)
[[1]]
TP FP FN TN precision recall TPR FPR
5 1.116 3.884 32.97 3311 0.2231 0.03899 0.03899 0.001171
10 1.699 8.301 32.39 3307 0.1699 0.05812 0.05812 0.002503
15 2.075 12.925 32.01 3302 0.1383 0.07021 0.07021 0.003898
20 2.385 17.615 31.70 3297 0.1193 0.08002 0.08002 0.005313
Group4:
B034020012 謝雨靜
B034020027 陳韻卉
B044012015 王譯苓
M064111025 黃威豪
M064111039 王 欣
行傳所 孟祥瑄