library(arules)
library(arulesViz)
library(tidyverse)
Mylist<-list(
c("A","C","D"),
c("B","C","E"),
c("A","B","C","E"),
c("B","E")
)
names(Mylist)<-paste("Tr",c(1:4),sep="")
Mylist
## $Tr1
## [1] "A" "C" "D"
##
## $Tr2
## [1] "B" "C" "E"
##
## $Tr3
## [1] "A" "B" "C" "E"
##
## $Tr4
## [1] "B" "E"
#将Mylist转化成交易数据库
MyTrans<-as(Mylist,"transactions")
MyTrans
## transactions in sparse format with
## 4 transactions (rows) and
## 5 items (columns)
summary(MyTrans)
## transactions as itemMatrix in sparse format with
## 4 rows (elements/itemsets/transactions) and
## 5 columns (items) and a density of 0.6
##
## most frequent items:
## B C E A D (Other)
## 3 3 3 2 1 0
##
## element (itemset/transaction) length distribution:
## sizes
## 2 3 4
## 1 2 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 2.75 3.00 3.00 3.25 4.00
##
## includes extended item information - examples:
## labels
## 1 A
## 2 B
## 3 C
##
## includes extended transaction information - examples:
## transactionID
## 1 Tr1
## 2 Tr2
## 3 Tr3
inspect(MyTrans)
## items transactionID
## [1] {A, C, D} Tr1
## [2] {B, C, E} Tr2
## [3] {A, B, C, E} Tr3
## [4] {B, E} Tr4
itemFrequency(MyTrans)
## A B C D E
## 0.50 0.75 0.75 0.25 0.75
par(mfrow=c(1,2))
itemFrequencyPlot(MyTrans)
itemFrequencyPlot(MyTrans,topN=5)
MyFact<-matrix(c(
1,0,1,1,0,
0,1,1,0,1,
1,1,1,0,1,
0,1,0,0,1),
nrow = 4,ncol = 5,byrow=TRUE)
MyFact
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 0 1 1 0
## [2,] 0 1 1 0 1
## [3,] 1 1 1 0 1
## [4,] 0 1 0 0 1
dimnames(MyFact)<-list(paste("Tr",c(1:4),sep=""),c("A","B","C","D","E"))
MyFact
## A B C D E
## Tr1 1 0 1 1 0
## Tr2 0 1 1 0 1
## Tr3 1 1 1 0 1
## Tr4 0 1 0 0 1
MyTrans1<-as(MyFact,"transactions")
MyTrans1
## transactions in sparse format with
## 4 transactions (rows) and
## 5 items (columns)
summary(MyTrans1)
## transactions as itemMatrix in sparse format with
## 4 rows (elements/itemsets/transactions) and
## 5 columns (items) and a density of 0.6
##
## most frequent items:
## B C E A D (Other)
## 3 3 3 2 1 0
##
## element (itemset/transaction) length distribution:
## sizes
## 2 3 4
## 1 2 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 2.75 3.00 3.00 3.25 4.00
##
## includes extended item information - examples:
## labels
## 1 A
## 2 B
## 3 C
##
## includes extended transaction information - examples:
## transactionID
## 1 Tr1
## 2 Tr2
## 3 Tr3
inspect(MyTrans1)
## items transactionID
## [1] {A, C, D} Tr1
## [2] {B, C, E} Tr2
## [3] {A, B, C, E} Tr3
## [4] {B, E} Tr4
itemFrequency(MyTrans1)
## A B C D E
## 0.50 0.75 0.75 0.25 0.75
par(mfrow=c(1,2))
itemFrequencyPlot(MyTrans1)
itemFrequencyPlot(MyTrans1,topN=5)
MyT<-data.frame(
TID=c(1,1,1,2,2,2,3,3,3,3,4,4),
items=c("A","C","D","B","C","E","A","B","C","E","B","E")
)
MyT
## TID items
## 1 1 A
## 2 1 C
## 3 1 D
## 4 2 B
## 5 2 C
## 6 2 E
## 7 3 A
## 8 3 B
## 9 3 C
## 10 3 E
## 11 4 B
## 12 4 E
(Mylist<-split(MyT[,"items"],MyT[,"TID"]))
## $`1`
## [1] "A" "C" "D"
##
## $`2`
## [1] "B" "C" "E"
##
## $`3`
## [1] "A" "B" "C" "E"
##
## $`4`
## [1] "B" "E"
MyTrans2<-as(Mylist,"transactions")
MyTrans2
## transactions in sparse format with
## 4 transactions (rows) and
## 5 items (columns)
summary(MyTrans2)
## transactions as itemMatrix in sparse format with
## 4 rows (elements/itemsets/transactions) and
## 5 columns (items) and a density of 0.6
##
## most frequent items:
## B C E A D (Other)
## 3 3 3 2 1 0
##
## element (itemset/transaction) length distribution:
## sizes
## 2 3 4
## 1 2 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 2.75 3.00 3.00 3.25 4.00
##
## includes extended item information - examples:
## labels
## 1 A
## 2 B
## 3 C
##
## includes extended transaction information - examples:
## transactionID
## 1 1
## 2 2
## 3 3
inspect(MyTrans2)
## items transactionID
## [1] {A, C, D} 1
## [2] {B, C, E} 2
## [3] {A, B, C, E} 3
## [4] {B, E} 4
itemFrequency(MyTrans2)
## A B C D E
## 0.50 0.75 0.75 0.25 0.75
par(mfrow=c(1,2))
itemFrequencyPlot(MyTrans2)
itemFrequencyPlot(MyTrans2,topN=5)
data("Groceries")
Groceries
## transactions in sparse format with
## 9835 transactions (rows) and
## 169 items (columns)
inspect(Groceries[1:10])
## items
## [1] {citrus fruit,
## semi-finished bread,
## margarine,
## ready soups}
## [2] {tropical fruit,
## yogurt,
## coffee}
## [3] {whole milk}
## [4] {pip fruit,
## yogurt,
## cream cheese ,
## meat spreads}
## [5] {other vegetables,
## whole milk,
## condensed milk,
## long life bakery product}
## [6] {whole milk,
## butter,
## yogurt,
## rice,
## abrasive cleaner}
## [7] {rolls/buns}
## [8] {other vegetables,
## UHT-milk,
## rolls/buns,
## bottled beer,
## liquor (appetizer)}
## [9] {pot plants}
## [10] {whole milk,
## cereals}
先使用绝对频率再使用相对频率查看频率最高的前15个项目。
itemFrequency<-Groceries %>% itemFrequency() %>% sort(decreasing = TRUE)
itemFrequency[1:15] %>% round(3) %>% data.frame()
## .
## whole milk 0.256
## other vegetables 0.193
## rolls/buns 0.184
## soda 0.174
## yogurt 0.140
## bottled water 0.111
## root vegetables 0.109
## tropical fruit 0.105
## shopping bags 0.099
## sausage 0.094
## pastry 0.089
## citrus fruit 0.083
## bottled beer 0.081
## newspapers 0.080
## canned beer 0.078
par(mfrow=c(3,2))
itemFrequencyPlot(Groceries,topN=15,type="absolute")
itemFrequencyPlot(Groceries,topN=15,type="relative")
itemFrequencyPlot(Groceries,topN=15,type="absolute",horiz=T)
itemFrequencyPlot(Groceries,topN=15,type="relative",horiz=T)
itemFrequencyPlot(Groceries,topN=15,horiz=T)
itemFrequencyPlot(Groceries,topN=15,support=0.1,horiz=T)
对于参数组,我们需要设置最小支持度,最小置信度以及项集中的最大项数和最小项数。 通过上面的频率图和反复试错法,我们设定最小支持度为0.001,最小置信度为0.9,项集中最大项数为4
options(digits=3)
parameter1<-list(supp=0.001,conf=0.9,maxlen=4)
rules<-Groceries %>% apriori(parameter = parameter1)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.9 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 4 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 9
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.01s].
## writing ... [67 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
rules_lift<-rules %>% sort(by="lift",decreasing = TRUE)
inspect(rules_lift[1:5])
## lhs rhs support confidence coverage lift count
## [1] {liquor,
## red/blush wine} => {bottled beer} 0.00193 0.905 0.00214 11.24 19
## [2] {root vegetables,
## butter,
## cream cheese } => {yogurt} 0.00102 0.909 0.00112 6.52 10
## [3] {citrus fruit,
## root vegetables,
## soft cheese} => {other vegetables} 0.00102 1.000 0.00102 5.17 10
## [4] {pip fruit,
## whipped/sour cream,
## brown bread} => {other vegetables} 0.00112 1.000 0.00112 5.17 11
## [5] {butter,
## whipped/sour cream,
## soda} => {other vegetables} 0.00132 0.929 0.00142 4.80 13
rules_conf<-rules %>% sort(by="confidence",decreasing = TRUE)
inspect(rules_conf[1:20])
## lhs rhs support confidence coverage lift count
## [1] {rice,
## sugar} => {whole milk} 0.00122 1.000 0.00122 3.91 12
## [2] {canned fish,
## hygiene articles} => {whole milk} 0.00112 1.000 0.00112 3.91 11
## [3] {root vegetables,
## butter,
## rice} => {whole milk} 0.00102 1.000 0.00102 3.91 10
## [4] {root vegetables,
## whipped/sour cream,
## flour} => {whole milk} 0.00173 1.000 0.00173 3.91 17
## [5] {butter,
## soft cheese,
## domestic eggs} => {whole milk} 0.00102 1.000 0.00102 3.91 10
## [6] {citrus fruit,
## root vegetables,
## soft cheese} => {other vegetables} 0.00102 1.000 0.00102 5.17 10
## [7] {pip fruit,
## butter,
## hygiene articles} => {whole milk} 0.00102 1.000 0.00102 3.91 10
## [8] {root vegetables,
## whipped/sour cream,
## hygiene articles} => {whole milk} 0.00102 1.000 0.00102 3.91 10
## [9] {pip fruit,
## root vegetables,
## hygiene articles} => {whole milk} 0.00102 1.000 0.00102 3.91 10
## [10] {cream cheese ,
## domestic eggs,
## sugar} => {whole milk} 0.00112 1.000 0.00112 3.91 11
## [11] {curd,
## domestic eggs,
## sugar} => {whole milk} 0.00102 1.000 0.00102 3.91 10
## [12] {cream cheese ,
## domestic eggs,
## napkins} => {whole milk} 0.00112 1.000 0.00112 3.91 11
## [13] {pip fruit,
## whipped/sour cream,
## brown bread} => {other vegetables} 0.00112 1.000 0.00112 5.17 11
## [14] {other vegetables,
## cream cheese ,
## sugar} => {whole milk} 0.00153 0.938 0.00163 3.67 15
## [15] {citrus fruit,
## domestic eggs,
## sugar} => {whole milk} 0.00142 0.933 0.00153 3.65 14
## [16] {yogurt,
## domestic eggs,
## sugar} => {whole milk} 0.00142 0.933 0.00153 3.65 14
## [17] {pip fruit,
## whipped/sour cream,
## cream cheese } => {whole milk} 0.00132 0.929 0.00142 3.63 13
## [18] {butter,
## whipped/sour cream,
## soda} => {other vegetables} 0.00132 0.929 0.00142 4.80 13
## [19] {pip fruit,
## butter,
## pastry} => {other vegetables} 0.00132 0.929 0.00142 4.80 13
## [20] {whipped/sour cream,
## house keeping products} => {whole milk} 0.00122 0.923 0.00132 3.61 12
下面我们专门探讨一下与啤酒有关的交易。 首先利用crossTable()函数建立交叉表,然后检查商品之间的共同购买关系。
tab<-Groceries %>% crossTable()
tab[1:5,1:5]
## frankfurter sausage liver loaf ham meat
## frankfurter 580 99 7 25 32
## sausage 99 924 10 49 52
## liver loaf 7 10 50 3 0
## ham 25 49 3 256 9
## meat 32 52 0 9 254
tab["bottled beer","bottled beer"]
## [1] 792
tab["canned beer","canned beer"]
## [1] 764
tab["bottled beer","canned beer"]
## [1] 26
parameter2<-list(support=0.0015,confidence=0.3)
appearance1<-list(default="lhs",rhs="bottled beer")
beer.rules<-Groceries %>% apriori(parameter = parameter2,appearance = appearance1)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.3 0.1 1 none FALSE TRUE 5 0.0015 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 14
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [153 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [4 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
beer.rules %>% inspect()
## lhs rhs support confidence
## [1] {liquor} => {bottled beer} 0.00468 0.422
## [2] {liquor, red/blush wine} => {bottled beer} 0.00193 0.905
## [3] {soda, red/blush wine} => {bottled beer} 0.00163 0.356
## [4] {other vegetables, red/blush wine} => {bottled beer} 0.00153 0.306
## coverage lift count
## [1] 0.01108 5.24 46
## [2] 0.00214 11.24 19
## [3] 0.00458 4.42 16
## [4] 0.00498 3.80 15
library(arulesViz)
beer.rules %>% plot(method="graph",measure="lift",
shading="confidence")