软件包arules、arulesViz、tidyverse包的安装和载入

library(arules)
library(arulesViz)
library(tidyverse)

transactions型数据创建

方式一

Mylist<-list(
  c("A","C","D"),
  c("B","C","E"),
  c("A","B","C","E"),
  c("B","E")
 )

names(Mylist)<-paste("Tr",c(1:4),sep="")
Mylist
## $Tr1
## [1] "A" "C" "D"
## 
## $Tr2
## [1] "B" "C" "E"
## 
## $Tr3
## [1] "A" "B" "C" "E"
## 
## $Tr4
## [1] "B" "E"
#将Mylist转化成交易数据库
MyTrans<-as(Mylist,"transactions")
MyTrans
## transactions in sparse format with
##  4 transactions (rows) and
##  5 items (columns)
summary(MyTrans)
## transactions as itemMatrix in sparse format with
##  4 rows (elements/itemsets/transactions) and
##  5 columns (items) and a density of 0.6 
## 
## most frequent items:
##       B       C       E       A       D (Other) 
##       3       3       3       2       1       0 
## 
## element (itemset/transaction) length distribution:
## sizes
## 2 3 4 
## 1 2 1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    2.75    3.00    3.00    3.25    4.00 
## 
## includes extended item information - examples:
##   labels
## 1      A
## 2      B
## 3      C
## 
## includes extended transaction information - examples:
##   transactionID
## 1           Tr1
## 2           Tr2
## 3           Tr3
inspect(MyTrans)
##     items        transactionID
## [1] {A, C, D}    Tr1          
## [2] {B, C, E}    Tr2          
## [3] {A, B, C, E} Tr3          
## [4] {B, E}       Tr4
itemFrequency(MyTrans)
##    A    B    C    D    E 
## 0.50 0.75 0.75 0.25 0.75
par(mfrow=c(1,2))
itemFrequencyPlot(MyTrans)
itemFrequencyPlot(MyTrans,topN=5)

方式二

MyFact<-matrix(c(
  1,0,1,1,0,
  0,1,1,0,1,
  1,1,1,0,1,
  0,1,0,0,1),
  nrow = 4,ncol = 5,byrow=TRUE)
MyFact
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    0    1    1    0
## [2,]    0    1    1    0    1
## [3,]    1    1    1    0    1
## [4,]    0    1    0    0    1
dimnames(MyFact)<-list(paste("Tr",c(1:4),sep=""),c("A","B","C","D","E"))
MyFact
##     A B C D E
## Tr1 1 0 1 1 0
## Tr2 0 1 1 0 1
## Tr3 1 1 1 0 1
## Tr4 0 1 0 0 1
MyTrans1<-as(MyFact,"transactions")
MyTrans1
## transactions in sparse format with
##  4 transactions (rows) and
##  5 items (columns)
summary(MyTrans1)
## transactions as itemMatrix in sparse format with
##  4 rows (elements/itemsets/transactions) and
##  5 columns (items) and a density of 0.6 
## 
## most frequent items:
##       B       C       E       A       D (Other) 
##       3       3       3       2       1       0 
## 
## element (itemset/transaction) length distribution:
## sizes
## 2 3 4 
## 1 2 1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    2.75    3.00    3.00    3.25    4.00 
## 
## includes extended item information - examples:
##   labels
## 1      A
## 2      B
## 3      C
## 
## includes extended transaction information - examples:
##   transactionID
## 1           Tr1
## 2           Tr2
## 3           Tr3
inspect(MyTrans1)
##     items        transactionID
## [1] {A, C, D}    Tr1          
## [2] {B, C, E}    Tr2          
## [3] {A, B, C, E} Tr3          
## [4] {B, E}       Tr4
itemFrequency(MyTrans1)
##    A    B    C    D    E 
## 0.50 0.75 0.75 0.25 0.75
par(mfrow=c(1,2))
itemFrequencyPlot(MyTrans1)
itemFrequencyPlot(MyTrans1,topN=5)

方式三

MyT<-data.frame(
  TID=c(1,1,1,2,2,2,3,3,3,3,4,4),
  items=c("A","C","D","B","C","E","A","B","C","E","B","E")
)
MyT
##    TID items
## 1    1     A
## 2    1     C
## 3    1     D
## 4    2     B
## 5    2     C
## 6    2     E
## 7    3     A
## 8    3     B
## 9    3     C
## 10   3     E
## 11   4     B
## 12   4     E
(Mylist<-split(MyT[,"items"],MyT[,"TID"]))
## $`1`
## [1] "A" "C" "D"
## 
## $`2`
## [1] "B" "C" "E"
## 
## $`3`
## [1] "A" "B" "C" "E"
## 
## $`4`
## [1] "B" "E"
MyTrans2<-as(Mylist,"transactions")
MyTrans2
## transactions in sparse format with
##  4 transactions (rows) and
##  5 items (columns)
summary(MyTrans2)
## transactions as itemMatrix in sparse format with
##  4 rows (elements/itemsets/transactions) and
##  5 columns (items) and a density of 0.6 
## 
## most frequent items:
##       B       C       E       A       D (Other) 
##       3       3       3       2       1       0 
## 
## element (itemset/transaction) length distribution:
## sizes
## 2 3 4 
## 1 2 1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    2.75    3.00    3.00    3.25    4.00 
## 
## includes extended item information - examples:
##   labels
## 1      A
## 2      B
## 3      C
## 
## includes extended transaction information - examples:
##   transactionID
## 1             1
## 2             2
## 3             3
inspect(MyTrans2)
##     items        transactionID
## [1] {A, C, D}    1            
## [2] {B, C, E}    2            
## [3] {A, B, C, E} 3            
## [4] {B, E}       4
itemFrequency(MyTrans2)
##    A    B    C    D    E 
## 0.50 0.75 0.75 0.25 0.75
par(mfrow=c(1,2))
itemFrequencyPlot(MyTrans2)
itemFrequencyPlot(MyTrans2,topN=5)

读取数据集Groceries

data("Groceries")
Groceries
## transactions in sparse format with
##  9835 transactions (rows) and
##  169 items (columns)
inspect(Groceries[1:10])
##      items                     
## [1]  {citrus fruit,            
##       semi-finished bread,     
##       margarine,               
##       ready soups}             
## [2]  {tropical fruit,          
##       yogurt,                  
##       coffee}                  
## [3]  {whole milk}              
## [4]  {pip fruit,               
##       yogurt,                  
##       cream cheese ,           
##       meat spreads}            
## [5]  {other vegetables,        
##       whole milk,              
##       condensed milk,          
##       long life bakery product}
## [6]  {whole milk,              
##       butter,                  
##       yogurt,                  
##       rice,                    
##       abrasive cleaner}        
## [7]  {rolls/buns}              
## [8]  {other vegetables,        
##       UHT-milk,                
##       rolls/buns,              
##       bottled beer,            
##       liquor (appetizer)}      
## [9]  {pot plants}              
## [10] {whole milk,              
##       cereals}

使用arules包中的itemFrequencyPlot()函数生成项目频率图

先使用绝对频率再使用相对频率查看频率最高的前15个项目。

itemFrequency<-Groceries %>% itemFrequency() %>% sort(decreasing = TRUE)
itemFrequency[1:15] %>% round(3) %>% data.frame()
##                      .
## whole milk       0.256
## other vegetables 0.193
## rolls/buns       0.184
## soda             0.174
## yogurt           0.140
## bottled water    0.111
## root vegetables  0.109
## tropical fruit   0.105
## shopping bags    0.099
## sausage          0.094
## pastry           0.089
## citrus fruit     0.083
## bottled beer     0.081
## newspapers       0.080
## canned beer      0.078
par(mfrow=c(3,2))
itemFrequencyPlot(Groceries,topN=15,type="absolute")
itemFrequencyPlot(Groceries,topN=15,type="relative")
itemFrequencyPlot(Groceries,topN=15,type="absolute",horiz=T)
itemFrequencyPlot(Groceries,topN=15,type="relative",horiz=T)
itemFrequencyPlot(Groceries,topN=15,horiz=T)
itemFrequencyPlot(Groceries,topN=15,support=0.1,horiz=T)

模型构建与模型评价

使用apriori()函数建模

对于参数组,我们需要设置最小支持度,最小置信度以及项集中的最大项数和最小项数。 通过上面的频率图和反复试错法,我们设定最小支持度为0.001,最小置信度为0.9,项集中最大项数为4

options(digits=3)
parameter1<-list(supp=0.001,conf=0.9,maxlen=4)
rules<-Groceries %>% apriori(parameter = parameter1)
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.9    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target  ext
##       4  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 9 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.01s].
## writing ... [67 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

模型评价

将规则按照提升度lift降序排列,并查看前5条规则

rules_lift<-rules %>% sort(by="lift",decreasing = TRUE)
inspect(rules_lift[1:5])
##     lhs                      rhs                support confidence coverage  lift count
## [1] {liquor,                                                                           
##      red/blush wine}      => {bottled beer}     0.00193      0.905  0.00214 11.24    19
## [2] {root vegetables,                                                                  
##      butter,                                                                           
##      cream cheese }       => {yogurt}           0.00102      0.909  0.00112  6.52    10
## [3] {citrus fruit,                                                                     
##      root vegetables,                                                                  
##      soft cheese}         => {other vegetables} 0.00102      1.000  0.00102  5.17    10
## [4] {pip fruit,                                                                        
##      whipped/sour cream,                                                               
##      brown bread}         => {other vegetables} 0.00112      1.000  0.00112  5.17    11
## [5] {butter,                                                                           
##      whipped/sour cream,                                                               
##      soda}                => {other vegetables} 0.00132      0.929  0.00142  4.80    13

将规则按照置信度confidence降序排列,并查看前5条规则

rules_conf<-rules %>% sort(by="confidence",decreasing = TRUE)
inspect(rules_conf[1:20])
##      lhs                         rhs                support confidence coverage lift count
## [1]  {rice,                                                                               
##       sugar}                  => {whole milk}       0.00122      1.000  0.00122 3.91    12
## [2]  {canned fish,                                                                        
##       hygiene articles}       => {whole milk}       0.00112      1.000  0.00112 3.91    11
## [3]  {root vegetables,                                                                    
##       butter,                                                                             
##       rice}                   => {whole milk}       0.00102      1.000  0.00102 3.91    10
## [4]  {root vegetables,                                                                    
##       whipped/sour cream,                                                                 
##       flour}                  => {whole milk}       0.00173      1.000  0.00173 3.91    17
## [5]  {butter,                                                                             
##       soft cheese,                                                                        
##       domestic eggs}          => {whole milk}       0.00102      1.000  0.00102 3.91    10
## [6]  {citrus fruit,                                                                       
##       root vegetables,                                                                    
##       soft cheese}            => {other vegetables} 0.00102      1.000  0.00102 5.17    10
## [7]  {pip fruit,                                                                          
##       butter,                                                                             
##       hygiene articles}       => {whole milk}       0.00102      1.000  0.00102 3.91    10
## [8]  {root vegetables,                                                                    
##       whipped/sour cream,                                                                 
##       hygiene articles}       => {whole milk}       0.00102      1.000  0.00102 3.91    10
## [9]  {pip fruit,                                                                          
##       root vegetables,                                                                    
##       hygiene articles}       => {whole milk}       0.00102      1.000  0.00102 3.91    10
## [10] {cream cheese ,                                                                      
##       domestic eggs,                                                                      
##       sugar}                  => {whole milk}       0.00112      1.000  0.00112 3.91    11
## [11] {curd,                                                                               
##       domestic eggs,                                                                      
##       sugar}                  => {whole milk}       0.00102      1.000  0.00102 3.91    10
## [12] {cream cheese ,                                                                      
##       domestic eggs,                                                                      
##       napkins}                => {whole milk}       0.00112      1.000  0.00112 3.91    11
## [13] {pip fruit,                                                                          
##       whipped/sour cream,                                                                 
##       brown bread}            => {other vegetables} 0.00112      1.000  0.00112 5.17    11
## [14] {other vegetables,                                                                   
##       cream cheese ,                                                                      
##       sugar}                  => {whole milk}       0.00153      0.938  0.00163 3.67    15
## [15] {citrus fruit,                                                                       
##       domestic eggs,                                                                      
##       sugar}                  => {whole milk}       0.00142      0.933  0.00153 3.65    14
## [16] {yogurt,                                                                             
##       domestic eggs,                                                                      
##       sugar}                  => {whole milk}       0.00142      0.933  0.00153 3.65    14
## [17] {pip fruit,                                                                          
##       whipped/sour cream,                                                                 
##       cream cheese }          => {whole milk}       0.00132      0.929  0.00142 3.63    13
## [18] {butter,                                                                             
##       whipped/sour cream,                                                                 
##       soda}                   => {other vegetables} 0.00132      0.929  0.00142 4.80    13
## [19] {pip fruit,                                                                          
##       butter,                                                                             
##       pastry}                 => {other vegetables} 0.00132      0.929  0.00142 4.80    13
## [20] {whipped/sour cream,                                                                 
##       house keeping products} => {whole milk}       0.00122      0.923  0.00132 3.61    12

设置前项或者后项限制条件,进行规则选取

下面我们专门探讨一下与啤酒有关的交易。 首先利用crossTable()函数建立交叉表,然后检查商品之间的共同购买关系。

tab<-Groceries %>% crossTable()
tab[1:5,1:5]
##             frankfurter sausage liver loaf ham meat
## frankfurter         580      99          7  25   32
## sausage              99     924         10  49   52
## liver loaf            7      10         50   3    0
## ham                  25      49          3 256    9
## meat                 32      52          0   9  254
tab["bottled beer","bottled beer"]
## [1] 792
tab["canned beer","canned beer"]
## [1] 764
tab["bottled beer","canned beer"]
## [1] 26
parameter2<-list(support=0.0015,confidence=0.3)
appearance1<-list(default="lhs",rhs="bottled beer")
beer.rules<-Groceries %>% apriori(parameter = parameter2,appearance = appearance1)
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.3    0.1    1 none FALSE            TRUE       5  0.0015      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 14 
## 
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [153 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [4 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
beer.rules %>% inspect()
##     lhs                                   rhs            support confidence
## [1] {liquor}                           => {bottled beer} 0.00468 0.422     
## [2] {liquor, red/blush wine}           => {bottled beer} 0.00193 0.905     
## [3] {soda, red/blush wine}             => {bottled beer} 0.00163 0.356     
## [4] {other vegetables, red/blush wine} => {bottled beer} 0.00153 0.306     
##     coverage lift  count
## [1] 0.01108   5.24 46   
## [2] 0.00214  11.24 19   
## [3] 0.00458   4.42 16   
## [4] 0.00498   3.80 15

利用arulesViz包中的plot()函数进行关联规则可视化

library(arulesViz)
beer.rules %>% plot(method="graph",measure="lift",
                    shading="confidence")