#install.packages("arules")
#install.packages("arulesViz")
library(arules)
library(arulesViz)
library(tidyverse)
data(Groceries)
str(Groceries)
## Formal class 'transactions' [package "arules"] with 3 slots
## ..@ data :Formal class 'ngCMatrix' [package "Matrix"] with 5 slots
## .. .. ..@ i : int [1:43367] 13 60 69 78 14 29 98 24 15 29 ...
## .. .. ..@ p : int [1:9836] 0 4 7 8 12 16 21 22 27 28 ...
## .. .. ..@ Dim : int [1:2] 169 9835
## .. .. ..@ Dimnames:List of 2
## .. .. .. ..$ : NULL
## .. .. .. ..$ : NULL
## .. .. ..@ factors : list()
## ..@ itemInfo :'data.frame': 169 obs. of 3 variables:
## .. ..$ labels: chr [1:169] "frankfurter" "sausage" "liver loaf" "ham" ...
## .. ..$ level2: Factor w/ 55 levels "baby food","bags",..: 44 44 44 44 44 44 44 42 42 41 ...
## .. ..$ level1: Factor w/ 10 levels "canned food",..: 6 6 6 6 6 6 6 6 6 6 ...
## ..@ itemsetInfo:'data.frame': 0 obs. of 0 variables
先使用绝对频率再使用相对频率查看频率最高的前15个项目。
itemFrequency<-Groceries %>% itemFrequency() %>% sort(decreasing = TRUE)
itemFrequency[1:15] %>% round(3) %>% data.frame()
## .
## whole milk 0.256
## other vegetables 0.193
## rolls/buns 0.184
## soda 0.174
## yogurt 0.140
## bottled water 0.111
## root vegetables 0.109
## tropical fruit 0.105
## shopping bags 0.099
## sausage 0.094
## pastry 0.089
## citrus fruit 0.083
## bottled beer 0.081
## newspapers 0.080
## canned beer 0.078
par(mfrow=c(2,2))
itemFrequencyPlot(Groceries,topN=15,type="absolute")
itemFrequencyPlot(Groceries,topN=15,type="relative")
itemFrequencyPlot(Groceries,topN=15, horiz=T)
itemFrequencyPlot(Groceries,topN=15,support=0.1, horiz=T)
我们发现,啤酒在杂货店“最常被购买的商品”中只排第13位(瓶装)和第15位(罐装)。只有不到10%的购买记录中包括瓶装啤酒和罐装啤酒。
如果频繁项集L 的所有超集都是非频繁项集, 那么称L 为最大频繁项集或称最大频繁模式, 记为MFI (Maximal Frequent Itemset) .
(1)对于参数组,我们需要设置最小支持度为0.005和target参数值,通过上面的频率图和反复试错法,我们设定最小支持度为0.005,分别生成频繁项目集和最大频繁项集。
options(digits=3)
#生成最大频繁项集
parameter1<-list(support=0.005,target="maximally frequent itemsets")
item_maxF<-Groceries %>% eclat(parameter=parameter1) %>% sort(by="support")
## Eclat
##
## parameter specification:
## tidLists support minlen maxlen target ext
## FALSE 0.005 1 10 maximally frequent itemsets TRUE
##
## algorithmic control:
## sparse sort verbose
## 7 -2 TRUE
##
## Absolute minimum support count: 49
##
## create itemset ...
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [120 item(s)] done [0.00s].
## creating sparse bit matrix ... [120 row(s), 9835 column(s)] done [0.00s].
## writing ... [708 set(s)] done [0.01s].
## Creating S4 object ... done [0.00s].
item_maxF <- inspect(item_maxF[1:49])
## items support count
## [1] {red/blush wine} 0.0192 189
## [2] {white wine} 0.0190 187
## [3] {soda, newspapers} 0.0146 144
## [4] {seasonal products} 0.0142 140
## [5] {other vegetables, whole milk, soda} 0.0139 137
## [6] {soda, canned beer} 0.0138 136
## [7] {whole milk, dessert} 0.0137 135
## [8] {tropical fruit, shopping bags} 0.0135 133
## [9] {pip fruit, soda} 0.0133 131
## [10] {packaged fruit/vegetables} 0.0130 128
## [11] {citrus fruit, soda} 0.0128 126
## [12] {whole milk, waffles} 0.0127 125
## [13] {other vegetables, whole milk, domestic eggs} 0.0123 121
## [14] {soda, napkins} 0.0120 118
## [15] {pork, soda} 0.0119 117
## [16] {pastry, shopping bags} 0.0119 117
## [17] {berries, whole milk} 0.0118 116
## [18] {rolls/buns, chocolate} 0.0118 116
## [19] {frozen fish} 0.0117 115
## [20] {whole milk, butter milk} 0.0116 114
## [21] {other vegetables, dessert} 0.0116 114
## [22] {ham, whole milk} 0.0115 113
## [23] {other vegetables, whole milk, butter} 0.0115 113
## [24] {canned beer, shopping bags} 0.0114 112
## [25] {cling film/bags} 0.0114 112
## [26] {rolls/buns, canned beer} 0.0113 111
## [27] {frankfurter, soda} 0.0113 111
## [28] {bottled water, newspapers} 0.0113 111
## [29] {sausage, citrus fruit} 0.0113 111
## [30] {whole milk, salty snack} 0.0112 110
## [31] {spread cheese} 0.0112 110
## [32] {liquor} 0.0111 109
## [33] {rolls/buns, coffee} 0.0110 108
## [34] {bottled water, shopping bags} 0.0110 108
## [35] {tropical fruit, whole milk, rolls/buns} 0.0110 108
## [36] {whole milk, sliced cheese} 0.0108 106
## [37] {other vegetables, salty snack} 0.0108 106
## [38] {other vegetables, whole milk, bottled water} 0.0108 106
## [39] {canned vegetables} 0.0108 106
## [40] {frozen dessert} 0.0108 106
## [41] {salt} 0.0108 106
## [42] {sausage, brown bread} 0.0107 105
## [43] {fruit/vegetable juice, shopping bags} 0.0107 105
## [44] {berries, yogurt} 0.0106 104
## [45] {pip fruit, bottled water} 0.0106 104
## [46] {other vegetables, whole milk, pastry} 0.0106 104
## [47] {whole milk, yogurt, soda} 0.0105 103
## [48] {dish cleaner} 0.0105 103
## [49] {other vegetables, butter milk} 0.0104 102
write.csv(item_maxF,"item_maxF.csv")
#生成频繁项集
parameter2<-list(support=0.005,target="frequent itemsets")
item_F<-Groceries %>% eclat(parameter=parameter2) %>% sort(by="support")
## Eclat
##
## parameter specification:
## tidLists support minlen maxlen target ext
## FALSE 0.005 1 10 frequent itemsets TRUE
##
## algorithmic control:
## sparse sort verbose
## 7 -2 TRUE
##
## Absolute minimum support count: 49
##
## create itemset ...
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [120 item(s)] done [0.00s].
## creating sparse bit matrix ... [120 row(s), 9835 column(s)] done [0.00s].
## writing ... [1001 set(s)] done [0.01s].
## Creating S4 object ... done [0.00s].
item_F<-inspect(item_F[1:49])
## items support count
## [1] {whole milk} 0.2555 2513
## [2] {other vegetables} 0.1935 1903
## [3] {rolls/buns} 0.1839 1809
## [4] {soda} 0.1744 1715
## [5] {yogurt} 0.1395 1372
## [6] {bottled water} 0.1105 1087
## [7] {root vegetables} 0.1090 1072
## [8] {tropical fruit} 0.1049 1032
## [9] {shopping bags} 0.0985 969
## [10] {sausage} 0.0940 924
## [11] {pastry} 0.0890 875
## [12] {citrus fruit} 0.0828 814
## [13] {bottled beer} 0.0805 792
## [14] {newspapers} 0.0798 785
## [15] {canned beer} 0.0777 764
## [16] {pip fruit} 0.0756 744
## [17] {other vegetables, whole milk} 0.0748 736
## [18] {fruit/vegetable juice} 0.0723 711
## [19] {whipped/sour cream} 0.0717 705
## [20] {brown bread} 0.0649 638
## [21] {domestic eggs} 0.0634 624
## [22] {frankfurter} 0.0590 580
## [23] {margarine} 0.0586 576
## [24] {coffee} 0.0581 571
## [25] {pork} 0.0577 567
## [26] {whole milk, rolls/buns} 0.0566 557
## [27] {whole milk, yogurt} 0.0560 551
## [28] {butter} 0.0554 545
## [29] {curd} 0.0533 524
## [30] {beef} 0.0525 516
## [31] {napkins} 0.0524 515
## [32] {chocolate} 0.0496 488
## [33] {root vegetables, whole milk} 0.0489 481
## [34] {frozen vegetables} 0.0481 473
## [35] {root vegetables, other vegetables} 0.0474 466
## [36] {other vegetables, yogurt} 0.0434 427
## [37] {chicken} 0.0429 422
## [38] {other vegetables, rolls/buns} 0.0426 419
## [39] {tropical fruit, whole milk} 0.0423 416
## [40] {white bread} 0.0421 414
## [41] {whole milk, soda} 0.0401 394
## [42] {cream cheese } 0.0397 390
## [43] {waffles} 0.0384 378
## [44] {rolls/buns, soda} 0.0383 377
## [45] {salty snack} 0.0378 372
## [46] {long life bakery product} 0.0374 368
## [47] {dessert} 0.0371 365
## [48] {tropical fruit, other vegetables} 0.0359 353
## [49] {whole milk, bottled water} 0.0344 338
write.csv(item_maxF,"item_maxF.csv")
options(digits=3)
parameter2<-list(support=0.005,target="frequent itemsets")
item_F<-Groceries %>% eclat(parameter=parameter2)
## Eclat
##
## parameter specification:
## tidLists support minlen maxlen target ext
## FALSE 0.005 1 10 frequent itemsets TRUE
##
## algorithmic control:
## sparse sort verbose
## 7 -2 TRUE
##
## Absolute minimum support count: 49
##
## create itemset ...
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [120 item(s)] done [0.00s].
## creating sparse bit matrix ... [120 row(s), 9835 column(s)] done [0.00s].
## writing ... [1001 set(s)] done [0.01s].
## Creating S4 object ... done [0.00s].
MyRules_lift<-item_F %>% ruleInduction(transactions=Groceries,confidence=0.6) %>%
sort(by="lift")
MyRules_l<-MyRules_lift%>% inspect()
## lhs rhs support confidence lift itemset
## [1] {citrus fruit,
## root vegetables,
## whole milk} => {other vegetables} 0.00580 0.633 3.27 734
## [2] {pip fruit,
## root vegetables,
## whole milk} => {other vegetables} 0.00549 0.614 3.17 681
## [3] {pip fruit,
## whipped/sour cream} => {other vegetables} 0.00559 0.604 3.12 641
## [4] {root vegetables,
## onions} => {other vegetables} 0.00569 0.602 3.11 111
## [5] {tropical fruit,
## root vegetables,
## yogurt} => {whole milk} 0.00569 0.700 2.74 823
## [6] {pip fruit,
## root vegetables,
## other vegetables} => {whole milk} 0.00549 0.675 2.64 681
## [7] {butter,
## whipped/sour cream} => {whole milk} 0.00671 0.660 2.58 530
## [8] {pip fruit,
## whipped/sour cream} => {whole milk} 0.00600 0.648 2.54 640
## [9] {butter,
## yogurt} => {whole milk} 0.00935 0.639 2.50 538
## [10] {root vegetables,
## butter} => {whole milk} 0.00824 0.638 2.50 536
## [11] {tropical fruit,
## curd} => {whole milk} 0.00651 0.634 2.48 358
## [12] {pip fruit,
## other vegetables,
## yogurt} => {whole milk} 0.00508 0.625 2.45 685
## [13] {pip fruit,
## domestic eggs} => {whole milk} 0.00539 0.624 2.44 586
## [14] {tropical fruit,
## butter} => {whole milk} 0.00620 0.622 2.44 534
## [15] {domestic eggs,
## margarine} => {whole milk} 0.00519 0.622 2.43 504
## [16] {butter,
## domestic eggs} => {whole milk} 0.00600 0.621 2.43 529
## [17] {tropical fruit,
## other vegetables,
## yogurt} => {whole milk} 0.00763 0.620 2.43 833
## [18] {other vegetables,
## yogurt,
## fruit/vegetable juice} => {whole milk} 0.00508 0.617 2.42 621
## [19] {tropical fruit,
## domestic eggs} => {whole milk} 0.00691 0.607 2.38 588
## [20] {root vegetables,
## other vegetables,
## whipped/sour cream} => {whole milk} 0.00519 0.607 2.38 648
## [21] {root vegetables,
## other vegetables,
## yogurt} => {whole milk} 0.00783 0.606 2.37 848
## [22] {butter,
## bottled water} => {whole milk} 0.00539 0.602 2.36 533
write.csv(MyRules_l,"MyRules_lift.csv")
# 规定右项为whole milk且lift>1.2
options(digits=3)
rules_milk<-MyRules_lift %>% subset(subset = rhs %in% "whole milk" &lift > 1.2)
rules_milk %>% inspect()
## lhs rhs support confidence lift itemset
## [1] {tropical fruit,
## root vegetables,
## yogurt} => {whole milk} 0.00569 0.700 2.74 823
## [2] {pip fruit,
## root vegetables,
## other vegetables} => {whole milk} 0.00549 0.675 2.64 681
## [3] {butter,
## whipped/sour cream} => {whole milk} 0.00671 0.660 2.58 530
## [4] {pip fruit,
## whipped/sour cream} => {whole milk} 0.00600 0.648 2.54 640
## [5] {butter,
## yogurt} => {whole milk} 0.00935 0.639 2.50 538
## [6] {root vegetables,
## butter} => {whole milk} 0.00824 0.638 2.50 536
## [7] {tropical fruit,
## curd} => {whole milk} 0.00651 0.634 2.48 358
## [8] {pip fruit,
## other vegetables,
## yogurt} => {whole milk} 0.00508 0.625 2.45 685
## [9] {pip fruit,
## domestic eggs} => {whole milk} 0.00539 0.624 2.44 586
## [10] {tropical fruit,
## butter} => {whole milk} 0.00620 0.622 2.44 534
## [11] {domestic eggs,
## margarine} => {whole milk} 0.00519 0.622 2.43 504
## [12] {butter,
## domestic eggs} => {whole milk} 0.00600 0.621 2.43 529
## [13] {tropical fruit,
## other vegetables,
## yogurt} => {whole milk} 0.00763 0.620 2.43 833
## [14] {other vegetables,
## yogurt,
## fruit/vegetable juice} => {whole milk} 0.00508 0.617 2.42 621
## [15] {tropical fruit,
## domestic eggs} => {whole milk} 0.00691 0.607 2.38 588
## [16] {root vegetables,
## other vegetables,
## whipped/sour cream} => {whole milk} 0.00519 0.607 2.38 648
## [17] {root vegetables,
## other vegetables,
## yogurt} => {whole milk} 0.00783 0.606 2.37 848
## [18] {butter,
## bottled water} => {whole milk} 0.00539 0.602 2.36 533
rules_milk %>% plot(method="graph")
# 挑选关联规则项集中包含“pip fruit”的项
fruit_rules <- MyRules_lift %>% subset(items %in% "pip fruit")
inspect(fruit_rules)
## lhs rhs support confidence lift itemset
## [1] {pip fruit,
## root vegetables,
## whole milk} => {other vegetables} 0.00549 0.614 3.17 681
## [2] {pip fruit,
## whipped/sour cream} => {other vegetables} 0.00559 0.604 3.12 641
## [3] {pip fruit,
## root vegetables,
## other vegetables} => {whole milk} 0.00549 0.675 2.64 681
## [4] {pip fruit,
## whipped/sour cream} => {whole milk} 0.00600 0.648 2.54 640
## [5] {pip fruit,
## other vegetables,
## yogurt} => {whole milk} 0.00508 0.625 2.45 685
## [6] {pip fruit,
## domestic eggs} => {whole milk} 0.00539 0.624 2.44 586
plot(fruit_rules,method="graph")
plot(fruit_rules,method="group")
library(arulesSequences)
MyTrans<-read_baskets(con="网页浏览数据.txt",sep=",",info=c("sequenceID","eventID"))
MyTrans
## transactions in sparse format with
## 8737 transactions (rows) and
## 63 items (columns)
inspect(MyTrans[1:10,])
## items sequenceID eventID
## [1] {News North America} 1 1
## [2] {Football} 1 2
## [3] {Football} 1 3
## [4] {Baseball} 1 4
## [5] {Basketball} 1 5
## [6] {Weather} 2 1
## [7] {Weather} 2 2
## [8] {Weather} 2 3
## [9] {Weather} 2 4
## [10] {Weather} 2 5
itemFrequencyPlot(MyTrans,topN=20,type="absolute")
itemFrequencyPlot(MyTrans,topN=20,type="relative")
MyFsets<-MyTrans %>% cspade(parameter=list(support=0.05)) %>% sort(by="support")
inspect(MyFsets)
## items support
## 1 <{News North America}> 0.2085
## 2 <{Weather}> 0.1985
## 3 <{Football}> 0.1320
## 4 <{Flight}> 0.1275
## 5 <{Hotel}> 0.1225
## 6 <{News North America},
## {News North America}> 0.1175
## 7 <{News North America},
## {Weather}> 0.1150
## 8 <{Basketball}> 0.1065
## 9 <{Baseball}> 0.1055
## 10 <{Flight},
## {Hotel}> 0.1050
## 11 <{Music}> 0.1035
## 12 <{Movie}> 0.1030
## 13 <{Shopping Music}> 0.1000
## 14 <{Weather},
## {Weather}> 0.0945
## 15 <{Health}> 0.0880
## 16 <{Shopping Music},
## {Movie}> 0.0860
## 17 <{Baseball},
## {Basketball}> 0.0855
## 18 <{Music},
## {Shopping Music}> 0.0835
## 19 <{Medicine}> 0.0810
## 20 <{News North America},
## {News North America},
## {Weather}> 0.0810
## 21 <{Music},
## {Movie}> 0.0770
## 22 <{Music},
## {Shopping Music},
## {Movie}> 0.0770
## 23 <{Football},
## {Baseball}> 0.0755
## 24 <{News North America},
## {News North America},
## {News North America}> 0.0750
## 25 <{Football},
## {Basketball}> 0.0745
## 26 <{Football},
## {Baseball},
## {Basketball}> 0.0740
## 27 <{News North America},
## {Football}> 0.0735
## 28 <{News North America},
## {Basketball}> 0.0735
## 29 <{News North America},
## {Football},
## {Basketball}> 0.0735
## 30 <{News North America},
## {Baseball},
## {Basketball}> 0.0735
## 31 <{News North America},
## {Football},
## {Baseball},
## {Basketball}> 0.0735
## 32 <{News North America},
## {Baseball}> 0.0735
## 33 <{News North America},
## {Football},
## {Baseball}> 0.0735
## 34 <{Shopping Computer}> 0.0685
## 35 <{Shopping House}> 0.0615
## 36 <{Shopping Electronics}> 0.0610
## 37 <{Health},
## {Medicine}> 0.0610
## 38 <{News Europe}> 0.0595
## 39 <{Insurance}> 0.0565
## 40 <{Loan}> 0.0560
## 41 <{News North America},
## {News North America},
## {News North America},
## {Weather}> 0.0560
## 42 <{News North America},
## {News North America},
## {News North America},
## {News North America}> 0.0540
## 43 <{Weather},
## {Weather},
## {Weather}> 0.0515
##
MyRules<-MyFsets %>% ruleInduction(confidence=0.5) %>% sort(by="lift")
inspect(MyRules)
## lhs rhs support confidence lift
## 1 <{News North America},
## {Football}> => <{Baseball}> 0.0735 1.000 9.48
## 2 <{News North America},
## {Football}> => <{Basketball}> 0.0735 1.000 9.39
## 3 <{News North America},
## {Baseball}> => <{Basketball}> 0.0735 1.000 9.39
## 4 <{News North America},
## {Football},
## {Baseball}> => <{Basketball}> 0.0735 1.000 9.39
## 5 <{Football},
## {Baseball}> => <{Basketball}> 0.0740 0.980 9.20
## 6 <{Music},
## {Shopping Music}> => <{Movie}> 0.0770 0.922 8.95
## 7 <{Health}> => <{Medicine}> 0.0610 0.693 8.56
## 8 <{Shopping Music}> => <{Movie}> 0.0860 0.860 8.35
## 9 <{Music}> => <{Shopping Music}> 0.0835 0.807 8.07
## 10 <{Baseball}> => <{Basketball}> 0.0855 0.810 7.61
## 11 <{Music}> => <{Movie}> 0.0770 0.744 7.22
## 12 <{Flight}> => <{Hotel}> 0.1050 0.824 6.72
## 13 <{Football}> => <{Baseball}> 0.0755 0.572 5.42
## 14 <{Football}> => <{Basketball}> 0.0745 0.564 5.30
## 15 <{News North America},
## {News North America},
## {News North America}> => <{Weather}> 0.0560 0.747 3.76
## 16 <{News North America},
## {News North America}> => <{Weather}> 0.0810 0.689 3.47
## 17 <{News North America},
## {News North America},
## {News North America}> => <{News North America}> 0.0540 0.720 3.45
## 18 <{News North America},
## {News North America}> => <{News North America}> 0.0750 0.638 3.06
## 19 <{News North America}> => <{Weather}> 0.1150 0.552 2.78
## 20 <{Weather},
## {Weather}> => <{Weather}> 0.0515 0.545 2.75
## 21 <{News North America}> => <{News North America}> 0.1175 0.564 2.70
##