library(arules) # association rules
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz) # data visualization of association rules
## Loading required package: grid
library(RColorBrewer) # color palettes for plots
packages = c(
"dplyr","ggplot2","googleVis","devtools","magrittr","slam","irlba","plotly", "arules","arulesViz","Matrix","recommenderlab")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
data(Groceries) # grocery transactions object from arules package
summary(Groceries)
## transactions as itemMatrix in sparse format with
## 9835 rows (elements/itemsets/transactions) and
## 169 columns (items) and a density of 0.02609146
##
## most frequent items:
## whole milk other vegetables rolls/buns soda
## 2513 1903 1809 1715
## yogurt (Other)
## 1372 34055
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55
## 16 17 18 19 20 21 22 23 24 26 27 28 29 32
## 46 29 14 14 9 11 4 6 1 1 1 1 3 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 4.409 6.000 32.000
##
## includes extended item information - examples:
## labels level2 level1
## 1 frankfurter sausage meat and sausage
## 2 sausage sausage meat and sausage
## 3 liver loaf sausage meat and sausage
# show the dimensions of the transactions object
print(dim(Groceries))
## [1] 9835 169
print(dim(Groceries)[1]) # 9835 market baskets for shopping trips
## [1] 9835
print(dim(Groceries)[2]) # 169 initial store items
## [1] 169
inspect(Groceries[1:5])
## items
## [1] {citrus fruit,
## semi-finished bread,
## margarine,
## ready soups}
## [2] {tropical fruit,
## yogurt,
## coffee}
## [3] {whole milk}
## [4] {pip fruit,
## yogurt,
## cream cheese ,
## meat spreads}
## [5] {other vegetables,
## whole milk,
## condensed milk,
## long life bakery product}
inspect(Groceries[1])
## items
## [1] {citrus fruit,
## semi-finished bread,
## margarine,
## ready soups}
# examine frequency for each item with support greater than 0.025
pdf(file="fig_market_basket_initial_item_support.pdf",
width = 8.5, height = 11)
itemFrequencyPlot(Groceries, support = 0.025, cex.names=0.8, xlim = c(0,0.3),
type = "relative", horiz = TRUE, col = "dark red", las = 1,
xlab = paste("Proportion of Market Baskets Containing Item",
"\n(Item Relative Frequency or Support)"))
itemFrequencyPlot(Groceries, topN=20, type="absolute",col = "dark red", cex=0.8)
# explore possibilities for combining similar items
df = itemInfo(Groceries)
str(df) # levels 10, 55
## 'data.frame': 169 obs. of 3 variables:
## $ labels: chr "frankfurter" "sausage" "liver loaf" "ham" ...
## $ level2: Factor w/ 55 levels "baby food","bags",..: 44 44 44 44 44 44 44 42 42 41 ...
## $ level1: Factor w/ 10 levels "canned food",..: 6 6 6 6 6 6 6 6 6 6 ...
# aggregate items using the 55 level2 levels for food categories
# to create a more meaningful set of items
groceries <- aggregate(Groceries, itemInfo(Groceries)[["level2"]])
print(dim(groceries)[1]) # 9835 market baskets for shopping trips
## [1] 9835
print(dim(groceries)[2]) # 55 final store items (categories)
## [1] 55
itemFrequencyPlot(groceries, support = 0.025, cex.names=1.0, xlim = c(0,0.5),
type = "relative", horiz = TRUE, col = "blue", las = 1,
xlab = paste("Proportion of Market Baskets Containing Item",
"\n(Item Relative Frequency or Support)"))
# obtain large set of association rules for items by category and all shoppers
# this is done by setting very low criteria for support and confidence
first.rules <- apriori(groceries,
parameter = list(support = 0.001, confidence = 0.05))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.05 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 9
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[55 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [54 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 done [0.02s].
## writing ... [69921 rule(s)] done [0.01s].
## creating S4 object ... done [0.02s].
print(summary(first.rules)) # yields 69,921 rules... too many
## set of 69921 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2 3 4 5 6 7 8
## 21 1205 10467 23895 22560 9888 1813 72
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 4.000 4.502 5.000 8.000
##
## summary of quality measures:
## support confidence lift count
## Min. :0.001017 Min. :0.0500 Min. : 0.4475 Min. : 10.00
## 1st Qu.:0.001118 1st Qu.:0.2110 1st Qu.: 1.8315 1st Qu.: 11.00
## Median :0.001525 Median :0.4231 Median : 2.2573 Median : 15.00
## Mean :0.002488 Mean :0.4364 Mean : 2.5382 Mean : 24.47
## 3rd Qu.:0.002339 3rd Qu.:0.6269 3rd Qu.: 2.9662 3rd Qu.: 23.00
## Max. :0.443010 Max. :1.0000 Max. :16.1760 Max. :4357.00
##
## mining info:
## data ntransactions support confidence
## groceries 9835 0.001 0.05
# select association rules using thresholds for support and confidence
second.rules <- apriori(groceries,
parameter = list(support = 0.025, confidence = 0.05))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.05 0.1 1 none FALSE TRUE 5 0.025 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 245
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[55 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [32 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [344 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
print(summary(second.rules)) # yields 344 rules
## set of 344 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2 3 4
## 21 162 129 32
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 2.0 2.0 2.5 3.0 4.0
##
## summary of quality measures:
## support confidence lift count
## Min. :0.02542 Min. :0.05043 Min. :0.6669 Min. : 250.0
## 1st Qu.:0.03030 1st Qu.:0.18202 1st Qu.:1.2498 1st Qu.: 298.0
## Median :0.03854 Median :0.39522 Median :1.4770 Median : 379.0
## Mean :0.05276 Mean :0.37658 Mean :1.4831 Mean : 518.9
## 3rd Qu.:0.05236 3rd Qu.:0.51271 3rd Qu.:1.7094 3rd Qu.: 515.0
## Max. :0.44301 Max. :0.79841 Max. :2.4073 Max. :4357.0
##
## mining info:
## data ntransactions support confidence
## groceries 9835 0.025 0.05
# data visualization of association rules in scatter plot
plot(second.rules,colors=c("red","green"),engine="htmlwidget",
marker=list(opacity=.6,size=8))
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
plot(second.rules,method="matrix",shading="lift",engine="htmlwidget",
colors=c("red", "green"))
# select rules with vegetables in consequent (right-hand-side) item subsets
vegie.rules <- subset(second.rules, subset = rhs %pin% "vegetables")
inspect(vegie.rules) # 41 rules
## lhs rhs support confidence lift count
## [1] {} => {vegetables} 0.27300458 0.2730046 1.0000000 2685
## [2] {poultry} => {vegetables} 0.02897814 0.5745968 2.1047148 285
## [3] {pork} => {vegetables} 0.03009659 0.5220459 1.9122238 296
## [4] {staple foods} => {vegetables} 0.02613116 0.5160643 1.8903136 257
## [5] {eggs} => {vegetables} 0.03141840 0.4951923 1.8138608 309
## [6] {games/books/hobby} => {vegetables} 0.02785968 0.3145809 1.1522918 274
## [7] {long-life bakery products} => {vegetables} 0.02907982 0.3492063 1.2791227 286
## [8] {perfumery} => {vegetables} 0.03213015 0.4056483 1.4858662 316
## [9] {beef} => {vegetables} 0.04585663 0.5595533 2.0496116 451
## [10] {bags} => {vegetables} 0.03141840 0.3175745 1.1632571 309
## [11] {vinegar/oils} => {vegetables} 0.04199288 0.4666667 1.7093731 413
## [12] {chocolate} => {vegetables} 0.03192679 0.2934579 1.0749195 314
## [13] {beer} => {vegetables} 0.03406202 0.2189542 0.8020168 335
## [14] {frozen foods} => {vegetables} 0.04738180 0.4052174 1.4842879 466
## [15] {cheese} => {vegetables} 0.05531266 0.4365971 1.5992300 544
## [16] {sausage} => {vegetables} 0.07625826 0.4032258 1.4769929 750
## [17] {fruit} => {vegetables} 0.10706660 0.4297959 1.5743176 1053
## [18] {non-alc. drinks} => {vegetables} 0.09456024 0.2974097 1.0893944 930
## [19] {bread and backed goods} => {vegetables} 0.11621759 0.3363743 1.2321198 1143
## [20] {dairy produce} => {vegetables} 0.17041179 0.3846683 1.4090180 1676
## [21] {beef,
## dairy produce} => {vegetables} 0.02989324 0.6074380 2.2250104 294
## [22] {dairy produce,
## vinegar/oils} => {vegetables} 0.03141840 0.5355286 1.9616103 309
## [23] {dairy produce,
## frozen foods} => {vegetables} 0.03436706 0.5121212 1.8758704 338
## [24] {cheese,
## fruit} => {vegetables} 0.02674123 0.5197628 1.9038613 263
## [25] {bread and backed goods,
## cheese} => {vegetables} 0.02887646 0.4536741 1.6617821 284
## [26] {cheese,
## dairy produce} => {vegetables} 0.04219624 0.4987981 1.8270686 415
## [27] {fruit,
## sausage} => {vegetables} 0.03426538 0.5290424 1.9378517 337
## [28] {non-alc. drinks,
## sausage} => {vegetables} 0.03029995 0.4156206 1.5223944 298
## [29] {bread and backed goods,
## sausage} => {vegetables} 0.04382308 0.4229637 1.5492916 431
## [30] {dairy produce,
## sausage} => {vegetables} 0.05266904 0.4905303 1.7967842 518
## [31] {fruit,
## non-alc. drinks} => {vegetables} 0.04361973 0.4657980 1.7061914 429
## [32] {bread and backed goods,
## fruit} => {vegetables} 0.05124555 0.4763705 1.7449177 504
## [33] {dairy produce,
## fruit} => {vegetables} 0.07869853 0.5032510 1.8433793 774
## [34] {bread and backed goods,
## non-alc. drinks} => {vegetables} 0.04636502 0.3731588 1.3668590 456
## [35] {dairy produce,
## non-alc. drinks} => {vegetables} 0.06446365 0.4243641 1.5544213 634
## [36] {bread and backed goods,
## dairy produce} => {vegetables} 0.08195221 0.4366197 1.5993128 806
## [37] {dairy produce,
## fruit,
## sausage} => {vegetables} 0.02714794 0.5741935 2.1032378 267
## [38] {bread and backed goods,
## dairy produce,
## sausage} => {vegetables} 0.03284189 0.5135135 1.8809704 323
## [39] {dairy produce,
## fruit,
## non-alc. drinks} => {vegetables} 0.03304525 0.5183413 1.8986543 325
## [40] {bread and backed goods,
## dairy produce,
## fruit} => {vegetables} 0.04077275 0.5276316 1.9326840 401
## [41] {bread and backed goods,
## dairy produce,
## non-alc. drinks} => {vegetables} 0.03345196 0.4627286 1.6949480 329
# sort by lift and identify the top 10 rules
top.vegie.rules <- head(sort(vegie.rules, decreasing = TRUE, by = "lift"), 10)
inspect(top.vegie.rules)
## lhs rhs support confidence lift count
## [1] {beef,
## dairy produce} => {vegetables} 0.02989324 0.6074380 2.225010 294
## [2] {poultry} => {vegetables} 0.02897814 0.5745968 2.104715 285
## [3] {dairy produce,
## fruit,
## sausage} => {vegetables} 0.02714794 0.5741935 2.103238 267
## [4] {beef} => {vegetables} 0.04585663 0.5595533 2.049612 451
## [5] {dairy produce,
## vinegar/oils} => {vegetables} 0.03141840 0.5355286 1.961610 309
## [6] {fruit,
## sausage} => {vegetables} 0.03426538 0.5290424 1.937852 337
## [7] {bread and backed goods,
## dairy produce,
## fruit} => {vegetables} 0.04077275 0.5276316 1.932684 401
## [8] {pork} => {vegetables} 0.03009659 0.5220459 1.912224 296
## [9] {cheese,
## fruit} => {vegetables} 0.02674123 0.5197628 1.903861 263
## [10] {dairy produce,
## fruit,
## non-alc. drinks} => {vegetables} 0.03304525 0.5183413 1.898654 325
plot(vegie.rules,method="graph",engine="htmlwidget",itemCol="cyan")