support(X)=count(X)/N
confidence(x->y)=count(X,Y)/count(X)=support(X,Y)/support(X)
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
getwd()
## [1] "C:/Users/Maxwell/Desktop/Learn R"
setwd( "C:/Users/Maxwell/Desktop/Learn R")
#install.packages("arules")
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
grocery=read.transactions("groceries.csv",sep=",")
summary(grocery)
## transactions as itemMatrix in sparse format with
## 9835 rows (elements/itemsets/transactions) and
## 169 columns (items) and a density of 0.02609146
##
## most frequent items:
## whole milk other vegetables rolls/buns soda
## 2513 1903 1809 1715
## yogurt (Other)
## 1372 34055
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55
## 16 17 18 19 20 21 22 23 24 26 27 28 29 32
## 46 29 14 14 9 11 4 6 1 1 1 1 3 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 4.409 6.000 32.000
##
## includes extended item information - examples:
## labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3 baby cosmetics
Take more look into the data
grocery[1:5,]%>%inspect
## items
## [1] {citrus fruit,
## margarine,
## ready soups,
## semi-finished bread}
## [2] {coffee,
## tropical fruit,
## yogurt}
## [3] {whole milk}
## [4] {cream cheese,
## meat spreads,
## pip fruit,
## yogurt}
## [5] {condensed milk,
## long life bakery product,
## other vegetables,
## whole milk}
grocery[,1:5]%>%itemFrequency
## abrasive cleaner artif. sweetener baby cosmetics baby food
## 0.0035587189 0.0032536858 0.0006100661 0.0001016777
## bags
## 0.0004067107
itemFrequencyPlot(grocery, support=0.08)
itemFrequencyPlot(grocery, topN=10)
image(grocery[1:5,])
sample(grocery,10)%>%image
## enough with the visualization.
library(arules)
g_rule=apriori(data=grocery,parameter = list(supp=0.005,conf=0.2,target="rules"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.2 0.1 1 none FALSE TRUE 5 0.005 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 49
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [120 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [873 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(g_rule[1:5])
## lhs rhs support confidence lift
## [1] {} => {whole milk} 0.255516014 0.2555160 1.000000
## [2] {cake bar} => {whole milk} 0.005592272 0.4230769 1.655775
## [3] {dishes} => {other vegetables} 0.005998983 0.3410405 1.762550
## [4] {dishes} => {whole milk} 0.005287239 0.3005780 1.176357
## [5] {mustard} => {whole milk} 0.005185562 0.4322034 1.691492
\[lift(X->Y)=confidence(X->Y)/support(Y)=support(X,Y)/support(X)/support(Y)= \frac{support(X,Y)}{support(X)*support(Y)}\]
A large lift value is therefore a strong indicator that a rule is important, and reflects a true connection between the items.
(g_rule%>%sort(by="lift"))[1:10]%>%inspect
## lhs rhs support confidence lift
## [1] {citrus fruit,
## other vegetables,
## whole milk} => {root vegetables} 0.005795628 0.4453125 4.085493
## [2] {butter,
## other vegetables} => {whipped/sour cream} 0.005795628 0.2893401 4.036397
## [3] {herbs} => {root vegetables} 0.007015760 0.4312500 3.956477
## [4] {citrus fruit,
## pip fruit} => {tropical fruit} 0.005592272 0.4044118 3.854060
## [5] {berries} => {whipped/sour cream} 0.009049314 0.2721713 3.796886
## [6] {other vegetables,
## tropical fruit,
## whole milk} => {root vegetables} 0.007015760 0.4107143 3.768074
## [7] {whipped/sour cream,
## whole milk} => {butter} 0.006710727 0.2082019 3.757185
## [8] {root vegetables,
## whole milk,
## yogurt} => {tropical fruit} 0.005693950 0.3916084 3.732043
## [9] {other vegetables,
## pip fruit,
## whole milk} => {root vegetables} 0.005490595 0.4060150 3.724961
## [10] {citrus fruit,
## tropical fruit} => {pip fruit} 0.005592272 0.2806122 3.709437
berryrules <- subset(g_rule, items %in% "berries")
inspect(berryrules)
## lhs rhs support confidence lift
## [1] {berries} => {whipped/sour cream} 0.009049314 0.2721713 3.796886
## [2] {berries} => {tropical fruit} 0.006710727 0.2018349 1.923494
## [3] {berries} => {soda} 0.007320793 0.2201835 1.262685
## [4] {berries} => {yogurt} 0.010574479 0.3180428 2.279848
## [5] {berries} => {other vegetables} 0.010269446 0.3088685 1.596280
## [6] {berries} => {whole milk} 0.011794611 0.3547401 1.388328
A common approach is to take the association rules and divide them into the following three categories:
as.data.frame does not work
b_rule=as(berryrules,"data.frame")
str(b_rule)
## 'data.frame': 6 obs. of 4 variables:
## $ rules : Factor w/ 6 levels "{berries} => {other vegetables}",..: 4 3 2 6 1 5
## $ support : num 0.00905 0.00671 0.00732 0.01057 0.01027 ...
## $ confidence: num 0.272 0.202 0.22 0.318 0.309 ...
## $ lift : num 3.8 1.92 1.26 2.28 1.6 ...
b_rule
## rules support confidence lift
## 101 {berries} => {whipped/sour cream} 0.009049314 0.2721713 3.796886
## 102 {berries} => {tropical fruit} 0.006710727 0.2018349 1.923494
## 103 {berries} => {soda} 0.007320793 0.2201835 1.262685
## 104 {berries} => {yogurt} 0.010574479 0.3180428 2.279848
## 105 {berries} => {other vegetables} 0.010269446 0.3088685 1.596280
## 106 {berries} => {whole milk} 0.011794611 0.3547401 1.388328