recap on important concept

support(X)=count(X)/N

confidence(x->y)=count(X,Y)/count(X)=support(X,Y)/support(X)

library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
getwd()
## [1] "C:/Users/Maxwell/Desktop/Learn R"
setwd( "C:/Users/Maxwell/Desktop/Learn R")
#install.packages("arules")
library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
grocery=read.transactions("groceries.csv",sep=",")


summary(grocery)
## transactions as itemMatrix in sparse format with
##  9835 rows (elements/itemsets/transactions) and
##  169 columns (items) and a density of 0.02609146 
## 
## most frequent items:
##       whole milk other vegetables       rolls/buns             soda 
##             2513             1903             1809             1715 
##           yogurt          (Other) 
##             1372            34055 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 2159 1643 1299 1005  855  645  545  438  350  246  182  117   78   77   55 
##   16   17   18   19   20   21   22   23   24   26   27   28   29   32 
##   46   29   14   14    9   11    4    6    1    1    1    1    3    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   4.409   6.000  32.000 
## 
## includes extended item information - examples:
##             labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3   baby cosmetics

Take more look into the data

grocery[1:5,]%>%inspect
##     items                     
## [1] {citrus fruit,            
##      margarine,               
##      ready soups,             
##      semi-finished bread}     
## [2] {coffee,                  
##      tropical fruit,          
##      yogurt}                  
## [3] {whole milk}              
## [4] {cream cheese,            
##      meat spreads,            
##      pip fruit,               
##      yogurt}                  
## [5] {condensed milk,          
##      long life bakery product,
##      other vegetables,        
##      whole milk}
grocery[,1:5]%>%itemFrequency
## abrasive cleaner artif. sweetener   baby cosmetics        baby food 
##     0.0035587189     0.0032536858     0.0006100661     0.0001016777 
##             bags 
##     0.0004067107
itemFrequencyPlot(grocery, support=0.08)

itemFrequencyPlot(grocery, topN=10)

image(grocery[1:5,])

sample(grocery,10)%>%image

## enough with the visualization.

Let us train the model for the unsupervized learning.

library(arules)

g_rule=apriori(data=grocery,parameter = list(supp=0.005,conf=0.2,target="rules"))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.2    0.1    1 none FALSE            TRUE       5   0.005      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 49 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [120 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [873 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
inspect(g_rule[1:5])
##     lhs           rhs                support     confidence lift    
## [1] {}         => {whole milk}       0.255516014 0.2555160  1.000000
## [2] {cake bar} => {whole milk}       0.005592272 0.4230769  1.655775
## [3] {dishes}   => {other vegetables} 0.005998983 0.3410405  1.762550
## [4] {dishes}   => {whole milk}       0.005287239 0.3005780  1.176357
## [5] {mustard}  => {whole milk}       0.005185562 0.4322034  1.691492

\[lift(X->Y)=confidence(X->Y)/support(Y)=support(X,Y)/support(X)/support(Y)= \frac{support(X,Y)}{support(X)*support(Y)}\]

A large lift value is therefore a strong indicator that a rule is important, and reflects a true connection between the items.

(g_rule%>%sort(by="lift"))[1:10]%>%inspect
##      lhs                     rhs                      support confidence     lift
## [1]  {citrus fruit,                                                              
##       other vegetables,                                                          
##       whole milk}         => {root vegetables}    0.005795628  0.4453125 4.085493
## [2]  {butter,                                                                    
##       other vegetables}   => {whipped/sour cream} 0.005795628  0.2893401 4.036397
## [3]  {herbs}              => {root vegetables}    0.007015760  0.4312500 3.956477
## [4]  {citrus fruit,                                                              
##       pip fruit}          => {tropical fruit}     0.005592272  0.4044118 3.854060
## [5]  {berries}            => {whipped/sour cream} 0.009049314  0.2721713 3.796886
## [6]  {other vegetables,                                                          
##       tropical fruit,                                                            
##       whole milk}         => {root vegetables}    0.007015760  0.4107143 3.768074
## [7]  {whipped/sour cream,                                                        
##       whole milk}         => {butter}             0.006710727  0.2082019 3.757185
## [8]  {root vegetables,                                                           
##       whole milk,                                                                
##       yogurt}             => {tropical fruit}     0.005693950  0.3916084 3.732043
## [9]  {other vegetables,                                                          
##       pip fruit,                                                                 
##       whole milk}         => {root vegetables}    0.005490595  0.4060150 3.724961
## [10] {citrus fruit,                                                              
##       tropical fruit}     => {pip fruit}          0.005592272  0.2806122 3.709437
berryrules <- subset(g_rule, items %in% "berries")


inspect(berryrules)
##     lhs          rhs                  support     confidence lift    
## [1] {berries} => {whipped/sour cream} 0.009049314 0.2721713  3.796886
## [2] {berries} => {tropical fruit}     0.006710727 0.2018349  1.923494
## [3] {berries} => {soda}               0.007320793 0.2201835  1.262685
## [4] {berries} => {yogurt}             0.010574479 0.3180428  2.279848
## [5] {berries} => {other vegetables}   0.010269446 0.3088685  1.596280
## [6] {berries} => {whole milk}         0.011794611 0.3547401  1.388328

A common approach is to take the association rules and divide them into the following three categories:

convert into dataframe

as.data.frame does not work

b_rule=as(berryrules,"data.frame")

str(b_rule)
## 'data.frame':    6 obs. of  4 variables:
##  $ rules     : Factor w/ 6 levels "{berries} => {other vegetables}",..: 4 3 2 6 1 5
##  $ support   : num  0.00905 0.00671 0.00732 0.01057 0.01027 ...
##  $ confidence: num  0.272 0.202 0.22 0.318 0.309 ...
##  $ lift      : num  3.8 1.92 1.26 2.28 1.6 ...
b_rule
##                                 rules     support confidence     lift
## 101 {berries} => {whipped/sour cream} 0.009049314  0.2721713 3.796886
## 102     {berries} => {tropical fruit} 0.006710727  0.2018349 1.923494
## 103               {berries} => {soda} 0.007320793  0.2201835 1.262685
## 104             {berries} => {yogurt} 0.010574479  0.3180428 2.279848
## 105   {berries} => {other vegetables} 0.010269446  0.3088685 1.596280
## 106         {berries} => {whole milk} 0.011794611  0.3547401 1.388328