APRIORI ALGORITHM FOR FINDING ASSOCIATION RULES IN DATA MINING

#loadinng necessary libraries
library('arules')

## Loading required package: Matrix

## 
## Attaching package: 'arules'

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

library('arulesViz')
data("Groceries")
Groceries

## transactions in sparse format with
##  9835 transactions (rows) and
##  169 items (columns)

#summary of our dataset
summary(Groceries)

## transactions as itemMatrix in sparse format with
##  9835 rows (elements/itemsets/transactions) and
##  169 columns (items) and a density of 0.02609146 
## 
## most frequent items:
##       whole milk other vegetables       rolls/buns             soda 
##             2513             1903             1809             1715 
##           yogurt          (Other) 
##             1372            34055 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 2159 1643 1299 1005  855  645  545  438  350  246  182  117   78   77   55   46 
##   17   18   19   20   21   22   23   24   26   27   28   29   32 
##   29   14   14    9   11    4    6    1    1    1    1    3    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   4.409   6.000  32.000 
## 
## includes extended item information - examples:
##        labels  level2           level1
## 1 frankfurter sausage meat and sausage
## 2     sausage sausage meat and sausage
## 3  liver loaf sausage meat and sausage

#displaying from the 10th to 20th transactions of the Groceries dataset.
apply(Groceries@data[,1:10], 2, function(r) paste(Groceries@itemInfo[r, "labels"], collapse = ", "))

##  [1] "citrus fruit, semi-finished bread, margarine, ready soups"               
##  [2] "tropical fruit, yogurt, coffee"                                          
##  [3] "whole milk"                                                              
##  [4] "pip fruit, yogurt, cream cheese , meat spreads"                          
##  [5] "other vegetables, whole milk, condensed milk, long life bakery product"  
##  [6] "whole milk, butter, yogurt, rice, abrasive cleaner"                      
##  [7] "rolls/buns"                                                              
##  [8] "other vegetables, UHT-milk, rolls/buns, bottled beer, liquor (appetizer)"
##  [9] "pot plants"                                                              
## [10] "whole milk, cereals"

# a barchart of the top 10 most frequently purchased items.
itemFrequencyPlot(Groceries, topN = 10, type = "absolute", col = c("red4", "yellow4","magenta", "tan", "green4"), main = "Top 10 Most Frequently Purchased Items")

#displaying top 10 itemsts with the highest support 
itemsets <- apriori(Groceries, parameter=list(minlen=1, support=0.02,target="frequent itemsets"))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##          NA    0.1    1 none FALSE            TRUE       5    0.02      1
##  maxlen            target  ext
##      10 frequent itemsets TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 196 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [59 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## sorting transactions ... done [0.00s].
## writing ... [122 set(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

inspect(head(sort(itemsets, by = "support"), 10))

##      items              support    count
## [1]  {whole milk}       0.25551601 2513 
## [2]  {other vegetables} 0.19349263 1903 
## [3]  {rolls/buns}       0.18393493 1809 
## [4]  {soda}             0.17437722 1715 
## [5]  {yogurt}           0.13950178 1372 
## [6]  {bottled water}    0.11052364 1087 
## [7]  {root vegetables}  0.10899847 1072 
## [8]  {tropical fruit}   0.10493137 1032 
## [9]  {shopping bags}    0.09852567  969 
## [10] {sausage}          0.09395018  924

#Rule generation and visualization
rules <- apriori(Groceries, parameter = list(support = 0.001, confidence = 0.6, target = "rules"))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.6    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 9 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [2918 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

#summary of rules 
summary(rules)

## set of 2918 rules
## 
## rule length distribution (lhs + rhs):sizes
##    2    3    4    5    6 
##    3  490 1765  626   34 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   4.000   4.000   4.068   4.000   6.000 
## 
## summary of quality measures:
##     support           confidence        coverage             lift       
##  Min.   :0.001017   Min.   :0.6000   Min.   :0.001017   Min.   : 2.348  
##  1st Qu.:0.001118   1st Qu.:0.6316   1st Qu.:0.001525   1st Qu.: 2.668  
##  Median :0.001220   Median :0.6818   Median :0.001830   Median : 3.168  
##  Mean   :0.001480   Mean   :0.7028   Mean   :0.002157   Mean   : 3.450  
##  3rd Qu.:0.001525   3rd Qu.:0.7500   3rd Qu.:0.002339   3rd Qu.: 3.692  
##  Max.   :0.009354   Max.   :1.0000   Max.   :0.014642   Max.   :18.996  
##      count      
##  Min.   :10.00  
##  1st Qu.:11.00  
##  Median :12.00  
##  Mean   :14.55  
##  3rd Qu.:15.00  
##  Max.   :92.00  
## 
## mining info:
##       data ntransactions support confidence
##  Groceries          9835   0.001        0.6
##                                                                                              call
##  apriori(data = Groceries, parameter = list(support = 0.001, confidence = 0.6, target = "rules"))

#Scatterplot of the 2,918 rules with minimum support 0.001 and minimum confidence 0.6
plot(rules, main = "Scatter Plot For 2918 rules")

## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.

#Scatterplot matrix on the support, confidence, and lift of the 2,918 rules
plot(rules@quality[, c(1:2,4)], col = "red4")

# compute the 1/Support(Y)
slope <- sort(round(rules@quality$lift / rules@quality$confidence, 2))
# Display the number of times each slope appears in the dataset
unlist(lapply(split(slope,f=slope),length))

##  3.91  5.17  5.44  5.73  7.17  9.05  9.17  9.53 10.64 12.08 12.42 13.22 13.83 
##  1585   940    12     7   188     1   102    55     1     4     1     5     2 
## 13.95 18.05 23.76 26.44 30.08 
##     9     3     1     1     1

#let display some content of the rule generated: sorting it by lift to find those items in the rule 
inspect(head(sort(rules, by = "lift"), 10))

##      lhs                         rhs                      support confidence    coverage      lift count
## [1]  {Instant food products,                                                                            
##       soda}                   => {hamburger meat}     0.001220132  0.6315789 0.001931876 18.995654    12
## [2]  {soda,                                                                                             
##       popcorn}                => {salty snack}        0.001220132  0.6315789 0.001931876 16.697793    12
## [3]  {ham,                                                                                              
##       processed cheese}       => {white bread}        0.001931876  0.6333333 0.003050330 15.045491    19
## [4]  {tropical fruit,                                                                                   
##       other vegetables,                                                                                 
##       yogurt,                                                                                           
##       white bread}            => {butter}             0.001016777  0.6666667 0.001525165 12.030581    10
## [5]  {hamburger meat,                                                                                   
##       yogurt,                                                                                           
##       whipped/sour cream}     => {butter}             0.001016777  0.6250000 0.001626843 11.278670    10
## [6]  {tropical fruit,                                                                                   
##       other vegetables,                                                                                 
##       whole milk,                                                                                       
##       yogurt,                                                                                           
##       domestic eggs}          => {butter}             0.001016777  0.6250000 0.001626843 11.278670    10
## [7]  {liquor,                                                                                           
##       red/blush wine}         => {bottled beer}       0.001931876  0.9047619 0.002135231 11.235269    19
## [8]  {other vegetables,                                                                                 
##       butter,                                                                                           
##       sugar}                  => {whipped/sour cream} 0.001016777  0.7142857 0.001423488  9.964539    10
## [9]  {whole milk,                                                                                       
##       butter,                                                                                           
##       hard cheese}            => {whipped/sour cream} 0.001423488  0.6666667 0.002135231  9.300236    14
## [10] {tropical fruit,                                                                                   
##       other vegetables,                                                                                 
##       butter,                                                                                           
##       fruit/vegetable juice}  => {whipped/sour cream} 0.001016777  0.6666667 0.001525165  9.300236    10

#fetching the rules whose confidence is about 0.90 
confidentRules <- rules[quality(rules)$confidence > 0.9]
inspect(head(sort(confidentRules, by = "confidence"),10))

##      lhs                      rhs                    support confidence    coverage     lift count
## [1]  {rice,                                                                                       
##       sugar}               => {whole milk}       0.001220132          1 0.001220132 3.913649    12
## [2]  {canned fish,                                                                                
##       hygiene articles}    => {whole milk}       0.001118454          1 0.001118454 3.913649    11
## [3]  {root vegetables,                                                                            
##       butter,                                                                                     
##       rice}                => {whole milk}       0.001016777          1 0.001016777 3.913649    10
## [4]  {root vegetables,                                                                            
##       whipped/sour cream,                                                                         
##       flour}               => {whole milk}       0.001728521          1 0.001728521 3.913649    17
## [5]  {butter,                                                                                     
##       soft cheese,                                                                                
##       domestic eggs}       => {whole milk}       0.001016777          1 0.001016777 3.913649    10
## [6]  {citrus fruit,                                                                               
##       root vegetables,                                                                            
##       soft cheese}         => {other vegetables} 0.001016777          1 0.001016777 5.168156    10
## [7]  {pip fruit,                                                                                  
##       butter,                                                                                     
##       hygiene articles}    => {whole milk}       0.001016777          1 0.001016777 3.913649    10
## [8]  {root vegetables,                                                                            
##       whipped/sour cream,                                                                         
##       hygiene articles}    => {whole milk}       0.001016777          1 0.001016777 3.913649    10
## [9]  {pip fruit,                                                                                  
##       root vegetables,                                                                            
##       hygiene articles}    => {whole milk}       0.001016777          1 0.001016777 3.913649    10
## [10] {cream cheese ,                                                                              
##       domestic eggs,                                                                              
##       sugar}               => {whole milk}       0.001118454          1 0.001118454 3.913649    11

#Graphical visualization of the top five rules sorted by lift
top_5 <- head(sort(rules, by = "lift"), 5)
plot(top_5, method = "graph", control=list(type="items"))

## Warning: Unknown control parameters: type

## Available control parameters (with default values):
## layout    =  stress
## circular  =  FALSE
## ggraphdots    =  NULL
## edges     =  <environment>
## nodes     =  <environment>
## nodetext  =  <environment>
## colors    =  c("#EE0000FF", "#EEEEEEFF")
## engine    =  ggplot2
## max   =  100
## verbose   =  FALSE

APRIORI ALGORITHM FOR FINDING ASSOCIATION RULES IN DATA MINING

Groceries Dataset Analysis