library(openxlsx)
library(tm)
library(car)
library(foreign)
library(readr)
library(dplyr)
library(RWeka)
library(RODBC)
library(class)
library(gmodels)
library(arules)
This project explores a basic application of market basket analysis using “association rule algorithms”. The data used are for practice and were drawn from “grocery” data and text: “Machine Learning with R”.
Apriori
# call data as sparse matrix
groceries<-read.transactions("C:\\Users\\Jaire\\OneDrive\\Desktop\\Exploratory Research\\ML\\grocery.txt", sep = ",")
## Warning in readLines(file, encoding = encoding): incomplete final line found on
## 'C:\Users\Jaire\OneDrive\Desktop\Exploratory Research\ML\grocery.txt'
# check data
summary(groceries)
## transactions as itemMatrix in sparse format with
## 9835 rows (elements/itemsets/transactions) and
## 169 columns (items) and a density of 0.02609146
##
## most frequent items:
## whole milk other vegetables rolls/buns soda
## 2513 1903 1809 1715
## yogurt (Other)
## 1372 34055
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55 46
## 17 18 19 20 21 22 23 24 26 27 28 29 32
## 29 14 14 9 11 4 6 1 1 1 1 3 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 4.409 6.000 32.000
##
## includes extended item information - examples:
## labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3 baby cosmetics
inspect(groceries [1:10])
## items
## [1] {citrus fruit,
## margarine,
## ready soups,
## semi-finished bread}
## [2] {coffee,
## tropical fruit,
## yogurt}
## [3] {whole milk}
## [4] {cream cheese,
## meat spreads,
## pip fruit,
## yogurt}
## [5] {condensed milk,
## long life bakery product,
## other vegetables,
## whole milk}
## [6] {abrasive cleaner,
## butter,
## rice,
## whole milk,
## yogurt}
## [7] {rolls/buns}
## [8] {bottled beer,
## liquor (appetizer),
## other vegetables,
## rolls/buns,
## UHT-milk}
## [9] {pot plants}
## [10] {cereals,
## whole milk}
# browse item frequency, examine support level
itemFrequency(groceries[, 1:10])
## abrasive cleaner artif. sweetener baby cosmetics baby food
## 0.0035587189 0.0032536858 0.0006100661 0.0001016777
## bags baking powder bathroom cleaner beef
## 0.0004067107 0.0176919166 0.0027452974 0.0524656838
## berries beverages
## 0.0332486019 0.0260294865
# visually assess the sparse matrix; item support, ranking, transaction, sample...
itemFrequencyPlot(groceries, support = 0.06)
itemFrequencyPlot(groceries, topN = 10)
image(groceries[1:10])
image(sample(groceries, 500))
# assessment of apriori model before adding specifications, default support 0.1
apriori(groceries)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.1 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 983
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [8 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## set of 0 rules
No rules learned because too few items meet the support threshold.
# train model on data
groceryrules <- apriori(groceries, parameter = list(support = 0.006, confidence = 0.25, minlen = 2))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.25 0.1 1 none FALSE TRUE 5 0.006 2
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 59
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [109 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [463 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(groceryrules)
## set of 463 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4
## 150 297 16
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 3.000 2.711 3.000 4.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.006101 Min. :0.2500 Min. :0.009964 Min. :0.9932
## 1st Qu.:0.007117 1st Qu.:0.2971 1st Qu.:0.018709 1st Qu.:1.6229
## Median :0.008744 Median :0.3554 Median :0.024809 Median :1.9332
## Mean :0.011539 Mean :0.3786 Mean :0.032608 Mean :2.0351
## 3rd Qu.:0.012303 3rd Qu.:0.4495 3rd Qu.:0.035892 3rd Qu.:2.3565
## Max. :0.074835 Max. :0.6600 Max. :0.255516 Max. :3.9565
## count
## Min. : 60.0
## 1st Qu.: 70.0
## Median : 86.0
## Mean :113.5
## 3rd Qu.:121.0
## Max. :736.0
##
## mining info:
## data ntransactions support confidence
## groceries 9835 0.006 0.25
## call
## apriori(data = groceries, parameter = list(support = 0.006, confidence = 0.25, minlen = 2))
The object contains 463 rules.
# view first 10 rules
inspect(groceryrules[1:10])
## lhs rhs support confidence
## [1] {pot plants} => {whole milk} 0.006914082 0.4000000
## [2] {pasta} => {whole milk} 0.006100661 0.4054054
## [3] {herbs} => {root vegetables} 0.007015760 0.4312500
## [4] {herbs} => {other vegetables} 0.007727504 0.4750000
## [5] {herbs} => {whole milk} 0.007727504 0.4750000
## [6] {processed cheese} => {whole milk} 0.007015760 0.4233129
## [7] {semi-finished bread} => {whole milk} 0.007117438 0.4022989
## [8] {beverages} => {whole milk} 0.006812405 0.2617188
## [9] {detergent} => {other vegetables} 0.006405694 0.3333333
## [10] {detergent} => {whole milk} 0.008947636 0.4656085
## coverage lift count
## [1] 0.01728521 1.565460 68
## [2] 0.01504830 1.586614 60
## [3] 0.01626843 3.956477 69
## [4] 0.01626843 2.454874 76
## [5] 0.01626843 1.858983 76
## [6] 0.01657346 1.656698 69
## [7] 0.01769192 1.574457 70
## [8] 0.02602949 1.024275 67
## [9] 0.01921708 1.722719 63
## [10] 0.01921708 1.822228 88
Some of the rules are a bit trivial, but others seem actionable.
# improve model performance to identify better rules; best 10 rules by lift
inspect(sort(groceryrules, by = "lift")[1:10])
## lhs rhs support confidence coverage lift count
## [1] {herbs} => {root vegetables} 0.007015760 0.4312500 0.01626843 3.956477 69
## [2] {berries} => {whipped/sour cream} 0.009049314 0.2721713 0.03324860 3.796886 89
## [3] {other vegetables,
## tropical fruit,
## whole milk} => {root vegetables} 0.007015760 0.4107143 0.01708185 3.768074 69
## [4] {beef,
## other vegetables} => {root vegetables} 0.007930859 0.4020619 0.01972547 3.688692 78
## [5] {other vegetables,
## tropical fruit} => {pip fruit} 0.009456024 0.2634561 0.03589222 3.482649 93
## [6] {beef,
## whole milk} => {root vegetables} 0.008032537 0.3779904 0.02125064 3.467851 79
## [7] {other vegetables,
## pip fruit} => {tropical fruit} 0.009456024 0.3618677 0.02613116 3.448613 93
## [8] {pip fruit,
## yogurt} => {tropical fruit} 0.006405694 0.3559322 0.01799695 3.392048 63
## [9] {citrus fruit,
## other vegetables} => {root vegetables} 0.010371124 0.3591549 0.02887646 3.295045 102
## [10] {other vegetables,
## whole milk,
## yogurt} => {tropical fruit} 0.007625826 0.3424658 0.02226741 3.263712 75
# assess transactions by item and rule to find other basket items
herbsrules <- subset(groceryrules, items %in% "herbs")
herbs_veggiesrules <- subset(groceryrules, items %in% c("herbs","root vegetables"))
inspect(herbsrules)
## lhs rhs support confidence coverage lift
## [1] {herbs} => {root vegetables} 0.007015760 0.43125 0.01626843 3.956477
## [2] {herbs} => {other vegetables} 0.007727504 0.47500 0.01626843 2.454874
## [3] {herbs} => {whole milk} 0.007727504 0.47500 0.01626843 1.858983
## count
## [1] 69
## [2] 76
## [3] 76
inspect(sort(herbs_veggiesrules, by = "lift")[1:5])
## lhs rhs support confidence coverage lift count
## [1] {herbs} => {root vegetables} 0.007015760 0.4312500 0.01626843 3.956477 69
## [2] {other vegetables,
## tropical fruit,
## whole milk} => {root vegetables} 0.007015760 0.4107143 0.01708185 3.768074 69
## [3] {beef,
## other vegetables} => {root vegetables} 0.007930859 0.4020619 0.01972547 3.688692 78
## [4] {beef,
## whole milk} => {root vegetables} 0.008032537 0.3779904 0.02125064 3.467851 79
## [5] {citrus fruit,
## other vegetables} => {root vegetables} 0.010371124 0.3591549 0.02887646 3.295045 102
Herbs are purchased frequently with other vegetables and root vegetables. I assess some rules based on root vegetables as well.
# store rules as csv or df
# write(groceryrules, file = "groceryrules.csv", sep = ",", quote = TRUE, row.names = FALSE)
groceryrules_df <- as(groceryrules, "data.frame")
# view grocery rule data frame
str(groceryrules_df)
## 'data.frame': 463 obs. of 6 variables:
## $ rules : chr "{pot plants} => {whole milk}" "{pasta} => {whole milk}" "{herbs} => {root vegetables}" "{herbs} => {other vegetables}" ...
## $ support : num 0.00691 0.0061 0.00702 0.00773 0.00773 ...
## $ confidence: num 0.4 0.405 0.431 0.475 0.475 ...
## $ coverage : num 0.0173 0.015 0.0163 0.0163 0.0163 ...
## $ lift : num 1.57 1.59 3.96 2.45 1.86 ...
## $ count : int 68 60 69 76 76 69 70 67 63 88 ...