library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(arules)
## Warning: package 'arules' was built under R version 4.1.2
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
data set link: https://www.kaggle.com/aslanahmedov/market-basket-analysis/
sales <- read.csv2("Assignment-1_Data.csv")
head(sales,3)
## BillNo Itemname Quantity Date Price
## 1 536365 WHITE HANGING HEART T-LIGHT HOLDER 6 01.12.2010 08:26 2.55
## 2 536365 WHITE METAL LANTERN 6 01.12.2010 08:26 3.39
## 3 536365 CREAM CUPID HEARTS COAT HANGER 8 01.12.2010 08:26 2.75
## CustomerID Country
## 1 17850 United Kingdom
## 2 17850 United Kingdom
## 3 17850 United Kingdom
data set consists of 522064 observations
colSums(is.na(sales))
## BillNo Itemname Quantity Date Price CustomerID Country
## 0 0 0 0 0 134041 0
there are 134041 observations without CustomerID
sales <- sales[complete.cases(sales), ]
sales <- sales %>% select(c("BillNo","Itemname"))
trans1 <- as(split(sales[,"Itemname"], sales[,"BillNo"]), "transactions")
## Warning in asMethod(object): removing duplicated items in transactions
summary(trans1)
## transactions as itemMatrix in sparse format with
## 18163 rows (elements/itemsets/transactions) and
## 3846 columns (items) and a density of 0.005408855
##
## most frequent items:
## WHITE HANGING HEART T-LIGHT HOLDER REGENCY CAKESTAND 3 TIER
## 1919 1627
## JUMBO BAG RED RETROSPOT ASSORTED COLOUR BIRD ORNAMENT
## 1573 1358
## PARTY BUNTING (Other)
## 1352 370006
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 1379 728 621 612 649 592 582 584 588 515 529 482 487 511 530 538
## 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
## 449 427 469 416 383 331 338 297 239 247 233 232 260 213 186 167
## 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
## 155 168 128 121 121 116 125 114 112 96 88 91 85 83 71 76
## 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
## 74 76 48 54 68 65 62 41 56 38 31 51 35 23 38 35
## 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 32 35 28 33 23 30 28 20 22 26 24 19 17 19 11 11
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
## 16 18 15 19 14 14 9 13 11 9 9 14 12 7 3 9
## 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
## 8 11 4 10 8 3 6 7 2 3 6 3 2 4 4 3
## 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
## 3 6 6 8 3 4 5 4 5 7 3 4 3 2 5 1
## 129 130 131 132 134 135 136 137 138 139 140 141 142 144 145 146
## 1 2 3 2 2 2 2 2 2 1 1 4 1 1 2 2
## 148 149 150 151 153 154 156 157 163 165 169 170 171 175 176 178
## 1 3 1 1 1 1 1 1 1 2 1 1 1 1 2 1
## 179 180 181 184 187 192 193 195 202 204 208 210 219 227 249 259
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 262 270 280 333 347 352 363 375 386 419 434 439 525 529 541
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 6.0 15.0 20.8 27.0 541.0
##
## includes extended item information - examples:
## labels
## 1 10 COLOUR SPACEBOY PEN
## 2 12 COLOURED PARTY BALLOONS
## 3 12 DAISY PEGS IN WOOD BOX
##
## includes extended transaction information - examples:
## transactionID
## 1 536365
## 2 536366
## 3 536367
Three most popular items are: - WHITE HANGING HEART T-LIGHT HOLDER with 1919 occurrences - REGENCY CAKESTAND 3 TIER with 1627 occurrences - JUMBO BAG RED RETROSPOT with 1573 occurrences
itemFrequencyPlot(trans1, topN=25, type="absolute", main="ItemFrequency", col="darkgreen")
itemFrequencyPlot(trans1, topN=25, type="relative", main="ItemFrequency", col="green")
rules <- apriori(trans1)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.1 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 1816
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[3846 item(s), 18163 transaction(s)] done [0.20s].
## sorting and recoding items ... [1 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
*running apriori mechanism with default settings results in 0 rules
rules <- apriori(trans1, parameter = list(supp=0.0125, conf=0.8))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.0125 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 227
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[3846 item(s), 18163 transaction(s)] done [0.20s].
## sorting and recoding items ... [473 item(s)] done [0.01s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 done [0.02s].
## writing ... [8 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
running with amended parameters results in 8 rules
plot(rules, method="paracoord", control=list(reorder=TRUE))
plot(rules, method="graph")
inspect(head(sort(rules, by ="lift"),5))
## lhs rhs support confidence coverage lift count
## [1] {SET/6 RED SPOTTY PAPER CUPS} => {SET/6 RED SPOTTY PAPER PLATES} 0.01260805 0.8237410 0.01530584 47.34686 229
## [2] {POPPY'S PLAYHOUSE BEDROOM} => {POPPY'S PLAYHOUSE KITCHEN} 0.01354402 0.8013029 0.01690249 43.31567 246
## [3] {PINK REGENCY TEACUP AND SAUCER,
## ROSES REGENCY TEACUP AND SAUCER} => {GREEN REGENCY TEACUP AND SAUCER} 0.01998569 0.8918919 0.02240819 24.80771 363
## [4] {PINK REGENCY TEACUP AND SAUCER,
## REGENCY CAKESTAND 3 TIER} => {GREEN REGENCY TEACUP AND SAUCER} 0.01376425 0.8710801 0.01580135 24.22883 250
## [5] {PINK REGENCY TEACUP AND SAUCER} => {GREEN REGENCY TEACUP AND SAUCER} 0.02378462 0.8212928 0.02895997 22.84401 432
Rules above (sorted by lift - tendency of buying both products over bying them separately) can be translated as follows: - People who bought SET/6 RED SPOTTY PAPER CUPS also bought SET/6 RED SPOTTY PAPER PLATES in 82% of cases - When people buy two items (PINK REGENCY TEACUP AND SAUCER and ROSES REGENCY TEACUP AND SAUCER) they also buy GREEN REGENCY TEACUP AND SAUCER with a probability of 89% - Buying POPPY’S PLAYHOUSE BEDROOM leads to purchase of POPPY’S PLAYHOUSE KITCHEN in more than 80% of cases
We managed to prepare data set and discover association rules between items. Above are just striking examples of possible combinations of purchased items. We can found more by changing parameters. We also can check results for specific item. Below is the example. Received data about associations has a great value for marketing and promotion purposes. For example, based of confidence metric we can take decision to promote couple of items. While looking at lift metric we can focus our attention on stock availabilty of certain goods to prevent lost sales due to missing pairing item.
green_regency_rules <- sort(subset(rules, subset = rhs %in% "POPPY'S PLAYHOUSE KITCHEN"), by = "confidence")
summary(green_regency_rules)
## set of 1 rules
##
## rule length distribution (lhs + rhs):sizes
## 2
## 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 2 2 2 2 2
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.01354 Min. :0.8013 Min. :0.0169 Min. :43.32
## 1st Qu.:0.01354 1st Qu.:0.8013 1st Qu.:0.0169 1st Qu.:43.32
## Median :0.01354 Median :0.8013 Median :0.0169 Median :43.32
## Mean :0.01354 Mean :0.8013 Mean :0.0169 Mean :43.32
## 3rd Qu.:0.01354 3rd Qu.:0.8013 3rd Qu.:0.0169 3rd Qu.:43.32
## Max. :0.01354 Max. :0.8013 Max. :0.0169 Max. :43.32
## count
## Min. :246
## 1st Qu.:246
## Median :246
## Mean :246
## 3rd Qu.:246
## Max. :246
##
## mining info:
## data ntransactions support confidence
## trans1 18163 0.0125 0.8
## call
## apriori(data = trans1, parameter = list(supp = 0.0125, conf = 0.8))