the data set for this activity consist of the .csv file Online Retail, downloaded from the UCI repository. the data set contains transactions of online shopping including variables such as item descriptions, price, invoice number and others. a lot of this data was unnecessary so it was trimmed and cleaned by removing such variables and punctuation symbols.
library(readxl)
## Warning: package 'readxl' was built under R version 4.0.3
library(plyr)
library(arules)
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.0.3
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
## Loading required package: grid
#Read in the data
data <- read_xlsx("Online Retail.xlsx")
#Convert to dataframe
data = data.frame(data)
head(data)
## InvoiceNo StockCode lower
## 1 536365 85123A white hanging heart t-light holder
## 2 536365 71053 white metal lantern
## 3 536365 84406B cream cupid hearts coat hanger
## 4 536365 84029G knitted union flag hot water bottle
## 5 536365 84029E red woolly hottie white heart.
## 6 536365 22752 set 7 babushka nesting boxes
## Description Quantity InvoiceDate UnitPrice
## 1 WHITE HANGING HEART T-LIGHT HOLDER 6 2010-12-01 08:26:00 2.55
## 2 WHITE METAL LANTERN 6 2010-12-01 08:26:00 3.39
## 3 CREAM CUPID HEARTS COAT HANGER 8 2010-12-01 08:26:00 2.75
## 4 KNITTED UNION FLAG HOT WATER BOTTLE 6 2010-12-01 08:26:00 3.39
## 5 RED WOOLLY HOTTIE WHITE HEART. 6 2010-12-01 08:26:00 3.39
## 6 SET 7 BABUSHKA NESTING BOXES 2 2010-12-01 08:26:00 7.65
## CustomerID Country
## 1 17850 United Kingdom
## 2 17850 United Kingdom
## 3 17850 United Kingdom
## 4 17850 United Kingdom
## 5 17850 United Kingdom
## 6 17850 United Kingdom
#Remove irrelevant items below one dollar
data2 <- data[which(data$UnitPrice > 0),]
#only keep sale transactions(no refunds, etc.)
data2 <- subset(data2, StockCode!="D" & StockCode!="DOT" & StockCode!="S" & StockCode!="POST" & StockCode!="M" & StockCode!="C2" & StockCode!="AMAZONFEE" & StockCode!="B" & StockCode!="BANK CHARGES" & StockCode!="CRUK" & StockCode!="m" & StockCode!="PADS")
#Removing gift cards
#find rows starting with 'gift' and place in vector
giftcardrows <- as.vector(subset(data2, subset = grepl(glob2rx("gift*"), StockCode))[,2])
#Remove the gift rows data with vector above
data2 <- data2[-which(data2$StockCode %in% giftcardrows),]
#remove commas and place in new df
data2[,'Description'] <- gsub(",","",data2[,'Description'])
#Remove more irelevant columns
data2 <- data2[,-c(5,6,7,8)]
# Format new csv file
#takes items in description variable for eace invoice and joins items by commas
data_basket <- ddply(data2,"InvoiceNo", function(df1)paste(df1$Description, collapse = ","))
#Write new formatted file to csv
write.csv(data_basket,"ItemList.csv", quote = FALSE, row.names = TRUE)
#Read in transaction files
#Turns off warnings
options(warn=-1)
txn = read.transactions(file="ItemList.csv", format="basket",sep=",", cols=1)
#Have chosen to leave duplicate rows in as it is entirely possible for two separate customers to purchase identical baskets
#Removes quotes
txn@itemInfo$labels <- gsub("\"","",txn@itemInfo$labels)
#Turns warnings back on
options(warn=0)
summary(txn)
## transactions as itemMatrix in sparse format with
## 23196 rows (elements/itemsets/transactions) and
## 31915 columns (items) and a density of 0.0005927446
##
## most frequent items:
## WHITE HANGING HEART T-LIGHT HOLDER REGENCY CAKESTAND 3 TIER
## 1965 1882
## JUMBO BAG RED RETROSPOT PARTY BUNTING
## 1733 1463
## LUNCH BAG RED RETROSPOT (Other)
## 1382 430384
##
## element (itemset/transaction) length distribution:
## sizes
## 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 3510 1662 1199 962 884 789 731 701 688 632 630 544 550 545 557 534
## 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
## 476 468 497 430 383 324 335 264 251 244 238 236 237 211 165 166
## 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
## 133 151 144 104 116 114 119 94 100 89 83 85 64 76 64 71
## 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
## 53 58 59 51 52 54 54 33 42 37 30 30 29 15 28 35
## 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
## 24 27 27 26 19 17 20 20 17 17 11 16 12 16 16 11
## 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
## 17 12 9 14 15 16 13 8 12 8 10 12 5 7 5 8
## 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
## 9 7 7 6 3 3 4 6 3 6 9 7 4 4 8 1
## 114 115 116 117 118 119 120 121 122 123 124 125 126 127 129 130
## 3 1 4 5 2 4 3 8 4 2 4 1 7 3 3 6
## 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
## 2 3 5 1 5 2 2 3 3 5 2 4 1 5 4 5
## 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## 5 1 3 2 5 4 2 7 2 2 5 3 3 1 1 1
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
## 1 1 3 3 2 3 2 4 3 1 4 1 2 2 4 1
## 179 180 181 183 184 186 188 190 192 193 194 195 196 197 198 199
## 1 2 3 3 1 2 1 2 2 2 1 3 1 1 1 1
## 201 202 203 204 205 206 207 208 209 215 216 217 218 220 221 223
## 1 1 2 2 1 1 2 2 1 4 1 1 1 1 1 2
## 224 225 226 227 228 231 232 234 236 237 238 241 247 248 250 254
## 1 1 2 1 1 2 1 1 1 1 1 1 1 3 2 2
## 255 256 258 263 266 268 271 273 277 282 290 297 298 300 301 308
## 1 1 1 2 1 1 1 1 1 2 1 1 1 1 2 1
## 309 312 313 317 320 323 324 328 333 334 339 342 343 346 354 359
## 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
## 362 363 368 374 376 389 390 393 395 397 410 415 417 424 430 439
## 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
## 443 456 464 470 506 510 528 583 637
## 1 1 1 2 1 1 1 1 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 4.00 11.00 18.92 23.00 637.00
##
## includes extended item information - examples:
## labels
## 1 *Boombox Ipod Classic
## 2 *USB Office Mirror Ball
## 3 10 COLOUR SPACEBOY PEN
##
## includes extended transaction information - examples:
## transactionID
## 1
## 2 1
## 3 2
As seen above, the data must then be formatted for further analysis. To use the data effectively, the items under the description variable are grouped together separated by commas and this new format is saved as a new .csv file, ItemList.csv.Below, the 8 most purchased items are identified and the apriori process begins. the support is set to .001 so we look at items purchased at least 23 times(23,196 transactions x .001 =23), confidence is set to .8.
#plot 8 most purchased products
x = 8
itemFrequencyPlot(txn, topN = x, main=bquote(paste("Top ",.(x)," Most Frequently Purchased Items")))
#Run apriori with support of .001 and conf 0.8
basket_rules <- apriori(txn,parameter = list(sup = 0.001, conf = 0.8, target="rules"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 23
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[31915 item(s), 23196 transaction(s)] done [0.30s].
## sorting and recoding items ... [2522 item(s)] done [0.02s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(txn, parameter = list(sup = 0.001, conf = 0.8, target =
## "rules")): Mining stopped (maxlen reached). Only patterns up to a length of 10
## returned!
## done [1.92s].
## writing ... [724571 rule(s)] done [0.24s].
## creating S4 object ... done [0.48s].
#Summary statistics
summary(basket_rules)
## set of 724571 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4 5 6 7 8 9 10
## 52 8419 63160 146816 198592 170263 95256 34387 7626
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 5.00 6.00 6.27 7.00 10.00
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.001035 Min. :0.8000 Min. :0.001035 Min. : 9.444
## 1st Qu.:0.001035 1st Qu.:0.8710 1st Qu.:0.001078 1st Qu.: 23.127
## Median :0.001078 Median :0.9310 Median :0.001207 Median : 32.795
## Mean :0.001160 Mean :0.9275 Mean :0.001260 Mean : 38.716
## 3rd Qu.:0.001164 3rd Qu.:1.0000 3rd Qu.:0.001293 3rd Qu.: 45.816
## Max. :0.019529 Max. :1.0000 Max. :0.022935 Max. :533.241
## count
## Min. : 24.00
## 1st Qu.: 24.00
## Median : 25.00
## Mean : 26.91
## 3rd Qu.: 27.00
## Max. :453.00
##
## mining info:
## data ntransactions support confidence
## txn 23196 0.001 0.8
inspect(head(basket_rules))
## lhs rhs support confidence coverage lift count
## [1] {MIRRORED WALL ART LADIES} => {MIRRORED WALL ART GENTS} 0.001034661 0.8275862 0.001250216 533.24138 24
## [2] {BLUE FELT EASTER EGG BASKET} => {CREAM FELT EASTER EGG BASKET} 0.002457320 0.8636364 0.002845318 183.78816 57
## [3] {VINTAGE RED ENAMEL TRIM PLATE} => {VINTAGE RED TRIM ENAMEL BOWL} 0.002586653 0.8000000 0.003233316 195.33474 60
## [4] {BLUE POLKADOT BEAKER} => {RED POLKADOT BEAKER} 0.003060873 0.8352941 0.003664425 156.25389 71
## [5] {MIXED NUTS LIGHT GREEN BOWL} => {SMALL DOLLY MIX DESIGN ORANGE BOWL} 0.001595103 0.8604651 0.001853768 53.79878 37
## [6] {PARTY PIZZA DISH GREEN POLKADOT} => {PARTY PIZZA DISH BLUE POLKADOT} 0.001638213 0.8085106 0.002026211 329.02128 38
#Find minimum lift
min(basket_rules@quality$lift)
## [1] 9.443664
length(basket_rules)
## [1] 724571
the output of the algorithm gave us 724,571 rules with a min lift of 9.4 and max of 533.2. looking at the rules, there is an 82 percent chance mirrored wall art gents is purchases when mirrored wall art ladies is purchased. a customer who purchases the gents is 533 times more likely to also purchase the ladies. inspecting the confidence of the rules below, items on the rhs are purchased when items on the lhs are purchased. a confidence rating of 1 means they will be purchased together 100 percent of the time. their confidence are ranked in decreasing order. The rules are plotted below to visualize the support and confidence. they are plotted with x as the support and y as the confidence.
#Inspecting rules in order of confidence
rules_conf <- sort (basket_rules, by="confidence", decreasing=TRUE) # 'high-confidence' rules.
#Summary Statistics
inspect(head(rules_conf))
## lhs rhs support confidence coverage lift count
## [1] {CHRISTMAS TREE PAINTED ZINC,
## WOODEN STAR CHRISTMAS SCANDINAVIAN} => {WOODEN TREE CHRISTMAS SCANDINAVIAN} 0.001077772 1 0.001077772 92.78400 25
## [2] {CHRISTMAS TREE PAINTED ZINC,
## WOODEN HEART CHRISTMAS SCANDINAVIAN} => {WOODEN TREE CHRISTMAS SCANDINAVIAN} 0.001120883 1 0.001120883 92.78400 26
## [3] {CHILDS GARDEN RAKE BLUE,
## CHILDS GARDEN SPADE PINK} => {CHILDS GARDEN SPADE BLUE} 0.001250216 1 0.001250216 257.73333 29
## [4] {MIXED NUTS LIGHT GREEN BOWL,
## SMALL CHOCOLATES PINK BOWL} => {SMALL DOLLY MIX DESIGN ORANGE BOWL} 0.001207105 1 0.001207105 62.52291 28
## [5] {HERB MARKER ROSEMARY,
## IVORY GIANT GARDEN THERMOMETER} => {HERB MARKER MINT} 0.001163994 1 0.001163994 111.51923 27
## [6] {FELTCRAFT DOLL ROSIE,
## FELTCRAFT GIRL NICOLE KIT} => {FELTCRAFT GIRL AMELIE KIT} 0.001077772 1 0.001077772 100.85217 25
summary(rules_conf)
## set of 724571 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4 5 6 7 8 9 10
## 52 8419 63160 146816 198592 170263 95256 34387 7626
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 5.00 6.00 6.27 7.00 10.00
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.001035 Min. :0.8000 Min. :0.001035 Min. : 9.444
## 1st Qu.:0.001035 1st Qu.:0.8710 1st Qu.:0.001078 1st Qu.: 23.127
## Median :0.001078 Median :0.9310 Median :0.001207 Median : 32.795
## Mean :0.001160 Mean :0.9275 Mean :0.001260 Mean : 38.716
## 3rd Qu.:0.001164 3rd Qu.:1.0000 3rd Qu.:0.001293 3rd Qu.: 45.816
## Max. :0.019529 Max. :1.0000 Max. :0.022935 Max. :533.241
## count
## Min. : 24.00
## 1st Qu.: 24.00
## Median : 25.00
## Mean : 26.91
## 3rd Qu.: 27.00
## Max. :453.00
##
## mining info:
## data ntransactions support confidence
## txn 23196 0.001 0.8
#Plotting all basket rules
plot(basket_rules)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
plot(basket_rules,method="two-key plot")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
#Plot of top 10 rules
top10Rules <- head(basket_rules, n = 10, by = "confidence")
plot(top10Rules, method="paracoord")
plot(top10Rules, method="graph", control=list(type="itemsets"), itemLabels=TRUE)
## Warning: Unknown control parameters: type
## Available control parameters (with default values):
## main = Graph for 10 rules
## nodeColors = c("#66CC6680", "#9999CC80")
## nodeCol = c("#EE0000FF", "#EE0303FF", "#EE0606FF", "#EE0909FF", "#EE0C0CFF", "#EE0F0FFF", "#EE1212FF", "#EE1515FF", "#EE1818FF", "#EE1B1BFF", "#EE1E1EFF", "#EE2222FF", "#EE2525FF", "#EE2828FF", "#EE2B2BFF", "#EE2E2EFF", "#EE3131FF", "#EE3434FF", "#EE3737FF", "#EE3A3AFF", "#EE3D3DFF", "#EE4040FF", "#EE4444FF", "#EE4747FF", "#EE4A4AFF", "#EE4D4DFF", "#EE5050FF", "#EE5353FF", "#EE5656FF", "#EE5959FF", "#EE5C5CFF", "#EE5F5FFF", "#EE6262FF", "#EE6666FF", "#EE6969FF", "#EE6C6CFF", "#EE6F6FFF", "#EE7272FF", "#EE7575FF", "#EE7878FF", "#EE7B7BFF", "#EE7E7EFF", "#EE8181FF", "#EE8484FF", "#EE8888FF", "#EE8B8BFF", "#EE8E8EFF", "#EE9191FF", "#EE9494FF", "#EE9797FF", "#EE9999FF", "#EE9B9BFF", "#EE9D9DFF", "#EE9F9FFF", "#EEA0A0FF", "#EEA2A2FF", "#EEA4A4FF", "#EEA5A5FF", "#EEA7A7FF", "#EEA9A9FF", "#EEABABFF", "#EEACACFF", "#EEAEAEFF", "#EEB0B0FF", "#EEB1B1FF", "#EEB3B3FF", "#EEB5B5FF", "#EEB7B7FF", "#EEB8B8FF", "#EEBABAFF", "#EEBCBCFF", "#EEBDBDFF", "#EEBFBFFF", "#EEC1C1FF", "#EEC3C3FF", "#EEC4C4FF", "#EEC6C6FF", "#EEC8C8FF", "#EEC9C9FF", "#EECBCBFF", "#EECDCDFF", "#EECFCFFF", "#EED0D0FF", "#EED2D2FF", "#EED4D4FF", "#EED5D5FF", "#EED7D7FF", "#EED9D9FF", "#EEDBDBFF", "#EEDCDCFF", "#EEDEDEFF", "#EEE0E0FF", "#EEE1E1FF", "#EEE3E3FF", "#EEE5E5FF", "#EEE7E7FF", "#EEE8E8FF", "#EEEAEAFF", "#EEECECFF", "#EEEEEEFF")
## edgeCol = c("#474747FF", "#494949FF", "#4B4B4BFF", "#4D4D4DFF", "#4F4F4FFF", "#515151FF", "#535353FF", "#555555FF", "#575757FF", "#595959FF", "#5B5B5BFF", "#5E5E5EFF", "#606060FF", "#626262FF", "#646464FF", "#666666FF", "#686868FF", "#6A6A6AFF", "#6C6C6CFF", "#6E6E6EFF", "#707070FF", "#727272FF", "#747474FF", "#767676FF", "#787878FF", "#7A7A7AFF", "#7C7C7CFF", "#7E7E7EFF", "#808080FF", "#828282FF", "#848484FF", "#868686FF", "#888888FF", "#8A8A8AFF", "#8C8C8CFF", "#8D8D8DFF", "#8F8F8FFF", "#919191FF", "#939393FF", "#959595FF", "#979797FF", "#999999FF", "#9A9A9AFF", "#9C9C9CFF", "#9E9E9EFF", "#A0A0A0FF", "#A2A2A2FF", "#A3A3A3FF", "#A5A5A5FF", "#A7A7A7FF", "#A9A9A9FF", "#AAAAAAFF", "#ACACACFF", "#AEAEAEFF", "#AFAFAFFF", "#B1B1B1FF", "#B3B3B3FF", "#B4B4B4FF", "#B6B6B6FF", "#B7B7B7FF", "#B9B9B9FF", "#BBBBBBFF", "#BCBCBCFF", "#BEBEBEFF", "#BFBFBFFF", "#C1C1C1FF", "#C2C2C2FF", "#C3C3C4FF", "#C5C5C5FF", "#C6C6C6FF", "#C8C8C8FF", "#C9C9C9FF", "#CACACAFF", "#CCCCCCFF", "#CDCDCDFF", "#CECECEFF", "#CFCFCFFF", "#D1D1D1FF", "#D2D2D2FF", "#D3D3D3FF", "#D4D4D4FF", "#D5D5D5FF", "#D6D6D6FF", "#D7D7D7FF", "#D8D8D8FF", "#D9D9D9FF", "#DADADAFF", "#DBDBDBFF", "#DCDCDCFF", "#DDDDDDFF", "#DEDEDEFF", "#DEDEDEFF", "#DFDFDFFF", "#E0E0E0FF", "#E0E0E0FF", "#E1E1E1FF", "#E1E1E1FF", "#E2E2E2FF", "#E2E2E2FF", "#E2E2E2FF")
## alpha = 0.5
## cex = 1
## itemLabels = TRUE
## labelCol = #000000B3
## measureLabels = FALSE
## precision = 3
## layout = NULL
## layoutParams = list()
## arrowSize = 0.5
## engine = igraph
## plot = TRUE
## plot_options = list()
## max = 100
## verbose = FALSE