association rules

Data Retrieval

the data set for this activity consist of the .csv file Online Retail, downloaded from the UCI repository. the data set contains transactions of online shopping including variables such as item descriptions, price, invoice number and others. a lot of this data was unnecessary so it was trimmed and cleaned by removing such variables and punctuation symbols.

library(readxl)

## Warning: package 'readxl' was built under R version 4.0.3

library(plyr) 
library(arules)

## Loading required package: Matrix

## Warning: package 'Matrix' was built under R version 4.0.3

## 
## Attaching package: 'arules'

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

library(arulesViz)

## Loading required package: grid

#Read in the data
data <- read_xlsx("Online Retail.xlsx")
#Convert to dataframe
data = data.frame(data)

head(data)

##   InvoiceNo StockCode                               lower
## 1    536365    85123A  white hanging heart t-light holder
## 2    536365     71053                 white metal lantern
## 3    536365    84406B      cream cupid hearts coat hanger
## 4    536365    84029G knitted union flag hot water bottle
## 5    536365    84029E      red woolly hottie white heart.
## 6    536365     22752        set 7 babushka nesting boxes
##                           Description Quantity         InvoiceDate UnitPrice
## 1  WHITE HANGING HEART T-LIGHT HOLDER        6 2010-12-01 08:26:00      2.55
## 2                 WHITE METAL LANTERN        6 2010-12-01 08:26:00      3.39
## 3      CREAM CUPID HEARTS COAT HANGER        8 2010-12-01 08:26:00      2.75
## 4 KNITTED UNION FLAG HOT WATER BOTTLE        6 2010-12-01 08:26:00      3.39
## 5      RED WOOLLY HOTTIE WHITE HEART.        6 2010-12-01 08:26:00      3.39
## 6        SET 7 BABUSHKA NESTING BOXES        2 2010-12-01 08:26:00      7.65
##   CustomerID        Country
## 1      17850 United Kingdom
## 2      17850 United Kingdom
## 3      17850 United Kingdom
## 4      17850 United Kingdom
## 5      17850 United Kingdom
## 6      17850 United Kingdom

#Remove irrelevant items below one dollar
data2 <- data[which(data$UnitPrice > 0),]

#only keep sale transactions(no refunds, etc.)
data2 <- subset(data2, StockCode!="D" & StockCode!="DOT" & StockCode!="S" & StockCode!="POST" & StockCode!="M" & StockCode!="C2" & StockCode!="AMAZONFEE" & StockCode!="B" & StockCode!="BANK CHARGES" & StockCode!="CRUK" & StockCode!="m" & StockCode!="PADS")

#Removing gift cards
#find rows starting with 'gift' and place in vector
giftcardrows <- as.vector(subset(data2, subset = grepl(glob2rx("gift*"), StockCode))[,2])
#Remove the gift rows data with vector above
data2 <- data2[-which(data2$StockCode %in% giftcardrows),]

#remove commas and place in new df
data2[,'Description'] <- gsub(",","",data2[,'Description'])

#Remove more irelevant columns
data2 <- data2[,-c(5,6,7,8)]

# Format new csv file

#takes items in description variable for eace invoice and joins items by commas
data_basket <- ddply(data2,"InvoiceNo", function(df1)paste(df1$Description, collapse = ","))

#Write new formatted file to csv
write.csv(data_basket,"ItemList.csv", quote = FALSE, row.names = TRUE)

#Read in transaction files
#Turns off warnings
options(warn=-1)
txn = read.transactions(file="ItemList.csv", format="basket",sep=",", cols=1)

#Have chosen to leave duplicate rows in as it is entirely possible for two separate customers to purchase identical baskets

#Removes quotes
txn@itemInfo$labels <- gsub("\"","",txn@itemInfo$labels)

#Turns warnings back on
options(warn=0)

summary(txn)

## transactions as itemMatrix in sparse format with
##  23196 rows (elements/itemsets/transactions) and
##  31915 columns (items) and a density of 0.0005927446 
## 
## most frequent items:
## WHITE HANGING HEART T-LIGHT HOLDER           REGENCY CAKESTAND 3 TIER 
##                               1965                               1882 
##            JUMBO BAG RED RETROSPOT                      PARTY BUNTING 
##                               1733                               1463 
##            LUNCH BAG RED RETROSPOT                            (Other) 
##                               1382                             430384 
## 
## element (itemset/transaction) length distribution:
## sizes
##    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17 
## 3510 1662 1199  962  884  789  731  701  688  632  630  544  550  545  557  534 
##   18   19   20   21   22   23   24   25   26   27   28   29   30   31   32   33 
##  476  468  497  430  383  324  335  264  251  244  238  236  237  211  165  166 
##   34   35   36   37   38   39   40   41   42   43   44   45   46   47   48   49 
##  133  151  144  104  116  114  119   94  100   89   83   85   64   76   64   71 
##   50   51   52   53   54   55   56   57   58   59   60   61   62   63   64   65 
##   53   58   59   51   52   54   54   33   42   37   30   30   29   15   28   35 
##   66   67   68   69   70   71   72   73   74   75   76   77   78   79   80   81 
##   24   27   27   26   19   17   20   20   17   17   11   16   12   16   16   11 
##   82   83   84   85   86   87   88   89   90   91   92   93   94   95   96   97 
##   17   12    9   14   15   16   13    8   12    8   10   12    5    7    5    8 
##   98   99  100  101  102  103  104  105  106  107  108  109  110  111  112  113 
##    9    7    7    6    3    3    4    6    3    6    9    7    4    4    8    1 
##  114  115  116  117  118  119  120  121  122  123  124  125  126  127  129  130 
##    3    1    4    5    2    4    3    8    4    2    4    1    7    3    3    6 
##  131  132  133  134  135  136  137  138  139  140  141  142  143  144  145  146 
##    2    3    5    1    5    2    2    3    3    5    2    4    1    5    4    5 
##  147  148  149  150  151  152  153  154  155  156  157  158  159  160  161  162 
##    5    1    3    2    5    4    2    7    2    2    5    3    3    1    1    1 
##  163  164  165  166  167  168  169  170  171  172  173  174  175  176  177  178 
##    1    1    3    3    2    3    2    4    3    1    4    1    2    2    4    1 
##  179  180  181  183  184  186  188  190  192  193  194  195  196  197  198  199 
##    1    2    3    3    1    2    1    2    2    2    1    3    1    1    1    1 
##  201  202  203  204  205  206  207  208  209  215  216  217  218  220  221  223 
##    1    1    2    2    1    1    2    2    1    4    1    1    1    1    1    2 
##  224  225  226  227  228  231  232  234  236  237  238  241  247  248  250  254 
##    1    1    2    1    1    2    1    1    1    1    1    1    1    3    2    2 
##  255  256  258  263  266  268  271  273  277  282  290  297  298  300  301  308 
##    1    1    1    2    1    1    1    1    1    2    1    1    1    1    2    1 
##  309  312  313  317  320  323  324  328  333  334  339  342  343  346  354  359 
##    1    1    1    1    1    1    1    1    1    2    1    1    1    1    1    1 
##  362  363  368  374  376  389  390  393  395  397  410  415  417  424  430  439 
##    1    1    1    1    1    1    1    1    1    2    1    1    1    1    1    1 
##  443  456  464  470  506  510  528  583  637 
##    1    1    1    2    1    1    1    1    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    4.00   11.00   18.92   23.00  637.00 
## 
## includes extended item information - examples:
##                    labels
## 1   *Boombox Ipod Classic
## 2 *USB Office Mirror Ball
## 3  10 COLOUR SPACEBOY PEN
## 
## includes extended transaction information - examples:
##   transactionID
## 1              
## 2             1
## 3             2

Formatting Data

As seen above, the data must then be formatted for further analysis. To use the data effectively, the items under the description variable are grouped together separated by commas and this new format is saved as a new .csv file, ItemList.csv.Below, the 8 most purchased items are identified and the apriori process begins. the support is set to .001 so we look at items purchased at least 23 times(23,196 transactions x .001 =23), confidence is set to .8.

#plot 8 most purchased products
x = 8
itemFrequencyPlot(txn, topN = x, main=bquote(paste("Top ",.(x)," Most Frequently Purchased Items")))

#Run apriori with support of .001 and conf 0.8
basket_rules <- apriori(txn,parameter = list(sup = 0.001, conf = 0.8, target="rules"))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 23 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[31915 item(s), 23196 transaction(s)] done [0.30s].
## sorting and recoding items ... [2522 item(s)] done [0.02s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10

## Warning in apriori(txn, parameter = list(sup = 0.001, conf = 0.8, target =
## "rules")): Mining stopped (maxlen reached). Only patterns up to a length of 10
## returned!

##  done [1.92s].
## writing ... [724571 rule(s)] done [0.24s].
## creating S4 object  ... done [0.48s].

#Summary statistics
summary(basket_rules)

## set of 724571 rules
## 
## rule length distribution (lhs + rhs):sizes
##      2      3      4      5      6      7      8      9     10 
##     52   8419  63160 146816 198592 170263  95256  34387   7626 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    5.00    6.00    6.27    7.00   10.00 
## 
## summary of quality measures:
##     support           confidence        coverage             lift        
##  Min.   :0.001035   Min.   :0.8000   Min.   :0.001035   Min.   :  9.444  
##  1st Qu.:0.001035   1st Qu.:0.8710   1st Qu.:0.001078   1st Qu.: 23.127  
##  Median :0.001078   Median :0.9310   Median :0.001207   Median : 32.795  
##  Mean   :0.001160   Mean   :0.9275   Mean   :0.001260   Mean   : 38.716  
##  3rd Qu.:0.001164   3rd Qu.:1.0000   3rd Qu.:0.001293   3rd Qu.: 45.816  
##  Max.   :0.019529   Max.   :1.0000   Max.   :0.022935   Max.   :533.241  
##      count       
##  Min.   : 24.00  
##  1st Qu.: 24.00  
##  Median : 25.00  
##  Mean   : 26.91  
##  3rd Qu.: 27.00  
##  Max.   :453.00  
## 
## mining info:
##  data ntransactions support confidence
##   txn         23196   0.001        0.8

inspect(head(basket_rules))

##     lhs                                  rhs                                      support confidence    coverage      lift count
## [1] {MIRRORED WALL ART LADIES}        => {MIRRORED WALL ART GENTS}            0.001034661  0.8275862 0.001250216 533.24138    24
## [2] {BLUE FELT EASTER EGG BASKET}     => {CREAM FELT EASTER EGG BASKET}       0.002457320  0.8636364 0.002845318 183.78816    57
## [3] {VINTAGE RED ENAMEL TRIM PLATE}   => {VINTAGE RED TRIM ENAMEL BOWL}       0.002586653  0.8000000 0.003233316 195.33474    60
## [4] {BLUE POLKADOT BEAKER}            => {RED POLKADOT BEAKER}                0.003060873  0.8352941 0.003664425 156.25389    71
## [5] {MIXED NUTS LIGHT GREEN BOWL}     => {SMALL DOLLY MIX DESIGN ORANGE BOWL} 0.001595103  0.8604651 0.001853768  53.79878    37
## [6] {PARTY PIZZA DISH GREEN POLKADOT} => {PARTY PIZZA DISH BLUE POLKADOT}     0.001638213  0.8085106 0.002026211 329.02128    38

#Find minimum lift
min(basket_rules@quality$lift)

## [1] 9.443664

length(basket_rules)

## [1] 724571

Rules analysis

the output of the algorithm gave us 724,571 rules with a min lift of 9.4 and max of 533.2. looking at the rules, there is an 82 percent chance mirrored wall art gents is purchases when mirrored wall art ladies is purchased. a customer who purchases the gents is 533 times more likely to also purchase the ladies. inspecting the confidence of the rules below, items on the rhs are purchased when items on the lhs are purchased. a confidence rating of 1 means they will be purchased together 100 percent of the time. their confidence are ranked in decreasing order. The rules are plotted below to visualize the support and confidence. they are plotted with x as the support and y as the confidence.

#Inspecting rules in order of confidence
rules_conf <- sort (basket_rules, by="confidence", decreasing=TRUE) # 'high-confidence' rules.
#Summary Statistics
inspect(head(rules_conf))

##     lhs                                      rhs                                      support confidence    coverage      lift count
## [1] {CHRISTMAS TREE PAINTED ZINC,                                                                                                   
##      WOODEN STAR CHRISTMAS SCANDINAVIAN}  => {WOODEN TREE CHRISTMAS SCANDINAVIAN} 0.001077772          1 0.001077772  92.78400    25
## [2] {CHRISTMAS TREE PAINTED ZINC,                                                                                                   
##      WOODEN HEART CHRISTMAS SCANDINAVIAN} => {WOODEN TREE CHRISTMAS SCANDINAVIAN} 0.001120883          1 0.001120883  92.78400    26
## [3] {CHILDS GARDEN RAKE BLUE,                                                                                                       
##      CHILDS GARDEN SPADE PINK}            => {CHILDS GARDEN SPADE BLUE}           0.001250216          1 0.001250216 257.73333    29
## [4] {MIXED NUTS LIGHT GREEN BOWL,                                                                                                   
##      SMALL CHOCOLATES PINK BOWL}          => {SMALL DOLLY MIX DESIGN ORANGE BOWL} 0.001207105          1 0.001207105  62.52291    28
## [5] {HERB MARKER ROSEMARY,                                                                                                          
##      IVORY GIANT GARDEN THERMOMETER}      => {HERB MARKER MINT}                   0.001163994          1 0.001163994 111.51923    27
## [6] {FELTCRAFT DOLL ROSIE,                                                                                                          
##      FELTCRAFT GIRL NICOLE KIT}           => {FELTCRAFT GIRL AMELIE KIT}          0.001077772          1 0.001077772 100.85217    25

summary(rules_conf)

## set of 724571 rules
## 
## rule length distribution (lhs + rhs):sizes
##      2      3      4      5      6      7      8      9     10 
##     52   8419  63160 146816 198592 170263  95256  34387   7626 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    5.00    6.00    6.27    7.00   10.00 
## 
## summary of quality measures:
##     support           confidence        coverage             lift        
##  Min.   :0.001035   Min.   :0.8000   Min.   :0.001035   Min.   :  9.444  
##  1st Qu.:0.001035   1st Qu.:0.8710   1st Qu.:0.001078   1st Qu.: 23.127  
##  Median :0.001078   Median :0.9310   Median :0.001207   Median : 32.795  
##  Mean   :0.001160   Mean   :0.9275   Mean   :0.001260   Mean   : 38.716  
##  3rd Qu.:0.001164   3rd Qu.:1.0000   3rd Qu.:0.001293   3rd Qu.: 45.816  
##  Max.   :0.019529   Max.   :1.0000   Max.   :0.022935   Max.   :533.241  
##      count       
##  Min.   : 24.00  
##  1st Qu.: 24.00  
##  Median : 25.00  
##  Mean   : 26.91  
##  3rd Qu.: 27.00  
##  Max.   :453.00  
## 
## mining info:
##  data ntransactions support confidence
##   txn         23196   0.001        0.8

#Plotting all basket rules
plot(basket_rules)

## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.

plot(basket_rules,method="two-key plot")

## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.

#Plot of top 10 rules
top10Rules <- head(basket_rules, n = 10, by = "confidence")
plot(top10Rules, method="paracoord")

plot(top10Rules, method="graph", control=list(type="itemsets"), itemLabels=TRUE)

## Warning: Unknown control parameters: type

## Available control parameters (with default values):
## main  =  Graph for 10 rules
## nodeColors    =  c("#66CC6680", "#9999CC80")
## nodeCol   =  c("#EE0000FF", "#EE0303FF", "#EE0606FF", "#EE0909FF", "#EE0C0CFF", "#EE0F0FFF", "#EE1212FF", "#EE1515FF", "#EE1818FF", "#EE1B1BFF", "#EE1E1EFF", "#EE2222FF", "#EE2525FF", "#EE2828FF", "#EE2B2BFF", "#EE2E2EFF", "#EE3131FF", "#EE3434FF", "#EE3737FF", "#EE3A3AFF", "#EE3D3DFF", "#EE4040FF", "#EE4444FF", "#EE4747FF", "#EE4A4AFF", "#EE4D4DFF", "#EE5050FF", "#EE5353FF", "#EE5656FF", "#EE5959FF", "#EE5C5CFF", "#EE5F5FFF", "#EE6262FF", "#EE6666FF", "#EE6969FF", "#EE6C6CFF", "#EE6F6FFF", "#EE7272FF", "#EE7575FF",  "#EE7878FF", "#EE7B7BFF", "#EE7E7EFF", "#EE8181FF", "#EE8484FF", "#EE8888FF", "#EE8B8BFF", "#EE8E8EFF", "#EE9191FF", "#EE9494FF", "#EE9797FF", "#EE9999FF", "#EE9B9BFF", "#EE9D9DFF", "#EE9F9FFF", "#EEA0A0FF", "#EEA2A2FF", "#EEA4A4FF", "#EEA5A5FF", "#EEA7A7FF", "#EEA9A9FF", "#EEABABFF", "#EEACACFF", "#EEAEAEFF", "#EEB0B0FF", "#EEB1B1FF", "#EEB3B3FF", "#EEB5B5FF", "#EEB7B7FF", "#EEB8B8FF", "#EEBABAFF", "#EEBCBCFF", "#EEBDBDFF", "#EEBFBFFF", "#EEC1C1FF", "#EEC3C3FF", "#EEC4C4FF", "#EEC6C6FF", "#EEC8C8FF",  "#EEC9C9FF", "#EECBCBFF", "#EECDCDFF", "#EECFCFFF", "#EED0D0FF", "#EED2D2FF", "#EED4D4FF", "#EED5D5FF", "#EED7D7FF", "#EED9D9FF", "#EEDBDBFF", "#EEDCDCFF", "#EEDEDEFF", "#EEE0E0FF", "#EEE1E1FF", "#EEE3E3FF", "#EEE5E5FF", "#EEE7E7FF", "#EEE8E8FF", "#EEEAEAFF", "#EEECECFF", "#EEEEEEFF")
## edgeCol   =  c("#474747FF", "#494949FF", "#4B4B4BFF", "#4D4D4DFF", "#4F4F4FFF", "#515151FF", "#535353FF", "#555555FF", "#575757FF", "#595959FF", "#5B5B5BFF", "#5E5E5EFF", "#606060FF", "#626262FF", "#646464FF", "#666666FF", "#686868FF", "#6A6A6AFF", "#6C6C6CFF", "#6E6E6EFF", "#707070FF", "#727272FF", "#747474FF", "#767676FF", "#787878FF", "#7A7A7AFF", "#7C7C7CFF", "#7E7E7EFF", "#808080FF", "#828282FF", "#848484FF", "#868686FF", "#888888FF", "#8A8A8AFF", "#8C8C8CFF", "#8D8D8DFF", "#8F8F8FFF", "#919191FF", "#939393FF",  "#959595FF", "#979797FF", "#999999FF", "#9A9A9AFF", "#9C9C9CFF", "#9E9E9EFF", "#A0A0A0FF", "#A2A2A2FF", "#A3A3A3FF", "#A5A5A5FF", "#A7A7A7FF", "#A9A9A9FF", "#AAAAAAFF", "#ACACACFF", "#AEAEAEFF", "#AFAFAFFF", "#B1B1B1FF", "#B3B3B3FF", "#B4B4B4FF", "#B6B6B6FF", "#B7B7B7FF", "#B9B9B9FF", "#BBBBBBFF", "#BCBCBCFF", "#BEBEBEFF", "#BFBFBFFF", "#C1C1C1FF", "#C2C2C2FF", "#C3C3C4FF", "#C5C5C5FF", "#C6C6C6FF", "#C8C8C8FF", "#C9C9C9FF", "#CACACAFF", "#CCCCCCFF", "#CDCDCDFF", "#CECECEFF", "#CFCFCFFF", "#D1D1D1FF",  "#D2D2D2FF", "#D3D3D3FF", "#D4D4D4FF", "#D5D5D5FF", "#D6D6D6FF", "#D7D7D7FF", "#D8D8D8FF", "#D9D9D9FF", "#DADADAFF", "#DBDBDBFF", "#DCDCDCFF", "#DDDDDDFF", "#DEDEDEFF", "#DEDEDEFF", "#DFDFDFFF", "#E0E0E0FF", "#E0E0E0FF", "#E1E1E1FF", "#E1E1E1FF", "#E2E2E2FF", "#E2E2E2FF", "#E2E2E2FF")
## alpha     =  0.5
## cex   =  1
## itemLabels    =  TRUE
## labelCol  =  #000000B3
## measureLabels     =  FALSE
## precision     =  3
## layout    =  NULL
## layoutParams  =  list()
## arrowSize     =  0.5
## engine    =  igraph
## plot  =  TRUE
## plot_options  =  list()
## max   =  100
## verbose   =  FALSE

association rules

Uriel Serna Haro partner: Joe Samyn

10/13/2020

Data Retrieval

Formatting Data

Rules analysis