INTRODUCTION

1.Creating an Environment

rm(list=ls())
library(readr)     # as_date
library(lubridate) # read_csv2
library(arules)    # Apriori Algo
setwd("/Users/Mughundhan/Projects/market basket analysis")
retail <- read_csv("Online Retail.csv")

2.Data Exploration

1.Number of Unique Invoice No.s

#Data exploration
#No. of unique orders in the dataset
head(retail)
## # A tibble: 6 <U+00D7> 8
##   InvoiceNo StockCode                         Description Quantity
##       <chr>     <chr>                               <chr>    <int>
## 1    536365    85123A  WHITE HANGING HEART T-LIGHT HOLDER        6
## 2    536365     71053                 WHITE METAL LANTERN        6
## 3    536365    84406B      CREAM CUPID HEARTS COAT HANGER        8
## 4    536365    84029G KNITTED UNION FLAG HOT WATER BOTTLE        6
## 5    536365    84029E      RED WOOLLY HOTTIE WHITE HEART.        6
## 6    536365     22752        SET 7 BABUSHKA NESTING BOXES        2
## # ... with 4 more variables: InvoiceDate <chr>, UnitPrice <dbl>,
## #   CustomerID <int>, Country <chr>
order_count <- length(unique(retail$InvoiceNo))
order_count
## [1] 25900
#Transaction dates #Eight months data
order_max_date <- max(retail$InvoiceDate)
order_max_date
## [1] "31-10-2011 17:19"
order_min_date <- min(retail$InvoiceDate)
order_min_date
## [1] "01-02-2011 08:23"
#Unique Countries 
unique(retail$Country)
##  [1] "United Kingdom"       "France"               "Australia"           
##  [4] "Netherlands"          "Germany"              "Norway"              
##  [7] "EIRE"                 "Switzerland"          "Spain"               
## [10] "Poland"               "Portugal"             "Italy"               
## [13] "Belgium"              "Lithuania"            "Japan"               
## [16] "Iceland"              "Channel Islands"      "Denmark"             
## [19] "Cyprus"               "Sweden"               "Austria"             
## [22] "Israel"               "Finland"              "Bahrain"             
## [25] "Greece"               "Hong Kong"            "Singapore"           
## [28] "Lebanon"              "United Arab Emirates" "Saudi Arabia"        
## [31] "Czech Republic"       "Canada"               "Unspecified"         
## [34] "Brazil"               "USA"                  "European Community"  
## [37] "Malta"                "RSA"
country_trans <- aggregate(retail$InvoiceNo,list(retail$Country),length) #almost like group by and sum
country_trans
##                 Group.1      x
## 1             Australia   1259
## 2               Austria    401
## 3               Bahrain     19
## 4               Belgium   2069
## 5                Brazil     32
## 6                Canada    151
## 7       Channel Islands    758
## 8                Cyprus    622
## 9        Czech Republic     30
## 10              Denmark    389
## 11                 EIRE   8196
## 12   European Community     61
## 13              Finland    695
## 14               France   8557
## 15              Germany   9495
## 16               Greece    146
## 17            Hong Kong    288
## 18              Iceland    182
## 19               Israel    297
## 20                Italy    803
## 21                Japan    358
## 22              Lebanon     45
## 23            Lithuania     35
## 24                Malta    127
## 25          Netherlands   2371
## 26               Norway   1086
## 27               Poland    341
## 28             Portugal   1519
## 29                  RSA     58
## 30         Saudi Arabia     10
## 31            Singapore    229
## 32                Spain   2533
## 33               Sweden    462
## 34          Switzerland   2002
## 35                  USA    291
## 36 United Arab Emirates     68
## 37       United Kingdom 495478
## 38          Unspecified    446
head(country_trans)
##     Group.1    x
## 1 Australia 1259
## 2   Austria  401
## 3   Bahrain   19
## 4   Belgium 2069
## 5    Brazil   32
## 6    Canada  151
country_trans[which.max(country_trans$x),]
##           Group.1      x
## 37 United Kingdom 495478
#since UK has maximum number of transactions - MBA is done for United Kingdom

#4070 unique Products in total
length(unique(retail$StockCode))
## [1] 4070
oDesc_stock<-aggregate(retail$StockCode,list(retail$Description),length) #almost like group by and sum
nrow(oDesc_stock)
## [1] 4211
head(oDesc_stock)
##                      Group.1   x
## 1      *Boombox Ipod Classic   1
## 2    *USB Office Mirror Ball   2
## 3     10 COLOUR SPACEBOY PEN 327
## 4 12 COLOURED PARTY BALLOONS 170
## 5  12 DAISY PEGS IN WOOD BOX  84
## 6  12 EGG HOUSE PAINTED WOOD 100

3.Data Cleaning

# Since we are considering only UK
retail_uk <- subset(retail, retail$Country=="United Kingdom")

# Remove the columns that are not required
colnames(retail_uk)
## [1] "InvoiceNo"   "StockCode"   "Description" "Quantity"    "InvoiceDate"
## [6] "UnitPrice"   "CustomerID"  "Country"
drops <- "Country"
retail_uk <- retail_uk[ ,!(names(retail_uk) %in% drops)]
colnames(retail_uk)
## [1] "InvoiceNo"   "StockCode"   "Description" "Quantity"    "InvoiceDate"
## [6] "UnitPrice"   "CustomerID"
total_entries <- length(unique(retail_uk$InvoiceNo))

# Remove the canceled orders 
# NOTE: Canceled orders begin with 'C' or has some textual characters
retail_valid <- retail_uk[-grep("[A-Z]",retail_uk$InvoiceNo),]
retail_valid <- retail_uk[-grep("[A-Z]",retail_uk$StockCode),]
Desc_stock <- aggregate(retail_valid$StockCode,list(retail_valid$Description),length)
head(Desc_stock)
##                      Group.1   x
## 1      *Boombox Ipod Classic   1
## 2    *USB Office Mirror Ball   2
## 3     10 COLOUR SPACEBOY PEN 282
## 4 12 COLOURED PARTY BALLOONS 156
## 5  12 DAISY PEGS IN WOOD BOX  80
## 6  12 EGG HOUSE PAINTED WOOD  89
# Remove rows with invalid product description 
# NOTE: Invalid descriptions has lower case characters - removing those rows 
retail_valid_1 <- retail_valid[-grep("[a-z]",retail_valid$Description),]
retail_valid_2 <- retail_valid_1[-grep("\\?",retail_valid_1$Description),]
valid_entries <- length(unique(retail_valid_2$InvoiceNo))
invalid_entries <- total_entries - valid_entries
c(invalid_entries, valid_entries)
## [1]  1854 21640

4.Frequent Item Set Analysis

##Converting data to transactions
transaction_detail <- aggregate(retail_valid_2$Description ~ retail_valid_2$InvoiceNo,
                              FUN=paste,collapse=',')
head(transaction_detail)
##   retail_valid_2$InvoiceNo
## 1                   536365
## 2                   536366
## 3                   536367
## 4                   536368
## 5                   536369
## 6                   536371
##                                                                                                                                                                                                                                                                                                                    retail_valid_2$Description
## 1                                                                                                                                                                                                                                                          WHITE METAL LANTERN,SET 7 BABUSHKA NESTING BOXES,GLASS STAR FROSTED T-LIGHT HOLDER
## 2                                                                                                                                                                                                                                                                                            HAND WARMER UNION JACK,HAND WARMER RED POLKA DOT
## 3 ASSORTED COLOUR BIRD ORNAMENT,POPPY'S PLAYHOUSE BEDROOM,POPPY'S PLAYHOUSE KITCHEN,FELTCRAFT PRINCESS CHARLOTTE DOLL,IVORY KNITTED MUG COSY,BOX OF 6 ASSORTED COLOUR TEASPOONS,BOX OF VINTAGE JIGSAW BLOCKS,BOX OF VINTAGE ALPHABET BLOCKS,HOME BUILDING BLOCK WORD,LOVE BUILDING BLOCK WORD,RECIPE BOX WITH METAL HEART,DOORMAT NEW ENGLAND
## 4                                                                                                                                                                                                                            JAM MAKING SET WITH JARS,RED COAT RACK PARIS FASHION,YELLOW COAT RACK PARIS FASHION,BLUE COAT RACK PARIS FASHION
## 5                                                                                                                                                                                                                                                                                                                    BATH BUILDING BLOCK WORD
## 6                                                                                                                                                                                                                                                                                                              PAPER CHAIN KIT 50'S CHRISTMAS
# install.packages("arules") - only transaction, 
itemsets<-transaction_detail[,-1]
head(itemsets)
## [1] "WHITE METAL LANTERN,SET 7 BABUSHKA NESTING BOXES,GLASS STAR FROSTED T-LIGHT HOLDER"                                                                                                                                                                                                                                                         
## [2] "HAND WARMER UNION JACK,HAND WARMER RED POLKA DOT"                                                                                                                                                                                                                                                                                           
## [3] "ASSORTED COLOUR BIRD ORNAMENT,POPPY'S PLAYHOUSE BEDROOM,POPPY'S PLAYHOUSE KITCHEN,FELTCRAFT PRINCESS CHARLOTTE DOLL,IVORY KNITTED MUG COSY,BOX OF 6 ASSORTED COLOUR TEASPOONS,BOX OF VINTAGE JIGSAW BLOCKS,BOX OF VINTAGE ALPHABET BLOCKS,HOME BUILDING BLOCK WORD,LOVE BUILDING BLOCK WORD,RECIPE BOX WITH METAL HEART,DOORMAT NEW ENGLAND"
## [4] "JAM MAKING SET WITH JARS,RED COAT RACK PARIS FASHION,YELLOW COAT RACK PARIS FASHION,BLUE COAT RACK PARIS FASHION"                                                                                                                                                                                                                           
## [5] "BATH BUILDING BLOCK WORD"                                                                                                                                                                                                                                                                                                                   
## [6] "PAPER CHAIN KIT 50'S CHRISTMAS"
# converting data to transaction object
write(itemsets,"itemsets2.csv")
itemsets_txn<-read.transactions("itemsets2.csv",format="basket",rm.duplicates=TRUE,sep=",")
## distribution of transactions with duplicates:
## items
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 1038  476  281  178  110   90   50   47   30   30    8   14   13    5    8 
##   16   17   18   19   20   21   22   23   24   37 
##    8    9    3    1    1    2    2    2    2    3

5.Apriori Algorithm and Rules Generation

  • Support : Probability that a set of items occur together
  • Confidence : Probability that a customer purchases item X, given that the person buys item Y
  • Strong Rule : Rules that satisfy both Minimum Support and Minimum Confidence
  • Apriori Algo: Performs a Breadth First Search (first single itemsets, then 2-item sets ..,). Calculates support for single-item item-sets if(supp < minimumSupp) then ignores the item-set.
  • Lift : Lift > 1 indicates a rule that is useful in finding consequent items sets. (i.e., more useful than just selecting transactions randomly)
rules<-apriori(itemsets_txn,parameter=list(supp=0.005,conf=0.7,maxlen=2)) 
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.7    0.1    1 none FALSE            TRUE       5   0.005      1
##  maxlen target   ext
##       2  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 102 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[7344 item(s), 20594 transaction(s)] done [0.04s].
## sorting and recoding items ... [1056 item(s)] done [0.01s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 done [0.02s].
## writing ... [79 rule(s)] done [0.00s].
## creating S4 object  ... done [0.01s].
# head(inspect(rules))

rules_df<- as(rules,"data.frame")

head(rules_df)
##                                                                          rules
## 1                   {FELTCRAFT GIRL NICOLE KIT} => {FELTCRAFT GIRL AMELIE KIT}
## 2                     {COFFEE MUG PEARS  DESIGN} => {COFFEE MUG APPLES DESIGN}
## 3                                                 {GARAGE DESIGN} => {KEY FOB}
## 4               {CHILDRENS CUTLERY DOLLY GIRL} => {CHILDRENS CUTLERY SPACEBOY}
## 5 {SET/10 BLUE POLKADOT PARTY CANDLES} => {SET/10 PINK POLKADOT PARTY CANDLES}
## 6                                {GREEN POLKADOT BOWL} => {RED RETROSPOT BOWL}
##       support confidence     lift
## 1 0.005389919  0.7551020 73.69939
## 2 0.005438477  0.7516779 88.45745
## 3 0.005632709  1.0000000 48.68558
## 4 0.005778382  0.7300613 72.28309
## 5 0.006263960  0.7914110 65.71903
## 6 0.005001457  0.7054795 44.70352
rules_df$InverseConfidence<-(rules_df$support * rules_df$lift)/rules_df$confidence

rules_final<-subset(rules_df,rules_df$confidence > rules_df$InverseConfidence)


write.csv(rules_final,file="onlineretail_recommender2.csv") 
final_recomm <- read_csv("onlineretail_recommender2.csv")
nrow(final_recomm)
## [1] 50

6.Data Exploration on resultant item sets

  • Displaying records with Max Confidence. That is, if X is bought then Y will be purchased.
## # A tibble: 7 <U+00D7> 6
##      X1                            rules     support confidence      lift
##   <int>                            <chr>       <dbl>      <dbl>     <dbl>
## 1     3     {GARAGE DESIGN} => {KEY FOB} 0.005632709          1  48.68558
## 2     7    {ELEPHANT} => {BIRTHDAY CARD} 0.006069729          1  70.52740
## 3    12       {FRONT  DOOR} => {KEY FOB} 0.007769253          1  48.68558
## 4    58         {BACK DOOR} => {KEY FOB} 0.012042342          1  48.68558
## 5    70 {AIRLINE LOUNGE} => {METAL SIGN} 0.008643294          1 113.77901
## 6    73              {SHED} => {KEY FOB} 0.014227445          1  48.68558
## 7    75  {RETRO SPOT} => {BIRTHDAY CARD} 0.008934641          1  70.52740
## # ... with 1 more variables: InverseConfidence <dbl>