rm(list=ls())
library(readr) # as_date
library(lubridate) # read_csv2
library(arules) # Apriori Algo
setwd("/Users/Mughundhan/Projects/market basket analysis")
retail <- read_csv("Online Retail.csv")
1.Number of Unique Invoice No.s
#Data exploration
#No. of unique orders in the dataset
head(retail)
## # A tibble: 6 <U+00D7> 8
## InvoiceNo StockCode Description Quantity
## <chr> <chr> <chr> <int>
## 1 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6
## 2 536365 71053 WHITE METAL LANTERN 6
## 3 536365 84406B CREAM CUPID HEARTS COAT HANGER 8
## 4 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6
## 5 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6
## 6 536365 22752 SET 7 BABUSHKA NESTING BOXES 2
## # ... with 4 more variables: InvoiceDate <chr>, UnitPrice <dbl>,
## # CustomerID <int>, Country <chr>
order_count <- length(unique(retail$InvoiceNo))
order_count
## [1] 25900
#Transaction dates #Eight months data
order_max_date <- max(retail$InvoiceDate)
order_max_date
## [1] "31-10-2011 17:19"
order_min_date <- min(retail$InvoiceDate)
order_min_date
## [1] "01-02-2011 08:23"
#Unique Countries
unique(retail$Country)
## [1] "United Kingdom" "France" "Australia"
## [4] "Netherlands" "Germany" "Norway"
## [7] "EIRE" "Switzerland" "Spain"
## [10] "Poland" "Portugal" "Italy"
## [13] "Belgium" "Lithuania" "Japan"
## [16] "Iceland" "Channel Islands" "Denmark"
## [19] "Cyprus" "Sweden" "Austria"
## [22] "Israel" "Finland" "Bahrain"
## [25] "Greece" "Hong Kong" "Singapore"
## [28] "Lebanon" "United Arab Emirates" "Saudi Arabia"
## [31] "Czech Republic" "Canada" "Unspecified"
## [34] "Brazil" "USA" "European Community"
## [37] "Malta" "RSA"
country_trans <- aggregate(retail$InvoiceNo,list(retail$Country),length) #almost like group by and sum
country_trans
## Group.1 x
## 1 Australia 1259
## 2 Austria 401
## 3 Bahrain 19
## 4 Belgium 2069
## 5 Brazil 32
## 6 Canada 151
## 7 Channel Islands 758
## 8 Cyprus 622
## 9 Czech Republic 30
## 10 Denmark 389
## 11 EIRE 8196
## 12 European Community 61
## 13 Finland 695
## 14 France 8557
## 15 Germany 9495
## 16 Greece 146
## 17 Hong Kong 288
## 18 Iceland 182
## 19 Israel 297
## 20 Italy 803
## 21 Japan 358
## 22 Lebanon 45
## 23 Lithuania 35
## 24 Malta 127
## 25 Netherlands 2371
## 26 Norway 1086
## 27 Poland 341
## 28 Portugal 1519
## 29 RSA 58
## 30 Saudi Arabia 10
## 31 Singapore 229
## 32 Spain 2533
## 33 Sweden 462
## 34 Switzerland 2002
## 35 USA 291
## 36 United Arab Emirates 68
## 37 United Kingdom 495478
## 38 Unspecified 446
head(country_trans)
## Group.1 x
## 1 Australia 1259
## 2 Austria 401
## 3 Bahrain 19
## 4 Belgium 2069
## 5 Brazil 32
## 6 Canada 151
country_trans[which.max(country_trans$x),]
## Group.1 x
## 37 United Kingdom 495478
#since UK has maximum number of transactions - MBA is done for United Kingdom
#4070 unique Products in total
length(unique(retail$StockCode))
## [1] 4070
oDesc_stock<-aggregate(retail$StockCode,list(retail$Description),length) #almost like group by and sum
nrow(oDesc_stock)
## [1] 4211
head(oDesc_stock)
## Group.1 x
## 1 *Boombox Ipod Classic 1
## 2 *USB Office Mirror Ball 2
## 3 10 COLOUR SPACEBOY PEN 327
## 4 12 COLOURED PARTY BALLOONS 170
## 5 12 DAISY PEGS IN WOOD BOX 84
## 6 12 EGG HOUSE PAINTED WOOD 100
# Since we are considering only UK
retail_uk <- subset(retail, retail$Country=="United Kingdom")
# Remove the columns that are not required
colnames(retail_uk)
## [1] "InvoiceNo" "StockCode" "Description" "Quantity" "InvoiceDate"
## [6] "UnitPrice" "CustomerID" "Country"
drops <- "Country"
retail_uk <- retail_uk[ ,!(names(retail_uk) %in% drops)]
colnames(retail_uk)
## [1] "InvoiceNo" "StockCode" "Description" "Quantity" "InvoiceDate"
## [6] "UnitPrice" "CustomerID"
total_entries <- length(unique(retail_uk$InvoiceNo))
# Remove the canceled orders
# NOTE: Canceled orders begin with 'C' or has some textual characters
retail_valid <- retail_uk[-grep("[A-Z]",retail_uk$InvoiceNo),]
retail_valid <- retail_uk[-grep("[A-Z]",retail_uk$StockCode),]
Desc_stock <- aggregate(retail_valid$StockCode,list(retail_valid$Description),length)
head(Desc_stock)
## Group.1 x
## 1 *Boombox Ipod Classic 1
## 2 *USB Office Mirror Ball 2
## 3 10 COLOUR SPACEBOY PEN 282
## 4 12 COLOURED PARTY BALLOONS 156
## 5 12 DAISY PEGS IN WOOD BOX 80
## 6 12 EGG HOUSE PAINTED WOOD 89
# Remove rows with invalid product description
# NOTE: Invalid descriptions has lower case characters - removing those rows
retail_valid_1 <- retail_valid[-grep("[a-z]",retail_valid$Description),]
retail_valid_2 <- retail_valid_1[-grep("\\?",retail_valid_1$Description),]
valid_entries <- length(unique(retail_valid_2$InvoiceNo))
invalid_entries <- total_entries - valid_entries
c(invalid_entries, valid_entries)
## [1] 1854 21640
##Converting data to transactions
transaction_detail <- aggregate(retail_valid_2$Description ~ retail_valid_2$InvoiceNo,
FUN=paste,collapse=',')
head(transaction_detail)
## retail_valid_2$InvoiceNo
## 1 536365
## 2 536366
## 3 536367
## 4 536368
## 5 536369
## 6 536371
## retail_valid_2$Description
## 1 WHITE METAL LANTERN,SET 7 BABUSHKA NESTING BOXES,GLASS STAR FROSTED T-LIGHT HOLDER
## 2 HAND WARMER UNION JACK,HAND WARMER RED POLKA DOT
## 3 ASSORTED COLOUR BIRD ORNAMENT,POPPY'S PLAYHOUSE BEDROOM,POPPY'S PLAYHOUSE KITCHEN,FELTCRAFT PRINCESS CHARLOTTE DOLL,IVORY KNITTED MUG COSY,BOX OF 6 ASSORTED COLOUR TEASPOONS,BOX OF VINTAGE JIGSAW BLOCKS,BOX OF VINTAGE ALPHABET BLOCKS,HOME BUILDING BLOCK WORD,LOVE BUILDING BLOCK WORD,RECIPE BOX WITH METAL HEART,DOORMAT NEW ENGLAND
## 4 JAM MAKING SET WITH JARS,RED COAT RACK PARIS FASHION,YELLOW COAT RACK PARIS FASHION,BLUE COAT RACK PARIS FASHION
## 5 BATH BUILDING BLOCK WORD
## 6 PAPER CHAIN KIT 50'S CHRISTMAS
# install.packages("arules") - only transaction,
itemsets<-transaction_detail[,-1]
head(itemsets)
## [1] "WHITE METAL LANTERN,SET 7 BABUSHKA NESTING BOXES,GLASS STAR FROSTED T-LIGHT HOLDER"
## [2] "HAND WARMER UNION JACK,HAND WARMER RED POLKA DOT"
## [3] "ASSORTED COLOUR BIRD ORNAMENT,POPPY'S PLAYHOUSE BEDROOM,POPPY'S PLAYHOUSE KITCHEN,FELTCRAFT PRINCESS CHARLOTTE DOLL,IVORY KNITTED MUG COSY,BOX OF 6 ASSORTED COLOUR TEASPOONS,BOX OF VINTAGE JIGSAW BLOCKS,BOX OF VINTAGE ALPHABET BLOCKS,HOME BUILDING BLOCK WORD,LOVE BUILDING BLOCK WORD,RECIPE BOX WITH METAL HEART,DOORMAT NEW ENGLAND"
## [4] "JAM MAKING SET WITH JARS,RED COAT RACK PARIS FASHION,YELLOW COAT RACK PARIS FASHION,BLUE COAT RACK PARIS FASHION"
## [5] "BATH BUILDING BLOCK WORD"
## [6] "PAPER CHAIN KIT 50'S CHRISTMAS"
# converting data to transaction object
write(itemsets,"itemsets2.csv")
itemsets_txn<-read.transactions("itemsets2.csv",format="basket",rm.duplicates=TRUE,sep=",")
## distribution of transactions with duplicates:
## items
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 1038 476 281 178 110 90 50 47 30 30 8 14 13 5 8
## 16 17 18 19 20 21 22 23 24 37
## 8 9 3 1 1 2 2 2 2 3
rules<-apriori(itemsets_txn,parameter=list(supp=0.005,conf=0.7,maxlen=2))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.7 0.1 1 none FALSE TRUE 5 0.005 1
## maxlen target ext
## 2 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 102
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[7344 item(s), 20594 transaction(s)] done [0.04s].
## sorting and recoding items ... [1056 item(s)] done [0.01s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 done [0.02s].
## writing ... [79 rule(s)] done [0.00s].
## creating S4 object ... done [0.01s].
# head(inspect(rules))
rules_df<- as(rules,"data.frame")
head(rules_df)
## rules
## 1 {FELTCRAFT GIRL NICOLE KIT} => {FELTCRAFT GIRL AMELIE KIT}
## 2 {COFFEE MUG PEARS DESIGN} => {COFFEE MUG APPLES DESIGN}
## 3 {GARAGE DESIGN} => {KEY FOB}
## 4 {CHILDRENS CUTLERY DOLLY GIRL} => {CHILDRENS CUTLERY SPACEBOY}
## 5 {SET/10 BLUE POLKADOT PARTY CANDLES} => {SET/10 PINK POLKADOT PARTY CANDLES}
## 6 {GREEN POLKADOT BOWL} => {RED RETROSPOT BOWL}
## support confidence lift
## 1 0.005389919 0.7551020 73.69939
## 2 0.005438477 0.7516779 88.45745
## 3 0.005632709 1.0000000 48.68558
## 4 0.005778382 0.7300613 72.28309
## 5 0.006263960 0.7914110 65.71903
## 6 0.005001457 0.7054795 44.70352
rules_df$InverseConfidence<-(rules_df$support * rules_df$lift)/rules_df$confidence
rules_final<-subset(rules_df,rules_df$confidence > rules_df$InverseConfidence)
write.csv(rules_final,file="onlineretail_recommender2.csv")
final_recomm <- read_csv("onlineretail_recommender2.csv")
nrow(final_recomm)
## [1] 50
## # A tibble: 7 <U+00D7> 6
## X1 rules support confidence lift
## <int> <chr> <dbl> <dbl> <dbl>
## 1 3 {GARAGE DESIGN} => {KEY FOB} 0.005632709 1 48.68558
## 2 7 {ELEPHANT} => {BIRTHDAY CARD} 0.006069729 1 70.52740
## 3 12 {FRONT DOOR} => {KEY FOB} 0.007769253 1 48.68558
## 4 58 {BACK DOOR} => {KEY FOB} 0.012042342 1 48.68558
## 5 70 {AIRLINE LOUNGE} => {METAL SIGN} 0.008643294 1 113.77901
## 6 73 {SHED} => {KEY FOB} 0.014227445 1 48.68558
## 7 75 {RETRO SPOT} => {BIRTHDAY CARD} 0.008934641 1 70.52740
## # ... with 1 more variables: InverseConfidence <dbl>