# for data management and exploratory data analysis or descriptive analysis
library("tidyverse")
library("lubridate")
library("ggplot2")
library("RColorBrewer")
# for Association Rules
library("arules")
library("arulesViz")
# for LDA (experimental method for market basket analysis)
library("topicmodels")
library("tidytext")
dream_cafe <- read_rds("dream_cafe.rds")
dream_cafe
summary(dream_cafe)
## trans_id date time item
## Min. :1.101e+11 Min. :2018-05-02 Length:85242 Length:85242
## 1st Qu.:1.329e+11 1st Qu.:2018-05-24 Class1:hms Class :character
## Median :1.529e+11 Median :2018-06-15 Class2:difftime Mode :character
## Mean :1.512e+11 Mean :2018-06-14 Mode :numeric
## 3rd Qu.:1.626e+11 3rd Qu.:2018-07-07
## Max. :1.511e+12 Max. :2018-07-30
## size qty store period
## Length:85242 Min. :1.000 Length:85242 morning :20299
## Class :character 1st Qu.:1.000 Class :character afternoon:36857
## Mode :character Median :1.000 Mode :character evening :28086
## Mean :1.062
## 3rd Qu.:1.000
## Max. :6.000
## period2
## early morning : 9789
## midday :10510
## early afternoon:12166
## late afternoon :24691
## early evening :13476
## late evening :14610
dream_cafe %>%
group_by(store) %>%
summarise("no. of transactions" = n_distinct(trans_id))
Average number of transactions per hour
avg_trans <- dream_cafe %>%
mutate(hour = hour(time)) %>%
group_by(date, hour) %>%
summarise(`trans_number` = n_distinct(trans_id)) %>%
# at this point, i have calculated the number of transactions per hour per date
# the next codes will compute the average number of transactions per hour (over different dates)
group_by(hour) %>%
summarise("avg no. of transactions" = round(mean(trans_number, na.rm = TRUE), digits = 2))
avg_trans
avg_trans %>%
ggplot(aes(x = hour, y = `avg no. of transactions`)) +
geom_bar(stat = "identity") +
theme_minimal() +
scale_x_continuous(breaks = 7:22)
Based on the info above, it seems that most of the transactions are happening in the afternoon, especially the late afternoon mainly because it is the time where students, employees and other are mostly done to their class or work. It is not new to us that sometimes before we go home, we will go to our favorite shop to buy our favorite drink and this cafe is not different. Also, the Store 1 in BGC is the most popular branch as it has the most number of transactions.
complete_trans <- read.transactions(file = "data-complete.txt",
format = "single",
header = T,
sep = " ",
cols = c(1, 2))
# c(1, 2) means that column 1 is the transaction label and column 2 is the item label
complete_trans
## transactions in sparse format with
## 17147 transactions (rows) and
## 44 items (columns)
summary(complete_trans)
## transactions as itemMatrix in sparse format with
## 17147 rows (elements/itemsets/transactions) and
## 44 columns (items) and a density of 0.1083373
##
## most frequent items:
## VCFRP ICMAC WCFRP DCCAKE ICL (Other)
## 4643 4035 3831 3501 3497 62230
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 67 4064 1659 3065 2306 2426 1423 956 475 271 131 91 73 52 40 21
## 17 18 19 20
## 13 9 4 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.000 4.000 4.767 6.000 20.000
##
## includes extended item information - examples:
## labels
## 1 BANPIE
## 2 BBM
## 3 BC
##
## includes extended transaction information - examples:
## transactionID
## 1 110120180606
## 2 110120180613
## 3 110120180620
itemFrequencyPlot(complete_trans,topN=10,type="relative",col=brewer.pal(8,'Pastel2'),main="Relative Item Frequency Plot")
complete_apriori <- apriori(data = complete_trans,
parameter = list(support = 0.01, confidence = 0.4, minlen = 2, maxlen = 5))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.4 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 5 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 171
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[44 item(s), 17147 transaction(s)] done [0.00s].
## sorting and recoding items ... [44 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [13 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(head(sort(complete_apriori, by=c("lift"))))
## lhs rhs support confidence coverage lift count
## [1] {BANPIE, NYCAKE} => {VCFRP} 0.01259696 0.4716157 0.02671021 1.741718 216
## [2] {CPES, DCCAKE} => {VCFRP} 0.01108066 0.4578313 0.02420248 1.690811 190
## [3] {BANPIE, DCCAKE} => {VCFRP} 0.01516300 0.4561404 0.03324197 1.684566 260
## [4] {CCCAKE, DCCAKE} => {VCFRP} 0.01545460 0.4553265 0.03394180 1.681560 265
## [5] {BANPIE, CCCAKE} => {VCFRP} 0.01382166 0.4429907 0.03120079 1.636003 237
## [6] {CCCAKE, NYCAKE} => {VCFRP} 0.01166385 0.4255319 0.02741004 1.571526 200
plot(complete_apriori, method = "graph", engine = "htmlwidget")
In this cafe, Vanilla Cafe Frappe is their best-selling product and next is Iced Caramel Macchiato. It is interesting to see that customers are ordering mostly ordering 2-7 items with 2 having the most number of items ordered per transaction. This just implies that customers are likely to buy side dishes to be paired with their drinks unlike to just ordering one which can be seen on the 1 item having a low frequency compared to 2-7 items. Or maybe, they are ordering not only for themselves but for their friends, family, or colleagues too as it can be seen also that up to 20 items are ordered on one transaction.
vcfrp_apriori <- apriori(data = complete_trans,
parameter = list(support = 0.01, confidence = 0.4, minlen = 2),
appearance = list(rhs = "VCFRP"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.4 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 171
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[44 item(s), 17147 transaction(s)] done [0.00s].
## sorting and recoding items ... [44 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [13 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(head(sort(complete_apriori, by=c("lift"))))
## lhs rhs support confidence coverage lift count
## [1] {BANPIE, NYCAKE} => {VCFRP} 0.01259696 0.4716157 0.02671021 1.741718 216
## [2] {CPES, DCCAKE} => {VCFRP} 0.01108066 0.4578313 0.02420248 1.690811 190
## [3] {BANPIE, DCCAKE} => {VCFRP} 0.01516300 0.4561404 0.03324197 1.684566 260
## [4] {CCCAKE, DCCAKE} => {VCFRP} 0.01545460 0.4553265 0.03394180 1.681560 265
## [5] {BANPIE, CCCAKE} => {VCFRP} 0.01382166 0.4429907 0.03120079 1.636003 237
## [6] {CCCAKE, NYCAKE} => {VCFRP} 0.01166385 0.4255319 0.02741004 1.571526 200
It seems that Vanilla Cafe Frappe is best when partnered by Banoffee Pie and New York Cheesecake because aside from having the highest lift score, it also has the highest confidence meaning that this combination of food and drink is being sold more frequent that others. But the thing is, the difference between other combinations is not that much so the Vanilla Cafe Frappe is tend to be bought along side with the combination of 2 in Banoffee Pie, Dayap Cheesecake, Classic Chocolate Cake and New York Cheesecake. In short, the cakes in the cafe are mostly bought together with Vanilla Cafe Frappe which explains why the afternoon period have the most number of transactions because of these good combination of desserts.
morning_trans <- read.transactions(file = "data-morning.txt",
format = "single",
header = T,
sep = " ",
cols = c(1, 2))
morning_trans
## transactions in sparse format with
## 4272 transactions (rows) and
## 44 items (columns)
summary(morning_trans)
## transactions as itemMatrix in sparse format with
## 4272 rows (elements/itemsets/transactions) and
## 44 columns (items) and a density of 0.1034751
##
## most frequent items:
## CA CMAC CL LONPAN ICL (Other)
## 1070 1067 1030 871 856 14556
##
## element (itemset/transaction) length distribution:
## sizes
## 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 1195 412 765 593 550 279 189 114 73 39 21 21 7 6 4 1
## 18 19
## 2 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 4.000 4.553 6.000 19.000
##
## includes extended item information - examples:
## labels
## 1 BANPIE
## 2 BBM
## 3 BC
##
## includes extended transaction information - examples:
## transactionID
## 1 111120180502
## 2 111120180503
## 3 111120180504
itemFrequencyPlot(morning_trans,topN=10,type="relative",col=brewer.pal(8,'Pastel2'),main="Relative Item Frequency Plot")
It seems that coffees dominated the sales in morning period with Caffe Americano having the highest frequency and next are the Caramel Macchiato and Caffe Latte. These items are expected to see especially in the morning period where people want to sip a coffee first before going to work as it is reflected on the graph where 7:00 - 8:00 am have the highest average number of transactions and it be dramatically reduced to half when it is already 9 am. Same with all period, the most number of transactions have an order of 2 items maybe because it is a good place to buy breakfast and have a little talk with others before going to work/school. Also aside from buying a coffee, a bread/pastry/sandwich is a must-have too which can be seen from the most frequent items like Longganisa Pandesal, Cream Cheese on Toast and Spanish Tuna Pandesal next to the Espresso Drinks.
inspect(head(morning_trans))
## items transactionID
## [1] {CBPAN, CCFRP, CCT, CEN, CL, ICA} 111120180502
## [2] {CCT, CL, CMAC, CMFRP, LONPAN} 111120180503
## [3] {BC, CA, CAP, CCT, CL, CMAC, ICL, LONPAN, STPAN} 111120180504
## [4] {BC, BCCM, CEN, CL, CLAS} 111120180506
## [5] {CA, CCFRP, CCT, CMAC, ICA, LONPAN, VCFRP} 111120180507
## [6] {CL, CPES, DCCAKE, ICL, ICMOC} 111120180508
morning_apriori <- apriori(data = morning_trans,
parameter = list(support = 0.01, confidence = 0.4, minlen = 2, maxlen = 5))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.4 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 5 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 42
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[44 item(s), 4272 transaction(s)] done [0.00s].
## sorting and recoding items ... [44 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [33 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(head(sort(morning_apriori, by=c("conf","lift"))))
## lhs rhs support confidence coverage lift count
## [1] {BCESW, LONPAN} => {CL} 0.01170412 0.5000000 0.02340824 2.073786 50
## [2] {HCC, ICA} => {CMAC} 0.01029963 0.4943820 0.02083333 1.979381 44
## [3] {CCT, LONPAN} => {CA} 0.02013109 0.4648649 0.04330524 1.855984 86
## [4] {MACC} => {CA} 0.02808989 0.4545455 0.06179775 1.814783 120
## [5] {CAP, CCT} => {CA} 0.01521536 0.4482759 0.03394195 1.789752 65
## [6] {BCESW, CA} => {CL} 0.01170412 0.4464286 0.02621723 1.851595 50
It is interesting to see that the rules are mostly a combination of a bread/pastry/sandwich and a coffee/espresso drink. With having a highest confidence and lift, it seems that Bacon, Cheddar and Egg Sandwich and Longganisa Pandesal is mostly bought alongside with Caffe Latte. Mainly because if we think what people usually eat in the morning/breakfast, first thing that would come to our mind aside from the fried rice, is the popular pandesal which is good especially with a hot coffee as its partner. And this is why Longanisa Pandesal is popular together with Bacon Sandwich which are popular dressings in bread.
afternoon_trans <- read.transactions(file = "data-afternoon.txt",
format = "single",
header = T,
sep = " ",
cols = c(1, 2))
afternoon_trans
## transactions in sparse format with
## 7593 transactions (rows) and
## 44 items (columns)
summary(afternoon_trans)
## transactions as itemMatrix in sparse format with
## 7593 rows (elements/itemsets/transactions) and
## 44 columns (items) and a density of 0.1056446
##
## most frequent items:
## VCFRP ICMAC WCFRP DCCAKE ICL (Other)
## 2339 2144 1992 1774 1575 25471
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 90 1990 648 1261 1004 1113 630 390 195 92 54 40 26 22 17 8
## 17 18 19
## 6 4 3
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 4.000 4.648 6.000 19.000
##
## includes extended item information - examples:
## labels
## 1 BANPIE
## 2 BBM
## 3 BC
##
## includes extended transaction information - examples:
## transactionID
## 1 110120180606
## 2 110120180613
## 3 110120180620
itemFrequencyPlot(afternoon_trans,topN=10,type="relative",col=brewer.pal(8,'Pastel2'),main="Relative Item Frequency Plot")
The results are similar to all transactions above mainly because this is the period where most of transactions are happening. Iced/Blended Drinks and cakes are the most frequent items to be sold which is not surprising especially in the afternoon where people want to a dessert and these are good choices. Having vanilla Caffe Frappe as their best selling, and the Iced Caramel Macchiato and White Chocolate Mocha Frappe next to the VCFRP. While the cakes such as New York Cheesecake and Classic Chocolate Cake are on the top 10 next to the Frappe/Iced Drinks.
inspect(head(afternoon_trans))
## items transactionID
## [1] {CCFRP, CFRP} 110120180606
## [2] {CFRP} 110120180613
## [3] {CCFRP, EFRP} 110120180620
## [4] {CBPAN, CCFRP} 110120180627
## [5] {CFRP, EFRP} 110220180606
## [6] {CFRP, EFRP} 110220180613
afternoon_apriori <- apriori(data = afternoon_trans,
parameter = list(support = 0.01, confidence = 0.4, minlen = 2, maxlen = 5))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.4 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 5 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 75
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[44 item(s), 7593 transaction(s)] done [0.00s].
## sorting and recoding items ... [43 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [52 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(head(sort(afternoon_apriori, by=c("confidence","lift"))))
## lhs rhs support confidence coverage lift count
## [1] {CSROLL, DCCAKE} => {VCFRP} 0.01290662 0.5632184 0.02291584 1.828353 98
## [2] {BANPIE, NYCAKE} => {VCFRP} 0.01475043 0.5137615 0.02871065 1.667803 112
## [3] {BANPIE, DCCAKE} => {VCFRP} 0.01817463 0.5000000 0.03634927 1.623130 138
## [4] {CCCAKE, DCCAKE} => {VCFRP} 0.01738443 0.4748201 0.03661267 1.541389 132
## [5] {ICMOC, WCFRP} => {VCFRP} 0.01093112 0.4585635 0.02383775 1.488616 83
## [6] {CPES, DCCAKE} => {VCFRP} 0.01158962 0.4583333 0.02528645 1.487869 88
inspect(head(sort(afternoon_apriori, by=c("coverage"))))
## lhs rhs support confidence coverage lift count
## [1] {CSROLL} => {VCFRP} 0.03845647 0.4319527 0.08902937 1.402230 292
## [2] {DCCAKE, ICMAC} => {VCFRP} 0.03068616 0.4102113 0.07480574 1.331652 233
## [3] {DCCAKE, WCFRP} => {VCFRP} 0.02950086 0.4242424 0.06953773 1.377201 224
## [4] {BANPIE, ICMAC} => {VCFRP} 0.02423285 0.4088889 0.05926511 1.327359 184
## [5] {DCCAKE, DCMFRP} => {VCFRP} 0.02199394 0.4304124 0.05109970 1.397230 167
## [6] {ICMAC, SCFRP} => {VCFRP} 0.02041354 0.4046997 0.05044120 1.313760 155
Based on the results above, it seems that on all rules, the Frappe is always present whether it is Vanilla, or or White Chocolate Mocha. The highest confidence and lift score is the Classic Sausage Roll, Dayap Cheesecake and Vanilla Caffe Frappe which means that most people who bought Classic Sausage Roll and Dayap Cheesecake also bought Vanilla Caffe Frappe. But the thing is even having a high score in association rule, Classic Sausage Roll is not that popular that it did not appear in the top 20. While the Dayap Cheesecake and Vanilla Caffe Frappe seems to be the most popular combination as they are mostly appearing with other combinations of cake with having a high confidence and lift value.
evening_trans <- read.transactions(file = "data-evening.txt",
format = "single",
header = T,
sep = " ",
cols = c(1, 2))
evening_trans
## transactions in sparse format with
## 5353 transactions (rows) and
## 44 items (columns)
summary(evening_trans)
## transactions as itemMatrix in sparse format with
## 5353 rows (elements/itemsets/transactions) and
## 44 columns (items) and a density of 0.1146086
##
## most frequent items:
## VCFRP DCCAKE ICMAC CCCAKE WCFRP (Other)
## 1563 1326 1281 1174 1132 20518
##
## element (itemset/transaction) length distribution:
## sizes
## 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 931 612 1042 715 753 511 372 163 104 38 29 24 23 17 9 6
## 18 20
## 3 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 3.000 5.000 5.043 6.000 20.000
##
## includes extended item information - examples:
## labels
## 1 BANPIE
## 2 BBM
## 3 BC
##
## includes extended transaction information - examples:
## transactionID
## 1 161120180502
## 2 161120180503
## 3 161120180504
itemFrequencyPlot(evening_trans,topN=10,type="relative",col=brewer.pal(8,'Pastel2'),main="Relative Item Frequency Plot")
Results in the evening period shows how popular the Vanilla Caffe Frappe and Dayap Cheesecakes as they are also appeared in the top 4 of all transactions and afternoon period with VCFRP having an exception that it also appeared in the morning top 10 list where it should be dominated by coffees and sandwiches. This also show that the item frequency changes as same with time where in the morning, we mostly see sandwiches and coffees and now in the afternoon and this evening period, they are now gone. Another thing to notice that this time, the most number of items that were ordered in a transaction is now 4 compared to the 2 of morning and afternoon. Maybe because it is the time where people would already spend their time having a dinner and also together with their family or friends and they have now more time to stay unlike in earlier periods where it could be only a break/vacant.
inspect(head(evening_trans))
## items transactionID
## [1] {CAP,
## CBFB,
## CCFRP,
## CMFRP,
## CPPIE,
## CSALSW,
## ICL} 161120180502
## [2] {BCESW,
## CCFRP,
## CEN,
## CMAC,
## DCCAKE,
## EFRP,
## HC} 161120180503
## [3] {ICL,
## NYCAKE,
## VCFRP} 161120180504
## [4] {CCCAKE,
## CFRP,
## CMAC,
## HGD} 161120180505
## [5] {CCCAKE,
## ICL,
## MCFRP} 161120180506
## [6] {BC,
## CCFRP,
## CL,
## DCCAKE,
## EFRP,
## ICL,
## ICMAC,
## MCFRP,
## NYCAKE,
## VCFRP} 161120180507
evening_apriori <- apriori(data = evening_trans,
parameter = list(support = 0.01, confidence = 0.4, minlen = 2, maxlen = 5))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.4 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 5 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 53
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[44 item(s), 5353 transaction(s)] done [0.00s].
## sorting and recoding items ... [43 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [14 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(head(sort(evening_apriori, by=c("confidence","lift"))))
## lhs rhs support confidence coverage lift count
## [1] {CBFB, DCCAKE} => {VCFRP} 0.01083505 0.4957265 0.02185690 1.697776 58
## [2] {CYAKSW, WCFRP} => {VCFRP} 0.01158229 0.4428571 0.02615356 1.516708 62
## [3] {BANPIE, NYCAKE} => {VCFRP} 0.01737344 0.4407583 0.03941715 1.509520 93
## [4] {CPES, DCCAKE} => {VCFRP} 0.01419765 0.4393064 0.03231833 1.504547 76
## [5] {BANPIE, CCCAKE} => {VCFRP} 0.02073604 0.4269231 0.04857089 1.462136 111
## [6] {CCCAKE, DCCAKE} => {VCFRP} 0.01942836 0.4227642 0.04595554 1.447893 104
vcfrp_evening_apriori <- apriori(data = evening_trans,
parameter = list(support = 0.01, confidence = 0.4, minlen = 2),
appearance = list(rhs = "VCFRP"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.4 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 53
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[44 item(s), 5353 transaction(s)] done [0.00s].
## sorting and recoding items ... [43 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [11 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(head(sort(vcfrp_evening_apriori, by=c("confidence","lift"))))
## lhs rhs support confidence coverage lift count
## [1] {CBFB, DCCAKE} => {VCFRP} 0.01083505 0.4957265 0.02185690 1.697776 58
## [2] {CYAKSW, WCFRP} => {VCFRP} 0.01158229 0.4428571 0.02615356 1.516708 62
## [3] {BANPIE, NYCAKE} => {VCFRP} 0.01737344 0.4407583 0.03941715 1.509520 93
## [4] {CPES, DCCAKE} => {VCFRP} 0.01419765 0.4393064 0.03231833 1.504547 76
## [5] {BANPIE, CCCAKE} => {VCFRP} 0.02073604 0.4269231 0.04857089 1.462136 111
## [6] {CCCAKE, DCCAKE} => {VCFRP} 0.01942836 0.4227642 0.04595554 1.447893 104
Same with the afternoon period the rules are very similar where the combination of Frappe Drinks and Cakes are always present in the evening. Also, the most popular combination is still the Vanilla Caffe Frappe and Dayap Cheesecake which is expected because they are always present in every combination and they are the top 2 in item frequency with having a high confidence and lift value. Lastly, it can be concluded that when customer buy a cake, whether if it is Dayap Cheesecake or New York Cheesecake, they will buy it together with Vanilla Caffe Frappe which makes it the most popular combination in almost all of the time period.
dream_cafe %>%
select(trans_id, item, qty)
set.seed(10102020)
#randomly select 80% of data for training
train_cafe <- dream_cafe %>%
sample_frac(size = 0.8, replace = F) %>%
select(trans_id, item, qty)
# anti join means all other observations not found in train_cafe are kept in test_cafe
test_cafe <- dream_cafe %>%
anti_join(y = train_cafe, by = "trans_id") %>%
select(trans_id, item, qty)
train_dtm <- train_cafe %>%
cast_dtm(document = trans_id, term = item, value = qty)
test_dtm <- test_cafe %>%
cast_dtm(document = trans_id, term = item, value = qty)
I chose k=12 because it has the lowest perplexity score.
set.seed(201803075)
topic_model <- LDA(x = train_dtm, k = 12, method = "Gibbs",
control = list(burnin = 200, iter = 1000, keep = 10, alpha = 0.250))
topic_model
## A LDA_Gibbs topic model with 12 topics.
td_beta <- tidy(topic_model)
td_beta %>%
group_by(topic) %>%
top_n(3, beta) %>%
ungroup() %>%
mutate(topic = paste0("Topic ", topic),
term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = as.factor(topic))) +
geom_col(alpha = 0.8, show.legend = FALSE) +
facet_wrap(~ topic, scales = "free_y") +
coord_flip() +
scale_x_reordered()
Based on the plot above, these topics are really helpful and even if the customer does not have an idea on what it is about, they can easily understand what it is trying to imply because I can say that each topic can be a best or popular combination per transaction. Also, it is helpful for the cafe owner because he/she can have an idea on what to items to be proposed on a customer especially for those who are having a hard time choosing which is really relatable when you have many choices to choose or with your friends and all you can say is “kahit ano.” Moreover, this can give an idea to make a bundle or combo meals like in topic 1, when buying a Banoffee Pie and/or Classic Chocolate Cake together with Vanilla Caffe Frappe, a customer can save some money buy buying them altogether and it can somehow change their mind when they originally planned to just buy a Cheesecake or just Frappe, but buy seeing the combo/bundle, they end up buying that one. Lastly, it is much easier for a customer to buy when there is already a recommended sets of items just like the Vanilla Caffe Frappe where it is almost present in every topic and can be advertised as their best drink to be bought alongside with their side dishes or desserts.