Packages

# for data management and exploratory data analysis or descriptive analysis
  library("tidyverse") 
  library("lubridate") 
  library("ggplot2")
  library("RColorBrewer")

# for Association Rules
  library("arules")
  library("arulesViz")

# for LDA (experimental method for market basket analysis)
  library("topicmodels")
  library("tidytext")

Preparing Transactions Objects for arules

dream_cafe <- read_rds("dream_cafe.rds")
dream_cafe

summary(dream_cafe)

##     trans_id              date                time              item          
##  Min.   :1.101e+11   Min.   :2018-05-02   Length:85242      Length:85242      
##  1st Qu.:1.329e+11   1st Qu.:2018-05-24   Class1:hms        Class :character  
##  Median :1.529e+11   Median :2018-06-15   Class2:difftime   Mode  :character  
##  Mean   :1.512e+11   Mean   :2018-06-14   Mode  :numeric                      
##  3rd Qu.:1.626e+11   3rd Qu.:2018-07-07                                       
##  Max.   :1.511e+12   Max.   :2018-07-30                                       
##      size                qty           store                 period     
##  Length:85242       Min.   :1.000   Length:85242       morning  :20299  
##  Class :character   1st Qu.:1.000   Class :character   afternoon:36857  
##  Mode  :character   Median :1.000   Mode  :character   evening  :28086  
##                     Mean   :1.062                                       
##                     3rd Qu.:1.000                                       
##                     Max.   :6.000                                       
##             period2     
##  early morning  : 9789  
##  midday         :10510  
##  early afternoon:12166  
##  late afternoon :24691  
##  early evening  :13476  
##  late evening   :14610

dream_cafe %>%
  group_by(store) %>%
  summarise("no. of transactions" = n_distinct(trans_id))

Average number of transactions per hour

avg_trans <- dream_cafe %>%
      mutate(hour = hour(time)) %>%
      group_by(date, hour) %>%
      summarise(`trans_number` = n_distinct(trans_id)) %>%
      # at this point, i have calculated the number of transactions per hour per date
      # the next codes will compute the average number of transactions per hour (over different dates)
      group_by(hour) %>%
      summarise("avg no. of transactions" = round(mean(trans_number, na.rm = TRUE), digits = 2))

avg_trans

avg_trans %>%
    ggplot(aes(x = hour, y = `avg no. of transactions`)) +
    geom_bar(stat = "identity") +
    theme_minimal() +
    scale_x_continuous(breaks = 7:22)

Based on the info above, it seems that most of the transactions are happening in the afternoon, especially the late afternoon mainly because it is the time where students, employees and other are mostly done to their class or work. It is not new to us that sometimes before we go home, we will go to our favorite shop to buy our favorite drink and this cafe is not different. Also, the Store 1 in BGC is the most popular branch as it has the most number of transactions.

All Transactions

complete_trans <- read.transactions(file = "data-complete.txt", 
                                    format = "single",
                                    header = T,
                                    sep = " ",
                                    cols =  c(1, 2))
# c(1, 2) means that column 1 is the transaction label and column 2 is the item label
complete_trans

## transactions in sparse format with
##  17147 transactions (rows) and
##  44 items (columns)

summary(complete_trans)

## transactions as itemMatrix in sparse format with
##  17147 rows (elements/itemsets/transactions) and
##  44 columns (items) and a density of 0.1083373 
## 
## most frequent items:
##   VCFRP   ICMAC   WCFRP  DCCAKE     ICL (Other) 
##    4643    4035    3831    3501    3497   62230 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
##   67 4064 1659 3065 2306 2426 1423  956  475  271  131   91   73   52   40   21 
##   17   18   19   20 
##   13    9    4    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.000   4.000   4.767   6.000  20.000 
## 
## includes extended item information - examples:
##   labels
## 1 BANPIE
## 2    BBM
## 3     BC
## 
## includes extended transaction information - examples:
##   transactionID
## 1  110120180606
## 2  110120180613
## 3  110120180620

itemFrequencyPlot(complete_trans,topN=10,type="relative",col=brewer.pal(8,'Pastel2'),main="Relative Item Frequency Plot")

complete_apriori <- apriori(data = complete_trans, 
                             parameter = list(support = 0.01, confidence = 0.4, minlen = 2, maxlen = 5))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.4    0.1    1 none FALSE            TRUE       5    0.01      2
##  maxlen target  ext
##       5  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 171 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[44 item(s), 17147 transaction(s)] done [0.00s].
## sorting and recoding items ... [44 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [13 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

inspect(head(sort(complete_apriori, by=c("lift"))))

##     lhs                 rhs     support    confidence coverage   lift     count
## [1] {BANPIE, NYCAKE} => {VCFRP} 0.01259696 0.4716157  0.02671021 1.741718 216  
## [2] {CPES, DCCAKE}   => {VCFRP} 0.01108066 0.4578313  0.02420248 1.690811 190  
## [3] {BANPIE, DCCAKE} => {VCFRP} 0.01516300 0.4561404  0.03324197 1.684566 260  
## [4] {CCCAKE, DCCAKE} => {VCFRP} 0.01545460 0.4553265  0.03394180 1.681560 265  
## [5] {BANPIE, CCCAKE} => {VCFRP} 0.01382166 0.4429907  0.03120079 1.636003 237  
## [6] {CCCAKE, NYCAKE} => {VCFRP} 0.01166385 0.4255319  0.02741004 1.571526 200

plot(complete_apriori, method = "graph",  engine = "htmlwidget")

In this cafe, Vanilla Cafe Frappe is their best-selling product and next is Iced Caramel Macchiato. It is interesting to see that customers are ordering mostly ordering 2-7 items with 2 having the most number of items ordered per transaction. This just implies that customers are likely to buy side dishes to be paired with their drinks unlike to just ordering one which can be seen on the 1 item having a low frequency compared to 2-7 items. Or maybe, they are ordering not only for themselves but for their friends, family, or colleagues too as it can be seen also that up to 20 items are ordered on one transaction.

vcfrp_apriori <- apriori(data = complete_trans, 
                        parameter = list(support = 0.01, confidence = 0.4, minlen = 2), 
                        appearance = list(rhs = "VCFRP"))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.4    0.1    1 none FALSE            TRUE       5    0.01      2
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 171 
## 
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[44 item(s), 17147 transaction(s)] done [0.00s].
## sorting and recoding items ... [44 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [13 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

inspect(head(sort(complete_apriori, by=c("lift"))))

##     lhs                 rhs     support    confidence coverage   lift     count
## [1] {BANPIE, NYCAKE} => {VCFRP} 0.01259696 0.4716157  0.02671021 1.741718 216  
## [2] {CPES, DCCAKE}   => {VCFRP} 0.01108066 0.4578313  0.02420248 1.690811 190  
## [3] {BANPIE, DCCAKE} => {VCFRP} 0.01516300 0.4561404  0.03324197 1.684566 260  
## [4] {CCCAKE, DCCAKE} => {VCFRP} 0.01545460 0.4553265  0.03394180 1.681560 265  
## [5] {BANPIE, CCCAKE} => {VCFRP} 0.01382166 0.4429907  0.03120079 1.636003 237  
## [6] {CCCAKE, NYCAKE} => {VCFRP} 0.01166385 0.4255319  0.02741004 1.571526 200

It seems that Vanilla Cafe Frappe is best when partnered by Banoffee Pie and New York Cheesecake because aside from having the highest lift score, it also has the highest confidence meaning that this combination of food and drink is being sold more frequent that others. But the thing is, the difference between other combinations is not that much so the Vanilla Cafe Frappe is tend to be bought along side with the combination of 2 in Banoffee Pie, Dayap Cheesecake, Classic Chocolate Cake and New York Cheesecake. In short, the cakes in the cafe are mostly bought together with Vanilla Cafe Frappe which explains why the afternoon period have the most number of transactions because of these good combination of desserts.

Morning Transactions

morning_trans <- read.transactions(file = "data-morning.txt", 
                                    format = "single",
                                    header = T,
                                    sep = " ",
                                    cols =  c(1, 2))
morning_trans

## transactions in sparse format with
##  4272 transactions (rows) and
##  44 items (columns)

summary(morning_trans)

## transactions as itemMatrix in sparse format with
##  4272 rows (elements/itemsets/transactions) and
##  44 columns (items) and a density of 0.1034751 
## 
## most frequent items:
##      CA    CMAC      CL  LONPAN     ICL (Other) 
##    1070    1067    1030     871     856   14556 
## 
## element (itemset/transaction) length distribution:
## sizes
##    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17 
## 1195  412  765  593  550  279  189  114   73   39   21   21    7    6    4    1 
##   18   19 
##    2    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.000   4.000   4.553   6.000  19.000 
## 
## includes extended item information - examples:
##   labels
## 1 BANPIE
## 2    BBM
## 3     BC
## 
## includes extended transaction information - examples:
##   transactionID
## 1  111120180502
## 2  111120180503
## 3  111120180504

itemFrequencyPlot(morning_trans,topN=10,type="relative",col=brewer.pal(8,'Pastel2'),main="Relative Item Frequency Plot")

It seems that coffees dominated the sales in morning period with Caffe Americano having the highest frequency and next are the Caramel Macchiato and Caffe Latte. These items are expected to see especially in the morning period where people want to sip a coffee first before going to work as it is reflected on the graph where 7:00 - 8:00 am have the highest average number of transactions and it be dramatically reduced to half when it is already 9 am. Same with all period, the most number of transactions have an order of 2 items maybe because it is a good place to buy breakfast and have a little talk with others before going to work/school. Also aside from buying a coffee, a bread/pastry/sandwich is a must-have too which can be seen from the most frequent items like Longganisa Pandesal, Cream Cheese on Toast and Spanish Tuna Pandesal next to the Espresso Drinks.

inspect(head(morning_trans))

##     items                                            transactionID
## [1] {CBPAN, CCFRP, CCT, CEN, CL, ICA}                111120180502 
## [2] {CCT, CL, CMAC, CMFRP, LONPAN}                   111120180503 
## [3] {BC, CA, CAP, CCT, CL, CMAC, ICL, LONPAN, STPAN} 111120180504 
## [4] {BC, BCCM, CEN, CL, CLAS}                        111120180506 
## [5] {CA, CCFRP, CCT, CMAC, ICA, LONPAN, VCFRP}       111120180507 
## [6] {CL, CPES, DCCAKE, ICL, ICMOC}                   111120180508

morning_apriori <- apriori(data = morning_trans, 
                             parameter = list(support = 0.01, confidence = 0.4, minlen = 2, maxlen = 5))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.4    0.1    1 none FALSE            TRUE       5    0.01      2
##  maxlen target  ext
##       5  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 42 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[44 item(s), 4272 transaction(s)] done [0.00s].
## sorting and recoding items ... [44 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [33 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

inspect(head(sort(morning_apriori, by=c("conf","lift"))))

##     lhs                rhs    support    confidence coverage   lift     count
## [1] {BCESW, LONPAN} => {CL}   0.01170412 0.5000000  0.02340824 2.073786  50  
## [2] {HCC, ICA}      => {CMAC} 0.01029963 0.4943820  0.02083333 1.979381  44  
## [3] {CCT, LONPAN}   => {CA}   0.02013109 0.4648649  0.04330524 1.855984  86  
## [4] {MACC}          => {CA}   0.02808989 0.4545455  0.06179775 1.814783 120  
## [5] {CAP, CCT}      => {CA}   0.01521536 0.4482759  0.03394195 1.789752  65  
## [6] {BCESW, CA}     => {CL}   0.01170412 0.4464286  0.02621723 1.851595  50

It is interesting to see that the rules are mostly a combination of a bread/pastry/sandwich and a coffee/espresso drink. With having a highest confidence and lift, it seems that Bacon, Cheddar and Egg Sandwich and Longganisa Pandesal is mostly bought alongside with Caffe Latte. Mainly because if we think what people usually eat in the morning/breakfast, first thing that would come to our mind aside from the fried rice, is the popular pandesal which is good especially with a hot coffee as its partner. And this is why Longanisa Pandesal is popular together with Bacon Sandwich which are popular dressings in bread.

Afternoon Transactions

afternoon_trans <- read.transactions(file = "data-afternoon.txt", 
                                    format = "single",
                                    header = T,
                                    sep = " ",
                                    cols =  c(1, 2))
afternoon_trans

## transactions in sparse format with
##  7593 transactions (rows) and
##  44 items (columns)

summary(afternoon_trans)

## transactions as itemMatrix in sparse format with
##  7593 rows (elements/itemsets/transactions) and
##  44 columns (items) and a density of 0.1056446 
## 
## most frequent items:
##   VCFRP   ICMAC   WCFRP  DCCAKE     ICL (Other) 
##    2339    2144    1992    1774    1575   25471 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
##   90 1990  648 1261 1004 1113  630  390  195   92   54   40   26   22   17    8 
##   17   18   19 
##    6    4    3 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   4.000   4.648   6.000  19.000 
## 
## includes extended item information - examples:
##   labels
## 1 BANPIE
## 2    BBM
## 3     BC
## 
## includes extended transaction information - examples:
##   transactionID
## 1  110120180606
## 2  110120180613
## 3  110120180620

itemFrequencyPlot(afternoon_trans,topN=10,type="relative",col=brewer.pal(8,'Pastel2'),main="Relative Item Frequency Plot")

The results are similar to all transactions above mainly because this is the period where most of transactions are happening. Iced/Blended Drinks and cakes are the most frequent items to be sold which is not surprising especially in the afternoon where people want to a dessert and these are good choices. Having vanilla Caffe Frappe as their best selling, and the Iced Caramel Macchiato and White Chocolate Mocha Frappe next to the VCFRP. While the cakes such as New York Cheesecake and Classic Chocolate Cake are on the top 10 next to the Frappe/Iced Drinks.

inspect(head(afternoon_trans))

##     items          transactionID
## [1] {CCFRP, CFRP}  110120180606 
## [2] {CFRP}         110120180613 
## [3] {CCFRP, EFRP}  110120180620 
## [4] {CBPAN, CCFRP} 110120180627 
## [5] {CFRP, EFRP}   110220180606 
## [6] {CFRP, EFRP}   110220180613

afternoon_apriori <- apriori(data = afternoon_trans, 
                             parameter = list(support = 0.01, confidence = 0.4, minlen = 2, maxlen = 5))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.4    0.1    1 none FALSE            TRUE       5    0.01      2
##  maxlen target  ext
##       5  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 75 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[44 item(s), 7593 transaction(s)] done [0.00s].
## sorting and recoding items ... [43 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [52 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

inspect(head(sort(afternoon_apriori, by=c("confidence","lift"))))

##     lhs                 rhs     support    confidence coverage   lift     count
## [1] {CSROLL, DCCAKE} => {VCFRP} 0.01290662 0.5632184  0.02291584 1.828353  98  
## [2] {BANPIE, NYCAKE} => {VCFRP} 0.01475043 0.5137615  0.02871065 1.667803 112  
## [3] {BANPIE, DCCAKE} => {VCFRP} 0.01817463 0.5000000  0.03634927 1.623130 138  
## [4] {CCCAKE, DCCAKE} => {VCFRP} 0.01738443 0.4748201  0.03661267 1.541389 132  
## [5] {ICMOC, WCFRP}   => {VCFRP} 0.01093112 0.4585635  0.02383775 1.488616  83  
## [6] {CPES, DCCAKE}   => {VCFRP} 0.01158962 0.4583333  0.02528645 1.487869  88

inspect(head(sort(afternoon_apriori, by=c("coverage"))))

##     lhs                 rhs     support    confidence coverage   lift     count
## [1] {CSROLL}         => {VCFRP} 0.03845647 0.4319527  0.08902937 1.402230 292  
## [2] {DCCAKE, ICMAC}  => {VCFRP} 0.03068616 0.4102113  0.07480574 1.331652 233  
## [3] {DCCAKE, WCFRP}  => {VCFRP} 0.02950086 0.4242424  0.06953773 1.377201 224  
## [4] {BANPIE, ICMAC}  => {VCFRP} 0.02423285 0.4088889  0.05926511 1.327359 184  
## [5] {DCCAKE, DCMFRP} => {VCFRP} 0.02199394 0.4304124  0.05109970 1.397230 167  
## [6] {ICMAC, SCFRP}   => {VCFRP} 0.02041354 0.4046997  0.05044120 1.313760 155

Based on the results above, it seems that on all rules, the Frappe is always present whether it is Vanilla, or or White Chocolate Mocha. The highest confidence and lift score is the Classic Sausage Roll, Dayap Cheesecake and Vanilla Caffe Frappe which means that most people who bought Classic Sausage Roll and Dayap Cheesecake also bought Vanilla Caffe Frappe. But the thing is even having a high score in association rule, Classic Sausage Roll is not that popular that it did not appear in the top 20. While the Dayap Cheesecake and Vanilla Caffe Frappe seems to be the most popular combination as they are mostly appearing with other combinations of cake with having a high confidence and lift value.

Evening Transactions

evening_trans <- read.transactions(file = "data-evening.txt", 
                                    format = "single",
                                    header = T,
                                    sep = " ",
                                    cols =  c(1, 2))
evening_trans

## transactions in sparse format with
##  5353 transactions (rows) and
##  44 items (columns)

summary(evening_trans)

## transactions as itemMatrix in sparse format with
##  5353 rows (elements/itemsets/transactions) and
##  44 columns (items) and a density of 0.1146086 
## 
## most frequent items:
##   VCFRP  DCCAKE   ICMAC  CCCAKE   WCFRP (Other) 
##    1563    1326    1281    1174    1132   20518 
## 
## element (itemset/transaction) length distribution:
## sizes
##    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17 
##  931  612 1042  715  753  511  372  163  104   38   29   24   23   17    9    6 
##   18   20 
##    3    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   3.000   5.000   5.043   6.000  20.000 
## 
## includes extended item information - examples:
##   labels
## 1 BANPIE
## 2    BBM
## 3     BC
## 
## includes extended transaction information - examples:
##   transactionID
## 1  161120180502
## 2  161120180503
## 3  161120180504

itemFrequencyPlot(evening_trans,topN=10,type="relative",col=brewer.pal(8,'Pastel2'),main="Relative Item Frequency Plot")

Results in the evening period shows how popular the Vanilla Caffe Frappe and Dayap Cheesecakes as they are also appeared in the top 4 of all transactions and afternoon period with VCFRP having an exception that it also appeared in the morning top 10 list where it should be dominated by coffees and sandwiches. This also show that the item frequency changes as same with time where in the morning, we mostly see sandwiches and coffees and now in the afternoon and this evening period, they are now gone. Another thing to notice that this time, the most number of items that were ordered in a transaction is now 4 compared to the 2 of morning and afternoon. Maybe because it is the time where people would already spend their time having a dinner and also together with their family or friends and they have now more time to stay unlike in earlier periods where it could be only a break/vacant.

inspect(head(evening_trans))

##     items     transactionID
## [1] {CAP,                  
##      CBFB,                 
##      CCFRP,                
##      CMFRP,                
##      CPPIE,                
##      CSALSW,               
##      ICL}      161120180502
## [2] {BCESW,                
##      CCFRP,                
##      CEN,                  
##      CMAC,                 
##      DCCAKE,               
##      EFRP,                 
##      HC}       161120180503
## [3] {ICL,                  
##      NYCAKE,               
##      VCFRP}    161120180504
## [4] {CCCAKE,               
##      CFRP,                 
##      CMAC,                 
##      HGD}      161120180505
## [5] {CCCAKE,               
##      ICL,                  
##      MCFRP}    161120180506
## [6] {BC,                   
##      CCFRP,                
##      CL,                   
##      DCCAKE,               
##      EFRP,                 
##      ICL,                  
##      ICMAC,                
##      MCFRP,                
##      NYCAKE,               
##      VCFRP}    161120180507

evening_apriori <- apriori(data = evening_trans, 
                             parameter = list(support = 0.01, confidence = 0.4, minlen = 2, maxlen = 5))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.4    0.1    1 none FALSE            TRUE       5    0.01      2
##  maxlen target  ext
##       5  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 53 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[44 item(s), 5353 transaction(s)] done [0.00s].
## sorting and recoding items ... [43 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [14 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

inspect(head(sort(evening_apriori, by=c("confidence","lift"))))

##     lhs                 rhs     support    confidence coverage   lift     count
## [1] {CBFB, DCCAKE}   => {VCFRP} 0.01083505 0.4957265  0.02185690 1.697776  58  
## [2] {CYAKSW, WCFRP}  => {VCFRP} 0.01158229 0.4428571  0.02615356 1.516708  62  
## [3] {BANPIE, NYCAKE} => {VCFRP} 0.01737344 0.4407583  0.03941715 1.509520  93  
## [4] {CPES, DCCAKE}   => {VCFRP} 0.01419765 0.4393064  0.03231833 1.504547  76  
## [5] {BANPIE, CCCAKE} => {VCFRP} 0.02073604 0.4269231  0.04857089 1.462136 111  
## [6] {CCCAKE, DCCAKE} => {VCFRP} 0.01942836 0.4227642  0.04595554 1.447893 104

vcfrp_evening_apriori <- apriori(data = evening_trans, 
                        parameter = list(support = 0.01, confidence = 0.4, minlen = 2), 
                        appearance = list(rhs = "VCFRP"))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.4    0.1    1 none FALSE            TRUE       5    0.01      2
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 53 
## 
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[44 item(s), 5353 transaction(s)] done [0.00s].
## sorting and recoding items ... [43 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [11 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

inspect(head(sort(vcfrp_evening_apriori, by=c("confidence","lift"))))

##     lhs                 rhs     support    confidence coverage   lift     count
## [1] {CBFB, DCCAKE}   => {VCFRP} 0.01083505 0.4957265  0.02185690 1.697776  58  
## [2] {CYAKSW, WCFRP}  => {VCFRP} 0.01158229 0.4428571  0.02615356 1.516708  62  
## [3] {BANPIE, NYCAKE} => {VCFRP} 0.01737344 0.4407583  0.03941715 1.509520  93  
## [4] {CPES, DCCAKE}   => {VCFRP} 0.01419765 0.4393064  0.03231833 1.504547  76  
## [5] {BANPIE, CCCAKE} => {VCFRP} 0.02073604 0.4269231  0.04857089 1.462136 111  
## [6] {CCCAKE, DCCAKE} => {VCFRP} 0.01942836 0.4227642  0.04595554 1.447893 104

Same with the afternoon period the rules are very similar where the combination of Frappe Drinks and Cakes are always present in the evening. Also, the most popular combination is still the Vanilla Caffe Frappe and Dayap Cheesecake which is expected because they are always present in every combination and they are the top 2 in item frequency with having a high confidence and lift value. Lastly, it can be concluded that when customer buy a cake, whether if it is Dayap Cheesecake or New York Cheesecake, they will buy it together with Vanilla Caffe Frappe which makes it the most popular combination in almost all of the time period.

Preparation for LDA

dream_cafe %>%
  select(trans_id, item, qty)

Separating into Test and Training Data Set

set.seed(10102020)
#randomly select 80% of data for training
train_cafe <- dream_cafe %>%
                sample_frac(size = 0.8, replace = F) %>%
                select(trans_id, item, qty)

# anti join means all other observations not found in train_cafe are kept in test_cafe
test_cafe <- dream_cafe %>%
                anti_join(y = train_cafe, by = "trans_id") %>%
                select(trans_id, item, qty)

train_dtm <- train_cafe %>%
          cast_dtm(document = trans_id, term = item, value = qty)
test_dtm <- test_cafe %>%
          cast_dtm(document = trans_id, term = item, value = qty)

Selecting k-number of topics

I chose k=12 because it has the lowest perplexity score.

set.seed(201803075)
topic_model <- LDA(x = train_dtm, k = 12, method = "Gibbs",
                    control = list(burnin = 200, iter = 1000, keep = 10, alpha = 0.250))
topic_model

## A LDA_Gibbs topic model with 12 topics.

Plotting The Top 3 Items For Each Topic

td_beta <- tidy(topic_model)
td_beta %>%
    group_by(topic) %>%
    top_n(3, beta) %>%
    ungroup() %>%
    mutate(topic = paste0("Topic ", topic),
           term = reorder_within(term, beta, topic)) %>%
    ggplot(aes(term, beta, fill = as.factor(topic))) +
    geom_col(alpha = 0.8, show.legend = FALSE) +
    facet_wrap(~ topic, scales = "free_y") +
    coord_flip() +
    scale_x_reordered()

Based on the plot above, these topics are really helpful and even if the customer does not have an idea on what it is about, they can easily understand what it is trying to imply because I can say that each topic can be a best or popular combination per transaction. Also, it is helpful for the cafe owner because he/she can have an idea on what to items to be proposed on a customer especially for those who are having a hard time choosing which is really relatable when you have many choices to choose or with your friends and all you can say is “kahit ano.” Moreover, this can give an idea to make a bundle or combo meals like in topic 1, when buying a Banoffee Pie and/or Classic Chocolate Cake together with Vanilla Caffe Frappe, a customer can save some money buy buying them altogether and it can somehow change their mind when they originally planned to just buy a Cheesecake or just Frappe, but buy seeing the combo/bundle, they end up buying that one. Lastly, it is much easier for a customer to buy when there is already a recommended sets of items just like the Vanilla Caffe Frappe where it is almost present in every topic and can be advertised as their best drink to be bought alongside with their side dishes or desserts.

Simple Analystics Project on Apriori Algorithm

Mania, Dexter James U.