#install.packages("arules")

library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
#ABC <- read.csv("/cloud/project/Data/groceries.txt", header=FALSE)
groceries <- read.transactions("/cloud/project/Data/groceries.csv", sep = ",") # create sparse Matrix - ma trận rải rác (thiếu sót) 
## Warning in readLines(file, encoding = encoding): incomplete final line found on
## '/cloud/project/Data/groceries.csv'
summary(groceries)
## transactions as itemMatrix in sparse format with
##  9835 rows (elements/itemsets/transactions) and
##  169 columns (items) and a density of 0.02609146 
## 
## most frequent items:
##       whole milk other vegetables       rolls/buns             soda 
##             2513             1903             1809             1715 
##           yogurt          (Other) 
##             1372            34055 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 2159 1643 1299 1005  855  645  545  438  350  246  182  117   78   77   55   46 
##   17   18   19   20   21   22   23   24   26   27   28   29   32 
##   29   14   14    9   11    4    6    1    1    1    1    3    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   4.409   6.000  32.000 
## 
## includes extended item information - examples:
##             labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3   baby cosmetics
inspect(groceries[1:5])
##     items                     
## [1] {citrus fruit,            
##      margarine,               
##      ready soups,             
##      semi-finished bread}     
## [2] {coffee,                  
##      tropical fruit,          
##      yogurt}                  
## [3] {whole milk}              
## [4] {cream cheese,            
##      meat spreads,            
##      pip fruit,               
##      yogurt}                  
## [5] {condensed milk,          
##      long life bakery product,
##      other vegetables,        
##      whole milk}
itemFrequency(groceries[, 1:3]) # cột sắp thứ tự, ABC Alplabet order
## abrasive cleaner artif. sweetener   baby cosmetics 
##     0.0035587189     0.0032536858     0.0006100661
itemFrequencyPlot(groceries, support = 0.1 )

itemFrequencyPlot(groceries, topN = 20)

?itemFrequencyPlot
image(groceries[1:150])

groceriesrules <- apriori(groceries, parameter = list(support = 0.006, confidence = 0.25 , minlen = 2)) # support thông số ít nhất 0.006 lần xuất hiện trên tổng số transactions (số lần mua hàng) confidence số lần support lặp lại , minlen ít nhất 2 lần xuất hiện trong rule
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.25    0.1    1 none FALSE            TRUE       5   0.006      2
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 59 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [109 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [463 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
groceriesrules
## set of 463 rules
summary(groceriesrules)
## set of 463 rules
## 
## rule length distribution (lhs + rhs):sizes
##   2   3   4 
## 150 297  16 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.000   3.000   2.711   3.000   4.000 
## 
## summary of quality measures:
##     support           confidence        coverage             lift       
##  Min.   :0.006101   Min.   :0.2500   Min.   :0.009964   Min.   :0.9932  
##  1st Qu.:0.007117   1st Qu.:0.2971   1st Qu.:0.018709   1st Qu.:1.6229  
##  Median :0.008744   Median :0.3554   Median :0.024809   Median :1.9332  
##  Mean   :0.011539   Mean   :0.3786   Mean   :0.032608   Mean   :2.0351  
##  3rd Qu.:0.012303   3rd Qu.:0.4495   3rd Qu.:0.035892   3rd Qu.:2.3565  
##  Max.   :0.074835   Max.   :0.6600   Max.   :0.255516   Max.   :3.9565  
##      count      
##  Min.   : 60.0  
##  1st Qu.: 70.0  
##  Median : 86.0  
##  Mean   :113.5  
##  3rd Qu.:121.0  
##  Max.   :736.0  
## 
## mining info:
##       data ntransactions support confidence
##  groceries          9835   0.006       0.25
inspect(groceriesrules[1:5])
##     lhs             rhs                support     confidence coverage  
## [1] {pot plants} => {whole milk}       0.006914082 0.4000000  0.01728521
## [2] {pasta}      => {whole milk}       0.006100661 0.4054054  0.01504830
## [3] {herbs}      => {root vegetables}  0.007015760 0.4312500  0.01626843
## [4] {herbs}      => {other vegetables} 0.007727504 0.4750000  0.01626843
## [5] {herbs}      => {whole milk}       0.007727504 0.4750000  0.01626843
##     lift     count
## [1] 1.565460 68   
## [2] 1.586614 60   
## [3] 3.956477 69   
## [4] 2.454874 76   
## [5] 1.858983 76

The first rule can be read in plain language as “if a customer buys potted plants, they will also buy whole milk.” With a support of about 0.007 and confidence of 0.400, we can determine that this rule covers about 0.7 percent of transactions, and is correct in 40 percent of purchases involving potted plants.

The lift value tells us how much more likely a customer is to buy whole milk relative to the average customer, given that he or she bought a potted plant

Since we know that about 25.6 percent of customers bought whole milk (the support of all whole milk) while 40 percent of customers buying a potted plant bought whole milk (the confidence), we can compute the lift as 0.40 / 0.256 = 1.56, which matches the value shown

inspect(sort(groceriesrules, by = "lift")[1:5])
##     lhs                   rhs                      support confidence   coverage     lift count
## [1] {herbs}            => {root vegetables}    0.007015760  0.4312500 0.01626843 3.956477    69
## [2] {berries}          => {whipped/sour cream} 0.009049314  0.2721713 0.03324860 3.796886    89
## [3] {other vegetables,                                                                         
##      tropical fruit,                                                                           
##      whole milk}       => {root vegetables}    0.007015760  0.4107143 0.01708185 3.768074    69
## [4] {beef,                                                                                     
##      other vegetables} => {root vegetables}    0.007930859  0.4020619 0.01972547 3.688692    78
## [5] {other vegetables,                                                                         
##      tropical fruit}   => {pip fruit}          0.009456024  0.2634561 0.03589222 3.482649    93
berryrules <- subset(groceriesrules, items %in% "berries")
inspect(berryrules)
##     lhs          rhs                  support     confidence coverage  lift    
## [1] {berries} => {whipped/sour cream} 0.009049314 0.2721713  0.0332486 3.796886
## [2] {berries} => {yogurt}             0.010574479 0.3180428  0.0332486 2.279848
## [3] {berries} => {other vegetables}   0.010269446 0.3088685  0.0332486 1.596280
## [4] {berries} => {whole milk}         0.011794611 0.3547401  0.0332486 1.388328
##     count
## [1]  89  
## [2] 104  
## [3] 101  
## [4] 116
ABC <- subset(groceriesrules, items %pin% "fruit") # toàn bộ cái gì có chữ "fruit" bên lhs or rhs nó đều show ra hết 
#inspect(ABC) # nên hơi dài

ABC_1 <- subset(groceriesrules, items %ain% c("berries", "yogurt")) # chỉ tìm mối liên hệ giữa "berries", "yogurt" thôi 
inspect(ABC_1)
##     lhs          rhs      support    confidence coverage  lift     count
## [1] {berries} => {yogurt} 0.01057448 0.3180428  0.0332486 2.279848 104
write(groceriesrules, file = "groceryrules.csv",sep = ",", quote = TRUE, row.names = FALSE)

groceryrules_df <- as(groceriesrules, "data.frame")
str(groceryrules_df)
## 'data.frame':    463 obs. of  6 variables:
##  $ rules     : chr  "{pot plants} => {whole milk}" "{pasta} => {whole milk}" "{herbs} => {root vegetables}" "{herbs} => {other vegetables}" ...
##  $ support   : num  0.00691 0.0061 0.00702 0.00773 0.00773 ...
##  $ confidence: num  0.4 0.405 0.431 0.475 0.475 ...
##  $ coverage  : num  0.0173 0.015 0.0163 0.0163 0.0163 ...
##  $ lift      : num  1.57 1.59 3.96 2.45 1.86 ...
##  $ count     : int  68 60 69 76 76 69 70 67 63 88 ...