#install.packages("arules")
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
#ABC <- read.csv("/cloud/project/Data/groceries.txt", header=FALSE)
groceries <- read.transactions("/cloud/project/Data/groceries.csv", sep = ",") # create sparse Matrix - ma trận rải rác (thiếu sót)
## Warning in readLines(file, encoding = encoding): incomplete final line found on
## '/cloud/project/Data/groceries.csv'
summary(groceries)
## transactions as itemMatrix in sparse format with
## 9835 rows (elements/itemsets/transactions) and
## 169 columns (items) and a density of 0.02609146
##
## most frequent items:
## whole milk other vegetables rolls/buns soda
## 2513 1903 1809 1715
## yogurt (Other)
## 1372 34055
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55 46
## 17 18 19 20 21 22 23 24 26 27 28 29 32
## 29 14 14 9 11 4 6 1 1 1 1 3 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 4.409 6.000 32.000
##
## includes extended item information - examples:
## labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3 baby cosmetics
inspect(groceries[1:5])
## items
## [1] {citrus fruit,
## margarine,
## ready soups,
## semi-finished bread}
## [2] {coffee,
## tropical fruit,
## yogurt}
## [3] {whole milk}
## [4] {cream cheese,
## meat spreads,
## pip fruit,
## yogurt}
## [5] {condensed milk,
## long life bakery product,
## other vegetables,
## whole milk}
itemFrequency(groceries[, 1:3]) # cột sắp thứ tự, ABC Alplabet order
## abrasive cleaner artif. sweetener baby cosmetics
## 0.0035587189 0.0032536858 0.0006100661
itemFrequencyPlot(groceries, support = 0.1 )
itemFrequencyPlot(groceries, topN = 20)
?itemFrequencyPlot
image(groceries[1:150])
groceriesrules <- apriori(groceries, parameter = list(support = 0.006, confidence = 0.25 , minlen = 2)) # support thông số ít nhất 0.006 lần xuất hiện trên tổng số transactions (số lần mua hàng) confidence số lần support lặp lại , minlen ít nhất 2 lần xuất hiện trong rule
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.25 0.1 1 none FALSE TRUE 5 0.006 2
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 59
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [109 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [463 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
groceriesrules
## set of 463 rules
summary(groceriesrules)
## set of 463 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4
## 150 297 16
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 3.000 2.711 3.000 4.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.006101 Min. :0.2500 Min. :0.009964 Min. :0.9932
## 1st Qu.:0.007117 1st Qu.:0.2971 1st Qu.:0.018709 1st Qu.:1.6229
## Median :0.008744 Median :0.3554 Median :0.024809 Median :1.9332
## Mean :0.011539 Mean :0.3786 Mean :0.032608 Mean :2.0351
## 3rd Qu.:0.012303 3rd Qu.:0.4495 3rd Qu.:0.035892 3rd Qu.:2.3565
## Max. :0.074835 Max. :0.6600 Max. :0.255516 Max. :3.9565
## count
## Min. : 60.0
## 1st Qu.: 70.0
## Median : 86.0
## Mean :113.5
## 3rd Qu.:121.0
## Max. :736.0
##
## mining info:
## data ntransactions support confidence
## groceries 9835 0.006 0.25
inspect(groceriesrules[1:5])
## lhs rhs support confidence coverage
## [1] {pot plants} => {whole milk} 0.006914082 0.4000000 0.01728521
## [2] {pasta} => {whole milk} 0.006100661 0.4054054 0.01504830
## [3] {herbs} => {root vegetables} 0.007015760 0.4312500 0.01626843
## [4] {herbs} => {other vegetables} 0.007727504 0.4750000 0.01626843
## [5] {herbs} => {whole milk} 0.007727504 0.4750000 0.01626843
## lift count
## [1] 1.565460 68
## [2] 1.586614 60
## [3] 3.956477 69
## [4] 2.454874 76
## [5] 1.858983 76
The first rule can be read in plain language as “if a customer buys potted plants, they will also buy whole milk.” With a support of about 0.007 and confidence of 0.400, we can determine that this rule covers about 0.7 percent of transactions, and is correct in 40 percent of purchases involving potted plants.
The lift value tells us how much more likely a customer is to buy whole milk relative to the average customer, given that he or she bought a potted plant
Since we know that about 25.6 percent of customers bought whole milk (the support of all whole milk) while 40 percent of customers buying a potted plant bought whole milk (the confidence), we can compute the lift as 0.40 / 0.256 = 1.56, which matches the value shown
inspect(sort(groceriesrules, by = "lift")[1:5])
## lhs rhs support confidence coverage lift count
## [1] {herbs} => {root vegetables} 0.007015760 0.4312500 0.01626843 3.956477 69
## [2] {berries} => {whipped/sour cream} 0.009049314 0.2721713 0.03324860 3.796886 89
## [3] {other vegetables,
## tropical fruit,
## whole milk} => {root vegetables} 0.007015760 0.4107143 0.01708185 3.768074 69
## [4] {beef,
## other vegetables} => {root vegetables} 0.007930859 0.4020619 0.01972547 3.688692 78
## [5] {other vegetables,
## tropical fruit} => {pip fruit} 0.009456024 0.2634561 0.03589222 3.482649 93
berryrules <- subset(groceriesrules, items %in% "berries")
inspect(berryrules)
## lhs rhs support confidence coverage lift
## [1] {berries} => {whipped/sour cream} 0.009049314 0.2721713 0.0332486 3.796886
## [2] {berries} => {yogurt} 0.010574479 0.3180428 0.0332486 2.279848
## [3] {berries} => {other vegetables} 0.010269446 0.3088685 0.0332486 1.596280
## [4] {berries} => {whole milk} 0.011794611 0.3547401 0.0332486 1.388328
## count
## [1] 89
## [2] 104
## [3] 101
## [4] 116
ABC <- subset(groceriesrules, items %pin% "fruit") # toàn bộ cái gì có chữ "fruit" bên lhs or rhs nó đều show ra hết
#inspect(ABC) # nên hơi dài
ABC_1 <- subset(groceriesrules, items %ain% c("berries", "yogurt")) # chỉ tìm mối liên hệ giữa "berries", "yogurt" thôi
inspect(ABC_1)
## lhs rhs support confidence coverage lift count
## [1] {berries} => {yogurt} 0.01057448 0.3180428 0.0332486 2.279848 104
write(groceriesrules, file = "groceryrules.csv",sep = ",", quote = TRUE, row.names = FALSE)
groceryrules_df <- as(groceriesrules, "data.frame")
str(groceryrules_df)
## 'data.frame': 463 obs. of 6 variables:
## $ rules : chr "{pot plants} => {whole milk}" "{pasta} => {whole milk}" "{herbs} => {root vegetables}" "{herbs} => {other vegetables}" ...
## $ support : num 0.00691 0.0061 0.00702 0.00773 0.00773 ...
## $ confidence: num 0.4 0.405 0.431 0.475 0.475 ...
## $ coverage : num 0.0173 0.015 0.0163 0.0163 0.0163 ...
## $ lift : num 1.57 1.59 3.96 2.45 1.86 ...
## $ count : int 68 60 69 76 76 69 70 67 63 88 ...