library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
grd <- read.transactions("http://fimi.ua.ac.be/data/retail.dat",
format="basket")
itemFrequencyPlot(grd,support=.1)
itemFrequencyPlot(grd,support=.2)
itemFrequencyPlot(grd,support=.3)
itemFrequencyPlot(grd,support=.5)
summary(grd)
## transactions as itemMatrix in sparse format with
## 88162 rows (elements/itemsets/transactions) and
## 16470 columns (items) and a density of 0.0006257289
##
## most frequent items:
## 39 48 38 32 41 (Other)
## 50675 42135 15596 15167 14945 770058
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 3016 5516 6919 7210 6814 6163 5746 5143 4660 4086 3751 3285 2866 2620 2310
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## 2115 1874 1645 1469 1290 1205 981 887 819 684 586 582 472 480 355
## 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
## 310 303 272 234 194 136 153 123 115 112 76 66 71 60 50
## 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 44 37 37 33 22 24 21 21 10 11 10 9 11 4 9
## 61 62 63 64 65 66 67 68 71 73 74 76
## 7 4 5 2 2 5 3 3 1 1 1 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 4.00 8.00 10.31 14.00 76.00
##
## includes extended item information - examples:
## labels
## 1 0
## 2 1
## 3 10
grdar<-apriori(grd, parameter = list(supp = .05, conf=.5))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.05 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 4408
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[16470 item(s), 88162 transaction(s)] done [0.16s].
## sorting and recoding items ... [6 item(s)] done [0.00s].
## creating transaction tree ... done [0.03s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [15 rule(s)] done [0.00s].
## creating S4 object ... done [0.01s].
inspect(grdar)
## lhs rhs support confidence lift
## [1] {} => {39} 0.57479413 0.5747941 1.0000000
## [2] {38} => {48} 0.09010685 0.5093614 1.0657723
## [3] {38} => {39} 0.11734080 0.6633111 1.1539977
## [4] {32} => {48} 0.09112770 0.5297026 1.1083338
## [5] {32} => {39} 0.09590300 0.5574603 0.9698434
## [6] {41} => {48} 0.10228897 0.6034125 1.2625621
## [7] {41} => {39} 0.12946621 0.7637337 1.3287082
## [8] {48} => {39} 0.33055058 0.6916340 1.2032726
## [9] {39} => {48} 0.33055058 0.5750765 1.2032726
## [10] {38,48} => {39} 0.06921349 0.7681269 1.3363513
## [11] {38,39} => {48} 0.06921349 0.5898502 1.2341847
## [12] {32,48} => {39} 0.06127356 0.6723923 1.1697968
## [13] {32,39} => {48} 0.06127356 0.6389119 1.3368399
## [14] {41,48} => {39} 0.08355074 0.8168108 1.4210493
## [15] {39,41} => {48} 0.08355074 0.6453478 1.3503063
grdar_st_su <- sort(grdar,decreasing = T, by = "support")
grdar_st_cf <- sort(grdar,decreasing = T, by = "confidence")
grdar_st_lf <- sort(grdar,decreasing = T, by = "lift")
grdar_st_all<- sort(grdar,decreasing = T, by = c("support", "confidence", "lift"))
inspect(grdar_st_su)
## lhs rhs support confidence lift
## [1] {} => {39} 0.57479413 0.5747941 1.0000000
## [2] {48} => {39} 0.33055058 0.6916340 1.2032726
## [3] {39} => {48} 0.33055058 0.5750765 1.2032726
## [4] {41} => {39} 0.12946621 0.7637337 1.3287082
## [5] {38} => {39} 0.11734080 0.6633111 1.1539977
## [6] {41} => {48} 0.10228897 0.6034125 1.2625621
## [7] {32} => {39} 0.09590300 0.5574603 0.9698434
## [8] {32} => {48} 0.09112770 0.5297026 1.1083338
## [9] {38} => {48} 0.09010685 0.5093614 1.0657723
## [10] {41,48} => {39} 0.08355074 0.8168108 1.4210493
## [11] {39,41} => {48} 0.08355074 0.6453478 1.3503063
## [12] {38,48} => {39} 0.06921349 0.7681269 1.3363513
## [13] {38,39} => {48} 0.06921349 0.5898502 1.2341847
## [14] {32,48} => {39} 0.06127356 0.6723923 1.1697968
## [15] {32,39} => {48} 0.06127356 0.6389119 1.3368399
inspect(grdar_st_cf)
## lhs rhs support confidence lift
## [1] {41,48} => {39} 0.08355074 0.8168108 1.4210493
## [2] {38,48} => {39} 0.06921349 0.7681269 1.3363513
## [3] {41} => {39} 0.12946621 0.7637337 1.3287082
## [4] {48} => {39} 0.33055058 0.6916340 1.2032726
## [5] {32,48} => {39} 0.06127356 0.6723923 1.1697968
## [6] {38} => {39} 0.11734080 0.6633111 1.1539977
## [7] {39,41} => {48} 0.08355074 0.6453478 1.3503063
## [8] {32,39} => {48} 0.06127356 0.6389119 1.3368399
## [9] {41} => {48} 0.10228897 0.6034125 1.2625621
## [10] {38,39} => {48} 0.06921349 0.5898502 1.2341847
## [11] {39} => {48} 0.33055058 0.5750765 1.2032726
## [12] {} => {39} 0.57479413 0.5747941 1.0000000
## [13] {32} => {39} 0.09590300 0.5574603 0.9698434
## [14] {32} => {48} 0.09112770 0.5297026 1.1083338
## [15] {38} => {48} 0.09010685 0.5093614 1.0657723
inspect(grdar_st_lf)
## lhs rhs support confidence lift
## [1] {41,48} => {39} 0.08355074 0.8168108 1.4210493
## [2] {39,41} => {48} 0.08355074 0.6453478 1.3503063
## [3] {32,39} => {48} 0.06127356 0.6389119 1.3368399
## [4] {38,48} => {39} 0.06921349 0.7681269 1.3363513
## [5] {41} => {39} 0.12946621 0.7637337 1.3287082
## [6] {41} => {48} 0.10228897 0.6034125 1.2625621
## [7] {38,39} => {48} 0.06921349 0.5898502 1.2341847
## [8] {39} => {48} 0.33055058 0.5750765 1.2032726
## [9] {48} => {39} 0.33055058 0.6916340 1.2032726
## [10] {32,48} => {39} 0.06127356 0.6723923 1.1697968
## [11] {38} => {39} 0.11734080 0.6633111 1.1539977
## [12] {32} => {48} 0.09112770 0.5297026 1.1083338
## [13] {38} => {48} 0.09010685 0.5093614 1.0657723
## [14] {} => {39} 0.57479413 0.5747941 1.0000000
## [15] {32} => {39} 0.09590300 0.5574603 0.9698434
inspect(grdar_st_all)
## lhs rhs support confidence lift
## [1] {} => {39} 0.57479413 0.5747941 1.0000000
## [2] {48} => {39} 0.33055058 0.6916340 1.2032726
## [3] {39} => {48} 0.33055058 0.5750765 1.2032726
## [4] {41} => {39} 0.12946621 0.7637337 1.3287082
## [5] {38} => {39} 0.11734080 0.6633111 1.1539977
## [6] {41} => {48} 0.10228897 0.6034125 1.2625621
## [7] {32} => {39} 0.09590300 0.5574603 0.9698434
## [8] {32} => {48} 0.09112770 0.5297026 1.1083338
## [9] {38} => {48} 0.09010685 0.5093614 1.0657723
## [10] {41,48} => {39} 0.08355074 0.8168108 1.4210493
## [11] {39,41} => {48} 0.08355074 0.6453478 1.3503063
## [12] {38,48} => {39} 0.06921349 0.7681269 1.3363513
## [13] {38,39} => {48} 0.06921349 0.5898502 1.2341847
## [14] {32,48} => {39} 0.06127356 0.6723923 1.1697968
## [15] {32,39} => {48} 0.06127356 0.6389119 1.3368399
The biggest support happens when there’s no lhs, and only with a rhs of 39 - which means the probability of 39 happens without any precedents is 0.57
the biggest confidence is 0.81, happening when 41,48 are in the basket then the probability of buying 39 is 0.81 as the support is not big only 0.08 and ranks 10th, there’s a big lift (the biggest one) so this rule is not happening often, but once happening, we are more confident to predict 39 will happen at a confidence of 0.81
If sort by support, confidence and lift, which shows with the bigger support, which rule will happen more possibily. It shows when buy 48, which is a popular product with a support of 0.33, people will by 39 at confidence of 0.69, which brings a high list of 1.2
the hypothesis I want to test is - There will be a high possibility to buy 39 if there are 41 and 48 in basket
It will be interesting if we have sales data -> which can help us make a more financially reasonable result
Predict the possibility of buying 39 if there is 48 in the basket
What movies will people more possibly to watch next if then just watched a few action movies from Tom Cruise in a row?
What major will the students choose for college study depending on their score distribution in SAT