INSPECT DATASET

library(arules)

## Loading required package: Matrix

## 
## Attaching package: 'arules'

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

grd <- read.transactions("http://fimi.ua.ac.be/data/retail.dat",
                      format="basket") 

itemFrequencyPlot(grd,support=.1)

itemFrequencyPlot(grd,support=.2) 
itemFrequencyPlot(grd,support=.3)

itemFrequencyPlot(grd,support=.5)

summary(grd)

## transactions as itemMatrix in sparse format with
##  88162 rows (elements/itemsets/transactions) and
##  16470 columns (items) and a density of 0.0006257289 
## 
## most frequent items:
##      39      48      38      32      41 (Other) 
##   50675   42135   15596   15167   14945  770058 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 3016 5516 6919 7210 6814 6163 5746 5143 4660 4086 3751 3285 2866 2620 2310 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30 
## 2115 1874 1645 1469 1290 1205  981  887  819  684  586  582  472  480  355 
##   31   32   33   34   35   36   37   38   39   40   41   42   43   44   45 
##  310  303  272  234  194  136  153  123  115  112   76   66   71   60   50 
##   46   47   48   49   50   51   52   53   54   55   56   57   58   59   60 
##   44   37   37   33   22   24   21   21   10   11   10    9   11    4    9 
##   61   62   63   64   65   66   67   68   71   73   74   76 
##    7    4    5    2    2    5    3    3    1    1    1    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    4.00    8.00   10.31   14.00   76.00 
## 
## includes extended item information - examples:
##   labels
## 1      0
## 2      1
## 3     10

RULES AND ANALYSIS

grdar<-apriori(grd, parameter = list(supp = .05, conf=.5))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5    0.05      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 4408 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[16470 item(s), 88162 transaction(s)] done [0.16s].
## sorting and recoding items ... [6 item(s)] done [0.00s].
## creating transaction tree ... done [0.03s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [15 rule(s)] done [0.00s].
## creating S4 object  ... done [0.01s].

inspect(grdar)

##      lhs        rhs  support    confidence lift     
## [1]  {}      => {39} 0.57479413 0.5747941  1.0000000
## [2]  {38}    => {48} 0.09010685 0.5093614  1.0657723
## [3]  {38}    => {39} 0.11734080 0.6633111  1.1539977
## [4]  {32}    => {48} 0.09112770 0.5297026  1.1083338
## [5]  {32}    => {39} 0.09590300 0.5574603  0.9698434
## [6]  {41}    => {48} 0.10228897 0.6034125  1.2625621
## [7]  {41}    => {39} 0.12946621 0.7637337  1.3287082
## [8]  {48}    => {39} 0.33055058 0.6916340  1.2032726
## [9]  {39}    => {48} 0.33055058 0.5750765  1.2032726
## [10] {38,48} => {39} 0.06921349 0.7681269  1.3363513
## [11] {38,39} => {48} 0.06921349 0.5898502  1.2341847
## [12] {32,48} => {39} 0.06127356 0.6723923  1.1697968
## [13] {32,39} => {48} 0.06127356 0.6389119  1.3368399
## [14] {41,48} => {39} 0.08355074 0.8168108  1.4210493
## [15] {39,41} => {48} 0.08355074 0.6453478  1.3503063

grdar_st_su <- sort(grdar,decreasing = T, by = "support")
grdar_st_cf <- sort(grdar,decreasing = T, by = "confidence")
grdar_st_lf <- sort(grdar,decreasing = T, by = "lift")
grdar_st_all<- sort(grdar,decreasing = T, by = c("support", "confidence", "lift"))

inspect(grdar_st_su)

##      lhs        rhs  support    confidence lift     
## [1]  {}      => {39} 0.57479413 0.5747941  1.0000000
## [2]  {48}    => {39} 0.33055058 0.6916340  1.2032726
## [3]  {39}    => {48} 0.33055058 0.5750765  1.2032726
## [4]  {41}    => {39} 0.12946621 0.7637337  1.3287082
## [5]  {38}    => {39} 0.11734080 0.6633111  1.1539977
## [6]  {41}    => {48} 0.10228897 0.6034125  1.2625621
## [7]  {32}    => {39} 0.09590300 0.5574603  0.9698434
## [8]  {32}    => {48} 0.09112770 0.5297026  1.1083338
## [9]  {38}    => {48} 0.09010685 0.5093614  1.0657723
## [10] {41,48} => {39} 0.08355074 0.8168108  1.4210493
## [11] {39,41} => {48} 0.08355074 0.6453478  1.3503063
## [12] {38,48} => {39} 0.06921349 0.7681269  1.3363513
## [13] {38,39} => {48} 0.06921349 0.5898502  1.2341847
## [14] {32,48} => {39} 0.06127356 0.6723923  1.1697968
## [15] {32,39} => {48} 0.06127356 0.6389119  1.3368399

inspect(grdar_st_cf)

##      lhs        rhs  support    confidence lift     
## [1]  {41,48} => {39} 0.08355074 0.8168108  1.4210493
## [2]  {38,48} => {39} 0.06921349 0.7681269  1.3363513
## [3]  {41}    => {39} 0.12946621 0.7637337  1.3287082
## [4]  {48}    => {39} 0.33055058 0.6916340  1.2032726
## [5]  {32,48} => {39} 0.06127356 0.6723923  1.1697968
## [6]  {38}    => {39} 0.11734080 0.6633111  1.1539977
## [7]  {39,41} => {48} 0.08355074 0.6453478  1.3503063
## [8]  {32,39} => {48} 0.06127356 0.6389119  1.3368399
## [9]  {41}    => {48} 0.10228897 0.6034125  1.2625621
## [10] {38,39} => {48} 0.06921349 0.5898502  1.2341847
## [11] {39}    => {48} 0.33055058 0.5750765  1.2032726
## [12] {}      => {39} 0.57479413 0.5747941  1.0000000
## [13] {32}    => {39} 0.09590300 0.5574603  0.9698434
## [14] {32}    => {48} 0.09112770 0.5297026  1.1083338
## [15] {38}    => {48} 0.09010685 0.5093614  1.0657723

inspect(grdar_st_lf)

##      lhs        rhs  support    confidence lift     
## [1]  {41,48} => {39} 0.08355074 0.8168108  1.4210493
## [2]  {39,41} => {48} 0.08355074 0.6453478  1.3503063
## [3]  {32,39} => {48} 0.06127356 0.6389119  1.3368399
## [4]  {38,48} => {39} 0.06921349 0.7681269  1.3363513
## [5]  {41}    => {39} 0.12946621 0.7637337  1.3287082
## [6]  {41}    => {48} 0.10228897 0.6034125  1.2625621
## [7]  {38,39} => {48} 0.06921349 0.5898502  1.2341847
## [8]  {39}    => {48} 0.33055058 0.5750765  1.2032726
## [9]  {48}    => {39} 0.33055058 0.6916340  1.2032726
## [10] {32,48} => {39} 0.06127356 0.6723923  1.1697968
## [11] {38}    => {39} 0.11734080 0.6633111  1.1539977
## [12] {32}    => {48} 0.09112770 0.5297026  1.1083338
## [13] {38}    => {48} 0.09010685 0.5093614  1.0657723
## [14] {}      => {39} 0.57479413 0.5747941  1.0000000
## [15] {32}    => {39} 0.09590300 0.5574603  0.9698434

inspect(grdar_st_all)

##      lhs        rhs  support    confidence lift     
## [1]  {}      => {39} 0.57479413 0.5747941  1.0000000
## [2]  {48}    => {39} 0.33055058 0.6916340  1.2032726
## [3]  {39}    => {48} 0.33055058 0.5750765  1.2032726
## [4]  {41}    => {39} 0.12946621 0.7637337  1.3287082
## [5]  {38}    => {39} 0.11734080 0.6633111  1.1539977
## [6]  {41}    => {48} 0.10228897 0.6034125  1.2625621
## [7]  {32}    => {39} 0.09590300 0.5574603  0.9698434
## [8]  {32}    => {48} 0.09112770 0.5297026  1.1083338
## [9]  {38}    => {48} 0.09010685 0.5093614  1.0657723
## [10] {41,48} => {39} 0.08355074 0.8168108  1.4210493
## [11] {39,41} => {48} 0.08355074 0.6453478  1.3503063
## [12] {38,48} => {39} 0.06921349 0.7681269  1.3363513
## [13] {38,39} => {48} 0.06921349 0.5898502  1.2341847
## [14] {32,48} => {39} 0.06127356 0.6723923  1.1697968
## [15] {32,39} => {48} 0.06127356 0.6389119  1.3368399

The biggest support happens when there’s no lhs, and only with a rhs of 39 - which means the probability of 39 happens without any precedents is 0.57

the biggest confidence is 0.81, happening when 41,48 are in the basket then the probability of buying 39 is 0.81 as the support is not big only 0.08 and ranks 10th, there’s a big lift (the biggest one) so this rule is not happening often, but once happening, we are more confident to predict 39 will happen at a confidence of 0.81

If sort by support, confidence and lift, which shows with the bigger support, which rule will happen more possibily. It shows when buy 48, which is a popular product with a support of 0.33, people will by 39 at confidence of 0.69, which brings a high list of 1.2

Do Next

the hypothesis I want to test is - There will be a high possibility to buy 39 if there are 41 and 48 in basket
It will be interesting if we have sales data -> which can help us make a more financially reasonable result
Predict the possibility of buying 39 if there is 48 in the basket

Association Analysis in Application

What movies will people more possibly to watch next if then just watched a few action movies from Tom Cruise in a row?
What major will the students choose for college study depending on their score distribution in SAT

ANLY510_90_Assign5_Wen_He_197626

Wen He

June 11, 2017

INSPECT DATASET

RULES AND ANALYSIS

Do Next

Association Analysis in Application