Assiciation rule

연관룰

making rules

load(url("http://dl.dropbox.com/u/8686172/titanic.raw.rdata"))


str(titanic.raw)
## 'data.frame':    2201 obs. of  4 variables:
##  $ Class   : Factor w/ 4 levels "1st","2nd","3rd",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Sex     : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Age     : Factor w/ 2 levels "Adult","Child": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...

library(arules)

rules <- apriori(titanic.raw)
## 
## parameter specification:
##  confidence minval smax arem  aval originalSupport support minlen maxlen
##         0.8    0.1    1 none FALSE            TRUE     0.1      1     10
##  target   ext
##   rules FALSE
## 
## algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## apriori - find association rules with the apriori algorithm
## version 4.21 (2004.05.09)        (c) 1996-2004   Christian Borgelt
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[10 item(s), 2201 transaction(s)] done [0.00s].
## sorting and recoding items ... [9 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [27 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

inspect(rules)
##    lhs               rhs           support confidence   lift
## 1  {}             => {Age=Adult}    0.9505     0.9505 1.0000
## 2  {Class=2nd}    => {Age=Adult}    0.1186     0.9158 0.9635
## 3  {Class=1st}    => {Age=Adult}    0.1449     0.9815 1.0327
## 4  {Sex=Female}   => {Age=Adult}    0.1931     0.9043 0.9514
## 5  {Class=3rd}    => {Age=Adult}    0.2849     0.8881 0.9344
## 6  {Survived=Yes} => {Age=Adult}    0.2971     0.9198 0.9678
## 7  {Class=Crew}   => {Sex=Male}     0.3916     0.9740 1.2385
## 8  {Class=Crew}   => {Age=Adult}    0.4021     1.0000 1.0521
## 9  {Survived=No}  => {Sex=Male}     0.6197     0.9154 1.1640
## 10 {Survived=No}  => {Age=Adult}    0.6533     0.9651 1.0154
## 11 {Sex=Male}     => {Age=Adult}    0.7574     0.9630 1.0132
## 12 {Sex=Female,                                             
##     Survived=Yes} => {Age=Adult}    0.1436     0.9186 0.9665
## 13 {Class=3rd,                                              
##     Sex=Male}     => {Survived=No}  0.1917     0.8275 1.2223
## 14 {Class=3rd,                                              
##     Survived=No}  => {Age=Adult}    0.2163     0.9015 0.9485
## 15 {Class=3rd,                                              
##     Sex=Male}     => {Age=Adult}    0.2099     0.9059 0.9531
## 16 {Sex=Male,                                               
##     Survived=Yes} => {Age=Adult}    0.1536     0.9210 0.9690
## 17 {Class=Crew,                                             
##     Survived=No}  => {Sex=Male}     0.3044     0.9955 1.2659
## 18 {Class=Crew,                                             
##     Survived=No}  => {Age=Adult}    0.3058     1.0000 1.0521
## 19 {Class=Crew,                                             
##     Sex=Male}     => {Age=Adult}    0.3916     1.0000 1.0521
## 20 {Class=Crew,                                             
##     Age=Adult}    => {Sex=Male}     0.3916     0.9740 1.2385
## 21 {Sex=Male,                                               
##     Survived=No}  => {Age=Adult}    0.6038     0.9743 1.0251
## 22 {Age=Adult,                                              
##     Survived=No}  => {Sex=Male}     0.6038     0.9242 1.1751
## 23 {Class=3rd,                                              
##     Sex=Male,                                               
##     Survived=No}  => {Age=Adult}    0.1758     0.9171 0.9648
## 24 {Class=3rd,                                              
##     Age=Adult,                                              
##     Survived=No}  => {Sex=Male}     0.1758     0.8130 1.0338
## 25 {Class=3rd,                                              
##     Sex=Male,                                               
##     Age=Adult}    => {Survived=No}  0.1758     0.8377 1.2374
## 26 {Class=Crew,                                             
##     Sex=Male,                                               
##     Survived=No}  => {Age=Adult}    0.3044     1.0000 1.0521
## 27 {Class=Crew,                                             
##     Age=Adult,                                              
##     Survived=No}  => {Sex=Male}     0.3044     0.9955 1.2659

You can also embed plots, for example:

# rules with rhs containing 'Survived' only
rules <- apriori(titanic.raw, parameter = list(minlen = 2, supp = 0.005, conf = 0.8), 
    appearance = list(rhs = c("Survived=No", "Survived=Yes"), default = "lhs"), 
    control = list(verbose = F))


rules.sorted <- sort(rules, by = "lift")
inspect(rules.sorted)
##    lhs             rhs             support confidence  lift
## 1  {Class=2nd,                                             
##     Age=Child}  => {Survived=Yes} 0.010904     1.0000 3.096
## 2  {Class=2nd,                                             
##     Sex=Female,                                            
##     Age=Child}  => {Survived=Yes} 0.005906     1.0000 3.096
## 3  {Class=1st,                                             
##     Sex=Female} => {Survived=Yes} 0.064062     0.9724 3.010
## 4  {Class=1st,                                             
##     Sex=Female,                                            
##     Age=Adult}  => {Survived=Yes} 0.063607     0.9722 3.010
## 5  {Class=2nd,                                             
##     Sex=Female} => {Survived=Yes} 0.042254     0.8774 2.716
## 6  {Class=Crew,                                            
##     Sex=Female} => {Survived=Yes} 0.009087     0.8696 2.692
## 7  {Class=Crew,                                            
##     Sex=Female,                                            
##     Age=Adult}  => {Survived=Yes} 0.009087     0.8696 2.692
## 8  {Class=2nd,                                             
##     Sex=Female,                                            
##     Age=Adult}  => {Survived=Yes} 0.036347     0.8602 2.663
## 9  {Class=2nd,                                             
##     Sex=Male,                                              
##     Age=Adult}  => {Survived=No}  0.069968     0.9167 1.354
## 10 {Class=2nd,                                             
##     Sex=Male}   => {Survived=No}  0.069968     0.8603 1.271
## 11 {Class=3rd,                                             
##     Sex=Male,                                              
##     Age=Adult}  => {Survived=No}  0.175829     0.8377 1.237
## 12 {Class=3rd,                                             
##     Sex=Male}   => {Survived=No}  0.191731     0.8275 1.222

Pruning Redundant Rules

이미 0.8의 신뢰도를 가지고 출력한 결과이며, 상위룰과 하위룰과의 차이는 거의 없거나 하위룰이 조금 하락된 값을 가지고 있을 것이다. 이들 하위룰에 대한 제거 작업을 한다.

# find redundant rules
subset.matrix <- is.subset(rules.sorted, rules.sorted)
subset.matrix[lower.tri(subset.matrix, diag = T)] <- NA

redundant <- colSums(subset.matrix, na.rm = T) >= 1
which(redundant)
## [1] 2 4 7 8
# remove redundant rules
rules.pruned <- rules.sorted[!redundant]
inspect(rules.pruned)
##   lhs             rhs             support confidence  lift
## 1 {Class=2nd,                                             
##    Age=Child}  => {Survived=Yes} 0.010904     1.0000 3.096
## 2 {Class=1st,                                             
##    Sex=Female} => {Survived=Yes} 0.064062     0.9724 3.010
## 3 {Class=2nd,                                             
##    Sex=Female} => {Survived=Yes} 0.042254     0.8774 2.716
## 4 {Class=Crew,                                            
##    Sex=Female} => {Survived=Yes} 0.009087     0.8696 2.692
## 5 {Class=2nd,                                             
##    Sex=Male,                                              
##    Age=Adult}  => {Survived=No}  0.069968     0.9167 1.354
## 6 {Class=2nd,                                             
##    Sex=Male}   => {Survived=No}  0.069968     0.8603 1.271
## 7 {Class=3rd,                                             
##    Sex=Male,                                              
##    Age=Adult}  => {Survived=No}  0.175829     0.8377 1.237
## 8 {Class=3rd,                                             
##    Sex=Male}   => {Survived=No}  0.191731     0.8275 1.222

Interpretation rules

우리가 아래 룰을 상대적으로 얼마나 신뢰할 수 있는지 살펴볼 필요가 있다.

{Class=2nd, Age=Child} => {Sur-vived=Yes}

이등석 아이들은 모두 생존했다는 정보를 알 수 있을 뿐이고 다른 객석 등급의 아이들의 생존과는 상대적인 비교를 할 수 없다. 이들에 대한 정보는 이미 지지도와 신뢰도를 기반으로 필터링 된 결과들이여서 이들을 살펴볼 수 있는 해석 과정이 필요하다.


rules <- apriori(titanic.raw, parameter = list(minlen = 3, supp = 0.002, conf = 0.2), 
    appearance = list(rhs = c("Survived=Yes"), lhs = c("Class=1st", "Class=2nd", 
        "Class=3rd", "Age=Child", "Age=Adult"), default = "none"), control = list(verbose = F))
rules.sorted <- sort(rules, by = "confidence")
inspect(rules.sorted)
##   lhs            rhs             support confidence   lift
## 1 {Class=2nd,                                             
##    Age=Child} => {Survived=Yes} 0.010904     1.0000 3.0956
## 2 {Class=1st,                                             
##    Age=Child} => {Survived=Yes} 0.002726     1.0000 3.0956
## 3 {Class=1st,                                             
##    Age=Adult} => {Survived=Yes} 0.089505     0.6176 1.9117
## 4 {Class=2nd,                                             
##    Age=Adult} => {Survived=Yes} 0.042708     0.3602 1.1149
## 5 {Class=3rd,                                             
##    Age=Child} => {Survived=Yes} 0.012267     0.3418 1.0580
## 6 {Class=3rd,                                             
##    Age=Adult} => {Survived=Yes} 0.068605     0.2408 0.7455

Vislualization

scatter plot

library(arulesViz)
plot(rules)

plot of chunk visualization1

ballon plot

plot(rules, method = "grouped")

plot of chunk visualization2

Graph plot

plot(rules, method = "graph")

plot of chunk visualization3

Graph plot for items

plot(rules, method = "graph", control = list(type = "items"))

plot of chunk visualization4

Reference