연관룰
load(url("http://dl.dropbox.com/u/8686172/titanic.raw.rdata"))
str(titanic.raw)
## 'data.frame': 2201 obs. of 4 variables:
## $ Class : Factor w/ 4 levels "1st","2nd","3rd",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
## $ Age : Factor w/ 2 levels "Adult","Child": 2 2 2 2 2 2 2 2 2 2 ...
## $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
library(arules)
rules <- apriori(titanic.raw)
##
## parameter specification:
## confidence minval smax arem aval originalSupport support minlen maxlen
## 0.8 0.1 1 none FALSE TRUE 0.1 1 10
## target ext
## rules FALSE
##
## algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## apriori - find association rules with the apriori algorithm
## version 4.21 (2004.05.09) (c) 1996-2004 Christian Borgelt
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[10 item(s), 2201 transaction(s)] done [0.00s].
## sorting and recoding items ... [9 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [27 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(rules)
## lhs rhs support confidence lift
## 1 {} => {Age=Adult} 0.9505 0.9505 1.0000
## 2 {Class=2nd} => {Age=Adult} 0.1186 0.9158 0.9635
## 3 {Class=1st} => {Age=Adult} 0.1449 0.9815 1.0327
## 4 {Sex=Female} => {Age=Adult} 0.1931 0.9043 0.9514
## 5 {Class=3rd} => {Age=Adult} 0.2849 0.8881 0.9344
## 6 {Survived=Yes} => {Age=Adult} 0.2971 0.9198 0.9678
## 7 {Class=Crew} => {Sex=Male} 0.3916 0.9740 1.2385
## 8 {Class=Crew} => {Age=Adult} 0.4021 1.0000 1.0521
## 9 {Survived=No} => {Sex=Male} 0.6197 0.9154 1.1640
## 10 {Survived=No} => {Age=Adult} 0.6533 0.9651 1.0154
## 11 {Sex=Male} => {Age=Adult} 0.7574 0.9630 1.0132
## 12 {Sex=Female,
## Survived=Yes} => {Age=Adult} 0.1436 0.9186 0.9665
## 13 {Class=3rd,
## Sex=Male} => {Survived=No} 0.1917 0.8275 1.2223
## 14 {Class=3rd,
## Survived=No} => {Age=Adult} 0.2163 0.9015 0.9485
## 15 {Class=3rd,
## Sex=Male} => {Age=Adult} 0.2099 0.9059 0.9531
## 16 {Sex=Male,
## Survived=Yes} => {Age=Adult} 0.1536 0.9210 0.9690
## 17 {Class=Crew,
## Survived=No} => {Sex=Male} 0.3044 0.9955 1.2659
## 18 {Class=Crew,
## Survived=No} => {Age=Adult} 0.3058 1.0000 1.0521
## 19 {Class=Crew,
## Sex=Male} => {Age=Adult} 0.3916 1.0000 1.0521
## 20 {Class=Crew,
## Age=Adult} => {Sex=Male} 0.3916 0.9740 1.2385
## 21 {Sex=Male,
## Survived=No} => {Age=Adult} 0.6038 0.9743 1.0251
## 22 {Age=Adult,
## Survived=No} => {Sex=Male} 0.6038 0.9242 1.1751
## 23 {Class=3rd,
## Sex=Male,
## Survived=No} => {Age=Adult} 0.1758 0.9171 0.9648
## 24 {Class=3rd,
## Age=Adult,
## Survived=No} => {Sex=Male} 0.1758 0.8130 1.0338
## 25 {Class=3rd,
## Sex=Male,
## Age=Adult} => {Survived=No} 0.1758 0.8377 1.2374
## 26 {Class=Crew,
## Sex=Male,
## Survived=No} => {Age=Adult} 0.3044 1.0000 1.0521
## 27 {Class=Crew,
## Age=Adult,
## Survived=No} => {Sex=Male} 0.3044 0.9955 1.2659
You can also embed plots, for example:
# rules with rhs containing 'Survived' only
rules <- apriori(titanic.raw, parameter = list(minlen = 2, supp = 0.005, conf = 0.8),
appearance = list(rhs = c("Survived=No", "Survived=Yes"), default = "lhs"),
control = list(verbose = F))
rules.sorted <- sort(rules, by = "lift")
inspect(rules.sorted)
## lhs rhs support confidence lift
## 1 {Class=2nd,
## Age=Child} => {Survived=Yes} 0.010904 1.0000 3.096
## 2 {Class=2nd,
## Sex=Female,
## Age=Child} => {Survived=Yes} 0.005906 1.0000 3.096
## 3 {Class=1st,
## Sex=Female} => {Survived=Yes} 0.064062 0.9724 3.010
## 4 {Class=1st,
## Sex=Female,
## Age=Adult} => {Survived=Yes} 0.063607 0.9722 3.010
## 5 {Class=2nd,
## Sex=Female} => {Survived=Yes} 0.042254 0.8774 2.716
## 6 {Class=Crew,
## Sex=Female} => {Survived=Yes} 0.009087 0.8696 2.692
## 7 {Class=Crew,
## Sex=Female,
## Age=Adult} => {Survived=Yes} 0.009087 0.8696 2.692
## 8 {Class=2nd,
## Sex=Female,
## Age=Adult} => {Survived=Yes} 0.036347 0.8602 2.663
## 9 {Class=2nd,
## Sex=Male,
## Age=Adult} => {Survived=No} 0.069968 0.9167 1.354
## 10 {Class=2nd,
## Sex=Male} => {Survived=No} 0.069968 0.8603 1.271
## 11 {Class=3rd,
## Sex=Male,
## Age=Adult} => {Survived=No} 0.175829 0.8377 1.237
## 12 {Class=3rd,
## Sex=Male} => {Survived=No} 0.191731 0.8275 1.222
이미 0.8의 신뢰도를 가지고 출력한 결과이며, 상위룰과 하위룰과의 차이는 거의 없거나 하위룰이 조금 하락된 값을 가지고 있을 것이다. 이들 하위룰에 대한 제거 작업을 한다.
# find redundant rules
subset.matrix <- is.subset(rules.sorted, rules.sorted)
subset.matrix[lower.tri(subset.matrix, diag = T)] <- NA
redundant <- colSums(subset.matrix, na.rm = T) >= 1
which(redundant)
## [1] 2 4 7 8
# remove redundant rules
rules.pruned <- rules.sorted[!redundant]
inspect(rules.pruned)
## lhs rhs support confidence lift
## 1 {Class=2nd,
## Age=Child} => {Survived=Yes} 0.010904 1.0000 3.096
## 2 {Class=1st,
## Sex=Female} => {Survived=Yes} 0.064062 0.9724 3.010
## 3 {Class=2nd,
## Sex=Female} => {Survived=Yes} 0.042254 0.8774 2.716
## 4 {Class=Crew,
## Sex=Female} => {Survived=Yes} 0.009087 0.8696 2.692
## 5 {Class=2nd,
## Sex=Male,
## Age=Adult} => {Survived=No} 0.069968 0.9167 1.354
## 6 {Class=2nd,
## Sex=Male} => {Survived=No} 0.069968 0.8603 1.271
## 7 {Class=3rd,
## Sex=Male,
## Age=Adult} => {Survived=No} 0.175829 0.8377 1.237
## 8 {Class=3rd,
## Sex=Male} => {Survived=No} 0.191731 0.8275 1.222
우리가 아래 룰을 상대적으로 얼마나 신뢰할 수 있는지 살펴볼 필요가 있다.
{Class=2nd, Age=Child} => {Sur-vived=Yes}
이등석 아이들은 모두 생존했다는 정보를 알 수 있을 뿐이고 다른 객석 등급의 아이들의 생존과는 상대적인 비교를 할 수 없다. 이들에 대한 정보는 이미 지지도와 신뢰도를 기반으로 필터링 된 결과들이여서 이들을 살펴볼 수 있는 해석 과정이 필요하다.
rules <- apriori(titanic.raw, parameter = list(minlen = 3, supp = 0.002, conf = 0.2),
appearance = list(rhs = c("Survived=Yes"), lhs = c("Class=1st", "Class=2nd",
"Class=3rd", "Age=Child", "Age=Adult"), default = "none"), control = list(verbose = F))
rules.sorted <- sort(rules, by = "confidence")
inspect(rules.sorted)
## lhs rhs support confidence lift
## 1 {Class=2nd,
## Age=Child} => {Survived=Yes} 0.010904 1.0000 3.0956
## 2 {Class=1st,
## Age=Child} => {Survived=Yes} 0.002726 1.0000 3.0956
## 3 {Class=1st,
## Age=Adult} => {Survived=Yes} 0.089505 0.6176 1.9117
## 4 {Class=2nd,
## Age=Adult} => {Survived=Yes} 0.042708 0.3602 1.1149
## 5 {Class=3rd,
## Age=Child} => {Survived=Yes} 0.012267 0.3418 1.0580
## 6 {Class=3rd,
## Age=Adult} => {Survived=Yes} 0.068605 0.2408 0.7455
scatter plot
library(arulesViz)
plot(rules)
ballon plot
plot(rules, method = "grouped")
Graph plot
plot(rules, method = "graph")
Graph plot for items
plot(rules, method = "graph", control = list(type = "items"))