arules has Apriori and ECLAT. arulesViz has method to visualize rules.
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
Let’s test a minimal case.
items.df = data.frame(item1=c(T,F,F,T,T,F,T,T,T), item2=c(T,T,T,T,F,T,F,T,T), item3=c(F,F,T,F,T,T,T,T,T),item4=c(F,T,F,T,F,F,F,F,F), item5=c(T,F,F,F,F,F,F,T,F))
str(items.df)
## 'data.frame': 9 obs. of 5 variables:
## $ item1: logi TRUE FALSE FALSE TRUE TRUE FALSE ...
## $ item2: logi TRUE TRUE TRUE TRUE FALSE TRUE ...
## $ item3: logi FALSE FALSE TRUE FALSE TRUE TRUE ...
## $ item4: logi FALSE TRUE FALSE TRUE FALSE FALSE ...
## $ item5: logi TRUE FALSE FALSE FALSE FALSE FALSE ...
rules <- apriori(items.df,parameter = list(minlen=3, supp=2/9, conf=0.1))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support
## 0.1 0.1 1 none FALSE TRUE 5 0.2222222
## minlen maxlen target ext
## 3 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 2
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[5 item(s), 9 transaction(s)] done [0.00s].
## sorting and recoding items ... [5 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [6 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(rules)
## lhs rhs support confidence lift
## [1] {item1,item5} => {item2} 0.2222222 1.0 1.2857143
## [2] {item2,item5} => {item1} 0.2222222 1.0 1.5000000
## [3] {item1,item2} => {item5} 0.2222222 0.5 2.2500000
## [4] {item1,item3} => {item2} 0.2222222 0.5 0.6428571
## [5] {item2,item3} => {item1} 0.2222222 0.5 0.7500000
## [6] {item1,item2} => {item3} 0.2222222 0.5 0.7500000
Download the dataset, if it is not on the disk. Load the titanic dataset.
if (!file.exists("titanic.raw.rdata")) {
library(httr)
resp <- GET("http://www.rdatamining.com/data/titanic.raw.rdata")
writeBin(content(resp, 'raw'), "titanic.raw.rdata")
}
load("titanic.raw.rdata")
Use str and head to inspect the data frame.
str(titanic.raw)
## 'data.frame': 2201 obs. of 4 variables:
## $ Class : Factor w/ 4 levels "1st","2nd","3rd",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
## $ Age : Factor w/ 2 levels "Adult","Child": 2 2 2 2 2 2 2 2 2 2 ...
## $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
head(titanic.raw)
## Class Sex Age Survived
## 1 3rd Male Child No
## 2 3rd Male Child No
## 3 3rd Male Child No
## 4 3rd Male Child No
## 5 3rd Male Child No
## 6 3rd Male Child No
Train the rules with custom parameters, and use inspect to inspect the rules.
rules <- apriori(titanic.raw, parameter = list(minlen=2, supp=0.005, conf=0.8), appearance = list(rhs=c("Survived=No", "Survived=Yes"), default="lhs"), control = list(verbose=F))
rules.sorted <- sort(rules, by="lift")
inspect(rules.sorted)
## lhs rhs support
## [1] {Class=2nd,Age=Child} => {Survived=Yes} 0.010904134
## [2] {Class=2nd,Sex=Female,Age=Child} => {Survived=Yes} 0.005906406
## [3] {Class=1st,Sex=Female} => {Survived=Yes} 0.064061790
## [4] {Class=1st,Sex=Female,Age=Adult} => {Survived=Yes} 0.063607451
## [5] {Class=2nd,Sex=Female} => {Survived=Yes} 0.042253521
## [6] {Class=Crew,Sex=Female} => {Survived=Yes} 0.009086779
## [7] {Class=Crew,Sex=Female,Age=Adult} => {Survived=Yes} 0.009086779
## [8] {Class=2nd,Sex=Female,Age=Adult} => {Survived=Yes} 0.036347115
## [9] {Class=2nd,Sex=Male,Age=Adult} => {Survived=No} 0.069968196
## [10] {Class=2nd,Sex=Male} => {Survived=No} 0.069968196
## [11] {Class=3rd,Sex=Male,Age=Adult} => {Survived=No} 0.175829169
## [12] {Class=3rd,Sex=Male} => {Survived=No} 0.191731031
## confidence lift
## [1] 1.0000000 3.095640
## [2] 1.0000000 3.095640
## [3] 0.9724138 3.010243
## [4] 0.9722222 3.009650
## [5] 0.8773585 2.715986
## [6] 0.8695652 2.691861
## [7] 0.8695652 2.691861
## [8] 0.8602151 2.662916
## [9] 0.9166667 1.354083
## [10] 0.8603352 1.270871
## [11] 0.8376623 1.237379
## [12] 0.8274510 1.222295
There are rules that are subset of other rules. They can be pruned.
subset.matrix <- is.subset(rules.sorted,rules.sorted)
subset.matrix
## 12 x 12 sparse Matrix of class "ngCMatrix"
## [[ suppressing 12 column names '{Class=2nd,Age=Child,Survived=Yes}', '{Class=2nd,Sex=Female,Age=Child,Survived=Yes}', '{Class=1st,Sex=Female,Survived=Yes}' ... ]]
##
## {Class=2nd,Age=Child,Survived=Yes} | | . . . . . . . . . .
## {Class=2nd,Sex=Female,Age=Child,Survived=Yes} . | . . . . . . . . . .
## {Class=1st,Sex=Female,Survived=Yes} . . | | . . . . . . . .
## {Class=1st,Sex=Female,Age=Adult,Survived=Yes} . . . | . . . . . . . .
## {Class=2nd,Sex=Female,Survived=Yes} . | . . | . . | . . . .
## {Class=Crew,Sex=Female,Survived=Yes} . . . . . | | . . . . .
## {Class=Crew,Sex=Female,Age=Adult,Survived=Yes} . . . . . . | . . . . .
## {Class=2nd,Sex=Female,Age=Adult,Survived=Yes} . . . . . . . | . . . .
## {Class=2nd,Sex=Male,Age=Adult,Survived=No} . . . . . . . . | . . .
## {Class=2nd,Sex=Male,Survived=No} . . . . . . . . | | . .
## {Class=3rd,Sex=Male,Age=Adult,Survived=No} . . . . . . . . . . | .
## {Class=3rd,Sex=Male,Survived=No} . . . . . . . . . . | |
subset.matrix[lower.tri(subset.matrix, diag=T)] <- F
subset.matrix
## 12 x 12 sparse Matrix of class "ngCMatrix"
## [[ suppressing 12 column names '{Class=2nd,Age=Child,Survived=Yes}', '{Class=2nd,Sex=Female,Age=Child,Survived=Yes}', '{Class=1st,Sex=Female,Survived=Yes}' ... ]]
##
## {Class=2nd,Age=Child,Survived=Yes} . | . . . . . . . . . .
## {Class=2nd,Sex=Female,Age=Child,Survived=Yes} . . . . . . . . . . . .
## {Class=1st,Sex=Female,Survived=Yes} . . . | . . . . . . . .
## {Class=1st,Sex=Female,Age=Adult,Survived=Yes} . . . . . . . . . . . .
## {Class=2nd,Sex=Female,Survived=Yes} . . . . . . . | . . . .
## {Class=Crew,Sex=Female,Survived=Yes} . . . . . . | . . . . .
## {Class=Crew,Sex=Female,Age=Adult,Survived=Yes} . . . . . . . . . . . .
## {Class=2nd,Sex=Female,Age=Adult,Survived=Yes} . . . . . . . . . . . .
## {Class=2nd,Sex=Male,Age=Adult,Survived=No} . . . . . . . . . . . .
## {Class=2nd,Sex=Male,Survived=No} . . . . . . . . . . . .
## {Class=3rd,Sex=Male,Age=Adult,Survived=No} . . . . . . . . . . . .
## {Class=3rd,Sex=Male,Survived=No} . . . . . . . . . . . .
redundant <- apply(subset.matrix, 2, any)
redundant
## {Class=2nd,Age=Child,Survived=Yes}
## FALSE
## {Class=2nd,Sex=Female,Age=Child,Survived=Yes}
## TRUE
## {Class=1st,Sex=Female,Survived=Yes}
## FALSE
## {Class=1st,Sex=Female,Age=Adult,Survived=Yes}
## TRUE
## {Class=2nd,Sex=Female,Survived=Yes}
## FALSE
## {Class=Crew,Sex=Female,Survived=Yes}
## FALSE
## {Class=Crew,Sex=Female,Age=Adult,Survived=Yes}
## TRUE
## {Class=2nd,Sex=Female,Age=Adult,Survived=Yes}
## TRUE
## {Class=2nd,Sex=Male,Age=Adult,Survived=No}
## FALSE
## {Class=2nd,Sex=Male,Survived=No}
## FALSE
## {Class=3rd,Sex=Male,Age=Adult,Survived=No}
## FALSE
## {Class=3rd,Sex=Male,Survived=No}
## FALSE
rules.pruned <- rules.sorted[!redundant]
inspect(rules.pruned)
## lhs rhs support
## [1] {Class=2nd,Age=Child} => {Survived=Yes} 0.010904134
## [2] {Class=1st,Sex=Female} => {Survived=Yes} 0.064061790
## [3] {Class=2nd,Sex=Female} => {Survived=Yes} 0.042253521
## [4] {Class=Crew,Sex=Female} => {Survived=Yes} 0.009086779
## [5] {Class=2nd,Sex=Male,Age=Adult} => {Survived=No} 0.069968196
## [6] {Class=2nd,Sex=Male} => {Survived=No} 0.069968196
## [7] {Class=3rd,Sex=Male,Age=Adult} => {Survived=No} 0.175829169
## [8] {Class=3rd,Sex=Male} => {Survived=No} 0.191731031
## confidence lift
## [1] 1.0000000 3.095640
## [2] 0.9724138 3.010243
## [3] 0.8773585 2.715986
## [4] 0.8695652 2.691861
## [5] 0.9166667 1.354083
## [6] 0.8603352 1.270871
## [7] 0.8376623 1.237379
## [8] 0.8274510 1.222295
interestMeasure(rules.pruned, c("support", "chiSquare", "confidence", "conviction", "cosine", "coverage", "leverage", "lift", "oddsRatio"), titanic.raw)
## support chiSquared confidence conviction cosine coverage
## 1 0.010904134 0.09241223 1.0000000 NA 0.1837261 0.01090413
## 2 0.064061790 0.54398300 0.9724138 24.539982 0.4391373 0.06587915
## 3 0.042253521 0.28437580 0.8773585 5.519869 0.3387624 0.04815993
## 4 0.009086779 0.05769558 0.8695652 5.190065 0.1563980 0.01044980
## 5 0.069968196 0.08684767 0.9166667 3.876420 0.3078031 0.07632894
## 6 0.069968196 0.05444694 0.8603352 2.312930 0.2981955 0.08132667
## 7 0.175829169 0.12548849 0.8376623 1.989896 0.4664411 0.20990459
## 8 0.191731031 0.12492886 0.8274510 1.872135 0.4840990 0.23171286
## leverage lift oddsRatio
## 1 0.007381718 3.095640 NA
## 2 0.042780521 3.010243 91.897368
## 3 0.026696180 2.715986 17.097461
## 4 0.005711129 2.691861 14.346358
## 5 0.018296173 1.354083 5.738772
## 6 0.014912886 1.270871 3.162994
## 7 0.033731105 1.237379 2.975304
## 8 0.034869533 1.222295 2.797348
library(arulesViz)
## Loading required package: grid
plot(rules.pruned)
plot(rules.pruned, method="grouped")
plot(rules.pruned, method="graph")
data("Adult")
str(Adult)
## Formal class 'transactions' [package "arules"] with 3 slots
## ..@ data :Formal class 'ngCMatrix' [package "Matrix"] with 5 slots
## .. .. ..@ i : int [1:612200] 1 10 25 32 35 50 59 61 63 65 ...
## .. .. ..@ p : int [1:48843] 0 13 26 39 52 65 78 91 104 117 ...
## .. .. ..@ Dim : int [1:2] 115 48842
## .. .. ..@ Dimnames:List of 2
## .. .. .. ..$ : NULL
## .. .. .. ..$ : NULL
## .. .. ..@ factors : list()
## ..@ itemInfo :'data.frame': 115 obs. of 3 variables:
## .. ..$ labels : chr [1:115] "age=Young" "age=Middle-aged" "age=Senior" "age=Old" ...
## .. ..$ variables: Factor w/ 13 levels "age","capital-gain",..: 1 1 1 1 13 13 13 13 13 13 ...
## .. ..$ levels : Factor w/ 112 levels "10th","11th",..: 111 63 92 69 30 54 65 82 90 91 ...
## ..@ itemsetInfo:'data.frame': 48842 obs. of 1 variable:
## .. ..$ transactionID: chr [1:48842] "1" "2" "3" "4" ...
itemsets <- eclat(Adult, parameter = list(supp = 0.1, maxlen = 15))
## Eclat
##
## parameter specification:
## tidLists support minlen maxlen target ext
## FALSE 0.1 1 15 frequent itemsets FALSE
##
## algorithmic control:
## sparse sort verbose
## 7 -2 TRUE
##
## Absolute minimum support count: 4884
##
## create itemset ...
## set transactions ...[115 item(s), 48842 transaction(s)] done [0.04s].
## sorting and recoding items ... [31 item(s)] done [0.01s].
## creating bit matrix ... [31 row(s), 48842 column(s)] done [0.00s].
## writing ... [2616 set(s)] done [0.01s].
## Creating S4 object ... done [0.00s].
itemsets.sorted <- sort(itemsets)
inspect(itemsets.sorted[1:5])
## items support
## [1] {capital-loss=None} 0.9532779
## [2] {capital-gain=None} 0.9173867
## [3] {native-country=United-States} 0.8974243
## [4] {capital-gain=None,capital-loss=None} 0.8706646
## [5] {race=White} 0.8550428
head(as(items(itemsets.sorted), "list"))
## [[1]]
## [1] "capital-loss=None"
##
## [[2]]
## [1] "capital-gain=None"
##
## [[3]]
## [1] "native-country=United-States"
##
## [[4]]
## [1] "capital-gain=None" "capital-loss=None"
##
## [[5]]
## [1] "race=White"
##
## [[6]]
## [1] "capital-loss=None" "native-country=United-States"