library(arules)
library(arulesViz)
data("AdultUCI")
dim(AdultUCI)
## [1] 48842 15
AdultUCI[1:2,]
## remove attributes
AdultUCI[["fnlwgt"]] <- NULL
AdultUCI[["education-num"]] <- NULL
## map metric attributes
AdultUCI[[ "age"]] <- ordered(cut(AdultUCI[[ "age"]], c(15,25,45,65,100)),labels = c("Young", "Middle-aged", "Senior", "Old"))
AdultUCI[[ "hours-per-week"]] <- ordered(cut(AdultUCI[[ "hours-per-week"]],c(0,25,40,60,168)),
labels = c("Part-time", "Full-time", "Over-time", "Workaholic"))
AdultUCI[[ "capital-gain"]] <- ordered(cut(AdultUCI[[ "capital-gain"]],
c(-Inf,0,median(AdultUCI[[ "capital-gain"]][AdultUCI[[ "capital-gain"]]>0]),Inf)), labels = c("None", "Low", "High"))
AdultUCI[[ "capital-loss"]] <- ordered(cut(AdultUCI[[ "capital-loss"]],
c(-Inf,0, median(AdultUCI[[ "capital-loss"]][AdultUCI[[ "capital-loss"]]>0]),Inf)), labels = c("None", "Low", "High"))
## create transactions
Adult <- as(AdultUCI, "transactions")
This is not a traditional market transaction and it is not labeled 1 or 0 based on purchased or not purchased. In a market transaction, the items are represented by the column using 0 or 1s, but in this case, the columns are questions in the survey with different categories as measures of representation. For example, income has 2 categories, small or large. Hours per week has part-time, full-time, and over-time. Therefore, some categories have more than 2 categories depending on the variable and the available answers each one has.
itemFrequencyPlot(Adult,support=.10)
m1=apriori(Adult,parameter=list(support=.01,confidence=.6,
minlen=2))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.6 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 488
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[115 item(s), 48842 transaction(s)] done [0.01s].
## sorting and recoding items ... [67 item(s)] done [0.02s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10 done [0.85s].
## writing ... [276437 rule(s)] done [0.06s].
## creating S4 object ... done [0.28s].
summary(m1)
## set of 276437 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4 5 6 7 8 9 10
## 432 4981 22127 52669 75104 67198 38094 13244 2588
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 5.000 6.000 6.289 7.000 10.000
##
## summary of quality measures:
## support confidence lift count
## Min. :0.01001 Min. :0.6000 Min. : 0.7171 Min. : 489
## 1st Qu.:0.01253 1st Qu.:0.7691 1st Qu.: 1.0100 1st Qu.: 612
## Median :0.01701 Median :0.9051 Median : 1.0554 Median : 831
## Mean :0.02677 Mean :0.8600 Mean : 1.3110 Mean : 1308
## 3rd Qu.:0.02741 3rd Qu.:0.9542 3rd Qu.: 1.2980 3rd Qu.: 1339
## Max. :0.87066 Max. :1.0000 Max. :20.6826 Max. :42525
##
## mining info:
## data ntransactions support confidence
## Adult 48842 0.01 0.6
There are 276437 rules using support 1% and confidence of 60%
#rules with high income as antecedent
m1_high_income=apriori(Adult,parameter=list(supp=.01,conf=.6,minlen=2),appearance = list(default="rhs",lhs="income=large"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.6 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 488
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[115 item(s), 48842 transaction(s)] done [0.03s].
## sorting and recoding items ... [67 item(s)] done [0.00s].
## creating transaction tree ... done [0.05s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [8 rule(s)] done [0.00s].
## creating S4 object ... done [0.02s].
summary(m1_high_income)
## set of 8 rules
##
## rule length distribution (lhs + rhs):sizes
## 2
## 8
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 2 2 2 2 2
##
## summary of quality measures:
## support confidence lift count
## Min. :0.1016 Min. :0.6330 Min. :0.8569 Min. :4963
## 1st Qu.:0.1249 1st Qu.:0.7783 1st Qu.:0.9371 1st Qu.:6102
## Median :0.1367 Median :0.8515 Median :1.0403 Median :6677
## Mean :0.1325 Mean :0.8251 Mean :1.2248 Mean :6469
## 3rd Qu.:0.1450 3rd Qu.:0.9030 3rd Qu.:1.4189 3rd Qu.:7080
## Max. :0.1468 Max. :0.9146 Max. :1.8697 Max. :7171
##
## mining info:
## data ntransactions support confidence
## Adult 48842 0.01 0.6
For high income, there is a set of 8 rules.
#rules with high income as antecedent
m1_low_income=apriori(Adult,parameter=list(supp=.01,conf=.6,minlen=2),appearance = list(default="rhs",lhs="income=small"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.6 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 488
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[115 item(s), 48842 transaction(s)] done [0.03s].
## sorting and recoding items ... [67 item(s)] done [0.00s].
## creating transaction tree ... done [0.03s].
## checking subsets of size 1 2 done [0.01s].
## writing ... [7 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(m1_low_income)
## set of 7 rules
##
## rule length distribution (lhs + rhs):sizes
## 2
## 7
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 2 2 2 2 2
##
## summary of quality measures:
## support confidence lift count
## Min. :0.3097 Min. :0.6120 Min. :0.9155 Min. :15128
## 1st Qu.:0.3383 1st Qu.:0.6684 1st Qu.:0.9855 1st Qu.:16522
## Median :0.4238 Median :0.8373 Median :1.0174 Median :20699
## Mean :0.4052 Mean :0.8006 Mean :1.0057 Mean :19790
## 3rd Qu.:0.4677 3rd Qu.:0.9240 3rd Qu.:1.0389 3rd Qu.:22842
## Max. :0.4908 Max. :0.9698 Max. :1.0586 Max. :23974
##
## mining info:
## data ntransactions support confidence
## Adult 48842 0.01 0.6
For low income, there is a set of 7 rules.
#remove redundant rules
summary(is.redundant(m1_high_income))
## Mode FALSE
## logical 8
m1_high_income[!is.redundant(m1_high_income)]
## set of 8 rules
For high income, there are no redundant rules.
#remove redundant rules
summary(is.redundant(m1_low_income))
## Mode FALSE
## logical 7
m1_low_income[!is.redundant(m1_low_income)]
## set of 7 rules
For low income, there are no redundant rules.
inspect(head(sort(m1_low_income, by ="lift"),3))
## lhs rhs support confidence
## [1] {income=small} => {hours-per-week=Full-time} 0.3134802 0.6193770
## [2] {income=small} => {capital-gain=None} 0.4849310 0.9581311
## [3] {income=small} => {workclass=Private} 0.3630687 0.7173544
## lift count
## [1] 1.058600 15311
## [2] 1.044414 23685
## [3] 1.033358 17733
For low income, {income=small} => {hours-per-week=Full-time} has the highest lift.
inspect(head(sort(m1_high_income, by ="lift"),3))
## lhs rhs support confidence lift count
## [1] {income=large} => {relationship=Husband} 0.1211662 0.7547507 1.869727 5918
## [2] {income=large} => {marital-status=Married-civ-spouse} 0.1370132 0.8534626 1.862676 6692
## [3] {income=large} => {sex=Male} 0.1363990 0.8496365 1.270994 6662
For low income, {income=large} => {relationship=Husband} has the highest lift.
plot(m1_low_income,method="graph")
High Income
plot(m1_high_income,method="graph")