library(arules)
## Warning: package 'arules' was built under R version 3.5.3
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
## Warning: package 'arulesViz' was built under R version 3.5.3
## Loading required package: grid
# Dataset: AdultUCI Description
data("AdultUCI")
str(AdultUCI)
## 'data.frame': 48842 obs. of 15 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ workclass : Factor w/ 8 levels "Federal-gov",..: 7 6 4 4 4 4 4 6 4 4 ...
## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
## $ education : Ord.factor w/ 16 levels "Preschool"<"1st-4th"<..: 14 14 9 7 14 15 5 9 15 14 ...
## $ education-num : int 13 13 9 7 13 14 5 9 14 13 ...
## $ marital-status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
## $ occupation : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
## $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
## $ capital-gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital-loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours-per-week: int 40 13 40 40 40 40 16 45 50 40 ...
## $ native-country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
## $ income : Ord.factor w/ 2 levels "small"<"large": 1 1 1 1 1 1 1 2 2 2 ...
# Data Pre-processing: Discretization
var_to_discretize <- c("age", "hours-per-week")
AdultUCI$age_grp <- discretize(AdultUCI$age, method = "frequency", breaks = 3,
labels = c("low", "medium", "high"), order = T)
boxplot(age ~ age_grp, data = AdultUCI)

# Approach 1: Run Apriori Against Record Dataset Directly
rules_record <- apriori(AdultUCI[, sapply(AdultUCI, is.factor)],
parameter = list(support = 0.1, confidence = 0.5, minlen = 3))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.1 3
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 4884
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[104 item(s), 48842 transaction(s)] done [0.02s].
## sorting and recoding items ... [26 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [1021 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(head(rules_record, 5))
## lhs rhs support confidence lift count
## [1] {occupation=Exec-managerial,
## race=White} => {native-country=United-States} 0.1059539 0.9453782 1.053435 5175
## [2] {occupation=Exec-managerial,
## native-country=United-States} => {race=White} 0.1059539 0.9231181 1.079616 5175
## [3] {occupation=Craft-repair,
## sex=Male} => {race=White} 0.1076532 0.9082743 1.062256 5258
## [4] {occupation=Craft-repair,
## race=White} => {sex=Male} 0.1076532 0.9553052 1.429066 5258
## [5] {occupation=Craft-repair,
## sex=Male} => {native-country=United-States} 0.1068138 0.9011919 1.004198 5217
# Approach 2: Run Apriori Against Transactional Dataset
fac_var <- sapply(AdultUCI, is.factor)
adult <- as(AdultUCI[, fac_var], "transactions")
rules_transaction <- apriori(adult,
parameter = list(support = 0.1, confidence = 0.5, minlen = 3))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.1 3
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 4884
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[104 item(s), 48842 transaction(s)] done [0.02s].
## sorting and recoding items ... [26 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [1021 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Check and Visualize the Most Frequent Items
frequent_items <- eclat(adult, parameter = list(support = 0.7, minlen = 2))
## Eclat
##
## parameter specification:
## tidLists support minlen maxlen target ext
## FALSE 0.7 2 10 frequent itemsets FALSE
##
## algorithmic control:
## sparse sort verbose
## 7 -2 TRUE
##
## Absolute minimum support count: 34189
##
## create itemset ...
## set transactions ...[104 item(s), 48842 transaction(s)] done [0.02s].
## sorting and recoding items ... [2 item(s)] done [0.00s].
## creating bit matrix ... [2 row(s), 48842 column(s)] done [0.00s].
## writing ... [1 set(s)] done [0.00s].
## Creating S4 object ... done [0.00s].
inspect(head(frequent_items, 2))
## items support count
## [1] {race=White,native-country=United-States} 0.7881127 38493
itemFrequencyPlot(adult, topN = 10, type = "absolute", main = "Item frequency")

# Sort Association Rules by Performance Metrics
rules <- apriori(adult, parameter = list(support = 0.1, confidence = 0.5))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.1 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 4884
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[104 item(s), 48842 transaction(s)] done [0.02s].
## sorting and recoding items ... [26 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [1127 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
quality(head(rules, 3))
## support confidence lift count
## 1 0.5061218 0.5061218 1 24720
## 2 0.6684820 0.6684820 1 32650
## 3 0.6941976 0.6941976 1 33906
inspect(head(sort(rules, by = "lift", decreasing = T), 8))
## lhs rhs support confidence lift count
## [1] {marital-status=Never-married,
## native-country=United-States,
## age_grp=low} => {relationship=Own-child} 0.1123828 0.5414817 3.488596 5489
## [2] {marital-status=Never-married,
## race=White,
## age_grp=low} => {relationship=Own-child} 0.1029442 0.5327964 3.432640 5028
## [3] {marital-status=Never-married,
## age_grp=low} => {relationship=Own-child} 0.1209410 0.5233918 3.372049 5907
## [4] {relationship=Own-child,
## race=White,
## age_grp=low} => {marital-status=Never-married} 0.1029442 0.9501134 2.879285 5028
## [5] {relationship=Own-child,
## native-country=United-States,
## age_grp=low} => {marital-status=Never-married} 0.1123828 0.9498183 2.878391 5489
## [6] {relationship=Own-child,
## age_grp=low} => {marital-status=Never-married} 0.1209410 0.9470899 2.870123 5907
## [7] {marital-status=Never-married,
## relationship=Own-child,
## race=White} => {age_grp=low} 0.1029442 0.8811777 2.725162 5028
## [8] {relationship=Own-child,
## race=White,
## native-country=United-States} => {marital-status=Never-married} 0.1106630 0.8976914 2.720422 5405
# Remove Redundant Rules
subset_rules <- which(colSums(is.subset(rules, rules)) > 1)
rules <- sort(rules[-subset_rules], by = "lift", descreasing = T)
inspect(head(rules, 5))
## lhs rhs support confidence lift count
## [1] {relationship=Own-child} => {marital-status=Never-married} 0.1382007 0.8903839 2.698277 6750
## [2] {relationship=Own-child} => {age_grp=low} 0.1276975 0.8227147 2.544357 6237
## [3] {income=large} => {relationship=Husband} 0.1211662 0.7547507 1.869727 5918
## [4] {income=large} => {marital-status=Married-civ-spouse} 0.1370132 0.8534626 1.862676 6692
## [5] {relationship=Not-in-family} => {marital-status=Never-married} 0.1456533 0.5653660 1.713322 7114
# Use Association Rule Mining as a Supervised Learning Method
rules <- apriori(data = adult, parameter = list(supp = 0.01, conf = 0.5),
appearance = list(default = "lhs", rhs = c("income=small", "income=large")),
control = list(verbose = F))
inspect(head(sort(rules, by = "lift", descreasing = T), 3))
## lhs rhs support confidence lift count
## [1] {education=Bachelors,
## occupation=Exec-managerial,
## relationship=Husband} => {income=large} 0.01250973 0.5431111 3.383068 611
## [2] {education=Bachelors,
## marital-status=Married-civ-spouse,
## occupation=Exec-managerial,
## relationship=Husband} => {income=large} 0.01250973 0.5431111 3.383068 611
## [3] {education=Bachelors,
## occupation=Exec-managerial,
## relationship=Husband,
## sex=Male} => {income=large} 0.01250973 0.5431111 3.383068 611
# Plot Association Rules on Selected Metrics Dimensions
plot(rules, measure = c("support", "lift"), shading = "confidence")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
