Week-3-Notes.R

library(arules)

## Warning: package 'arules' was built under R version 3.5.3

## Loading required package: Matrix

## 
## Attaching package: 'arules'

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

library(arulesViz)

## Warning: package 'arulesViz' was built under R version 3.5.3

## Loading required package: grid

# Dataset: AdultUCI Description
data("AdultUCI")
str(AdultUCI)

## 'data.frame':    48842 obs. of  15 variables:
##  $ age           : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ workclass     : Factor w/ 8 levels "Federal-gov",..: 7 6 4 4 4 4 4 6 4 4 ...
##  $ fnlwgt        : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
##  $ education     : Ord.factor w/ 16 levels "Preschool"<"1st-4th"<..: 14 14 9 7 14 15 5 9 15 14 ...
##  $ education-num : int  13 13 9 7 13 14 5 9 14 13 ...
##  $ marital-status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
##  $ occupation    : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
##  $ relationship  : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
##  $ race          : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
##  $ sex           : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
##  $ capital-gain  : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital-loss  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours-per-week: int  40 13 40 40 40 40 16 45 50 40 ...
##  $ native-country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
##  $ income        : Ord.factor w/ 2 levels "small"<"large": 1 1 1 1 1 1 1 2 2 2 ...

# Data Pre-processing: Discretization
var_to_discretize <- c("age", "hours-per-week")
AdultUCI$age_grp <- discretize(AdultUCI$age, method = "frequency", breaks = 3, 
                               labels = c("low", "medium", "high"), order = T)
boxplot(age ~ age_grp, data = AdultUCI)

# Approach 1: Run Apriori Against Record Dataset Directly
rules_record <- apriori(AdultUCI[, sapply(AdultUCI, is.factor)], 
                        parameter = list(support = 0.1, confidence = 0.5, minlen = 3))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5     0.1      3
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 4884 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[104 item(s), 48842 transaction(s)] done [0.02s].
## sorting and recoding items ... [26 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [1021 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

inspect(head(rules_record, 5))

##     lhs                               rhs                              support confidence     lift count
## [1] {occupation=Exec-managerial,                                                                        
##      race=White}                   => {native-country=United-States} 0.1059539  0.9453782 1.053435  5175
## [2] {occupation=Exec-managerial,                                                                        
##      native-country=United-States} => {race=White}                   0.1059539  0.9231181 1.079616  5175
## [3] {occupation=Craft-repair,                                                                           
##      sex=Male}                     => {race=White}                   0.1076532  0.9082743 1.062256  5258
## [4] {occupation=Craft-repair,                                                                           
##      race=White}                   => {sex=Male}                     0.1076532  0.9553052 1.429066  5258
## [5] {occupation=Craft-repair,                                                                           
##      sex=Male}                     => {native-country=United-States} 0.1068138  0.9011919 1.004198  5217

# Approach 2: Run Apriori Against Transactional Dataset
fac_var <- sapply(AdultUCI, is.factor)
adult <- as(AdultUCI[, fac_var], "transactions")
rules_transaction <- apriori(adult, 
                             parameter = list(support = 0.1, confidence = 0.5, minlen = 3))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5     0.1      3
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 4884 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[104 item(s), 48842 transaction(s)] done [0.02s].
## sorting and recoding items ... [26 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [1021 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

# Check and Visualize the Most Frequent Items
frequent_items <- eclat(adult, parameter = list(support = 0.7, minlen = 2))

## Eclat
## 
## parameter specification:
##  tidLists support minlen maxlen            target   ext
##     FALSE     0.7      2     10 frequent itemsets FALSE
## 
## algorithmic control:
##  sparse sort verbose
##       7   -2    TRUE
## 
## Absolute minimum support count: 34189 
## 
## create itemset ... 
## set transactions ...[104 item(s), 48842 transaction(s)] done [0.02s].
## sorting and recoding items ... [2 item(s)] done [0.00s].
## creating bit matrix ... [2 row(s), 48842 column(s)] done [0.00s].
## writing  ... [1 set(s)] done [0.00s].
## Creating S4 object  ... done [0.00s].

inspect(head(frequent_items, 2))

##     items                                     support   count
## [1] {race=White,native-country=United-States} 0.7881127 38493

itemFrequencyPlot(adult, topN = 10, type = "absolute", main = "Item frequency")

# Sort Association Rules by Performance Metrics
rules <- apriori(adult, parameter = list(support = 0.1, confidence = 0.5))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5     0.1      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 4884 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[104 item(s), 48842 transaction(s)] done [0.02s].
## sorting and recoding items ... [26 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [1127 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

quality(head(rules, 3))

##     support confidence lift count
## 1 0.5061218  0.5061218    1 24720
## 2 0.6684820  0.6684820    1 32650
## 3 0.6941976  0.6941976    1 33906

inspect(head(sort(rules, by = "lift", decreasing = T), 8))

##     lhs                               rhs                              support confidence     lift count
## [1] {marital-status=Never-married,                                                                      
##      native-country=United-States,                                                                      
##      age_grp=low}                  => {relationship=Own-child}       0.1123828  0.5414817 3.488596  5489
## [2] {marital-status=Never-married,                                                                      
##      race=White,                                                                                        
##      age_grp=low}                  => {relationship=Own-child}       0.1029442  0.5327964 3.432640  5028
## [3] {marital-status=Never-married,                                                                      
##      age_grp=low}                  => {relationship=Own-child}       0.1209410  0.5233918 3.372049  5907
## [4] {relationship=Own-child,                                                                            
##      race=White,                                                                                        
##      age_grp=low}                  => {marital-status=Never-married} 0.1029442  0.9501134 2.879285  5028
## [5] {relationship=Own-child,                                                                            
##      native-country=United-States,                                                                      
##      age_grp=low}                  => {marital-status=Never-married} 0.1123828  0.9498183 2.878391  5489
## [6] {relationship=Own-child,                                                                            
##      age_grp=low}                  => {marital-status=Never-married} 0.1209410  0.9470899 2.870123  5907
## [7] {marital-status=Never-married,                                                                      
##      relationship=Own-child,                                                                            
##      race=White}                   => {age_grp=low}                  0.1029442  0.8811777 2.725162  5028
## [8] {relationship=Own-child,                                                                            
##      race=White,                                                                                        
##      native-country=United-States} => {marital-status=Never-married} 0.1106630  0.8976914 2.720422  5405

# Remove Redundant Rules
subset_rules <- which(colSums(is.subset(rules, rules)) > 1)
rules <- sort(rules[-subset_rules], by = "lift", descreasing = T)
inspect(head(rules, 5))

##     lhs                             rhs                                   support confidence     lift count
## [1] {relationship=Own-child}     => {marital-status=Never-married}      0.1382007  0.8903839 2.698277  6750
## [2] {relationship=Own-child}     => {age_grp=low}                       0.1276975  0.8227147 2.544357  6237
## [3] {income=large}               => {relationship=Husband}              0.1211662  0.7547507 1.869727  5918
## [4] {income=large}               => {marital-status=Married-civ-spouse} 0.1370132  0.8534626 1.862676  6692
## [5] {relationship=Not-in-family} => {marital-status=Never-married}      0.1456533  0.5653660 1.713322  7114

# Use Association Rule Mining as a Supervised Learning Method
rules <- apriori(data = adult, parameter = list(supp = 0.01, conf = 0.5),
                 appearance = list(default = "lhs", rhs = c("income=small", "income=large")),
                 control = list(verbose = F))
inspect(head(sort(rules, by = "lift", descreasing = T), 3))

##     lhs                                    rhs               support confidence     lift count
## [1] {education=Bachelors,                                                                     
##      occupation=Exec-managerial,                                                              
##      relationship=Husband}              => {income=large} 0.01250973  0.5431111 3.383068   611
## [2] {education=Bachelors,                                                                     
##      marital-status=Married-civ-spouse,                                                       
##      occupation=Exec-managerial,                                                              
##      relationship=Husband}              => {income=large} 0.01250973  0.5431111 3.383068   611
## [3] {education=Bachelors,                                                                     
##      occupation=Exec-managerial,                                                              
##      relationship=Husband,                                                                    
##      sex=Male}                          => {income=large} 0.01250973  0.5431111 3.383068   611

# Plot Association Rules on Selected Metrics Dimensions
plot(rules, measure = c("support", "lift"), shading = "confidence")

## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.

Week-3-Notes.R

shibo

2019-09-20