Introduction

This study aims to investigate the relationships between different food preferences using association rule mining. Association rule mining is a data mining technique that discovers interesting relationships or dependencies among items within a dataset. By analyzing the “Food Preferences” dataset available on Kaggle, I aim to uncover hidden patterns and associations between different food categories.

necessary libraries

## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
## Loading required package: lattice
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following objects are masked from 'package:base':
## 
##     abbreviate, write

data cleaning and preparation

##     Gender       Nationality       Age                      Food    
##  Female:165   Indian   :241   Min.   : 8.00   Traditional food:238  
##  Male  :119   Malaysian: 10   1st Qu.:24.00   Western Food    : 50  
##  NA's  :  4   Indonesia:  7   Median :28.00                         
##               Pakistani:  3   Mean   :30.60                         
##               Japan    :  2   3rd Qu.:36.25                         
##               Maldivian:  2   Max.   :80.00                         
##               (Other)  : 23                                         
##                Juice      Dessert   
##  Carbonated drinks: 32   Maybe:122  
##  Fresh Juice      :256   No   : 52  
##                          Yes  :114  
##                                     
##                                     
##                                     
## 
cat("Number of observations in the dataset:", nrow(data))
## Number of observations in the dataset: 288
there was no mising values in this dataset
# checking for missing values
colSums(is.na(data))
##      Gender Nationality         Age        Food       Juice     Dessert 
##           4           0           0           0           0           0
summary(data)
##     Gender       Nationality       Age                      Food    
##  Female:165   Indian   :241   Min.   : 8.00   Traditional food:238  
##  Male  :119   Malaysian: 10   1st Qu.:24.00   Western Food    : 50  
##  NA's  :  4   Indonesia:  7   Median :28.00                         
##               Pakistani:  3   Mean   :30.60                         
##               Japan    :  2   3rd Qu.:36.25                         
##               Maldivian:  2   Max.   :80.00                         
##               (Other)  : 23                                         
##                Juice      Dessert   
##  Carbonated drinks: 32   Maybe:122  
##  Fresh Juice      :256   No   : 52  
##                          Yes  :114  
##                                     
##                                     
##                                     
## 

histogram for transactions

item frequency

library(dplyr)

item_frequencies <- data %>%
  group_by(Food) %>%
  summarize(Frequency = n()) %>%
  arrange(desc(Frequency))

library(ggplot2)

ggplot(item_frequencies, aes(x = reorder(Food, -Frequency), y = Frequency)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Item Frequency Plot",
       x = "Food Items",
       y = "Frequency") +
  coord_flip()  

item_frequencies_2 <- data %>%
  group_by(Juice) %>%
  summarize(Frequency = n()) %>%
  arrange(desc(Frequency))

ggplot(item_frequencies_2, aes(x = reorder(Juice, -Frequency), y = Frequency)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Item Frequency Plot",
       x = "Juice Items",
       y = "Frequency") +
  coord_flip()  

item_frequencies_3 <- data %>%
  group_by(Dessert) %>%
  summarize(Frequency = n()) %>%
  arrange(desc(Frequency))

ggplot(item_frequencies_3 , aes(x = reorder(Dessert, -Frequency), y = Frequency)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Item Frequency Plot",
       x = "Juice Items",
       y = "Frequency") +
  coord_flip() 

The Apriori algorithm

# Convert the relevant columns into a transaction format
transactions <- as(split(data$Gender, data$Age, data$Food, data$Juice, data$Dessert), "transactions")
## Warning in asMethod(object): removing duplicated items in transactions
#  Apriori algorithm
rules <- apriori(transactions)
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5     0.1      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 4 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[2 item(s), 49 transaction(s)] done [0.00s].
## sorting and recoding items ... [2 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [1 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Inspect the results
inspect(rules)
##     lhs    rhs      support   confidence coverage lift count
## [1] {}  => {Female} 0.8367347 0.8367347  1        1    41
rules<-apriori(data , parameter=list(supp=0.1, conf=0.65)) 
## Warning: Column(s) 3 not logical or factor. Applying default discretization
## (see '? discretizeDF').
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.65    0.1    1 none FALSE            TRUE       5     0.1      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 28 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[39 item(s), 288 transaction(s)] done [0.00s].
## sorting and recoding items ... [13 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 done [0.00s].
## writing ... [278 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

visualize the association rules generated by the Apriori

set.seed(240)

# Ploting the rules using the graph method
plot(rules, method = "graph", measure = "support", shading = "lift", main = "Graph of Rules")
## Warning: Unknown control parameters: main
## Available control parameters (with default values):
## layout    =  stress
## circular  =  FALSE
## ggraphdots    =  NULL
## edges     =  <environment>
## nodes     =  <environment>
## nodetext  =  <environment>
## colors    =  c("#EE0000FF", "#EEEEEEFF")
## engine    =  ggplot2
## max   =  100
## verbose   =  FALSE
## Warning: Too many rules supplied. Only plotting the best 100 using 'lift'
## (change control parameter max if needed).

confidence

rules.by.conf<-sort(rules, by="confidence", decreasing=TRUE) 
inspect(head(rules.by.conf))
##     lhs                         rhs                    support confidence  coverage     lift count
## [1] {Dessert=No}             => {Juice=Fresh Juice}  0.1805556          1 0.1805556 1.125000    52
## [2] {Age=[33,80],                                                                                 
##      Dessert=No}             => {Nationality=Indian} 0.1076389          1 0.1076389 1.195021    31
## [3] {Age=[33,80],                                                                                 
##      Dessert=No}             => {Juice=Fresh Juice}  0.1076389          1 0.1076389 1.125000    31
## [4] {Gender=Female,                                                                               
##      Dessert=No}             => {Juice=Fresh Juice}  0.1215278          1 0.1215278 1.125000    35
## [5] {Food=Traditional food,                                                                       
##      Dessert=No}             => {Juice=Fresh Juice}  0.1701389          1 0.1701389 1.125000    49
## [6] {Nationality=Indian,                                                                          
##      Dessert=No}             => {Juice=Fresh Juice}  0.1631944          1 0.1631944 1.125000    47
from data above I underestood, the maximum confidence value in your dataset is 1. This means that for the rules listed, the confidence is 100%, indicating that whenever the left-hand side (LHS) of the rule occurs, the right-hand side (RHS) also occurs with certainty

Lift

rules.by.lift<-sort(rules, by="lift", decreasing=TRUE) 
inspect(head(rules.by.lift))
##     lhs                         rhs               support confidence  coverage     lift count
## [1] {Nationality=Indian,                                                                     
##      Food=Traditional food,                                                                  
##      Dessert=No}             => {Age=[33,80]}   0.1041667  0.6666667 0.1562500 1.900990    30
## [2] {Nationality=Indian,                                                                     
##      Food=Traditional food,                                                                  
##      Juice=Fresh Juice,                                                                      
##      Dessert=No}             => {Age=[33,80]}   0.1041667  0.6666667 0.1562500 1.900990    30
## [3] {Nationality=Indian,                                                                     
##      Dessert=No}             => {Age=[33,80]}   0.1076389  0.6595745 0.1631944 1.880767    31
## [4] {Nationality=Indian,                                                                     
##      Juice=Fresh Juice,                                                                      
##      Dessert=No}             => {Age=[33,80]}   0.1076389  0.6595745 0.1631944 1.880767    31
## [5] {Age=[33,80],                                                                            
##      Food=Traditional food,                                                                  
##      Dessert=Maybe}          => {Gender=Female} 0.1076389  0.8611111 0.1250000 1.503030    31
## [6] {Nationality=Indian,                                                                     
##      Age=[33,80],                                                                            
##      Food=Traditional food,                                                                  
##      Dessert=Maybe}          => {Gender=Female} 0.1041667  0.8571429 0.1215278 1.496104    30

Support

rules.by.supp<-sort(rules, by="support", decreasing=TRUE) 
inspect(head(rules.by.supp))
##     lhs                        rhs                     support   confidence
## [1] {}                      => {Juice=Fresh Juice}     0.8888889 0.8888889 
## [2] {}                      => {Nationality=Indian}    0.8368056 0.8368056 
## [3] {}                      => {Food=Traditional food} 0.8263889 0.8263889 
## [4] {Food=Traditional food} => {Juice=Fresh Juice}     0.7638889 0.9243697 
## [5] {Juice=Fresh Juice}     => {Food=Traditional food} 0.7638889 0.8593750 
## [6] {Nationality=Indian}    => {Juice=Fresh Juice}     0.7534722 0.9004149 
##     coverage  lift     count
## [1] 1.0000000 1.000000 256  
## [2] 1.0000000 1.000000 241  
## [3] 1.0000000 1.000000 238  
## [4] 0.8263889 1.039916 220  
## [5] 0.8888889 1.039916 220  
## [6] 0.8368056 1.012967 217

reviewing the results from confidence, support and lift, I underestood the most meaningful rule are Rules with high lift, support and confidence values. they are the most meaningful because they represent strong associations that are not due to random chance.

Parallel plot

plot(rules, method="paracoord", control=list(reorder=TRUE))

I wanted to focus on the strongest association rules between transactions, so I decided to filter the transactions by lift and then by support.

meaningful rules by lift

# Sorting the rules by lift in descending order
sorted_rules <- sort(rules, by = "lift", decreasing = TRUE)

# Selecting the top 10 rules with the highest lift
meaningful_rules <- head(sorted_rules, 10)

# Ploting the rules using parallel coordinates
plot(meaningful_rules, method = "paracoord", control = list(reorder = TRUE))

From the plot above, I could underestand:

Dessert=No is frequently associated with Juice=Fresh Juice. This means that people who do not prefer dessert often prefer fresh juice.

Food=Traditional food is often associated with Nationality=Indian. This suggests that individuals who prefer traditional food are likely to be Indian.

Age=[33,80] is connected to Gender=Female in some rules, indicating that older individuals in this age group are more likely to be female.

meaningful rules by support

# Sorting the rules by lift in descending order
sorted_rules_1 <- sort(rules, by = "support", decreasing = TRUE)

# Selecting the top 10 rules with the highest lift
meaningful_rules_1 <- head(sorted_rules_1, 10)

# Ploting the rules using parallel coordinates
plot(meaningful_rules_1, method = "paracoord", control = list(reorder = TRUE))

From plot above I underestood:

Food=Traditional food is frequently associated with Juice=Fresh Juice. This means that people who prefer traditional food often prefer fresh juice.

Nationality=Indian is often associated with Food=Traditional food. This suggests that individuals who are Indian are likely to prefer traditional food.

The strong connections between these items indicate that they frequently appear together in the dataset.

Individual rule representation

checking genders and the association rules in food preference among them.

#data =na.omit(data)
pie(sort(table(data$Gender)),main = "Respondent's gender")

here we can see the number of women in the dataset is more that men.

Females

# View the available items
rules.f <- apriori(
    data = transactions,
    parameter = list(supp = 0.1, conf = 0.5),
    appearance = list(default = "lhs", rhs = "Gender=Female"),
    control = list(verbose = F)
)



rules.f.byconf<-sort(rules.f, by="support", decreasing=TRUE)

inspect(head(rules.f.byconf))
##     lhs                         rhs               support confidence  coverage     lift count
## [1] {}                       => {Gender=Female} 0.5729167  0.5729167 1.0000000 1.000000   165
## [2] {Juice=Fresh Juice}      => {Gender=Female} 0.5277778  0.5937500 0.8888889 1.036364   152
## [3] {Nationality=Indian}     => {Gender=Female} 0.5173611  0.6182573 0.8368056 1.079140   149
## [4] {Food=Traditional food}  => {Gender=Female} 0.5000000  0.6050420 0.8263889 1.056073   144
## [5] {Food=Traditional food,                                                                  
##      Juice=Fresh Juice}      => {Gender=Female} 0.4756944  0.6227273 0.7638889 1.086942   137
## [6] {Nationality=Indian,                                                                     
##      Juice=Fresh Juice}      => {Gender=Female} 0.4756944  0.6313364 0.7534722 1.101969   137
From the result above we can underestand:

Females in the Dataset: Females make up 57.29% of the dataset (Rule 1).

Associations with Females: Females are slightly more likely to prefer fresh juice (Rule 2) and traditional food (Rule 4). Females are moderately more likely to be Indian (Rule 3). The strongest associations are when females are both Indian and prefer fresh juice (Rule 6) or when they prefer both traditional food and fresh juice (Rule 5).

# Selecting the top 10 rules with the highest lift
meaningful_rules.f <- head(rules.f.byconf, 10)
plot(meaningful_rules.f, method="graph")

#####Females in dataset are more likely to: Prefer fresh juice and traditional food. Be of Indian nationality.

Males

rules.m <- apriori(
    data = transactions,
    parameter = list(supp = 0.1, conf = 0.5),
    appearance = list(default = "lhs", rhs = "Gender=Male"),
    control = list(verbose = F)
)

rules.m.byconf<-sort(rules.m, by="support", decreasing=TRUE)

inspect(head(rules.m.byconf))
##     lhs                         rhs             support confidence  coverage     lift count
## [1] {Age=[8,25)}             => {Gender=Male} 0.1840278  0.6022727 0.3055556 1.457601    53
## [2] {Age=[8,25),                                                                           
##      Juice=Fresh Juice}      => {Gender=Male} 0.1388889  0.5555556 0.2500000 1.344538    40
## [3] {Age=[8,25),                                                                           
##      Food=Traditional food}  => {Gender=Male} 0.1180556  0.5666667 0.2083333 1.371429    34
## [4] {Nationality=Indian,                                                                   
##      Age=[8,25)}             => {Gender=Male} 0.1041667  0.5357143 0.1944444 1.296519    30
## [5] {Food=Western Food}      => {Gender=Male} 0.1006944  0.5800000 0.1736111 1.403697    29
## [6] {Age=[8,25),                                                                           
##      Food=Traditional food,                                                                
##      Juice=Fresh Juice}      => {Gender=Male} 0.1006944  0.5370370 0.1875000 1.299720    29

from the result above we can underestand:

Age Group [8,25): Males are strongly associated with the age group [8,25) (Rule 1). This association becomes slightly weaker when combined with preferences like fresh juice (Rule 2) or traditional food (Rule 3).

Nationality and Age: Indian males in the age group [8,25) are moderately associated with being male (Rule 4).

Food Preferences: Males are strongly associated with preferring Western food (Rule 5). Males are moderately associated with preferring traditional food and fresh juice in the age group [8,25) (Rule 6).

meaningful_rules.m <- head(rules.m.byconf, 10)
plot(meaningful_rules.m , method="graph")

Males in dataset are more likely to:

Be in the age group [8,25). Prefer Western food. Be Indian and in the age group [8,25).

Rules focusing on traditional food

# Additional analysis: Rules focusing on traditional food in the RHS
rules.traditional <- apriori(data = transactions, 
                             parameter = list(supp = 0.001, conf = 0.08), 
                             appearance = list(default = "lhs", rhs = "Food=Traditional food"), 
                             control = list(verbose = FALSE))
rules.traditional.byconf <- sort(rules.traditional, by = "confidence", decreasing = TRUE)
inspect(head(rules.traditional.byconf))
##     lhs                          rhs                     support     confidence
## [1] {Nationality=Muslim}      => {Food=Traditional food} 0.003472222 1         
## [2] {Nationality=Mauritian}   => {Food=Traditional food} 0.003472222 1         
## [3] {Nationality=China}       => {Food=Traditional food} 0.003472222 1         
## [4] {Nationality=Seychellois} => {Food=Traditional food} 0.003472222 1         
## [5] {Nationality=Algerian}    => {Food=Traditional food} 0.003472222 1         
## [6] {Nationality=Nigerian}    => {Food=Traditional food} 0.003472222 1         
##     coverage    lift     count
## [1] 0.003472222 1.210084 1    
## [2] 0.003472222 1.210084 1    
## [3] 0.003472222 1.210084 1    
## [4] 0.003472222 1.210084 1    
## [5] 0.003472222 1.210084 1    
## [6] 0.003472222 1.210084 1
meaningful_rules.tf <- head(rules.traditional.byconf, 10)
plot(meaningful_rules.tf , method="graph")

Summary Observations:

Strong Association:

Individuals from specific nationalities (Muslim, Mauritian, Chinese, Seychellois, Algerian, and Nigerian) shows a perfect association with a preference for Traditional food.