##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
## Loading required package: lattice
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
## Gender Nationality Age Food
## Female:165 Indian :241 Min. : 8.00 Traditional food:238
## Male :119 Malaysian: 10 1st Qu.:24.00 Western Food : 50
## NA's : 4 Indonesia: 7 Median :28.00
## Pakistani: 3 Mean :30.60
## Japan : 2 3rd Qu.:36.25
## Maldivian: 2 Max. :80.00
## (Other) : 23
## Juice Dessert
## Carbonated drinks: 32 Maybe:122
## Fresh Juice :256 No : 52
## Yes :114
##
##
##
##
cat("Number of observations in the dataset:", nrow(data))
## Number of observations in the dataset: 288
# checking for missing values
colSums(is.na(data))
## Gender Nationality Age Food Juice Dessert
## 4 0 0 0 0 0
summary(data)
## Gender Nationality Age Food
## Female:165 Indian :241 Min. : 8.00 Traditional food:238
## Male :119 Malaysian: 10 1st Qu.:24.00 Western Food : 50
## NA's : 4 Indonesia: 7 Median :28.00
## Pakistani: 3 Mean :30.60
## Japan : 2 3rd Qu.:36.25
## Maldivian: 2 Max. :80.00
## (Other) : 23
## Juice Dessert
## Carbonated drinks: 32 Maybe:122
## Fresh Juice :256 No : 52
## Yes :114
##
##
##
##
library(dplyr)
item_frequencies <- data %>%
group_by(Food) %>%
summarize(Frequency = n()) %>%
arrange(desc(Frequency))
library(ggplot2)
ggplot(item_frequencies, aes(x = reorder(Food, -Frequency), y = Frequency)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Item Frequency Plot",
x = "Food Items",
y = "Frequency") +
coord_flip()
item_frequencies_2 <- data %>%
group_by(Juice) %>%
summarize(Frequency = n()) %>%
arrange(desc(Frequency))
ggplot(item_frequencies_2, aes(x = reorder(Juice, -Frequency), y = Frequency)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Item Frequency Plot",
x = "Juice Items",
y = "Frequency") +
coord_flip()
item_frequencies_3 <- data %>%
group_by(Dessert) %>%
summarize(Frequency = n()) %>%
arrange(desc(Frequency))
ggplot(item_frequencies_3 , aes(x = reorder(Dessert, -Frequency), y = Frequency)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Item Frequency Plot",
x = "Juice Items",
y = "Frequency") +
coord_flip()
# Convert the relevant columns into a transaction format
transactions <- as(split(data$Gender, data$Age, data$Food, data$Juice, data$Dessert), "transactions")
## Warning in asMethod(object): removing duplicated items in transactions
# Apriori algorithm
rules <- apriori(transactions)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.1 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 4
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[2 item(s), 49 transaction(s)] done [0.00s].
## sorting and recoding items ... [2 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [1 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Inspect the results
inspect(rules)
## lhs rhs support confidence coverage lift count
## [1] {} => {Female} 0.8367347 0.8367347 1 1 41
rules<-apriori(data , parameter=list(supp=0.1, conf=0.65))
## Warning: Column(s) 3 not logical or factor. Applying default discretization
## (see '? discretizeDF').
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.65 0.1 1 none FALSE TRUE 5 0.1 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 28
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[39 item(s), 288 transaction(s)] done [0.00s].
## sorting and recoding items ... [13 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 done [0.00s].
## writing ... [278 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
set.seed(240)
# Ploting the rules using the graph method
plot(rules, method = "graph", measure = "support", shading = "lift", main = "Graph of Rules")
## Warning: Unknown control parameters: main
## Available control parameters (with default values):
## layout = stress
## circular = FALSE
## ggraphdots = NULL
## edges = <environment>
## nodes = <environment>
## nodetext = <environment>
## colors = c("#EE0000FF", "#EEEEEEFF")
## engine = ggplot2
## max = 100
## verbose = FALSE
## Warning: Too many rules supplied. Only plotting the best 100 using 'lift'
## (change control parameter max if needed).
rules.by.conf<-sort(rules, by="confidence", decreasing=TRUE)
inspect(head(rules.by.conf))
## lhs rhs support confidence coverage lift count
## [1] {Dessert=No} => {Juice=Fresh Juice} 0.1805556 1 0.1805556 1.125000 52
## [2] {Age=[33,80],
## Dessert=No} => {Nationality=Indian} 0.1076389 1 0.1076389 1.195021 31
## [3] {Age=[33,80],
## Dessert=No} => {Juice=Fresh Juice} 0.1076389 1 0.1076389 1.125000 31
## [4] {Gender=Female,
## Dessert=No} => {Juice=Fresh Juice} 0.1215278 1 0.1215278 1.125000 35
## [5] {Food=Traditional food,
## Dessert=No} => {Juice=Fresh Juice} 0.1701389 1 0.1701389 1.125000 49
## [6] {Nationality=Indian,
## Dessert=No} => {Juice=Fresh Juice} 0.1631944 1 0.1631944 1.125000 47
rules.by.lift<-sort(rules, by="lift", decreasing=TRUE)
inspect(head(rules.by.lift))
## lhs rhs support confidence coverage lift count
## [1] {Nationality=Indian,
## Food=Traditional food,
## Dessert=No} => {Age=[33,80]} 0.1041667 0.6666667 0.1562500 1.900990 30
## [2] {Nationality=Indian,
## Food=Traditional food,
## Juice=Fresh Juice,
## Dessert=No} => {Age=[33,80]} 0.1041667 0.6666667 0.1562500 1.900990 30
## [3] {Nationality=Indian,
## Dessert=No} => {Age=[33,80]} 0.1076389 0.6595745 0.1631944 1.880767 31
## [4] {Nationality=Indian,
## Juice=Fresh Juice,
## Dessert=No} => {Age=[33,80]} 0.1076389 0.6595745 0.1631944 1.880767 31
## [5] {Age=[33,80],
## Food=Traditional food,
## Dessert=Maybe} => {Gender=Female} 0.1076389 0.8611111 0.1250000 1.503030 31
## [6] {Nationality=Indian,
## Age=[33,80],
## Food=Traditional food,
## Dessert=Maybe} => {Gender=Female} 0.1041667 0.8571429 0.1215278 1.496104 30
rules.by.supp<-sort(rules, by="support", decreasing=TRUE)
inspect(head(rules.by.supp))
## lhs rhs support confidence
## [1] {} => {Juice=Fresh Juice} 0.8888889 0.8888889
## [2] {} => {Nationality=Indian} 0.8368056 0.8368056
## [3] {} => {Food=Traditional food} 0.8263889 0.8263889
## [4] {Food=Traditional food} => {Juice=Fresh Juice} 0.7638889 0.9243697
## [5] {Juice=Fresh Juice} => {Food=Traditional food} 0.7638889 0.8593750
## [6] {Nationality=Indian} => {Juice=Fresh Juice} 0.7534722 0.9004149
## coverage lift count
## [1] 1.0000000 1.000000 256
## [2] 1.0000000 1.000000 241
## [3] 1.0000000 1.000000 238
## [4] 0.8263889 1.039916 220
## [5] 0.8888889 1.039916 220
## [6] 0.8368056 1.012967 217
reviewing the results from confidence, support and lift, I underestood the most meaningful rule are Rules with high lift, support and confidence values. they are the most meaningful because they represent strong associations that are not due to random chance.
plot(rules, method="paracoord", control=list(reorder=TRUE))
I wanted to focus on the strongest association rules between transactions, so I decided to filter the transactions by lift and then by support.
# Sorting the rules by lift in descending order
sorted_rules <- sort(rules, by = "lift", decreasing = TRUE)
# Selecting the top 10 rules with the highest lift
meaningful_rules <- head(sorted_rules, 10)
# Ploting the rules using parallel coordinates
plot(meaningful_rules, method = "paracoord", control = list(reorder = TRUE))
Dessert=No is frequently associated with Juice=Fresh Juice. This means that people who do not prefer dessert often prefer fresh juice.
Food=Traditional food is often associated with Nationality=Indian. This suggests that individuals who prefer traditional food are likely to be Indian.
Age=[33,80] is connected to Gender=Female in some rules, indicating that older individuals in this age group are more likely to be female.
# Sorting the rules by lift in descending order
sorted_rules_1 <- sort(rules, by = "support", decreasing = TRUE)
# Selecting the top 10 rules with the highest lift
meaningful_rules_1 <- head(sorted_rules_1, 10)
# Ploting the rules using parallel coordinates
plot(meaningful_rules_1, method = "paracoord", control = list(reorder = TRUE))
From plot above I underestood:
Food=Traditional food is frequently associated with Juice=Fresh Juice. This means that people who prefer traditional food often prefer fresh juice.
Nationality=Indian is often associated with Food=Traditional food. This suggests that individuals who are Indian are likely to prefer traditional food.
The strong connections between these items indicate that they frequently appear together in the dataset.
checking genders and the association rules in food preference among them.
#data =na.omit(data)
pie(sort(table(data$Gender)),main = "Respondent's gender")
here we can see the number of women in the dataset is more that men.
# View the available items
rules.f <- apriori(
data = transactions,
parameter = list(supp = 0.1, conf = 0.5),
appearance = list(default = "lhs", rhs = "Gender=Female"),
control = list(verbose = F)
)
rules.f.byconf<-sort(rules.f, by="support", decreasing=TRUE)
inspect(head(rules.f.byconf))
## lhs rhs support confidence coverage lift count
## [1] {} => {Gender=Female} 0.5729167 0.5729167 1.0000000 1.000000 165
## [2] {Juice=Fresh Juice} => {Gender=Female} 0.5277778 0.5937500 0.8888889 1.036364 152
## [3] {Nationality=Indian} => {Gender=Female} 0.5173611 0.6182573 0.8368056 1.079140 149
## [4] {Food=Traditional food} => {Gender=Female} 0.5000000 0.6050420 0.8263889 1.056073 144
## [5] {Food=Traditional food,
## Juice=Fresh Juice} => {Gender=Female} 0.4756944 0.6227273 0.7638889 1.086942 137
## [6] {Nationality=Indian,
## Juice=Fresh Juice} => {Gender=Female} 0.4756944 0.6313364 0.7534722 1.101969 137
Females in the Dataset: Females make up 57.29% of the dataset (Rule 1).
Associations with Females: Females are slightly more likely to prefer fresh juice (Rule 2) and traditional food (Rule 4). Females are moderately more likely to be Indian (Rule 3). The strongest associations are when females are both Indian and prefer fresh juice (Rule 6) or when they prefer both traditional food and fresh juice (Rule 5).
# Selecting the top 10 rules with the highest lift
meaningful_rules.f <- head(rules.f.byconf, 10)
plot(meaningful_rules.f, method="graph")
#####Females in dataset are more likely to: Prefer fresh juice and traditional food. Be of Indian nationality.
rules.m <- apriori(
data = transactions,
parameter = list(supp = 0.1, conf = 0.5),
appearance = list(default = "lhs", rhs = "Gender=Male"),
control = list(verbose = F)
)
rules.m.byconf<-sort(rules.m, by="support", decreasing=TRUE)
inspect(head(rules.m.byconf))
## lhs rhs support confidence coverage lift count
## [1] {Age=[8,25)} => {Gender=Male} 0.1840278 0.6022727 0.3055556 1.457601 53
## [2] {Age=[8,25),
## Juice=Fresh Juice} => {Gender=Male} 0.1388889 0.5555556 0.2500000 1.344538 40
## [3] {Age=[8,25),
## Food=Traditional food} => {Gender=Male} 0.1180556 0.5666667 0.2083333 1.371429 34
## [4] {Nationality=Indian,
## Age=[8,25)} => {Gender=Male} 0.1041667 0.5357143 0.1944444 1.296519 30
## [5] {Food=Western Food} => {Gender=Male} 0.1006944 0.5800000 0.1736111 1.403697 29
## [6] {Age=[8,25),
## Food=Traditional food,
## Juice=Fresh Juice} => {Gender=Male} 0.1006944 0.5370370 0.1875000 1.299720 29
from the result above we can underestand:
Age Group [8,25): Males are strongly associated with the age group [8,25) (Rule 1). This association becomes slightly weaker when combined with preferences like fresh juice (Rule 2) or traditional food (Rule 3).
Nationality and Age: Indian males in the age group [8,25) are moderately associated with being male (Rule 4).
Food Preferences: Males are strongly associated with preferring Western food (Rule 5). Males are moderately associated with preferring traditional food and fresh juice in the age group [8,25) (Rule 6).
meaningful_rules.m <- head(rules.m.byconf, 10)
plot(meaningful_rules.m , method="graph")
Be in the age group [8,25). Prefer Western food. Be Indian and in the age group [8,25).
# Additional analysis: Rules focusing on traditional food in the RHS
rules.traditional <- apriori(data = transactions,
parameter = list(supp = 0.001, conf = 0.08),
appearance = list(default = "lhs", rhs = "Food=Traditional food"),
control = list(verbose = FALSE))
rules.traditional.byconf <- sort(rules.traditional, by = "confidence", decreasing = TRUE)
inspect(head(rules.traditional.byconf))
## lhs rhs support confidence
## [1] {Nationality=Muslim} => {Food=Traditional food} 0.003472222 1
## [2] {Nationality=Mauritian} => {Food=Traditional food} 0.003472222 1
## [3] {Nationality=China} => {Food=Traditional food} 0.003472222 1
## [4] {Nationality=Seychellois} => {Food=Traditional food} 0.003472222 1
## [5] {Nationality=Algerian} => {Food=Traditional food} 0.003472222 1
## [6] {Nationality=Nigerian} => {Food=Traditional food} 0.003472222 1
## coverage lift count
## [1] 0.003472222 1.210084 1
## [2] 0.003472222 1.210084 1
## [3] 0.003472222 1.210084 1
## [4] 0.003472222 1.210084 1
## [5] 0.003472222 1.210084 1
## [6] 0.003472222 1.210084 1
meaningful_rules.tf <- head(rules.traditional.byconf, 10)
plot(meaningful_rules.tf , method="graph")
Strong Association:
Individuals from specific nationalities (Muslim, Mauritian, Chinese, Seychellois, Algerian, and Nigerian) shows a perfect association with a preference for Traditional food.