This analysis explores association rules in the European Social
Survey (Round 11) data, with a focus on predicting feelings about
household income (hincfel) based on other sociodemographic
variables.
# Load the data
data <- read.csv("/Users/bayu/Desktop/STUDY/1st Year/Unsupervised Learning/Homework/Association Rules/ESS11e04_1/ESS11e04_1.csv", stringsAsFactors = FALSE)
# Display dataset dimensions
cat("Dataset dimensions:", dim(data)[1], "rows,", dim(data)[2], "columns\n")## Dataset dimensions: 50116 rows, 691 columns
# Select relevant variables for analysis
# Including sociodemographic and attitudinal variables
selected_vars <- data %>%
select(
cntry, # Country - for geographical coverage
hincfel, # Feeling about household income (TARGET)
gndr, # Gender
agea, # Age
eduyrs, # Years of education
emplrel, # Employment relation
hinctnta, # Household income decile
health, # Subjective general health
happy, # How happy are you
stflife, # How satisfied with life
stfeco, # How satisfied with state of economy
stfgov # How satisfied with government
) %>%
# Remove rows with missing target variable
filter(!is.na(hincfel))
# Check geographical coverage
cat("\nCountries in dataset:\n")##
## Countries in dataset:
##
## AT BE BG CH CY DE EE ES FI FR GB GR HR HU IE IL
## 2354 1594 2239 1384 685 2420 1293 1844 1563 1771 1684 2757 1563 2118 2017 906
## IS IT LT LV ME NL NO PL PT RS SE SI SK UA
## 842 2865 1365 1252 1609 1695 1337 1442 1373 1563 1230 1248 1442 2661
##
## Number of countries: 30
# Examine the distribution of hincfel
cat("Distribution of 'hincfel' (Feeling about household income):\n")## Distribution of 'hincfel' (Feeling about household income):
##
## 1 2 3 4 7 8 9
## 15585 22011 8597 3218 488 198 19
# Visualize distribution
ggplot(selected_vars %>% filter(!is.na(hincfel)),
aes(x = factor(hincfel))) +
geom_bar(fill = "steelblue", alpha = 0.8) +
labs(title = "Distribution of Feelings about Household Income",
x = "Income Feeling (1=Living comfortably, 4=Very difficult)",
y = "Count") +
theme_minimal()# Create categorical bins for continuous variables and label all variables
rules_data <- selected_vars %>%
mutate(
# Categorize age
age_group = cut(agea,
breaks = c(0, 30, 50, 70, Inf),
labels = c("Young", "Middle", "Senior", "Elderly"),
include.lowest = TRUE),
# Categorize education years
education = cut(eduyrs,
breaks = c(0, 10, 15, Inf),
labels = c("Low_edu", "Medium_edu", "High_edu"),
include.lowest = TRUE),
# Categorize satisfaction variables
life_satisfaction = cut(stflife,
breaks = c(0, 5, 7, 10),
labels = c("Low_stflife", "Medium_stflife", "High_stflife"),
include.lowest = TRUE),
economy_satisfaction = cut(stfeco,
breaks = c(0, 5, 7, 10),
labels = c("Low_stfeco", "Medium_stfeco", "High_stfeco"),
include.lowest = TRUE),
# Label target variable clearly
income_feeling = case_when(
hincfel == 1 ~ "Comfortable",
hincfel == 2 ~ "Coping",
hincfel == 3 ~ "Difficult",
hincfel == 4 ~ "Very_difficult",
TRUE ~ NA_character_
),
# Label gender
gender = case_when(
gndr == 1 ~ "Male",
gndr == 2 ~ "Female",
TRUE ~ NA_character_
),
# Label health
health_status = case_when(
health <= 2 ~ "Good_health",
health == 3 ~ "Fair_health",
health >= 4 ~ "Poor_health",
TRUE ~ NA_character_
),
# Categorize happiness
happiness = case_when(
happy <= 5 ~ "Low_happy",
happy <= 7 ~ "Medium_happy",
happy > 7 ~ "High_happy",
TRUE ~ NA_character_
)
) %>%
select(income_feeling, age_group, education, gender, health_status,
happiness, life_satisfaction, economy_satisfaction) %>%
na.omit() # Remove rows with any missing values
cat("Prepared data dimensions:", nrow(rules_data), "rows\n")## Prepared data dimensions: 48316 rows
## First few rows:
# Convert to transaction format for arules
transactions <- as(rules_data, "transactions")
# Inspect transactions
cat("Transaction summary:\n")## Transaction summary:
## transactions as itemMatrix in sparse format with
## 48316 rows (elements/itemsets/transactions) and
## 25 columns (items) and a density of 0.32
##
## most frequent items:
## economy_satisfaction=Low_stfeco health_status=Good_health
## 31918 31797
## happiness=High_happy gender=Female
## 26438 25931
## education=Medium_edu (Other)
## 25361 245083
##
## element (itemset/transaction) length distribution:
## sizes
## 8
## 48316
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8 8 8 8 8 8
##
## includes extended item information - examples:
## labels variables levels
## 1 income_feeling=Comfortable income_feeling Comfortable
## 2 income_feeling=Coping income_feeling Coping
## 3 income_feeling=Difficult income_feeling Difficult
##
## includes extended transaction information - examples:
## transactionID
## 1 1
## 2 2
## 3 3
# Visualize item frequencies
itemFrequencyPlot(transactions, topN = 15, type = "absolute",
col = "steelblue",
main = "Top 15 Most Frequent Items")# Generate rules with income_feeling as consequent (right-hand side)
rules <- apriori(transactions,
parameter = list(
supp = 0.01, # Minimum support: 1%
conf = 0.5, # Minimum confidence: 50%
minlen = 2, # Minimum rule length
maxlen = 5 # Maximum rule length
),
appearance = list(
rhs = c("income_feeling=Comfortable",
"income_feeling=Coping",
"income_feeling=Difficult",
"income_feeling=Very_difficult"),
default = "lhs" # All other items on left-hand side
))## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 5 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 483
##
## set item appearances ...[4 item(s)] done [0.00s].
## set transactions ...[25 item(s), 48316 transaction(s)] done [0.01s].
## sorting and recoding items ... [25 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5
## done [0.00s].
## writing ... [484 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Number of rules generated: 484
# Sort rules by confidence
rules_sorted <- sort(rules, by = "confidence", decreasing = TRUE)
# Display top 20 rules
cat("\nTop 20 Rules by Confidence:\n")##
## Top 20 Rules by Confidence:
## lhs rhs support confidence coverage lift count
## [1] {education=High_edu,
## gender=Male,
## happiness=High_happy,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01057621 0.7187060 0.01471562 2.266645 511
## [2] {education=High_edu,
## gender=Male,
## life_satisfaction=High_stflife,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01053481 0.7169014 0.01469493 2.260954 509
## [3] {education=High_edu,
## health_status=Good_health,
## life_satisfaction=High_stflife,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01558490 0.7043966 0.02212518 2.221516 753
## [4] {education=High_edu,
## gender=Male,
## health_status=Good_health,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01012087 0.7015782 0.01442586 2.212627 489
## [5] {education=High_edu,
## health_status=Good_health,
## happiness=High_happy,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01541932 0.7015066 0.02198030 2.212402 745
## [6] {education=High_edu,
## happiness=High_happy,
## life_satisfaction=High_stflife,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01666115 0.6945643 0.02398791 2.190507 805
## [7] {age_group=Senior,
## education=High_edu,
## gender=Male,
## life_satisfaction=High_stflife} => {income_feeling=Comfortable} 0.01661975 0.6916451 0.02402931 2.181301 803
## [8] {education=High_edu,
## happiness=High_happy,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01777879 0.6844622 0.02597483 2.158647 859
## [9] {age_group=Senior,
## education=High_edu,
## life_satisfaction=High_stflife,
## economy_satisfaction=Medium_stfeco} => {income_feeling=Comfortable} 0.01239755 0.6837900 0.01813064 2.156527 599
## [10] {education=High_edu,
## life_satisfaction=High_stflife,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01782018 0.6827914 0.02609901 2.153378 861
## [11] {education=High_edu,
## gender=Male,
## life_satisfaction=High_stflife,
## economy_satisfaction=Medium_stfeco} => {income_feeling=Comfortable} 0.01823413 0.6824167 0.02671993 2.152196 881
## [12] {education=High_edu,
## gender=Male,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01192152 0.6784452 0.01757182 2.139671 576
## [13] {age_group=Senior,
## education=High_edu,
## happiness=High_happy,
## economy_satisfaction=Medium_stfeco} => {income_feeling=Comfortable} 0.01291498 0.6760563 0.01910340 2.132137 624
## [14] {education=High_edu,
## health_status=Good_health,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01713718 0.6715328 0.02551950 2.117871 828
## [15] {education=High_edu,
## gender=Male,
## happiness=High_happy,
## economy_satisfaction=Medium_stfeco} => {income_feeling=Comfortable} 0.01860667 0.6674091 0.02787896 2.104865 899
## [16] {age_group=Senior,
## education=High_edu,
## gender=Male,
## happiness=High_happy} => {income_feeling=Comfortable} 0.01724067 0.6669335 0.02585065 2.103366 833
## [17] {age_group=Elderly,
## education=High_edu,
## happiness=High_happy,
## life_satisfaction=High_stflife} => {income_feeling=Comfortable} 0.01057621 0.6636364 0.01593675 2.092967 511
## [18] {age_group=Senior,
## education=High_edu,
## health_status=Good_health,
## life_satisfaction=High_stflife} => {income_feeling=Comfortable} 0.02692690 0.6627611 0.04062836 2.090207 1301
## [19] {education=High_edu,
## gender=Male,
## health_status=Good_health,
## life_satisfaction=High_stflife} => {income_feeling=Comfortable} 0.04228413 0.6603103 0.06403676 2.082477 2043
## [20] {age_group=Senior,
## education=High_edu,
## health_status=Good_health,
## economy_satisfaction=Medium_stfeco} => {income_feeling=Comfortable} 0.01289428 0.6585624 0.01957944 2.076965 623
##
## Rule Quality Measures Summary:
## support confidence coverage lift
## Min. :0.01004 Min. :0.5000 Min. :0.01443 Min. :1.121
## 1st Qu.:0.01480 1st Qu.:0.5145 1st Qu.:0.02654 1st Qu.:1.163
## Median :0.02139 Median :0.5303 Median :0.03992 Median :1.224
## Mean :0.02860 Mean :0.5440 Mean :0.05321 Mean :1.425
## 3rd Qu.:0.03518 3rd Qu.:0.5563 3rd Qu.:0.06539 3rd Qu.:1.705
## Max. :0.16742 Max. :0.7187 Max. :0.32718 Max. :2.267
## count
## Min. : 485
## 1st Qu.: 715
## Median :1034
## Mean :1382
## 3rd Qu.:1700
## Max. :8089
# Rules predicting "Comfortable"
rules_comfortable <- subset(rules, subset = rhs %in% "income_feeling=Comfortable")
cat("\nTop 10 Rules for 'Comfortable' Income Feeling:\n")##
## Top 10 Rules for 'Comfortable' Income Feeling:
## lhs rhs support confidence coverage lift count
## [1] {education=High_edu,
## gender=Male,
## happiness=High_happy,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01057621 0.7187060 0.01471562 2.266645 511
## [2] {education=High_edu,
## gender=Male,
## life_satisfaction=High_stflife,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01053481 0.7169014 0.01469493 2.260954 509
## [3] {education=High_edu,
## health_status=Good_health,
## life_satisfaction=High_stflife,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01558490 0.7043966 0.02212518 2.221516 753
## [4] {education=High_edu,
## gender=Male,
## health_status=Good_health,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01012087 0.7015782 0.01442586 2.212627 489
## [5] {education=High_edu,
## health_status=Good_health,
## happiness=High_happy,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01541932 0.7015066 0.02198030 2.212402 745
## [6] {education=High_edu,
## happiness=High_happy,
## life_satisfaction=High_stflife,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01666115 0.6945643 0.02398791 2.190507 805
## [7] {age_group=Senior,
## education=High_edu,
## gender=Male,
## life_satisfaction=High_stflife} => {income_feeling=Comfortable} 0.01661975 0.6916451 0.02402931 2.181301 803
## [8] {education=High_edu,
## happiness=High_happy,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01777879 0.6844622 0.02597483 2.158647 859
## [9] {age_group=Senior,
## education=High_edu,
## life_satisfaction=High_stflife,
## economy_satisfaction=Medium_stfeco} => {income_feeling=Comfortable} 0.01239755 0.6837900 0.01813064 2.156527 599
## [10] {education=High_edu,
## life_satisfaction=High_stflife,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01782018 0.6827914 0.02609901 2.153378 861
# Rules predicting "Difficult" or "Very_difficult"
rules_difficult <- subset(rules, subset = rhs %in% c("income_feeling=Difficult",
"income_feeling=Very_difficult"))
cat("\nTop 10 Rules for 'Difficult' Income Feelings:\n")##
## Top 10 Rules for 'Difficult' Income Feelings:
# Extract top rules for interpretation
top_comfortable <- head(sort(rules_comfortable, by = "lift"), 5)
top_difficult <- head(sort(rules_difficult, by = "lift"), 5)
cat("\n=== KEY FINDINGS ===\n\n")##
## === KEY FINDINGS ===
## Strong Predictors of COMFORTABLE Income Feeling:
if(length(top_comfortable) > 0) {
inspect(top_comfortable)
} else {
cat("No strong rules found.\n")
}## lhs rhs support confidence coverage lift count
## [1] {education=High_edu,
## gender=Male,
## happiness=High_happy,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01057621 0.7187060 0.01471562 2.266645 511
## [2] {education=High_edu,
## gender=Male,
## life_satisfaction=High_stflife,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01053481 0.7169014 0.01469493 2.260954 509
## [3] {education=High_edu,
## health_status=Good_health,
## life_satisfaction=High_stflife,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01558490 0.7043966 0.02212518 2.221516 753
## [4] {education=High_edu,
## gender=Male,
## health_status=Good_health,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01012087 0.7015782 0.01442586 2.212627 489
## [5] {education=High_edu,
## health_status=Good_health,
## happiness=High_happy,
## economy_satisfaction=High_stfeco} => {income_feeling=Comfortable} 0.01541932 0.7015066 0.02198030 2.212402 745
##
##
## Strong Predictors of DIFFICULT Income Feeling:
## No strong rules found.
plot(rules,
measure = c("support", "confidence"),
shading = "lift",
main = "Association Rules: Support vs Confidence (colored by Lift)")# Visualize top 30 rules as network
if(length(rules) >= 30) {
plot(head(rules_sorted, 30),
method = "graph",
main = "Network Graph: Top 30 Association Rules",
control = list(type = "items"))
} else {
plot(rules_sorted,
method = "graph",
main = "Network Graph: All Association Rules",
control = list(type = "items"))
}## Available control parameters (with default values):
## layout = stress
## circular = FALSE
## ggraphdots = NULL
## edges = <environment>
## nodes = <environment>
## nodetext = <environment>
## colors = c("#EE0000FF", "#EEEEEEFF")
## engine = ggplot2
## max = 100
## verbose = FALSE
# Group rules and visualize as matrix
if(length(rules) > 0) {
plot(rules_sorted,
method = "grouped",
main = "Grouped Matrix of Association Rules")
}## Available control parameters (with default values):
## k = 20
## aggr.fun = function (x, ...) UseMethod("mean")
## rhs_max = 10
## lhs_label_items = 2
## col = c("#EE0000FF", "#EEEEEEFF")
## groups = NULL
## engine = ggplot2
## verbose = FALSE
##
## === ANALYSIS SUMMARY ===
## Total respondents analyzed: 48316
## Total association rules found: 484
## Rules predicting 'Comfortable': 191
## Rules predicting 'Difficult' feelings: 0
## Key Metrics:
## - Average Support: 0.0286
## - Average Confidence: 0.544
## - Average Lift: 1.4253
The association rules reveal patterns between sociodemographic factors and feelings about household income. High confidence rules indicate strong predictive relationships. Rules with high lift show combinations of factors that are particularly associated with specific income feelings, beyond what would be expected by chance.