library(arules)
## Warning: package 'arules' was built under R version 4.5.2
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ dplyr::recode() masks arules::recode()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read.csv("ESS11e04_1/ESS11e04_1.csv")
print(head(data$hincfel))
## [1] 1 2 1 2 2 1
print(unique(data$hincfel))
## [1] 1 2 4 3 8 7 9
Description of variable hincfel from the official data source:
Feeling about household’s income nowadays CARD 52 Which of the descriptions on this card comes closest to how you feel about your household’s income nowadays? Value Category 1 Living comfortably on present income 2 Coping on present income 3 Difficult on present income 4 Very difficult on present income 7 Refusal 8 Don’t know 9 No answer ) Missing value
data <- data %>%
select(hincfel, health, gndr, cntry, eduyrs, domicil) %>%
filter(hincfel <= 4 & health <= 5 & domicil <= 5) %>%
mutate(
Income_Feeling = dplyr::recode(hincfel, `1`="Comfortable", `2`="Coping",
`3`="Difficult", `4`="Very_Difficult"),
Health_Status = dplyr::recode(health, `1`="Very_Good", `2`="Good",
`3`="Fair", `4`="Bad", `5`="Very_Bad"),
Gender = dplyr::recode(gndr, `1`="Male", `2`="Female"),
Edu_Level = case_when(
eduyrs < 12 ~ "Low_Edu",
eduyrs >= 12 & eduyrs <= 16 ~ "Mid_Edu",
eduyrs > 16 ~ "High_Edu"
),
Area = dplyr::recode(domicil, `1`="Big_City", `2`="City_Suburb",
`3`="Town/Small_City", `4`="Country_Village", `5`="Farm/Home_in_Country")
) %>%
select(cntry, Gender, Health_Status, Edu_Level, Area, Income_Feeling)
trans <- as(data, "transactions")
## Warning: Column(s) 1, 2, 3, 4, 5, 6 not logical or factor. Applying default
## discretization (see '? discretizeDF').
feeling_counts <- prop.table(table(data$Income_Feeling))
barplot(feeling_counts,
main = "Frequency of income feelings",
col = "steelblue",
ylab = "Relative Frequency",
cex.names = 0.8)
Let’s find out what are the profiles of people who feel difficult or very difficult with their current income.
rules <- apriori(trans,
parameter = list(supp=0.001, conf=0.05, minlen=2),
appearance = list(default="lhs",
rhs=c("Income_Feeling=Difficult",
"Income_Feeling=Very_Difficult")))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.05 0.1 1 none FALSE TRUE 5 0.001 2
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 49
##
## set item appearances ...[2 item(s)] done [0.00s].
## set transactions ...[49 item(s), 49282 transaction(s)] done [0.01s].
## sorting and recoding items ... [49 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 done [0.00s].
## writing ... [1168 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
rules_sorted <- sort(rules, by="lift", decreasing=TRUE)
inspect(head(rules_sorted, 10))
## lhs rhs support confidence coverage lift count
## [1] {cntry=UA,
## Health_Status=Bad,
## Area=Country_Village} => {Income_Feeling=Very_Difficult} 0.002029138 0.5952381 0.003408953 9.147030 100
## [2] {cntry=UA,
## Gender=Female,
## Health_Status=Bad,
## Area=Country_Village} => {Income_Feeling=Very_Difficult} 0.001237774 0.5754717 0.002150887 8.843279 61
## [3] {cntry=UA,
## Health_Status=Bad,
## Edu_Level=Low_Edu,
## Area=Country_Village} => {Income_Feeling=Very_Difficult} 0.001197192 0.5619048 0.002130595 8.634796 59
## [4] {cntry=UA,
## Gender=Female,
## Edu_Level=Low_Edu,
## Area=Town/Small_City} => {Income_Feeling=Very_Difficult} 0.001116026 0.5612245 0.001988556 8.624342 55
## [5] {cntry=UA,
## Gender=Female,
## Health_Status=Bad,
## Edu_Level=Low_Edu} => {Income_Feeling=Very_Difficult} 0.001318940 0.5603448 0.002353801 8.610824 65
## [6] {cntry=UA,
## Health_Status=Bad,
## Edu_Level=Low_Edu} => {Income_Feeling=Very_Difficult} 0.001988556 0.5600000 0.003550992 8.605525 98
## [7] {cntry=UA,
## Gender=Female,
## Health_Status=Bad} => {Income_Feeling=Very_Difficult} 0.002516132 0.5321888 0.004727893 8.178151 124
## [8] {cntry=UA,
## Health_Status=Bad} => {Income_Feeling=Very_Difficult} 0.003916237 0.5316804 0.007365772 8.170338 193
## [9] {cntry=UA,
## Gender=Male,
## Health_Status=Bad} => {Income_Feeling=Very_Difficult} 0.001400106 0.5307692 0.002637880 8.156336 69
## [10] {cntry=UA,
## Health_Status=Bad,
## Edu_Level=Mid_Edu} => {Income_Feeling=Very_Difficult} 0.001785642 0.5238095 0.003408953 8.049386 88
The profile most likely to be struggling is a Female from Ukraine with Bad health. The Lift is 8.18, which is very high — it means they are over 8 times more likely to be in the ‘Very Difficult’ category than the average person in the survey.
Right after Ukraine goes Bulgaria. Persons in Bulgaria with bad health, report difficult and very difficult income feeling with lift 7.21.
One big takeaway is that Health Status is in almost every single top rule. Even if you ignore the country, Rule [10] shows that just having ‘Very Bad’ health alone makes you about 4.6 times more likely to report income as ‘Very Difficult.’ Basically, if your health is bad, your feeling about your income is usually bad too, no matter where you live.
plot(rules, method = "scatterplot", engine = "ggplot2")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
plot(head(rules_sorted, 10), method = "graph", engine = "htmlwidget")
plot(head(rules_sorted, 10), method = "paracoord")