In this project, the European Social Survey (ESS) dataset (Round 11)
was analyzed using association rule mining to uncover patterns related
to people’s feelings about their household income (variable
hincfel). The first step involved setting the working
directory, loading the necessary libraries (arules,
arulesViz, and dplyr), and loading the ESS
dataset into R. After inspecting the data, variables such as
cntry (country), agea (age), and
hincfel (feeling about household income) were converted
into appropriate formats, with categorical variables like
hincfel being transformed into a factor for easier
analysis.
To ensure data integrity, a check was performed for missing values in
important columns (agea, dweight, and
pspwght). Subsequently, continuous variables such as age,
weight, and poverty status were categorized into bins, creating new
variables like age_group, dweight_cat, and
pspwght_cat. These categorical variables were then selected
for analysis. The dataset was converted into a transaction format, which
is a prerequisite for running the Apriori algorithm, a popular method
for discovering association rules.
The Apriori algorithm was applied with a minimum support of 0.02 (2%)
and a minimum confidence of 0.5 (50%), focusing on discovering rules
where hincfel (feeling about income) was the consequent
(right-hand side) of the rules. The algorithm produced association rules
linking feelings about income to other socio-demographic factors. To
interpret the results, item frequencies were examined, and the top rules
were sorted by lift, a measure of association strength.
For visualization, several types of plots were used, including an interactive graph that allows for dynamic exploration of item associations, a grouped matrix visualization to display how items are related, and an item frequency plot that highlights the most common items in the dataset. These visualizations helped in understanding the relationships between people’s feelings about their household income and other factors like age, country, and socio-economic status. The overall analysis provides insights into how certain demographic factors influence perceptions of household income, which can be useful for policymakers and researchers studying socio-economic disparities.
{r install.packages("arules") install.packages("arulesViz") install.packages("dplyr")
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:arules':
##
## intersect, recode, setdiff, setequal, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Load the ESS11 Dataset
dat <- read.csv("ESS11.csv")
# Convert necessary variables to appropriate formats
dat$cntry <- as.factor(dat$cntry)
dat$agea <- as.numeric(dat$agea)
dat$hincfel <- as.factor(dat$hincfel)
# Check If Columns Have Valid Values
if (all(is.na(dat$agea))) stop("Column 'agea' contains only NA values!")
if (all(is.na(dat$dweight))) stop("Column 'dweight' contains only NA values!")
if (all(is.na(dat$pspwght))) stop("Column 'pspwght' contains only NA values!")
# Check if 'agea' exists, otherwise try another column
colnames(dat)
## [1] "name" "essround" "edition" "proddate" "idno" "cntry"
## [7] "dweight" "pspwght" "pweight" "anweight" "nwspol" "netusoft"
## [13] "netustm" "ppltrst" "pplfair" "pplhlp" "polintr" "psppsgva"
## [19] "actrolga" "psppipla" "cptppola" "trstprl" "trstlgl" "trstplc"
## [25] "trstplt" "trstprt" "trstep" "trstun" "vote" "prtvtdat"
## [31] "prtvtebe" "prtvtchr" "prtvtccy" "prtvtffi" "prtvtffr" "prtvgde1"
## [37] "prtvgde2" "prtvtegr" "prtvthhu" "prtvteis" "prtvteie" "prtvteit"
## [43] "prtvclt1" "prtvclt2" "prtvclt3" "prtvtinl" "prtvtcno" "prtvtfpl"
## [49] "prtvtept" "prtvtbrs" "prtvtesk" "prtvtgsi" "prtvtges" "prtvtdse"
## [55] "prtvthch" "prtvtdgb" "contplt" "donprty" "badge" "sgnptit"
## [61] "pbldmna" "bctprd" "pstplonl" "volunfp" "clsprty" "prtcleat"
## [67] "prtclebe" "prtclbhr" "prtclccy" "prtclgfi" "prtclgfr" "prtclgde"
## [73] "prtclegr" "prtclihu" "prtcleis" "prtclfie" "prtclfit" "prtclclt"
## [79] "prtclhnl" "prtclcno" "prtcljpl" "prtclgpt" "prtclbrs" "prtclesk"
## [85] "prtclgsi" "prtclhes" "prtcldse" "prtclhch" "prtcldgb" "prtdgcl"
## [91] "lrscale" "stflife" "stfeco" "stfgov" "stfdem" "stfedu"
## [97] "stfhlth" "gincdif" "freehms" "hmsfmlsh" "hmsacld" "euftf"
## [103] "lrnobed" "loylead" "imsmetn" "imdfetn" "impcntr" "imbgeco"
## [109] "imueclt" "imwbcnt" "happy" "sclmeet" "inprdsc" "sclact"
## [115] "crmvct" "aesfdrk" "health" "hlthhmp" "atchctr" "atcherp"
## [121] "rlgblg" "rlgdnm" "rlgdnbat" "rlgdnacy" "rlgdnafi" "rlgdnade"
## [127] "rlgdnagr" "rlgdnhu" "rlgdnais" "rlgdnie" "rlgdnlt" "rlgdnanl"
## [133] "rlgdnno" "rlgdnapl" "rlgdnapt" "rlgdnrs" "rlgdnask" "rlgdnase"
## [139] "rlgdnach" "rlgdngb" "rlgblge" "rlgdnme" "rlgdebat" "rlgdeacy"
## [145] "rlgdeafi" "rlgdeade" "rlgdeagr" "rlgdehu" "rlgdeais" "rlgdeie"
## [151] "rlgdelt" "rlgdeanl" "rlgdeno" "rlgdeapl" "rlgdeapt" "rlgders"
## [157] "rlgdeask" "rlgdease" "rlgdeach" "rlgdegb" "rlgdgr" "rlgatnd"
## [163] "pray" "dscrgrp" "dscrrce" "dscrntn" "dscrrlg" "dscrlng"
## [169] "dscretn" "dscrage" "dscrgnd" "dscrsex" "dscrdsb" "dscroth"
## [175] "dscrdk" "dscrref" "dscrnap" "dscrna" "ctzcntr" "brncntr"
## [181] "cntbrthd" "livecnta" "lnghom1" "lnghom2" "feethngr" "facntr"
## [187] "fbrncntc" "mocntr" "mbrncntc" "ccnthum" "ccrdprs" "wrclmch"
## [193] "admrclc" "testjc34" "testjc35" "testjc36" "testjc37" "testjc38"
## [199] "testjc39" "testjc40" "testjc41" "testjc42" "vteurmmb" "vteubcmb"
## [205] "ctrlife" "etfruit" "eatveg" "dosprt" "cgtsmok" "alcfreq"
## [211] "alcwkdy" "alcwknd" "icgndra" "alcbnge" "height" "weighta"
## [217] "dshltgp" "dshltms" "dshltnt" "dshltref" "dshltdk" "dshltna"
## [223] "medtrun" "medtrnp" "medtrnt" "medtroc" "medtrnl" "medtrwl"
## [229] "medtrnaa" "medtroth" "medtrnap" "medtrref" "medtrdk" "medtrna"
## [235] "medtrnu" "hlpfmly" "hlpfmhr" "trhltacu" "trhltacp" "trhltcm"
## [241] "trhltch" "trhltos" "trhltho" "trhltht" "trhlthy" "trhltmt"
## [247] "trhltpt" "trhltre" "trhltsh" "trhltnt" "trhltref" "trhltdk"
## [253] "trhltna" "fltdpr" "flteeff" "slprl" "wrhpp" "fltlnl"
## [259] "enjlf" "fltsd" "cldgng" "hltprhc" "hltprhb" "hltprbp"
## [265] "hltpral" "hltprbn" "hltprpa" "hltprpf" "hltprsd" "hltprsc"
## [271] "hltprsh" "hltprdi" "hltprnt" "hltprref" "hltprdk" "hltprna"
## [277] "hltphhc" "hltphhb" "hltphbp" "hltphal" "hltphbn" "hltphpa"
## [283] "hltphpf" "hltphsd" "hltphsc" "hltphsh" "hltphdi" "hltphnt"
## [289] "hltphnap" "hltphref" "hltphdk" "hltphna" "hltprca" "cancfre"
## [295] "cnfpplh" "fnsdfml" "jbexpvi" "jbexpti" "jbexpml" "jbexpmc"
## [301] "jbexpnt" "jbexpnap" "jbexpref" "jbexpdk" "jbexpna" "jbexevl"
## [307] "jbexevh" "jbexevc" "jbexera" "jbexecp" "jbexebs" "jbexent"
## [313] "jbexenap" "jbexeref" "jbexedk" "jbexena" "nobingnd" "likrisk"
## [319] "liklead" "sothnds" "actcomp" "mascfel" "femifel" "impbemw"
## [325] "trmedmw" "trwrkmw" "trplcmw" "trmdcnt" "trwkcnt" "trplcnt"
## [331] "eqwrkbg" "eqpolbg" "eqmgmbg" "eqpaybg" "eqparep" "eqparlv"
## [337] "freinsw" "fineqpy" "wsekpwr" "weasoff" "wlespdm" "wexashr"
## [343] "wprtbym" "wbrgwrm" "hhmmb" "gndr" "gndr2" "gndr3"
## [349] "gndr4" "gndr5" "gndr6" "gndr7" "gndr8" "gndr9"
## [355] "gndr10" "gndr11" "gndr12" "yrbrn" "agea" "yrbrn2"
## [361] "yrbrn3" "yrbrn4" "yrbrn5" "yrbrn6" "yrbrn7" "yrbrn8"
## [367] "yrbrn9" "yrbrn10" "yrbrn11" "yrbrn12" "rshipa2" "rshipa3"
## [373] "rshipa4" "rshipa5" "rshipa6" "rshipa7" "rshipa8" "rshipa9"
## [379] "rshipa10" "rshipa11" "rshipa12" "rshpsts" "rshpsgb" "lvgptnea"
## [385] "dvrcdeva" "marsts" "marstgb" "maritalb" "chldhhe" "domicil"
## [391] "paccmoro" "paccdwlr" "pacclift" "paccnbsh" "paccocrw" "paccxhoc"
## [397] "paccnois" "paccinro" "paccnt" "paccref" "paccdk" "paccna"
## [403] "edulvlb" "eisced" "edlveat" "edlvebe" "edlvehr" "edlvgcy"
## [409] "edlvdfi" "edlvdfr" "edudde1" "educde2" "edlvegr" "edlvdahu"
## [415] "edlvdis" "edlvdie" "edlvfit" "edlvdlt" "edlvenl" "edlveno"
## [421] "edlvipl" "edlvept" "edlvdrs" "edlvdsk" "edlvesi" "edlvies"
## [427] "edlvdse" "edlvdch" "educgb1" "edubgb2" "edagegb" "eduyrs"
## [433] "pdwrk" "edctn" "uempla" "uempli" "dsbld" "rtrd"
## [439] "cmsrv" "hswrk" "dngoth" "dngref" "dngdk" "dngna"
## [445] "mainact" "mnactic" "crpdwk" "pdjobev" "pdjobyr" "emplrel"
## [451] "emplno" "wrkctra" "estsz" "jbspv" "njbspv" "wkdcorga"
## [457] "iorgact" "wkhct" "wkhtot" "nacer2" "tporgwk" "isco08"
## [463] "wrkac6m" "uemp3m" "uemp12m" "uemp5yr" "mbtru" "hincsrca"
## [469] "hinctnta" "hincfel" "edulvlpb" "eiscedp" "edlvpfat" "edlvpebe"
## [475] "edlvpehr" "edlvpgcy" "edlvpdfi" "edlvpdfr" "edupdde1" "edupcde2"
## [481] "edlvpegr" "edlvpdahu" "edlvpdis" "edlvpdie" "edlvpfit" "edlvpdlt"
## [487] "edlvpenl" "edlvpeno" "edlvphpl" "edlvpept" "edlvpdrs" "edlvpdsk"
## [493] "edlvpesi" "edlvphes" "edlvpdse" "edlvpdch" "edupcgb1" "edupbgb2"
## [499] "edagepgb" "pdwrkp" "edctnp" "uemplap" "uemplip" "dsbldp"
## [505] "rtrdp" "cmsrvp" "hswrkp" "dngothp" "dngdkp" "dngnapp"
## [511] "dngrefp" "dngnap" "mnactp" "crpdwkp" "isco08p" "emprelp"
## [517] "wkhtotp" "edulvlfb" "eiscedf" "edlvfeat" "edlvfebe" "edlvfehr"
## [523] "edlvfgcy" "edlvfdfi" "edlvfdfr" "edufcde1" "edufbde2" "edlvfegr"
## [529] "edlvfdahu" "edlvfdis" "edlvfdie" "edlvffit" "edlvfdlt" "edlvfenl"
## [535] "edlvfeno" "edlvfgpl" "edlvfept" "edlvfdrs" "edlvfdsk" "edlvfesi"
## [541] "edlvfges" "edlvfdse" "edlvfdch" "edufcgb1" "edufbgb2" "edagefgb"
## [547] "emprf14" "occf14b" "edulvlmb" "eiscedm" "edlvmeat" "edlvmebe"
## [553] "edlvmehr" "edlvmgcy" "edlvmdfi" "edlvmdfr" "edumcde1" "edumbde2"
## [559] "edlvmegr" "edlvmdahu" "edlvmdis" "edlvmdie" "edlvmfit" "edlvmdlt"
## [565] "edlvmenl" "edlvmeno" "edlvmgpl" "edlvmept" "edlvmdrs" "edlvmdsk"
## [571] "edlvmesi" "edlvmges" "edlvmdse" "edlvmdch" "edumcgb1" "edumbgb2"
## [577] "edagemgb" "emprm14" "occm14b" "atncrse" "anctrya1" "anctrya2"
## [583] "regunit" "region" "ipcrtiva" "impricha" "ipeqopta" "ipshabta"
## [589] "impsafea" "impdiffa" "ipfrulea" "ipudrsta" "ipmodsta" "ipgdtima"
## [595] "impfreea" "iphlppla" "ipsucesa" "ipstrgva" "ipadvnta" "ipbhprpa"
## [601] "iprspota" "iplylfra" "impenva" "imptrada" "impfuna" "testji1"
## [607] "testji2" "testji3" "testji4" "testji5" "testji6" "testji7"
## [613] "testji8" "testji9" "respc19a" "symtc19" "symtnc19" "vacc19"
## [619] "recon" "inwds" "ainws" "ainwe" "binwe" "cinwe"
## [625] "dinwe" "einwe" "finwe" "hinwe" "iinwe" "kinwe"
## [631] "rinwe" "inwde" "jinws" "jinwe" "inwtm" "mode"
## [637] "domain" "prob" "stratum" "psu"
# Convert Continuous Variables to Categories (Binning)
dat$age_group <- cut(dat$agea, breaks = c(15, 30, 50, 70, 100), labels = c("Young", "Adult", "Middle-aged", "Senior"))
dat$dweight_cat <- cut(dat$dweight, breaks = 3, labels = c("Low", "Medium", "High"))
dat$pspwght_cat <- cut(dat$pspwght, breaks = 3, labels = c("Low", "Medium", "High"))
# Select relevant columns for association rule mining
dat_selected <- dat[, c("age_group", "cntry", "hincfel", "dweight_cat", "pspwght_cat")]
#Convert Data into Transaction Format
dat_trans <- as(dat_selected, "transactions")
#Apply Apriori Algorithm
rules <- apriori(dat_trans,
parameter = list(support = 0.02, confidence = 0.5),
appearance = list(rhs = c("hincfel=1", "hincfel=2", "hincfel=3", "hincfel=4"), default = "lhs"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.02 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 803
##
## set item appearances ...[4 item(s)] done [0.00s].
## set transactions ...[41 item(s), 40156 transaction(s)] done [0.01s].
## sorting and recoding items ... [36 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [23 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## R Markdown
Rule 1:
{cntry=SE} => {hincfel=1}
Support: 0.02079 (2.08% of transactions)
Confidence: 0.6789 (67.89% of people from Sweden feel they are living comfortably on their current income)
Coverage: 0.03063 (3.06% of the total dataset consists of people from Sweden)
Interpretation: Among people from Sweden, 67.89% report feeling comfortable with their household income. The rule indicates that being from Sweden is a strong predictor of this feeling.
Rule 2:
{cntry=SE, dweight_cat=Low} => {hincfel=1}
Support: 0.02079 (Same as the previous rule)
Confidence: 0.6789 (67.89% of people from Sweden with low weight feel comfortable with their income)
Coverage: 0.03063 (3.06% of the dataset consists of people from Sweden with low weight)
Interpretation: This rule tells us that people from Sweden who have a low weight category (possibly linked to socio-economic status or health) still report high comfort with their income.
Rule 3:
{cntry=NL} => {hincfel=1}
Support: 0.02749 (2.75% of transactions)
Confidence: 0.6513 (65.13% of people from the Netherlands report feeling comfortable with their income)
Coverage: 0.04221 (4.22% of the dataset consists of people from the Netherlands)
Interpretation: A substantial portion of Dutch people (65.13%) also feel comfortable with their household income. This indicates a somewhat positive financial situation in the Netherlands.
Rule 4:
{cntry=PL} => {hincfel=2}
Support: 0.0252 (2.52% of transactions)
Confidence: 0.7018 (70.18% of people from Poland report feeling that they are coping with their income)
Coverage: 0.03591 (3.59% of the dataset consists of people from Poland)
Interpretation: People from Poland appear to feel more often that they are coping with their income, with 70.18% of Poles reporting a feeling of financial adequacy.
Rule 5:
{cntry=PL, pspwght_cat=Low} => {hincfel=2}
Support: 0.0235 (2.35% of transactions)
Confidence: 0.7026 (70.26% of Polish people with a low poverty weight category report feeling they are coping with their income)
Coverage: 0.03349 (3.35% of the dataset consists of Polish people with low poverty weight)
Interpretation: This rule suggests that Polish people who are categorized as having a low poverty weight are still more likely to report coping with their income. The financial situation of this group is fairly stable, even when considering socio-economic factors.
#Analyze & Visualize Results
itemFrequency(dat_trans)
## age_group=Young age_group=Adult age_group=Middle-aged
## 0.1590297838 0.3003038151 0.3491134575
## age_group=Senior cntry=AT cntry=BE
## 0.1821645582 0.0586213766 0.0396951888
## cntry=CH cntry=CY cntry=DE
## 0.0344655842 0.0170584720 0.0602649666
## cntry=ES cntry=FI cntry=FR
## 0.0459209085 0.0389231995 0.0441029983
## cntry=GB cntry=GR cntry=HR
## 0.0419364479 0.0686572368 0.0389231995
## cntry=HU cntry=IE cntry=IS
## 0.0527442972 0.0502291065 0.0209682239
## cntry=IT cntry=LT cntry=NL
## 0.0713467477 0.0339924295 0.0422103795
## cntry=NO cntry=PL cntry=PT
## 0.0332951489 0.0359099512 0.0341916526
## cntry=RS cntry=SE cntry=SI
## 0.0389231995 0.0306305409 0.0310787927
## cntry=SK hincfel=1 hincfel=2
## 0.0359099512 0.3563352924 0.4509413288
## hincfel=3 hincfel=4 hincfel=7
## 0.1459806754 0.0355115051 0.0074459608
## hincfel=8 hincfel=9 dweight_cat=Low
## 0.0033120829 0.0004731547 0.9261131587
## dweight_cat=Medium dweight_cat=High pspwght_cat=Low
## 0.0700517980 0.0038350433 0.8767805558
## pspwght_cat=Medium pspwght_cat=High
## 0.1027990836 0.0204203606
inspect(head(sort(rules, by = "lift"), 10))
## lhs rhs support confidence coverage lift count
## [1] {cntry=SE} => {hincfel=1} 0.02079390 0.6788618 0.03063054 1.905121 835
## [2] {cntry=SE,
## dweight_cat=Low} => {hincfel=1} 0.02079390 0.6788618 0.03063054 1.905121 835
## [3] {cntry=NL} => {hincfel=1} 0.02749278 0.6513274 0.04221038 1.827850 1104
## [4] {cntry=NL,
## dweight_cat=Low} => {hincfel=1} 0.02749278 0.6513274 0.04221038 1.827850 1104
## [5] {cntry=NL,
## pspwght_cat=Low} => {hincfel=1} 0.02652157 0.6497865 0.04081582 1.823525 1065
## [6] {cntry=NL,
## dweight_cat=Low,
## pspwght_cat=Low} => {hincfel=1} 0.02652157 0.6497865 0.04081582 1.823525 1065
## [7] {cntry=PL,
## pspwght_cat=Low} => {hincfel=2} 0.02353322 0.7026022 0.03349437 1.558079 945
## [8] {cntry=PL,
## dweight_cat=Low,
## pspwght_cat=Low} => {hincfel=2} 0.02353322 0.7026022 0.03349437 1.558079 945
## [9] {cntry=PL} => {hincfel=2} 0.02520171 0.7018031 0.03590995 1.556307 1012
## [10] {cntry=PL,
## dweight_cat=Low} => {hincfel=2} 0.02520171 0.7018031 0.03590995 1.556307 1012
#Visualization
plot(rules, method = "graph", engine = "interactive") # Interactive graph
plot(rules, method = "grouped") # Grouped matrix visualization
plot(rules, method = "graph", control = list(type = "items"))
## Warning: Unknown control parameters: type
## Available control parameters (with default values):
## layout = stress
## circular = FALSE
## ggraphdots = NULL
## edges = <environment>
## nodes = <environment>
## nodetext = <environment>
## colors = c("#EE0000FF", "#EEEEEEFF")
## engine = ggplot2
## max = 100
## verbose = FALSE
itemFrequencyPlot(dat_trans, topN = 10)