rm(list=ls()); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 526536 28.2 1169674 62.5 NA 669428 35.8
## Vcells 971729 7.5 8388608 64.0 16384 1851813 14.2
# List of packages
packages <- c("tidyverse", "infer", "fst", "modelsummary", "effects", "survey", "MASS", "aod", "interactions", "kableExtra", "flextable", "scales") # add any you need here
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: carData
##
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
##
## Loading required package: grid
##
## Loading required package: Matrix
##
##
## Attaching package: 'Matrix'
##
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Loading required package: survival
##
##
## Attaching package: 'survey'
##
##
## The following object is masked from 'package:graphics':
##
## dotchart
##
##
##
## Attaching package: 'MASS'
##
##
## The following object is masked from 'package:dplyr':
##
## select
##
##
##
## Attaching package: 'aod'
##
##
## The following object is masked from 'package:survival':
##
## rats
##
##
##
## Attaching package: 'kableExtra'
##
##
## The following object is masked from 'package:dplyr':
##
## group_rows
##
##
##
## Attaching package: 'flextable'
##
##
## The following objects are masked from 'package:kableExtra':
##
## as_image, footnote
##
##
## The following object is masked from 'package:purrr':
##
## compose
##
##
##
## Attaching package: 'scales'
##
##
## The following object is masked from 'package:purrr':
##
## discard
##
##
## The following object is masked from 'package:readr':
##
## col_factor
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "infer" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "fst" "infer" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
##
## [[4]]
## [1] "modelsummary" "fst" "infer" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "effects" "carData" "modelsummary" "fst" "infer"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[6]]
## [1] "survey" "survival" "Matrix" "grid" "effects"
## [6] "carData" "modelsummary" "fst" "infer" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[7]]
## [1] "MASS" "survey" "survival" "Matrix" "grid"
## [6] "effects" "carData" "modelsummary" "fst" "infer"
## [11] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [16] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [21] "stats" "graphics" "grDevices" "utils" "datasets"
## [26] "methods" "base"
##
## [[8]]
## [1] "aod" "MASS" "survey" "survival" "Matrix"
## [6] "grid" "effects" "carData" "modelsummary" "fst"
## [11] "infer" "lubridate" "forcats" "stringr" "dplyr"
## [16] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [21] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [26] "datasets" "methods" "base"
##
## [[9]]
## [1] "interactions" "aod" "MASS" "survey" "survival"
## [6] "Matrix" "grid" "effects" "carData" "modelsummary"
## [11] "fst" "infer" "lubridate" "forcats" "stringr"
## [16] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [21] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [26] "utils" "datasets" "methods" "base"
##
## [[10]]
## [1] "kableExtra" "interactions" "aod" "MASS" "survey"
## [6] "survival" "Matrix" "grid" "effects" "carData"
## [11] "modelsummary" "fst" "infer" "lubridate" "forcats"
## [16] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [21] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [26] "grDevices" "utils" "datasets" "methods" "base"
##
## [[11]]
## [1] "flextable" "kableExtra" "interactions" "aod" "MASS"
## [6] "survey" "survival" "Matrix" "grid" "effects"
## [11] "carData" "modelsummary" "fst" "infer" "lubridate"
## [16] "forcats" "stringr" "dplyr" "purrr" "readr"
## [21] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [26] "graphics" "grDevices" "utils" "datasets" "methods"
## [31] "base"
##
## [[12]]
## [1] "scales" "flextable" "kableExtra" "interactions" "aod"
## [6] "MASS" "survey" "survival" "Matrix" "grid"
## [11] "effects" "carData" "modelsummary" "fst" "infer"
## [16] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [21] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [26] "stats" "graphics" "grDevices" "utils" "datasets"
## [31] "methods" "base"
Use data for Germany and the variable coding for model3 and model4 from the tutorial (i.e., both for the cleaning & recode, as well as the linear model formulas). Do a modelsummary table displaying both model outputs. Interpret the coefficients for the MLR without the interaction (i.e., the first displayed model), and interpret model fit metrics for both models.
germany_data <- read_fst("germany_data.fst")
df <- germany_data
df <- df %>%
mutate(behave = ipbhprp,
secure = impsafe,
safety = ipstrgv,
tradition = imptrad,
rules = ipfrule) %>%
mutate(across(c("behave", "secure", "safety", "tradition", "rules"),
~ na_if(.x, 7) %>% na_if(8) %>% na_if(9))) %>%
# Apply the reverse coding
mutate(across(c("behave", "secure", "safety", "tradition", "rules"), ~ 7 - .x ))
# Now you can calculate 'schwartzauth' after the NA recoding
df$auth <- scales::rescale(df$behave +
df$secure +
df$safety +
df$tradition +
df$rules, to=c(0,100), na.rm=TRUE)
df <- df %>% filter(!is.na(auth))
df <- df %>%
mutate(
polID = case_when(
lrscale %in% 0:3 ~ "Left",
lrscale %in% 7:10 ~ "Right",
lrscale %in% 4:6 ~ "Moderate",
lrscale %in% c(77, 88, 99) ~ NA_character_
),
religious = case_when(
rlgdgr %in% c(77, 88, 99) ~ NA_real_,
TRUE ~ rlgdgr
)
)
model3 <- lm(auth ~ polID + religious, data = df, weights = weight)
model4 <- lm(auth ~ polID + religious + polID*religious, data = df, weights = weight)
modelsummary(
list(model3, model4),
fmt = 1,
estimate = c( "{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}"),
statistic = NULL)
| (1) | (2) | |
|---|---|---|
| (Intercept) | 57.5 (0.7)*** | 59.6 (0.9)*** |
| polIDModerate | 5.3 (0.7)*** | 2.6 (1.1)* |
| polIDRight | 9.3 (1.1)*** | 3.4 (1.9)+ |
| religious | 0.3 (0.1)** | −0.3 (0.2)+ |
| polIDModerate × religious | 0.8 (0.2)** | |
| polIDRight × religious | 1.5 (0.4)*** | |
| Num.Obs. | 2855 | 2855 |
| R2 | 0.035 | 0.041 |
| R2 Adj. | 0.034 | 0.039 |
| AIC | 24680.0 | 24665.5 |
| BIC | 24709.7 | 24707.2 |
| Log.Lik. | −12334.978 | −12325.764 |
| RMSE | 17.13 | 17.13 |
Interpretation for the MLR without the interaction:
Interpret model fit metrics for both models:
Model 3: R-Squared (R² = 0.035): The R-squared value of 0.035 means that 3.5% of the variability in the dependent variable is explained by the model. Adjusted R-Squared for model 3 (Adj. R² = 0.034): The adjusted R-squared value of 0.034 means that the ‘polID’ variable explain approximately 3.4% of the variance in the authoritarian values scale for Germany, after accounting for the number of predictors. While statistically significant, this suggests that the model explains only a small portion of the variability in authoritarian values, pointing to other factors outside generational differences that may play critical roles in shaping these attitudes.
Model 4: polIDModerate (2.6): This coefficient suggests that, comparing individuals with the same religiousness (i.e., what some term “holding constant”), the average outcome for individuals identifying as politically moderate are 2.6 units higher compared to the reference category (in our case, identifying ideologically as on the ‘left’). The “baseline group” refers to the category that is omitted, and is thus interpreted as part of our intercept. polIDRight (3.4): This coefficient indicates that, comparing individuals with the same religiousness, individuals identifying as politically right are expected to be, on average, 3.4 units higher than the reference category (i.e., identifying ideologically as on the ‘left’) on the authoritarian values scale. religious (-0.3): For the ‘religious’ predictor, a coefficient of -0.3 means that with every one-unit increase in religiousness, there is an associated expected average decrease of 0.3 units on the authoritarian values scale, assuming that we are comparing people with the same ideological categorization. R-Squared (R² = 0.041): The R-squared value of 0.041 means that 4.1% of the variability in the dependent variable is explained by the model. Adjusted R-Squared for model 4 (Adj. R² = 0.039): The adjusted R-squared value of 0.039 means that the ‘polID’ variable explain approximately 3.9% of the variance in the authoritarian values scale for Germany, after accounting for the number of predictors. While statistically significant, this suggests that the model explains only a small portion of the variability in authoritarian values, pointing to other factors outside generational differences that may play critical roles in shaping these attitudes.
What we note here is that the R-squared and adjusted R-squared are noticeably higher for the second model – put differently, the first model does not explain much variance in the outcome. Also, the AIC and BIC are higher for the first model, highlighting that the increase in variance explained is not justified (there is greater complexity or potential underlying issues that the AIC and BIC do not like, comparatively). In sum: it suggests issues with the first model.
polIDModerate × religious (0.8): This coefficient suggests that the average outcome of the intersection of “Moderate” and religious is 0.8 units higher compared to the reference category (in our case, identifying ideologically as on the ‘left’).
polIDRight × religious (1.5): This coefficient suggests that the average outcome of the intersection of “Right” and religious is 1.5 units higher compared to the reference category (in our case, identifying ideologically as on the ‘left’).
Now generate the model4 interaction plot that we did in the tutorial, but again using the German data instead of the French. Interpret.
interaction_plot <- effect("polID*religious", model4, na.rm=TRUE)
plot(interaction_plot,
main="Interaction effect",
xlab="Religiousness",
ylab="Authoritarian attitudes scale")
interaction_plot
##
## polID*religious effect
## religious
## polID 0 2 5 8 10
## Left 59.61010 58.93398 57.91980 56.90562 56.22950
## Moderate 62.21299 63.14317 64.53843 65.93370 66.86388
## Right 63.00456 65.26702 68.66072 72.05442 74.31689
Interpretation: in the interaction plot of France, lines are not parallel which means that there is an interaction effect. There is intersection between the authoritarian scale, religiousness, and political ideology. Also, both “Moderate” and “Right” show increased slopes while “Left” shows a decreased slope. “Moderate” increases from 62.2 to 66.9; “Right” increases from 63 to 74.3; “Left” decreases from 59.6 to 56.2. Hence, for “Right”, higher levels of religiousness have the most significant effect while for “Left” has the least effect. It shows that the effect of religiousness on the outcome varies depending on the people’s political ideology.
Use data for the Netherlands and the variable coding for model5 and model6 from the tutorial (i.e., both for the cleaning & recode, as well as the linear model formulas). Do a modelsummary table displaying both model outputs. Interpret the coefficients for the MLR without the interaction (i.e., the first displayed model), and interpret model fit metrics for both models.
netherlands_data <- read_fst("netherlands_data.fst")
netherlands_data <- netherlands_data
netherlands_data <- netherlands_data %>%
mutate(
cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),
# Recoding generational cohorts based on the year of birth (yrbrn).
# The year of birth is categorized into different generational cohorts.
# Interwar (1900-1945), Baby Boomers (1946-1964), Gen X (1965-1979), Millennials (1980-1996).
# The 'TRUE' line is a catch-all that keeps the original year of birth for those not in these ranges.
gen = case_when(
yrbrn %in% 1900:1945 ~ "1",
yrbrn %in% 1946:1964 ~ "2",
yrbrn %in% 1965:1979 ~ "3",
yrbrn %in% 1980:1996 ~ "4",
TRUE ~ as.character(yrbrn)
),
# After recoding, the gen variable is converted into a factor with labels for clearer interpretation.
# Factors are used in R to handle categorical variables.
gen = factor(gen,
levels = c("1", "2", "3", "4"),
labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
)
table(netherlands_data$gen)
##
## Interwar Baby Boomers Gen X Millennials
## 3967 6359 4684 2803
netherlands_data <- netherlands_data %>%
mutate(behave = ipbhprp,
secure = impsafe,
safety = ipstrgv,
tradition = imptrad,
rules = ipfrule) %>%
mutate(across(c("behave", "secure", "safety", "tradition", "rules"),
~ na_if(.x, 7) %>% na_if(8) %>% na_if(9))) %>%
# Apply the reverse coding
mutate(across(c("behave", "secure", "safety", "tradition", "rules"), ~ 7 - .x ))
# Now you can calculate 'schwartzauth' after the NA recoding
netherlands_data$auth <- scales::rescale(netherlands_data$behave +
netherlands_data$secure +
netherlands_data$safety +
netherlands_data$tradition +
netherlands_data$rules, to=c(0,100), na.rm=TRUE)
netherlands_data <- netherlands_data %>% filter(!is.na(auth))
netherlands_data <- netherlands_data %>%
mutate(religion = case_when(
rlgblg == 2 ~ "No",
rlgblg == 1 ~ "Yes",
rlgblg %in% c(7, 8, 9) ~ NA_character_,
TRUE ~ as.character(rlgblg)
))
# check
table(netherlands_data$religion)
##
## No Yes
## 10913 6750
netherlands_data <- netherlands_data %>%
mutate(ID = case_when(
lrscale >= 0 & lrscale <= 4 ~ "Left",
lrscale >= 6 & lrscale <= 10 ~ "Right",
lrscale > 10 ~ NA_character_, # Set values above 10 as NA
TRUE ~ NA_character_ # Ensure value 5 and any other unexpected values are set as NA
))
table(netherlands_data$ID)
##
## Left Right
## 5449 7010
model5 <- lm(auth ~ religion + ID + gen, data = netherlands_data, weights = weight)
model6 <- lm(auth ~ religion + ID + gen + religion*gen, data = netherlands_data, weights = weight)
modelsummary(
list(model5, model6),
fmt = 1,
estimate = c( "{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}"),
statistic = NULL,
coef_omit = "Intercept")
| (1) | (2) | |
|---|---|---|
| religionYes | 8.2 (0.9)*** | 7.5 (2.1)*** |
| IDRight | 3.4 (0.9)*** | 3.4 (0.9)*** |
| genBaby Boomers | −5.8 (1.3)*** | −6.5 (1.7)*** |
| genGen X | −7.3 (1.4)*** | −7.6 (1.8)*** |
| genMillennials | −8.6 (1.5)*** | −8.6 (1.9)*** |
| religionYes × genBaby Boomers | 1.6 (2.6) | |
| religionYes × genGen X | 0.5 (2.8) | |
| religionYes × genMillennials | −0.6 (3.2) | |
| Num.Obs. | 1147 | 1147 |
| R2 | 0.126 | 0.127 |
| R2 Adj. | 0.123 | 0.121 |
| AIC | 9479.3 | 9484.5 |
| BIC | 9514.7 | 9535.0 |
| Log.Lik. | −4732.669 | −4732.251 |
| RMSE | 14.94 | 14.94 |
Interpretation for the MLR without the interaction:
Interpret model fit metrics for both models:
Model 5: R-Squared (R² = 0.126): The R-squared value of 0.126 means that 12.6% of the variability in the dependent variable is explained by the model. Adjusted R-Squared (Adj. R² = 0.123): The adjusted R-squared value of 0.123 means that the ‘gen’ variable (with our four coded generations) explain approximately 12.3% of the variance in the authoritarian values scale for Netherlands, after accounting for the number of predictors (just one in this case). While statistically significant, this suggests that the model explains only a certain portion of the variability in authoritarian values, pointing to other factors outside generational differences that may play critical roles in shaping these attitudes.
Model 6: genBaby Boomers (-6.5): Compared to the Interwar generation (reference category that is omitted), Baby Boomers exhibit scores that are, on average, 6.5 points lower on the authoritarian values scale (0-100). The triple asterisks ( *** ) denote statistical significance at a very low p-value (under our alpha threshold of 0.05) genGen X (-7.6): Generation X shows an even more pronounced decrease, with scores 7.6 points lower than the Interwar generation on the authoritarian values scale. Again, the triple asterisks ( *** ) denote statistical significance at a very low p-value (under our alpha threshold of 0.05) genMillennials (-8.6): Millennials exhibit the most considerable decrease in authoritarian values, with their scores being 8.6 points lower than those of the Interwar generation. This marked difference is statistically significant, indicating that Millennials diverge most strongly from the Interwar generation in comparing their authoritarian values. R-Squared (R² = 0.127):The R-squared value of 0.127 means that 12.7% of the variability in the dependent variable is explained by the model. Adjusted R-Squared (Adj. R² = 0.121): The adjusted R-squared value of 0.121 means that the ‘gen’ variable (with our four coded generations) explain approximately 12.1% of the variance in the authoritarian values scale for Netherlands, after accounting for the number of predictors (just one in this case). While statistically significant, this suggests that the model explains only a certain portion of the variability in authoritarian values, pointing to other factors outside generational differences that may play critical roles in shaping these attitudes.
What we note here is that the R-squared is higher for the second model, and adjusted R-squared is higher for the first model – put differently, the first model does not explain much variance in the outcome at first but does better at last. Also, the AIC and BIC are higher for the second model, highlighting that the increase in variance explained is not justified (there is greater complexity or potential underlying issues that the AIC and BIC do not like, comparatively). In sum: it suggests issues with the second model.
religionYes × genBaby Boomers (1.6): Compared to the Interwar generation (reference category that is omitted), the intersection of having a religion and Baby Boomers exhibit scores that are, on average, 1.6 points higher on the authoritarian values scale (0-100).
religionYes × genGen X (0.5): Compared to the Interwar generation (reference category that is omitted), the intersection of having a religion and Generation X exhibit scores that are, on average, 0.5 points higher on the authoritarian values scale (0-100).
religionYes × genMillennials (-0.6): Compared to the Interwar generation (reference category that is omitted), the intersection of having a religion and Millennials exhibit scores that are, on average, 0.6 points lower on the authoritarian values scale (0-100).
Produce the model7 interaction plot from the tutorial, but again using the Netherlands data. Interpret.
netherlands_data <- netherlands_data %>%
mutate(
polID = case_when(
lrscale %in% 0:3 ~ "Left",
lrscale %in% 7:10 ~ "Right",
lrscale %in% 4:6 ~ "Moderate",
lrscale %in% c(77, 88, 99) ~ NA_character_
),
religious = case_when(
rlgdgr %in% c(77, 88, 99) ~ NA_real_,
TRUE ~ rlgdgr
)
)
model7 <- lm(auth ~ + cohort + polID + cohort*polID, data = netherlands_data, weights = weight)
interact_plot(model7, pred = cohort, modx = polID, jnplot = TRUE)
Interpretation: in the interaction plot of Netherlands, although “Left” and “Moderate” are parallel, “Right” intersects between them. Hence, there’s likely an interaction between the authoritarian scale, cohort, “Left”, “Moderate”, and “Right.” On the other hand, comparing their sloped, “Right” is the most tilted, and “Moderate” is more tilted than “Left.” “Right” decreases from over 70 to under 60; “Moderate” decreases from between 65 and 70 to almost 55; “Left” decreases from just over 60 to almost 55. It shows that the authoritarian scale of “Right” changes the most and “Left” changes the least.
Produce the model8 interaction plot from the tutorial, but again using the Netherlands data. Interpret.
netherlands_data <- netherlands_data
model8 <- lm(auth ~ religious + ID + religious*ID, data = netherlands_data, weights = weight)
interact_plot(model8, pred = religious, modx = ID, jnplot = TRUE)
Interpretation: in the interaction plot of Netherlands, “Left” and “Right” are parallel. It shows that they don’t have ant interaction effect between the authoritarian scale, religious, “Left”, and “Right.” Besides, comparing their slope, they have similar slopes. They most likely tilt and increase the same. “Left” increases from around 55 to almost 70, and “Right” increases from around 60 to over 70.