# List of packages
packages <- c("tidyverse", "broom", "infer", "fst", "modelsummary", "effects", "survey", "interactions") # add any you need here
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: carData
##
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
##
## Loading required package: grid
##
## Loading required package: Matrix
##
##
## Attaching package: 'Matrix'
##
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Loading required package: survival
##
##
## Attaching package: 'survey'
##
##
## The following object is masked from 'package:graphics':
##
## dotchart
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "broom" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "infer" "broom" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
##
## [[4]]
## [1] "fst" "infer" "broom" "lubridate" "forcats" "stringr"
## [7] "dplyr" "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [13] "tidyverse" "stats" "graphics" "grDevices" "utils" "datasets"
## [19] "methods" "base"
##
## [[5]]
## [1] "modelsummary" "fst" "infer" "broom" "lubridate"
## [6] "forcats" "stringr" "dplyr" "purrr" "readr"
## [11] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
##
## [[6]]
## [1] "effects" "carData" "modelsummary" "fst" "infer"
## [6] "broom" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[7]]
## [1] "survey" "survival" "Matrix" "grid" "effects"
## [6] "carData" "modelsummary" "fst" "infer" "broom"
## [11] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [16] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [21] "stats" "graphics" "grDevices" "utils" "datasets"
## [26] "methods" "base"
##
## [[8]]
## [1] "interactions" "survey" "survival" "Matrix" "grid"
## [6] "effects" "carData" "modelsummary" "fst" "infer"
## [11] "broom" "lubridate" "forcats" "stringr" "dplyr"
## [16] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [21] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [26] "datasets" "methods" "base"
Graph the histogram distribution using the populist scale (as we did in the tutorial) for the country of Sweden. What do you note?
sweden_data <- read.fst("sweden_data.fst")
# Setting values greater than 10 to NA
sweden_data$trstplt <- ifelse(sweden_data$trstplt > 10, NA, sweden_data$trstplt)
sweden_data$trstprl <- ifelse(sweden_data$trstprl > 10, NA, sweden_data$trstprl)
sweden_data$trstprt <- ifelse(sweden_data$trstprt > 10, NA, sweden_data$trstprt)
# Creating and rescaling the trust variable
sweden_data$trust <- scales::rescale(sweden_data$trstplt + sweden_data$trstprl + sweden_data$trstprt, na.rm = TRUE, to = c(0, 100))
# Here's how you would "flip it" because populist is conceived by N + I as 'anti-trust'
# As Schafer explains there are some issues with that conceptualization, but N + I is also widely applied
sweden_data$populist <- scales::rescale(sweden_data$trust, na.rm = TRUE, to=c(100,0))
#visualize
ggplot(sweden_data, aes(x = populist)) +
geom_histogram(bins = 30, fill = "blue", color = "black") +
theme_minimal() +
labs(title = "Distribution of Populist Scale for Hungary",
x = "Populist Scale",
y = "Count")
## Warning: Removed 2503 rows containing non-finite values (`stat_bin()`).
sweden_data <- sweden_data %>%
mutate(
ID = case_when(
lrscale %in% 0:4 ~ "Left", # This time: Values 0 to 3 in 'lrscale' are categorized as "Left"
lrscale %in% 6:10 ~ "Right", # This time: Values 6 to 10 in 'lrscale' are categorized as "Right"
lrscale %in% c(77, 88, 99) ~ NA_character_, # Values 77, 88, 99 in 'lrscale' are set as NA
TRUE ~ NA_character_ # Any other values not explicitly matched above
)
)
table(sweden_data$ID)
##
## Left Right
## 6188 7406
sweden_pop_model <- lm(populist ~ ID, data = sweden_data)
coefficients_sweden <- coef(sweden_pop_model)
print(coefficients_sweden)
## (Intercept) IDRight
## 48.3188406 -0.7149274
ANSWER: Intercept (48.3):
This value represents the average score on the populist scale for the reference group in Sweden, which is the “Left” group. In simpler terms, the expected average populist scale score would be 48.3 for the left, since it is the reference category. This score indicates a certain level of populism, where higher scores suggest stronger populist attitudes.
IDRight Coefficient (-0.71):
The negative coefficient associated with the “Right” group indicates that, on average, individuals who identify as ideologically right in Sweden have populist scale scores that are 0.71 points lower than those who identify as left.Comparing right-leaning individuals to left-leaning ones in Sweden, we expect to see the average populist scale score to be 0.71 points lower for the right-leaning group. This suggests that left-leaning individuals exhibit stronger populist attitudes than their right-leaning counterparts.
Run a linear regression with populist attitudes as the outcome, educational attainment (recoded as a binary “BA or more” and “No BA”), and using the Swedish data. Print the intercept and coefficients only and interpret. Then do a tidy model table displaying the p-value (with digits = 3) and interpret.
sweden_data <- sweden_data %>%
mutate(
# Recoding education
educ.ba = case_when(
essround < 5 & edulvla == 5 ~ "BA or more",
essround >= 5 & edulvlb > 600 ~ "BA or more",
TRUE ~ "No BA"
),
# Handle NAs for education levels
edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla),
edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
# Explicitly making 'No BA' the reference category
educ.ba = factor(educ.ba, levels = c("No BA", "BA or more"))
)
sweden_lm <- lm(populist ~ educ.ba, data = sweden_data)
coefficients_sweden <- coef(sweden_lm)
print(coefficients_sweden)
## (Intercept) educ.baBA or more
## 51.358680 -7.868121
Interpretation: The intercept value of 51.3 shows the average populist score for individuals who do not have a Bachelor’s degree. The coefficient for individuals with BA or more being -7.9 having a Bachelor’s degree or higher is associated with 7.9 decrease in the populist attitudes. Individuals with a bachelors degree tend to have populist attitudes that are 7.9 lower than individuals without a Bachelors.
tidy(sweden_lm)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 51.4 0.181 283. 0
## 2 educ.baBA or more -7.87 0.350 -22.5 2.58e-110
tidy(sweden_lm) |>
knitr::kable(digits = 3)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 51.359 | 0.181 | 283.363 | 0 |
| educ.baBA or more | -7.868 | 0.350 | -22.494 | 0 |
Interpretation: The intercept p-value of 0 shows that it is statistically significant, suggesting that there is a difference in the expected average score on the populist scale for individuals with a bachelor’s degree compared to those without one The p-value for the coefficient associated with “No BA” is also 0, representing the difference in the expected average score on the populist scale between people without a bachelor’s degree and the reference category which is statistically significant
Now using the Italian dataset and the authoritarian values scale (as we did in the tutorial), graph the average by survey year. Interpret.
italy_data <- read.fst("italy_data.fst")
italy_data <- italy_data %>%
mutate(behave = ipbhprp,
secure = impsafe,
safety = ipstrgv,
tradition = imptrad,
rules = ipfrule) %>%
mutate(across(c("behave", "secure", "safety", "tradition", "rules"),
~ na_if(.x, 7) %>% na_if(8) %>% na_if(9))) %>%
# Apply the reverse coding
mutate(across(c("behave", "secure", "safety", "tradition", "rules"), ~ 7 - .x ))
# Now you can calculate 'schwartzauth' after the NA recoding
italy_data$auth <- scales::rescale(italy_data$behave +
italy_data$secure +
italy_data$safety +
italy_data$tradition +
italy_data$rules, to=c(0,100), na.rm=TRUE)
italy_data <- italy_data %>% filter(!is.na(auth))
table(italy_data$secure)
##
## 1 2 3 4 5 6
## 54 139 626 1856 2876 2881
table(italy_data$auth)
##
## 0 12 16 20 24 28 32 36 40 44 48 52 56 60 64 68 72 76 80 84
## 7 4 4 9 13 20 27 57 113 155 163 323 367 596 647 789 823 897 992 733
## 88 92 96 100
## 632 465 299 297
italy_data$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
italy_data$year[italy_data$essround == i] <- replacements[i]
}
auth_avg <- italy_data %>%
group_by(year) %>%
summarize(auth_avg = mean(auth, na.rm = TRUE))
plot_auth <- ggplot(auth_avg, aes(x = year, y = auth_avg)) +
geom_point(aes(color = auth_avg), alpha = 1.0) +
geom_line(aes(group = 1), color = "blue", linetype = "dashed") +
labs(title = "Authoritarian Scale Average by Year (Italy)",
x = "Survey Year",
y = "Authoritarian Scale Average") +
theme_minimal() +
theme(legend.position = "none") +
scale_y_continuous(limits = c(0, 100))
print(plot_auth)
Interpretation: The authoritarian average in Italy shows small changes
over the years, remaining slightly stable around 75. From 2002 to 2020,
the average score does not vary by a significant amount, indicating a
consistent level of authoritarian attitude among the population.
Overall, the tendency suggests a consistent level of authoritarian value
among Italians, with the average ranging from 72 to 75.
Now run a linear regression model, using the modelsummary package, with authoritarian values as the outcome, and generations as the predictor/potential explanatory variable, and using the Italian data again. Rename the coefficients using the coef_rename function. Interpret the coefficients, whether they are statistically significant, and the adjusted R-squared.
model_data <- read.fst("italy_data.fst")
model_data <- model_data %>%
mutate(
# Recoding cohort variable, setting years before 1930 and after 2000 to NA
cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),
# Recoding generational cohorts based on year of birth
gen = case_when(
yrbrn %in% 1900:1945 ~ "1",
yrbrn %in% 1946:1964 ~ "2",
yrbrn %in% 1965:1979 ~ "3",
yrbrn %in% 1980:1996 ~ "4",
TRUE ~ as.character(cohort) # Keeping other values as character if they do not fit the ranges
),
# Converting cohort to a factor with labels
gen = factor(gen,
levels = c("1", "2", "3", "4"),
labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
)
table(model_data$gen)
##
## Interwar Baby Boomers Gen X Millennials
## 1470 3179 2636 2031
model_data <- model_data %>%
mutate(behave = ipbhprp,
secure = impsafe,
safety = ipstrgv,
tradition = imptrad,
rules = ipfrule) %>%
mutate(across(c("behave", "secure", "safety", "tradition", "rules"),
~ na_if(.x, 7) %>% na_if(8) %>% na_if(9))) %>%
mutate(across(c("behave", "secure", "safety", "tradition", "rules"), ~ 7 - .x ))
model_data$auth <- scales::rescale(model_data$behave +
model_data$secure +
model_data$safety +
model_data$tradition +
model_data$rules, to=c(0,100), na.rm=TRUE)
model_data <- model_data %>% filter(!is.na(auth))
table(model_data$secure)
##
## 1 2 3 4 5 6
## 54 139 626 1856 2876 2881
model <- lm(auth ~ gen, data = model_data)
modelsummary(
list(model),
fmt = 1,
estimate = c( "{estimate} ({std.error}){stars}"),
statistic = NULL,
coef_rename = c("genBaby Boomers" = "Baby Boomers", "genGen X" = "Gen X", "genMillennials" = "Millennials")
)
| (1) | |
|---|---|
| (Intercept) | 75.3 (0.5)*** |
| Baby Boomers | −1.1 (0.6)* |
| Gen X | −2.1 (0.6)*** |
| Millennials | −3.6 (0.6)*** |
| Num.Obs. | 7631 |
| R2 | 0.006 |
| R2 Adj. | 0.006 |
| AIC | 62934.0 |
| BIC | 62968.7 |
| Log.Lik. | −31462.009 |
| RMSE | 14.94 |
tidy(model)
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 75.3 0.467 162. 0
## 2 genBaby Boomers -1.12 0.552 -2.04 4.14e- 2
## 3 genGen X -2.08 0.564 -3.69 2.29e- 4
## 4 genMillennials -3.59 0.584 -6.15 8.01e-10
Interpretation: The intercept of 75.3 shows the expected average authoritarian scale score when all other predictor variables are zero. When comparing ton the reference category, Baby Boomers on average are 1.1 points lower on the authoritarian values scale. The * indicates statistical significance at a lower p-value than the threshold of 0.05. Secondly, the Gen X shows a decrease with a score of 2.1 points below the reference category on the authoritarian values scale. The Three *** show statistical significance at a lower p value than the threshold of 0.05. Lastly, Millennials have the largest decrease in authoritarian values, with a score of 3.6 lower than the interwar generation. As for the R-squared adjusted value of 0.006, this means that the gen varaible explains 0.6% of the variance in authoritarian values scale. Therefore, the model only explains a small portion of the variability in authoritarian values.
Now visualize, using the modelplot() function from the modelsummary package, coefficient estimates and 95% confidence intervals for the following three models:
Model 1 = populist attitudes scale as the outcome; left/right as the single predictor (omitting moderates as we did in the tutorial), and using data for Greece.
Model 2 = populist attitudes scale as the outcome; gender as the single predictor, and using data for Greece.
Model 3 = populist attitudes scale as the outcome; generations as the single predictor (using the four generational category recode we did in the tutorial), and using data for Greece. Interpret.
greece_data <- read.fst("greece_data.fst")
model_data <- greece_data %>%
mutate(
ID = case_when(
lrscale %in% 0:4 ~ "Left",
lrscale %in% 6:10 ~ "Right",
lrscale %in% c(77, 88, 99) ~ NA_character_,
TRUE ~ NA_character_,
),
gndr = case_when(
gndr == 1 ~ "Male",
gndr == 2 ~ "Female",
gndr == 9 ~ NA_character_,
TRUE ~ as.character(gndr)
),
age = ifelse(agea == 999, NA, agea),
cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),
gen = case_when(
yrbrn %in% 1900:1945 ~ "1",
yrbrn %in% 1946:1964 ~ "2",
yrbrn %in% 1965:1979 ~ "3",
yrbrn %in% 1980:1996 ~ "4",
TRUE ~ as.character(cohort)
),
gen = factor(gen,
levels = c("1", "2", "3", "4"),
labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
)
model_data$trstplt <- ifelse(model_data$trstplt > 10, NA, model_data$trstplt)
model_data$trstprl <- ifelse(model_data$trstprl > 10, NA, model_data$trstprl)
model_data$trstprt <- ifelse(model_data$trstprt > 10, NA, model_data$trstprt)
model_data$trust <- scales::rescale(model_data$trstplt + model_data$trstprl + model_data$trstprt, na.rm = TRUE, to = c(0, 100))
model_data$populist <- scales::rescale(model_data$trust, na.rm = TRUE, to=c(100,0))
table(model_data$ID)
##
## Left Right
## 2688 3522
table(model_data$gndr)
##
## Female Male
## 6929 5629
table(model_data$gen)
##
## Interwar Baby Boomers Gen X Millennials
## 2978 3484 3498 2330
model3 <- lm(populist ~ ID, data = model_data)
model4 <- lm(populist ~ gndr, data = model_data)
model5 <- lm(populist ~ gen, data = model_data)
models2 <- list(
"Model 1" = lm(populist ~ ID, data = model_data),
"Model 2" = lm(populist ~ gndr, data = model_data),
"Model 3" = lm(populist ~ gen, data = model_data))
modelsummary(models2, fmt = 1,
estimate = c("{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}"),
coef_rename = c("genBaby Boomers" = "Baby Boomers", "genGen X" = "Gen X", "genMillennials" = "Millennials"),
statistic = NULL,
)
| Model 1 | Model 2 | Model 3 | |
|---|---|---|---|
| (Intercept) | 72.0 (0.5)*** | 70.0 (0.3)*** | 65.8 (0.5)*** |
| IDRight | −8.8 (0.6)*** | ||
| gndrMale | −1.0 (0.5)* | ||
| Baby Boomers | 3.5 (0.7)*** | ||
| Gen X | 5.6 (0.7)*** | ||
| Millennials | 5.8 (0.7)*** | ||
| Num.Obs. | 4936 | 9823 | 9562 |
| R2 | 0.037 | 0.000 | 0.009 |
| R2 Adj. | 0.036 | 0.000 | 0.009 |
| AIC | 44723.8 | 88873.2 | 86473.9 |
| BIC | 44743.3 | 88894.8 | 86509.7 |
| Log.Lik. | −22358.894 | −44433.590 | −43231.944 |
| RMSE | 22.44 | 22.30 | 22.25 |
tidy(model3)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 72.0 0.478 151. 0
## 2 IDRight -8.80 0.643 -13.7 6.33e-42
tidy(model4)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 70.0 0.305 230. 0
## 2 gndrMale -0.967 0.452 -2.14 0.0324
tidy(model5)
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 65.8 0.508 130. 0
## 2 genBaby Boomers 3.55 0.661 5.37 7.97e- 8
## 3 genGen X 5.58 0.659 8.47 2.85e-17
## 4 genMillennials 5.79 0.706 8.21 2.51e-16
modelplot(models2, coef_omit = 'Interc', coef_rename = c("genBaby Boomers" = "Baby Boomers", "genGen X" = "Gen X", "genMillennials" = "Millennials"))
Coefficient estimates interpretation:
Millennials: Millennials have the largest increase in populist values, as their scores are 5.8 points higher than those of the Interwar generation (as the reference group). This is statistically significant, showing that Millennials, on average, possess the strongest populist attitudes among the four types of generations in the dataset
Gen X: Generation X presents a little less pronounced increase compared to Millennials, as their scores are 5.6 points higher than the Interwar generation (the reference group) on the populist values scale.
Baby Boomers: Baby Boomers incorporate scores that are on average, 3.6 points higher on the populist values scale
gndrMale: Male is statistically significant as their scores are 1 point lower than females on the populist values scale, with the p-value of 0.032. This shows that males possess slightly weaker populist attitudes than women in Greece.
IDRight: Compared to those who are politically left, those who are going politically right present scores that are 8.8 points lower on the populist values scale. This difference is statistically significant, representing that the right people possess weaker populist attitudes than those who lean left.
confint(model3, level=0.95)
## 2.5 % 97.5 %
## (Intercept) 71.0653 72.938927
## IDRight -10.0591 -7.539754
confint(model4, level=0.95)
## 2.5 % 97.5 %
## (Intercept) 69.416151 70.60995391
## gndrMale -1.852559 -0.08087043
confint(model5, level=0.95)
## 2.5 % 97.5 %
## (Intercept) 64.777052 66.768087
## genBaby Boomers 2.254559 4.845308
## genGen X 4.288926 6.872488
## genMillennials 4.409253 7.175453
95% confidence interval interpretation:
IDRight: the confidence interval ranging from 61.0 (71.1-10.1) to 65.4 (72.9-7.5) of populist scores scale is expected to establish the true population parameter, across 95% of random samples.
gndrMale: confidence interval ranging from 67.5 (69.4-1.9) to 70.5 (70.6-0.1) of populist scores scale is expected to establish the true population parameter, across 95% of random samples.
genBaby Boomers: the confidence interval ranging from 67.0 (64.8+2.2) to 71.6 (66.8+4.8) of populist scores scale is expected to establish the true population parameter, across 95% of random samples.
genGen X: the confidence interval ranging from 69.1 (64.8+4.3) to 73.7 (66.8+6.9) of populist scores scale is expected to establish the true population parameter, across 95% of random samples.
genMillennials: confidence interval ranging from 69.2 (64.8+4.4) to 74.0 (66.8+7.2) of populist scores scale is expected to establish the true population parameter, across 95% of random samples.