# List of packages
packages <- c("tidyverse", "infer", "fst", "modelsummary", "effects", "survey", "interactions") # add any you need here
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'infer' was built under R version 4.3.3
## Warning: package 'effects' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
## Warning: package 'survey' was built under R version 4.3.3
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loading required package: survival
##
## Attaching package: 'survey'
##
## The following object is masked from 'package:graphics':
##
## dotchart
## Warning: package 'interactions' was built under R version 4.3.3
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "infer" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "fst" "infer" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
##
## [[4]]
## [1] "modelsummary" "fst" "infer" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "effects" "carData" "modelsummary" "fst" "infer"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[6]]
## [1] "survey" "survival" "Matrix" "grid" "effects"
## [6] "carData" "modelsummary" "fst" "infer" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[7]]
## [1] "interactions" "survey" "survival" "Matrix" "grid"
## [6] "effects" "carData" "modelsummary" "fst" "infer"
## [11] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [16] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [21] "stats" "graphics" "grDevices" "utils" "datasets"
## [26] "methods" "base"
Graph the histogram distribution using the populist scale (as we did in the tutorial) for the country of Sweden. What do you note?
Ans:
# Read in the Sweden dataset:
sweden_data <- read.fst("sweden_data.fst")
# Populist scale (Norris/Inglehart use low levels of trust in parliaments, politicians and parties as an indicator of populism)
# Setting values greater than 10 to NA
sweden_data$trstplt <- ifelse(sweden_data$trstplt > 10, NA, sweden_data$trstplt)
sweden_data$trstprl <- ifelse(sweden_data$trstprl > 10, NA, sweden_data$trstprl)
sweden_data$trstprt <- ifelse(sweden_data$trstprt > 10, NA, sweden_data$trstprt)
# Creating and rescaling the trust variable
sweden_data$trust <- scales::rescale(sweden_data$trstplt + sweden_data$trstprl + sweden_data$trstprt, na.rm = TRUE, to = c(0, 100))
# Here's how you would "flip it" because populist is conceived by N + I as 'anti-trust'
# As Schafer explains there are some issues with that conceptualization, but N + I is also widely applied
sweden_data$populist <- scales::rescale(sweden_data$trust, na.rm = TRUE, to=c(100,0))
# Graphing the histogram distribution using the populist scale:
ggplot(sweden_data, aes(x = populist)) +
geom_histogram(bins = 30, fill = "blue", color = "black") +
theme_minimal() +
labs(title = "Distribution of Populist Scale for Sweden",
x = "Populist Scale",
y = "Count")
## Warning: Removed 2503 rows containing non-finite values (`stat_bin()`).
What do i see: Compared to Hungary and Spain, which we saw in the
tutorial had two modes at slightly less than 50 and 100 on the populist
scale, I see that Sweden has one mode that is slightly less than 50.
Additionally, I notice that the graph appears to be slightly tilted to
the right, with high populist scale counts falling between 0 and 50 and
low populist scale counts exceeding 50. The Norway histogram that we saw
in the tutorial appears a lot like this one.
Run a linear regression with populist attitudes as the outcome, educational attainment (recoded as a binary “BA or more” and “No BA”), and using the Swedish data. Print the intercept and coefficients only and interpret. Then do a tidy model table displaying the p-value (with digits = 3) and interpret.
# Recode educational attainment as a binary
sweden_data <- sweden_data %>%
mutate(
educ.ba = case_when(
essround < 5 & edulvla == 5 ~ "BA or more",
essround >= 5 & edulvlb > 600 ~ "BA or more",
TRUE ~ "No BA"
),
# Handle NAs for education levels
edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla),
edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
# Explicitly making 'No BA' the reference category
educ.ba = factor(educ.ba, levels = c("No BA", "BA or more"))
)
# Check that the recoding worked as expected:
table(sweden_data$educ.ba)
##
## No BA BA or more
## 13477 4739
# Run a linear regression
sweden_pop_model <- lm(populist ~ educ.ba, data = sweden_data)
# Print the intercept and coefficients only:
coefficients_sweden <- coef(sweden_pop_model)
print(coefficients_sweden)
## (Intercept) educ.baBA or more
## 51.358680 -7.868121
Thus, 51.36 would be the predicted average populist scale score in Sweden. According to this score, Swedish populist views are more robust and unbiased. The populist scale scores of those who identify as politically right in Sweden are, on average, 7.868121 points lower than those of those who identify as left, according to the negative coefficient linked to the “Right” category. This indicates that those who lean left in Sweden have more populist views than people who lean right. (“No BA” individuals exhibit stronger populist attitudes than the “BA or more” individuals.)
# Now do a tidy model table displaying p-value and interpret:
broom::tidy(sweden_pop_model) |>
knitr::kable(digits = 3)
## Registered S3 methods overwritten by 'broom':
## method from
## tidy.glht jtools
## tidy.summary.glht jtools
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 51.359 | 0.181 | 283.363 | 0 |
| educ.baBA or more | -7.868 | 0.350 | -22.494 | 0 |
summary(sweden_pop_model)
##
## Call:
## lm(formula = populist ~ educ.ba, data = sweden_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.359 -14.692 -1.359 11.975 56.509
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.3587 0.1812 283.36 <2e-16 ***
## educ.baBA or more -7.8681 0.3498 -22.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.43 on 15711 degrees of freedom
## (2503 observations deleted due to missingness)
## Multiple R-squared: 0.0312, Adjusted R-squared: 0.03114
## F-statistic: 506 on 1 and 15711 DF, p-value: < 2.2e-16
p-value of a very small number close to 0 indicates that it is statistically significant and we can say that there is a statistically significant association between populist attitudes and educational attainment.
Now using the Italian dataset and the authoritarian values scale (as we did in the tutorial), graph the average by survey year. Interpret. Auth flipped non auth not flipped))
italy_data <- read.fst("italy_data.fst")
# Load necessary libraries
library(dplyr)
library(ggplot2)
# Read the Italian dataset
italy_data <- read.fst("italy_data.fst")
# Create authoritarian values scale based on human modules items
italy_data <- italy_data %>%
mutate(behave = ipbhprp,
secure = impsafe,
safety = ipstrgv,
tradition = imptrad,
rules = ipfrule) %>%
mutate(across(c("behave", "secure", "safety", "tradition", "rules"),
~ na_if(.x, 7) %>% na_if(8) %>% na_if(9))) %>%
# Apply reverse coding
mutate(across(c("behave", "secure", "safety", "tradition", "rules"), ~ 7 - .x ))
# Now you can calculate 'schwartzauth' after the NA recoding
italy_data$auth <- scales::rescale(italy_data$behave +
italy_data$secure +
italy_data$safety +
italy_data$tradition +
italy_data$rules, to=c(0,100), na.rm=TRUE)
italy_data <- italy_data %>% filter(!is.na(auth))
table(italy_data$secure)
##
## 1 2 3 4 5 6
## 54 139 626 1856 2876 2881
italy_data$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
italy_data$year[italy_data$essround == i] <- replacements[i]
}
auth_avg <- italy_data %>%
group_by(year) %>%
summarize(auth_avg = mean(auth, na.rm = TRUE))
plot_auth <- ggplot(auth_avg, aes(x = year, y = auth_avg)) +
geom_point(aes(color = auth_avg), alpha = 1.0) +
geom_line(aes(group = 1), color = "blue", linetype = "dashed") +
labs(title = "Authoritarian Scale Average by Year (Italy)",
x = "Survey Year",
y = "Authoritarian Scale Average") +
theme_minimal() +
theme(legend.position = "none") +
scale_y_continuous(limits = c(0, 100))
print(plot_auth)
The visual shows small, but linear fluctuations in the authoritarian scale average by year. Starting from 75 in 2012, dropping to around 73-4 by 2016, rising slightly to about 74.5 btw 2018 - 2020.
Now run a linear regression model, using the modelsummary package, with authoritarian values as the outcome, and generations as the predictor/potential explanatory variable, and using the Italian data again. Rename the coefficients using the coef_rename function. Interpret the coefficients, whether they are statistically significant, and the adjusted R-squared.
model_data <- italy_data %>%
mutate(
# Recoding cohort variable, setting years before 1930 and after 2000 to NA
cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),
# Recoding generational cohorts based on year of birth
gen = case_when(
yrbrn %in% 1900:1945 ~ "1",
yrbrn %in% 1946:1964 ~ "2",
yrbrn %in% 1965:1979 ~ "3",
yrbrn %in% 1980:1996 ~ "4",
TRUE ~ as.character(cohort) # Keeping other values as character if they do not fit the ranges
),
# Converting cohort to a factor with labels
gen = factor(gen,
levels = c("1", "2", "3", "4"),
labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
)
table(model_data$gen)
##
## Interwar Baby Boomers Gen X Millennials
## 1026 2580 2214 1811
# Next, run a linear regression model and rename the coefficients:
model <- lm(auth ~ gen, data = model_data)
modelsummary(
list(model),coef_rename = c("genBaby Boomers" = "Baby Boomers", "genGen X" = "Gen X", "genMillennials" = "Millennials"))
| (1) | |
|---|---|
| (Intercept) | 75.345 |
| (0.467) | |
| Baby Boomers | −1.125 |
| (0.552) | |
| Gen X | −2.080 |
| (0.564) | |
| Millennials | −3.592 |
| (0.584) | |
| Num.Obs. | 7631 |
| R2 | 0.006 |
| R2 Adj. | 0.006 |
| AIC | 62934.0 |
| BIC | 62968.7 |
| Log.Lik. | −31462.009 |
| RMSE | 14.94 |
As a result, the negative coefficient shows that Baby Boomers are less authoritarian than the interwar generation. Comparing Gen X and Millennials to the interwar generation, there is a similar drop in authoritarian ideology.
When Gen X - Interwar generation are compared, Gen X exhibits scores that are, on average, 2.080 points lower on the authoritarian values scale. This is a larger decrease than was observed for the Baby Boomers generation compared to the Interwar generation. Again, there is no triple asterisks for this coefficient indicating that we do not have a very low p-value below our 0.05 alpha threshold, and therefore we cannot say that this is statistically significant.
The small value of the adjusted R squared (value is 0.006) means that
the ‘gen’ variable explains approximately 0.6% of the variance in
authoritarian values scales for Italy, after accounting for the number
of predictors which is 1 (taking into account number of predictors is
used in adjusted R squared but not R squared). This suggests that this
model only explains a very small portion of the variability in
authoritarian values and that there are likely other parameters outside
of generational differences that may play a significant role in shaping
these attitudes.
Now visualize, using the modelplot() function from the modelsummary package, coefficient estimates and 95% confidence intervals for the following three models:
Model 1 = populist attitudes scale as the outcome; left/right as the single predictor (omitting moderates as we did in the tutorial), and using data for Greece.
Model 2 = populist attitudes scale as the outcome; gender as the single predictor, and using data for Greece.
Model 3 = populist attitudes scale as the outcome; generations as the single predictor (using the four generational category recode we did in the tutorial), and using data for Greece. Interpret.
## Model #1
greece_data <- read.fst("greece_data.fst")
greece_data$trstplt <- ifelse(greece_data$trstplt > 10, NA, greece_data$trstplt)
greece_data$trstprl <- ifelse(greece_data$trstprl > 10, NA, greece_data$trstprl)
greece_data$trstprt <- ifelse(greece_data$trstprt > 10, NA, greece_data$trstprt)
greece_data$trust <- scales::rescale(greece_data$trstplt + greece_data$trstprl + greece_data$trstprt, na.rm = TRUE, to = c(0, 100))
greece_data$populist <- scales::rescale(greece_data$trust, na.rm = TRUE, to=c(100,0))
greece_data <- greece_data %>%
mutate(
ID = case_when(
lrscale %in% 0:4 ~ "Left",
lrscale %in% 6:10 ~ "Right",
lrscale %in% c(77, 88, 99) ~ NA_character_,
TRUE ~ NA_character_
)
)
greece_lr_model <- lm(populist ~ ID, data = greece_data)
greece_data <- greece_data %>%
mutate(
gndr = case_when(
gndr == 1 ~ "Male",
gndr == 2 ~ "Female",
gndr == 9 ~ NA_character_,
TRUE ~ as.character(gndr)
))
greece_gndr_model <- lm(populist ~ gndr, data = greece_data)
greece_data <- greece_data %>%
mutate(
cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),
gen = case_when(
yrbrn %in% 1900:1945 ~ "1",
yrbrn %in% 1946:1964 ~ "2",
yrbrn %in% 1965:1979 ~ "3",
yrbrn %in% 1980:1996 ~ "4",
TRUE ~ as.character(cohort) # Keeping other values as character if they do not fit the ranges
),
gen = factor(gen,
levels = c("1", "2", "3", "4"),
labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
)
table(greece_data$gen)
##
## Interwar Baby Boomers Gen X Millennials
## 2978 3484 3498 2330
greece_gen_model <- lm(populist ~ gen, data = greece_data)
models <- list(
"Model 1" = lm(populist ~ ID, data = greece_data),
"Model 2" = lm(populist ~ gndr, data = greece_data),
"Model 3" = lm(populist ~ gen, data = greece_data))
modelsummary(models, fmt = 1,
estimate = c("{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}"),
statistic = NULL)
| Model 1 | Model 2 | Model 3 | |
|---|---|---|---|
| (Intercept) | 72.0 (0.5)*** | 70.0 (0.3)*** | 65.8 (0.5)*** |
| IDRight | −8.8 (0.6)*** | ||
| gndrMale | −1.0 (0.5)* | ||
| genBaby Boomers | 3.5 (0.7)*** | ||
| genGen X | 5.6 (0.7)*** | ||
| genMillennials | 5.8 (0.7)*** | ||
| Num.Obs. | 4936 | 9823 | 9562 |
| R2 | 0.037 | 0.000 | 0.009 |
| R2 Adj. | 0.036 | 0.000 | 0.009 |
| AIC | 44723.8 | 88873.2 | 86473.9 |
| BIC | 44743.3 | 88894.8 | 86509.7 |
| Log.Lik. | −22358.894 | −44433.590 | −43231.944 |
| RMSE | 22.44 | 22.30 | 22.25 |
modelplot(models, coef_omit = 'Interc')
The graph shows that three generations coefficients are above 0, with
the “Millenials” being the largest. The gender coefficient is negative
and closer to 0 than “IDRight” which is also negative but is still
further.