# List of packages
packages <- c("tidyverse","broom", "infer", "fst", "modelsummary", "effects", "survey", "interactions") # add any you need here
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: carData
##
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
##
## Loading required package: grid
##
## Loading required package: Matrix
##
##
## Attaching package: 'Matrix'
##
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Loading required package: survival
##
##
## Attaching package: 'survey'
##
##
## The following object is masked from 'package:graphics':
##
## dotchart
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "broom" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "infer" "broom" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
##
## [[4]]
## [1] "fst" "infer" "broom" "lubridate" "forcats" "stringr"
## [7] "dplyr" "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [13] "tidyverse" "stats" "graphics" "grDevices" "utils" "datasets"
## [19] "methods" "base"
##
## [[5]]
## [1] "modelsummary" "fst" "infer" "broom" "lubridate"
## [6] "forcats" "stringr" "dplyr" "purrr" "readr"
## [11] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
##
## [[6]]
## [1] "effects" "carData" "modelsummary" "fst" "infer"
## [6] "broom" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[7]]
## [1] "survey" "survival" "Matrix" "grid" "effects"
## [6] "carData" "modelsummary" "fst" "infer" "broom"
## [11] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [16] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [21] "stats" "graphics" "grDevices" "utils" "datasets"
## [26] "methods" "base"
##
## [[8]]
## [1] "interactions" "survey" "survival" "Matrix" "grid"
## [6] "effects" "carData" "modelsummary" "fst" "infer"
## [11] "broom" "lubridate" "forcats" "stringr" "dplyr"
## [16] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [21] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [26] "datasets" "methods" "base"
Graph the histogram distribution using the populist scale (as we did in the tutorial) for the country of Sweden. What do you note?
sweden_data <- read.fst("sweden_data.fst")
# Populist scale (Norris/Inglehart use low levels of trust in parliaments, politicians and parties as an indicator of populism)
# Note: Schafer does it 2 ways, we will focus now on the most straightforward
# Setting values greater than 10 to NA
sweden_data$trstplt <- ifelse(sweden_data$trstplt > 10, NA, sweden_data$trstplt)
sweden_data$trstprl <- ifelse(sweden_data$trstprl > 10, NA, sweden_data$trstprl)
sweden_data$trstprt <- ifelse(sweden_data$trstprt > 10, NA, sweden_data$trstprt)
# Creating and rescaling the trust variable
sweden_data$trust <- scales::rescale(sweden_data$trstplt + sweden_data$trstprl + sweden_data$trstprt, na.rm = TRUE, to = c(0, 100))
# Here's how you would "flip it" because populist is conceived by N + I as 'anti-trust'
# As Schafer explains there are some issues with that conceptualization, but N + I is also widely applied
sweden_data$populist <- scales::rescale(sweden_data$trust, na.rm = TRUE, to=c(100,0))
ggplot(sweden_data, aes(x = populist)) +
geom_histogram(bins = 30, fill = "blue", color = "black") +
theme_minimal() +
labs(title = "Distribution of Populist Scale for Hungary",
x = "Populist Scale",
y = "Count")
## Warning: Removed 2503 rows containing non-finite values (`stat_bin()`).
Interpretation: The graph is more rightly- skewed and is a concentration
at the values between 25-50 more. This means the people in Sweden do
have a high trust in parliment and is not a populist country.
Run a linear regression with populist attitudes as the outcome, educational attainment (recoded as a binary “BA or more” and “No BA”), and using the Swedish data. Print the intercept and coefficients only and interpret. Then do a tidy model table displaying the p-value (with digits = 3) and interpret.
sweden_data <- read.fst("sweden_data.fst")
# Populist scale (Norris/Inglehart use low levels of trust in parliaments, politicians and parties as an indicator of populism)
# Note: Schafer does it 2 ways, we will focus now on the most straightforward
# Setting values greater than 10 to NA
sweden_data$trstplt <- ifelse(sweden_data$trstplt > 10, NA, sweden_data$trstplt)
sweden_data$trstprl <- ifelse(sweden_data$trstprl > 10, NA, sweden_data$trstprl)
sweden_data$trstprt <- ifelse(sweden_data$trstprt > 10, NA, sweden_data$trstprt)
# Creating and rescaling the trust variable
sweden_data$trust <- scales::rescale(sweden_data$trstplt + sweden_data$trstprl + sweden_data$trstprt, na.rm = TRUE, to = c(0, 100))
# Here's how you would "flip it" because populist is conceived by N + I as 'anti-trust'
# As Schafer explains there are some issues with that conceptualization, but N + I is also widely applied
sweden_data$populist <- scales::rescale(sweden_data$trust, na.rm = TRUE, to=c(100,0))
sweden_data <- sweden_data %>%
mutate(
# Recoding education
educ.ba = case_when(
essround < 5 & edulvla == 5 ~ "BA or more",
essround >= 5 & edulvlb > 600 ~ "BA or more",
TRUE ~ "No BA"
),
# Handle NAs for education levels
edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla),
edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
# Explicitly making 'No BA' the reference category
educ.ba = factor(educ.ba, levels = c("No BA", "BA or more")),
)
sweden_pop_model <- lm(populist ~ educ.ba, data = sweden_data)
coefficients_sweden <- coef(sweden_pop_model)
print(coefficients_sweden)
## (Intercept) educ.baBA or more
## 51.358680 -7.868121
tidy(sweden_pop_model)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 51.4 0.181 283. 0
## 2 educ.baBA or more -7.87 0.350 -22.5 2.58e-110
tidy(sweden_pop_model) |>
knitr::kable(digits = 3)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 51.359 | 0.181 | 283.363 | 0 |
| educ.baBA or more | -7.868 | 0.350 | -22.494 | 0 |
INTERPRETATION: People with no BA have an average score of a populist score of 51.4 which is the intercept. whereas, people with a BA or higher score on average 7.9 less than the average of 51.4 which is shown through the coefficient. overall, this means that having a BA or more is associated with lower populist scores. Lastly, a p value of 0 shows that there is a relationship between educational attainment and the populist scale in Sweden.
Now using the Italian dataset and the authoritarian values scale (as we did in the tutorial), graph the average by survey year. Interpret.
italy_data <- read.fst("italy_data.fst")
df <- read.fst("italy_data.fst")
## Creates an authoritarian values scale based on human modules items (can also do libertarian value scale)
df <- df %>%
mutate(behave = ipbhprp,
secure = impsafe,
safety = ipstrgv,
tradition = imptrad,
rules = ipfrule) %>%
mutate(across(c("behave", "secure", "safety", "tradition", "rules"),
~ na_if(.x, 7) %>% na_if(8) %>% na_if(9))) %>%
# Apply the reverse coding
mutate(across(c("behave", "secure", "safety", "tradition", "rules"), ~ 7 - .x ))
# Now you can calculate 'schwartzauth' after the NA recoding
df$auth <- scales::rescale(df$behave +
df$secure +
df$safety +
df$tradition +
df$rules, to=c(0,100), na.rm=TRUE)
df <- df %>% filter(!is.na(auth))
table(df$secure)
##
## 1 2 3 4 5 6
## 54 139 626 1856 2876 2881
table(df$auth)
##
## 0 12 16 20 24 28 32 36 40 44 48 52 56 60 64 68 72 76 80 84
## 7 4 4 9 13 20 27 57 113 155 163 323 367 596 647 789 823 897 992 733
## 88 92 96 100
## 632 465 299 297
df$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
df$year[df$essround == i] <- replacements[i]
}
auth_avg <- df %>%
group_by(year) %>%
summarize(auth_avg = mean(auth, na.rm = TRUE))
plot_auth <- ggplot(auth_avg, aes(x = year, y = auth_avg)) +
geom_point(aes(color = auth_avg), alpha = 1.0) +
geom_line(aes(group = 1), color = "blue", linetype = "dashed") +
labs(title = "Authoritarian Scale Average by Year (Italy)",
x = "Survey Year",
y = "Authoritarian Scale Average") +
theme_minimal() +
theme(legend.position = "none") +
scale_y_continuous(limits = c(0, 100))
print(plot_auth)
INTERPRETATION: Within the graph there is not much difference for the
authoritan scale and it only drops a bit within the years of 2016 - 2018
but slightly. Overall, they have a consistent average of 75.
Now run a linear regression model, using the modelsummary package, with authoritarian values as the outcome, and generations as the predictor/potential explanatory variable, and using the Italian data again. Rename the coefficients using the coef_rename function. Interpret the coefficients, whether they are statistically significant, and the adjusted R-squared.
model_data <- df %>%
mutate(
# Recoding gender
gndr = case_when(
gndr == 1 ~ "Male",
gndr == 2 ~ "Female",
gndr == 9 ~ NA_character_,
TRUE ~ as.character(gndr)
),
# Recoding education
educ.ba = case_when(
essround < 5 & edulvla == 5 ~ "BA or more",
essround >= 5 & edulvlb > 600 ~ "BA or more",
TRUE ~ "No BA"
),
# Handle NAs for education levels
edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla),
edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
# Explicitly making 'No BA' the reference category
educ.ba = factor(educ.ba, levels = c("No BA", "BA or more")),
# Recoding age, setting 999 to NA
age = ifelse(agea == 999, NA, agea),
# Recoding cohort variable, setting years before 1930 and after 2000 to NA
cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),
# Recoding generational cohorts based on year of birth
gen = case_when(
yrbrn %in% 1900:1945 ~ "1",
yrbrn %in% 1946:1964 ~ "2",
yrbrn %in% 1965:1979 ~ "3",
yrbrn %in% 1980:1996 ~ "4",
TRUE ~ as.character(cohort) # Keeping other values as character if they do not fit the ranges
),
# Converting cohort to a factor with labels
gen = factor(gen,
levels = c("1", "2", "3", "4"),
labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
)
table(model_data$gen)
##
## Interwar Baby Boomers Gen X Millennials
## 1026 2580 2214 1811
model1 <- lm(auth ~ gen, data = model_data)
model2 <- lm(auth ~ gndr, data = model_data)
model3 <- lm(auth ~ educ.ba, data = model_data)
models <- list(
"Model 1" = lm(auth ~ gen, data = model_data),
"Model 2" = lm(auth ~ gndr, data = model_data),
"Model 3" = lm(auth ~ educ.ba, data = model_data))
modelsummary(models, fmt = 1,
estimate = c("{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}"),
statistic = NULL)
| Model 1 | Model 2 | Model 3 | |
|---|---|---|---|
| (Intercept) | 75.3 (0.5)*** | 74.4 (0.2)*** | 73.3 (0.2)*** |
| genBaby Boomers | −1.1 (0.6)* | ||
| genGen X | −2.1 (0.6)*** | ||
| genMillennials | −3.6 (0.6)*** | ||
| gndrMale | −2.6 (0.3)*** | ||
| educ.baBA or more | −1.1 (0.4)* | ||
| Num.Obs. | 7631 | 8432 | 8432 |
| R2 | 0.006 | 0.007 | 0.001 |
| R2 Adj. | 0.006 | 0.007 | 0.001 |
| AIC | 62934.0 | 69523.1 | 69579.5 |
| BIC | 62968.7 | 69544.3 | 69600.6 |
| Log.Lik. | −31462.009 | −34758.566 | −34786.739 |
| RMSE | 14.94 | 14.93 | 14.98 |
INTERPRETATION (of co-efficients and adjusted R-square): When looking at the co-efficients in comparison to the inter-war generation, babyboomers exhibit a score 1.1 lower than the interwar average. Additionally, for genGenX they exhibit a score that is 2.1 lower and genMillenials exhbits a score 3.6 lower than inter-war generation. Furthermore, when we look at the adjusted R square which is 0.006 compared to the original R square which is also 0.006 we can see that it did not change. Additionally, that the ‘gen’ variable explains only about 0.6% of the variability in authoritarian values in Italy. This indicates that other factors beyond generational difference influence these attitudes as well.
Now visualize, using the modelplot() function from the modelsummary package, coefficient estimates and 95% confidence intervals for the following three models:
Model 1 = populist attitudes scale as the outcome; left/right as the single predictor (omitting moderates as we did in the tutorial), and using data for Greece.
Model 2 = populist attitudes scale as the outcome; gender as the single predictor, and using data for Greece.
Model 3 = populist attitudes scale as the outcome; generations as the single predictor (using the four generational category recode we did in the tutorial), and using data for Greece. Interpret.
greece_data <- read.fst("greece_data.fst")
# Populist scale (Norris/Inglehart use low levels of trust in parliaments, politicians and parties as an indicator of populism)
# Note: Schafer does it 2 ways, we will focus now on the most straightforward
# Setting values greater than 10 to NA
greece_data$trstplt <- ifelse(greece_data$trstplt > 10, NA, greece_data$trstplt)
greece_data$trstprl <- ifelse(greece_data$trstprl > 10, NA, greece_data$trstprl)
greece_data$trstprt <- ifelse(greece_data$trstprt > 10, NA, greece_data$trstprt)
# Creating and rescaling the trust variable
greece_data$trust <- scales::rescale(greece_data$trstplt + greece_data$trstprl + greece_data$trstprt, na.rm = TRUE, to = c(0, 100))
# Here's how you would "flip it" because populist is conceived by N + I as 'anti-trust'
# As Schafer explains there are some issues with that conceptualization, but N + I is also widely applied
greece_data$populist <- scales::rescale(greece_data$trust, na.rm = TRUE, to=c(100,0))
model_data <- greece_data %>%
mutate(
# Recoding gender
gndr = case_when(
gndr == 1 ~ "Male",
gndr == 2 ~ "Female",
gndr == 9 ~ NA_character_,
TRUE ~ as.character(gndr)
),
lrscale = case_when(
lrscale %in% 0:4 ~ "Left", # This time: Values 0 to 3 in 'lrscale' are categorized as "Left"
lrscale %in% 6:10 ~ "Right", # This time: Values 6 to 10 in 'lrscale' are categorized as "Right"
lrscale %in% c(77, 88, 99) ~ NA_character_, # Values 77, 88, 99 in 'lrscale' are set as NA
TRUE ~ NA_character_ # Any other values not explicitly matched above
),
# Recoding cohort variable, setting years before 1930 and after 2000 to NA
cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),
# Recoding generational cohorts based on year of birth
gen = case_when(
yrbrn %in% 1900:1945 ~ "1",
yrbrn %in% 1946:1964 ~ "2",
yrbrn %in% 1965:1979 ~ "3",
yrbrn %in% 1980:1996 ~ "4",
TRUE ~ as.character(cohort) # Keeping other values as character if they do not fit the ranges
),
# Converting cohort to a factor with labels
gen = factor(gen,
levels = c("1", "2", "3", "4"),
labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
)
table(model_data$gen)
##
## Interwar Baby Boomers Gen X Millennials
## 2978 3484 3498 2330
model1 <- lm(populist ~ lrscale, data = model_data)
model2 <- lm(populist ~ gndr, data = model_data)
model3 <- lm(populist ~ gen, data = model_data)
models <- list(
"Model 1" = lm(populist ~ lrscale, data = model_data),
"Model 2" = lm(populist ~ gndr, data = model_data),
"Model 3" = lm(populist ~ gen, data = model_data))
modelsummary(models, fmt = 1,
estimate = c("{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}"),
statistic = NULL)
| Model 1 | Model 2 | Model 3 | |
|---|---|---|---|
| (Intercept) | 72.0 (0.5)*** | 70.0 (0.3)*** | 65.8 (0.5)*** |
| lrscaleRight | −8.8 (0.6)*** | ||
| gndrMale | −1.0 (0.5)* | ||
| genBaby Boomers | 3.5 (0.7)*** | ||
| genGen X | 5.6 (0.7)*** | ||
| genMillennials | 5.8 (0.7)*** | ||
| Num.Obs. | 4936 | 9823 | 9562 |
| R2 | 0.037 | 0.000 | 0.009 |
| R2 Adj. | 0.036 | 0.000 | 0.009 |
| AIC | 44723.8 | 88873.2 | 86473.9 |
| BIC | 44743.3 | 88894.8 | 86509.7 |
| Log.Lik. | −22358.894 | −44433.590 | −43231.944 |
| RMSE | 22.44 | 22.30 | 22.25 |
modelplot(models, coef_omit = 'Interc')
INTERPRETATION: when we interpret the coefficients and graph above we can see that Baby Boomers exhibit a 3.5-point increase, Generation X shows a 5.6-point increase, and Millennials display the most considerable rise at 5.8 points compared to the Interwar generation. Moreover, these generational differences explain only about 0.9% (adjusted R square) of the variance in authoritarian values, indicating the influence of other factors. furthermore, Across generations, there’s a decline in authoritarian values, with Baby Boomers scoring 3.9 points lower, Generation X 5.9 points lower, and Millennials 10.5 points lower than the Interwar generation. Confidence intervals provide ranges for true population parameters across 95% of random samples for each predictor variable. Lastly, male and female have no signifance to authoritan values as well.