Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
# List of packages
packages <- c("tidyverse", "infer", "fst", "modelsummary", "effects", "survey", "interactions") # add any you need here
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'infer' was built under R version 4.3.3
## Warning: package 'effects' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
## Warning: package 'survey' was built under R version 4.3.3
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loading required package: survival
##
## Attaching package: 'survey'
##
## The following object is masked from 'package:graphics':
##
## dotchart
## Warning: package 'interactions' was built under R version 4.3.3
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "infer" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "fst" "infer" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
##
## [[4]]
## [1] "modelsummary" "fst" "infer" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "effects" "carData" "modelsummary" "fst" "infer"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[6]]
## [1] "survey" "survival" "Matrix" "grid" "effects"
## [6] "carData" "modelsummary" "fst" "infer" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[7]]
## [1] "interactions" "survey" "survival" "Matrix" "grid"
## [6] "effects" "carData" "modelsummary" "fst" "infer"
## [11] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [16] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [21] "stats" "graphics" "grDevices" "utils" "datasets"
## [26] "methods" "base"
Graph the histogram distribution using the populist scale (as we did in the tutorial) for the country of Sweden. What do you note?
Sweden_data <- read.fst("Sweden_data.fst")
Sweden_data$trstplt <- ifelse(Sweden_data$trstplt > 10, NA, Sweden_data$trstplt)
Sweden_data$trstprl <- ifelse(Sweden_data$trstprl > 10, NA, Sweden_data$trstprl)
Sweden_data$trstprt <- ifelse(Sweden_data$trstprt > 10, NA, Sweden_data$trstprt)
Sweden_data$trust <- scales::rescale(Sweden_data$trstplt + Sweden_data$trstprl + Sweden_data$trstprt, na.rm = TRUE, to = c(0, 100))
Sweden_data$populist <- scales::rescale(Sweden_data$trust, na.rm = TRUE, to=c(100,0))
ggplot(Sweden_data, aes(x = populist)) +
geom_histogram(bins = 30, fill = "blue", color = "black") +
theme_minimal() +
labs(title = "Distribution of Populist Scale for Sweden",
x = "Populist Scale",
y = "Count")
## Warning: Removed 2503 rows containing non-finite values (`stat_bin()`).
The histogram shows that the populist data of Sweden has a slight left
skew. The graph shows a standard bell curve with a peak appearing at 50.
The overall trend of the populist scale dies down after 50 while the
trend up until 50 seems to pick up gradually.
Run a linear regression with populist attitudes as the outcome, educational attainment (recoded as a binary “BA or more” and “No BA”), and using the Swedish data. Print the intercept and coefficients only and interpret. Then do a tidy model table displaying the p-value (with digits = 3) and interpret.
Sf <- read.fst("Sweden_data.fst")
Sf <- Sf %>%
mutate(
behave = ipbhprp,
secure = impsafe,
safety = ipstrgv,
tradition = imptrad,
rules = ipfrule
) %>%
mutate(across(c(behave, secure, safety, tradition, rules),
~ na_if(.x, 7) %>% na_if(8) %>% na_if(9))) %>%
mutate(across(c(behave, secure, safety, tradition, rules), ~ 7 - .x )) %>%
mutate(
populist = scales::rescale(behave + secure + safety + tradition + rules, to=c(0,100), na.rm=TRUE)
)
Sf <- Sf %>% filter(!is.na(populist))
model_data <- Sf %>%
mutate(
educ.ba = case_when(
essround < 5 & edulvla == 5 ~ "BA or more",
essround >= 5 & edulvlb > 600 ~ "BA or more",
TRUE ~ "No BA"
),
educ.ba = factor(educ.ba, levels = c("No BA", "BA or more"))
)
model1 <- lm(populist ~ educ.ba, data = model_data)
summary(model1)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.04499 0.1690301 343.40030 0.000000e+00
## educ.baBA or more -4.73686 0.3324911 -14.24658 9.664324e-46
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.04499 0.1690301 343.40030 0.000000e+00
## educ.baBA or more -4.73686 0.3324911 -14.24658 9.664324e-46
modelsummary(
list(model1),
fmt = 3,
statistic = "p.value"
)
| (1) | |
|---|---|
| (Intercept) | 58.045 |
| (<0.001) | |
| educ.baBA or more | −4.737 |
| (<0.001) | |
| Num.Obs. | 14328 |
| R2 | 0.014 |
| R2 Adj. | 0.014 |
| AIC | 122558.3 |
| BIC | 122581.0 |
| Log.Lik. | −61276.168 |
| RMSE | 17.42 |
The intercept 58.045 represents the estimated value of the dependent variable when all predictor variables are set to zero. The coefficient -4.737 signifies that indiviudals with a BA or more” education level, compared to those with “No BA” education level, have, on average, a decrease of 4.737 units in their populist attitudes scale score. Accordingly, the p-value “<0.001” indicates that the coefficient is statistically significant which suggests that the education level is associated with differences in the populist attitudes scale score. The R-squared value of 0.014 indicates that approximately 1.4% of the variance in the dependent variable (populist attitudes scale) is explained by the independent variable (education level) in the model. The AIC value of 122558.3 is a measure of the relative quality of the statistical model for a given set of data.
Now using the Italian dataset and the authoritarian values scale (as we did in the tutorial), graph the average by survey year. Interpret.
Italy_data <- read.fst("Italy_data.fst")
af <- read.fst("italy_data.fst")
## Creates an authoritarian values scale based on human modules items (can also do libertarian value scale)
af <- af %>%
mutate(behave = ipbhprp,
secure = impsafe,
safety = ipstrgv,
tradition = imptrad,
rules = ipfrule) %>%
mutate(across(c("behave", "secure", "safety", "tradition", "rules"),
~ na_if(.x, 7) %>% na_if(8) %>% na_if(9))) %>%
# Apply the reverse coding
mutate(across(c("behave", "secure", "safety", "tradition", "rules"), ~ 7 - .x ))
# Now you can calculate 'schwartzauth' after the NA recoding
af$auth <- scales::rescale(af$behave +
af$secure +
af$safety +
af$tradition +
af$rules, to=c(0,100), na.rm=TRUE)
af <- af %>% filter(!is.na(auth))
table(af$secure)
##
## 1 2 3 4 5 6
## 54 139 626 1856 2876 2881
ggplot(af, aes(x = auth)) +
geom_histogram(bins = 30, fill = "blue", color = "black") +
theme_minimal() +
labs(title = "Distribution of Authoritarian Scale for Italy",
x = "Authoritarian Scale",
y = "Count")
The graph shows a peak of 1000 as well as a shifted bell curve towards
the right.
Now run a linear regression model, using the modelsummary package, with authoritarian values as the outcome, and generations as the predictor/potential explanatory variable, and using the Italian data again. Rename the coefficients using the coef_rename function. Interpret the coefficients, whether they are statistically significant, and the adjusted R-squared.
model_data1 <- af %>%
mutate(
gen = case_when(
yrbrn >= 1900 & yrbrn <= 1945 ~ "1",
yrbrn >= 1946 & yrbrn <= 1964 ~ "2",
yrbrn >= 1965 & yrbrn <= 1979 ~ "3",
yrbrn >= 1980 & yrbrn <= 1996 ~ "4",
TRUE ~ NA_character_ # Keeping other values as NA
),
# Converting gen to a factor with labels
gen = factor(gen,
levels = c("1", "2", "3", "4"),
labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
)
table(model_data1$gen)
##
## Interwar Baby Boomers Gen X Millennials
## 1026 2580 2214 1811
model <- lm(auth ~ gen, data = model_data1)
modelsummary(
list(model),
fmt = 1,
estimate = c( "{estimate} ({std.error}){stars}"),
statistic = NULL,
coef_omit = "Intercept")
| (1) | |
|---|---|
| genBaby Boomers | −1.1 (0.6)* |
| genGen X | −2.1 (0.6)*** |
| genMillennials | −3.6 (0.6)*** |
| Num.Obs. | 7631 |
| R2 | 0.006 |
| R2 Adj. | 0.006 |
| AIC | 62934.0 |
| BIC | 62968.7 |
| Log.Lik. | −31462.009 |
| RMSE | 14.94 |
models <- list(
"Model 1" = lm(auth ~ gen, data = model_data1)
)
modelsummary(models,
estimate = c("{estimate} ({std.error}){stars}"),
statistic = NULL)
| Model 1 | |
|---|---|
| (Intercept) | 75.345 (0.467)*** |
| genBaby Boomers | −1.125 (0.552)* |
| genGen X | −2.080 (0.564)*** |
| genMillennials | −3.592 (0.584)*** |
| Num.Obs. | 7631 |
| R2 | 0.006 |
| R2 Adj. | 0.006 |
| AIC | 62934.0 |
| BIC | 62968.7 |
| Log.Lik. | −31462.009 |
| RMSE | 14.94 |
Intercept 75.345 shows the expected average authoritarian scale score when other predictor variables are zero. Baby boomers, on average, are 1.1 points lower on the authoritarian values scale while milennials have the most notieceable decrease in authoritarian values with a score of 3.592. The value 0.006 means that the gen variable explains 0.6% of the variance in authoritarian values scale. This model explains an extremely small portion of the varied authortarian values.
Now visualize, using the modelplot() function from the modelsummary package, coefficient estimates and 95% confidence intervals for the following three models:
Model 1 = populist attitudes scale as the outcome; left/right as the single predictor (omitting moderates as we did in the tutorial), and using data for Greece.
Model 2 = populist attitudes scale as the outcome; gender as the single predictor, and using data for Greece.
Model 3 = populist attitudes scale as the outcome; generations as the single predictor (using the four generational category recode we did in the tutorial), and using data for Greece. Interpret.
Greece_data <- read.fst("Greece_data.fst")
Greece_data$trstplt <- ifelse(Greece_data$trstplt > 10, NA, Greece_data$trstplt)
Greece_data$trstprl <- ifelse(Greece_data$trstprl > 10, NA, Greece_data$trstprl)
Greece_data$trstprt <- ifelse(Greece_data$trstprt > 10, NA, Greece_data$trstprt)
Greece_data$trust <- scales::rescale(Greece_data$trstplt + Greece_data$trstprl + Greece_data$trstprt, na.rm = TRUE, to = c(0, 100))
Greece_data$populist <- scales::rescale(Greece_data$trust, na.rm = TRUE, to=c(100,0))
ggplot(Greece_data, aes(x = populist)) +
geom_histogram(bins = 30, fill = "blue", color = "black") +
theme_minimal() +
labs(title = "Distribution of Populist Scale for Greece",
x = "Populist Scale",
y = "Count")
## Warning: Removed 2735 rows containing non-finite values (`stat_bin()`).
Greece_data <- Greece_data %>%
mutate(
# Recoding gender
gndr = case_when(
gndr == 1 ~ "Male",
gndr == 2 ~ "Female",
gndr == 9 ~ NA_character_,
TRUE ~ as.character(gndr)
),
# Recoding education
educ.ba = case_when(
essround < 5 & edulvla == 5 ~ "BA or more",
essround >= 5 & edulvlb > 600 ~ "BA or more",
TRUE ~ "No BA"
),
# Handle NAs for education levels
edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla),
edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
# Explicitly making 'No BA' the reference category
educ.ba = factor(educ.ba, levels = c("No BA", "BA or more")),
# Recoding age, setting 999 to NA
age = ifelse(agea == 999, NA, agea),
# Recoding cohort variable, setting years before 1930 and after 2000 to NA
cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),
# Recoding generational cohorts based on year of birth
gen = case_when(
yrbrn %in% 1900:1945 ~ "1",
yrbrn %in% 1946:1964 ~ "2",
yrbrn %in% 1965:1979 ~ "3",
yrbrn %in% 1980:1996 ~ "4",
TRUE ~ as.character(cohort) # Keeping other values as character if they do not fit the ranges
),
# Converting cohort to a factor with labels
gen = factor(gen,
levels = c("1", "2", "3", "4"),
labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
)
table(model_data$gen)
## < table of extent 0 >
modelG_data <- Greece_data %>%
mutate(
educ.ba = case_when(
essround < 5 & edulvla == 5 ~ "BA or more",
essround >= 5 & edulvlb > 600 ~ "BA or more",
TRUE ~ "No BA"
),
edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla),
edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
educ.ba = factor(educ.ba, levels = c("No BA", "BA or more")),
age = ifelse(agea == 999, NA, agea),
cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),
gen = case_when(
yrbrn %in% 1900:1945 ~ "1",
yrbrn %in% 1946:1964 ~ "2",
yrbrn %in% 1965:1979 ~ "3",
yrbrn %in% 1980:1996 ~ "4",
TRUE ~ as.character(cohort) # Keeping other values as character if they do not fit the ranges
),
gen = factor(gen,
levels = c("1", "2", "3", "4"),
labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
)
table(modelG_data$educ.ba)
##
## No BA BA or more
## 9977 2581
modelG_data <- lm(populist ~ educ.ba, data = modelG_data)
modelplot(modelG_data, coef_names = "Left/Right", title = "Model G: Populist Attitudes Scale vs. Left/Right", conf_level = 0.95)
## Warning in ggplot2::geom_pointrange(ggplot2::aes(y = term, x = estimate, :
## Ignoring unknown parameters: `coef_names` and `title`
model_2 <- lm(populist ~ gndr, data = Greece_data)
modelplot(model_2, coef_names = "Gender", title = "Model 2: Populist Attitudes Scale vs. Gender", conf_level = 0.95)
## Warning in ggplot2::geom_pointrange(ggplot2::aes(y = term, x = estimate, :
## Ignoring unknown parameters: `coef_names` and `title`
model_3 <- lm(populist ~ gen, data = Greece_data)
# Visualize model 3
modelplot(model_3, coef_names = "Generation", title = "Model 3: Populist Attitudes Scale vs. Generation", conf_level = 0.95)
## Warning in ggplot2::geom_pointrange(ggplot2::aes(y = term, x = estimate, :
## Ignoring unknown parameters: `coef_names` and `title`
As generations progress, we can see higher levels of populism by
generation.