packages <- c("tidyverse", "infer", "fst", "modelsummary", "effects", "survey", "interactions", "broom", "knitr")
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: carData
##
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
##
## Loading required package: grid
##
## Loading required package: Matrix
##
##
## Attaching package: 'Matrix'
##
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Loading required package: survival
##
##
## Attaching package: 'survey'
##
##
## The following object is masked from 'package:graphics':
##
## dotchart
##
##
## Registered S3 methods overwritten by 'broom':
## method from
## tidy.glht jtools
## tidy.summary.glht jtools
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "infer" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "fst" "infer" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
##
## [[4]]
## [1] "modelsummary" "fst" "infer" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "effects" "carData" "modelsummary" "fst" "infer"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[6]]
## [1] "survey" "survival" "Matrix" "grid" "effects"
## [6] "carData" "modelsummary" "fst" "infer" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[7]]
## [1] "interactions" "survey" "survival" "Matrix" "grid"
## [6] "effects" "carData" "modelsummary" "fst" "infer"
## [11] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [16] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [21] "stats" "graphics" "grDevices" "utils" "datasets"
## [26] "methods" "base"
##
## [[8]]
## [1] "broom" "interactions" "survey" "survival" "Matrix"
## [6] "grid" "effects" "carData" "modelsummary" "fst"
## [11] "infer" "lubridate" "forcats" "stringr" "dplyr"
## [16] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [21] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [26] "datasets" "methods" "base"
##
## [[9]]
## [1] "knitr" "broom" "interactions" "survey" "survival"
## [6] "Matrix" "grid" "effects" "carData" "modelsummary"
## [11] "fst" "infer" "lubridate" "forcats" "stringr"
## [16] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [21] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [26] "utils" "datasets" "methods" "base"
Graph the histogram distribution using the populist scale (as we did in the tutorial) for the country of Sweden. What do you note?
setwd("/Users/kaylapatricia/Desktop/soc222/homework 4")
sweden_data <- read.fst("sweden_data.fst")
sweden_data$trstplt <- ifelse(sweden_data$trstplt > 10, NA, sweden_data$trstplt)
sweden_data$trstprl <- ifelse(sweden_data$trstprl > 10, NA, sweden_data$trstprl)
sweden_data$trstprt <- ifelse(sweden_data$trstprt > 10, NA, sweden_data$trstprt)
sweden_data$trust <- scales::rescale(sweden_data$trstplt + sweden_data$trstprl + sweden_data$trstprt, na.rm = TRUE, to = c(0, 100))
sweden_data$populist <- scales::rescale(sweden_data$trust, na.rm = TRUE, to=c(100,0))
ggplot(sweden_data, aes(x = populist)) +
geom_histogram(bins = 30, fill = "blue", color = "black") +
theme_minimal() +
labs(title = "Distribution of Populist Scale for Sweden",
x = "Populist Scale",
y = "Count")
## Warning: Removed 2503 rows containing non-finite values (`stat_bin()`).
The histogram shows a rightwards skew since the data is more
concentrated on the left side and the peak falls on the left side. This
indicates a positive distribution, signifying that more individuals in
Sweden have lower levels of populism and thus, higher trust in
parliament.
Run a linear regression with populist attitudes as the outcome, educational attainment (recoded as a binary “BA or more” and “No BA”), and using the Swedish data. Print the intercept and coefficients only and interpret. Then do a tidy model table displaying the p-value (with digits = 3) and interpret.
dfs <- read.fst("sweden_data.fst")
dfs <- dfs %>%
mutate(
trstplt = ifelse(dfs$trstplt > 10, NA, dfs$trstplt),
trstprl = ifelse(dfs$trstprl > 10, NA, dfs$trstprl),
trstprt = ifelse(dfs$trstprt > 10, NA, dfs$trstprt))
dfs$trust <-scales::rescale(dfs$trstplt + dfs$trstprl + dfs$trstprt, na.rm = TRUE, to=c(0,100))
dfs$populist <-scales::rescale(dfs$trust, na.rm=TRUE, to=c(100,0))
dfs <- dfs %>%
mutate(
educ.ba = case_when(
essround < 5 & edulvla == 5 ~ "BA or more",
essround >= 5 & edulvlb > 600 ~ "BA or more",
TRUE ~ "No BA"),
edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla),
edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
educ.ba = factor(educ.ba, levels = c("No BA", "BA or more")))
model<-lm(dfs$populist ~educ.ba, data = dfs)
coefficients_dfs <-coef(model)
print(coefficients_dfs)
## (Intercept) educ.baBA or more
## 51.358680 -7.868121
Intercept (51.4): this number indicates that when education levels are 0, populist attitudes would hold a value of 51.4 Educ.baBA or more coefficient (-7.9): this number represents the prediction that when individuals with a BA or more would have a populist score of about 7.9 points lower than those without a BA.
tidy(model) |>
knitr::kable(digits = 3)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 51.359 | 0.181 | 283.363 | 0 |
| educ.baBA or more | -7.868 | 0.350 | -22.494 | 0 |
The intercept value shows that people in Sweden have an average score of 51.4 on the populist scale. A higher populist score indicates less trust in parliament. The educ.baBA coefficient value shows that indiiduals with a BA or more in Sweden have an average populist score that is 7.9 values lower than those without a BA.
Now using the Italian dataset and the authoritarian values scale (as we did in the tutorial), graph the average by survey year. Interpret.
italy_data <- read.fst("italy_data.fst")
dfi <- read.fst("italy_data.fst")
dfi <-dfi %>%
mutate(behave = ipbhprp,
secure = impsafe,
safety = ipstrgv,
tradition = imptrad,
rules = ipfrule) %>%
mutate(across(c("behave", "secure", "safety", "tradition", "rules"),
~na_if(.x, 7) %>% na_if(8) %>% na_if(9))) %>%
mutate(across(c("behave", "secure", "safety", "tradition", "rules"), ~7 -.x))
dfi$auth <- scales::rescale(dfi$behave +
dfi$secure +
dfi$safety +
dfi$tradition +
dfi$rules, to=c(0,100), na.rm=TRUE)
dfi <- dfi %>% filter(!is.na(auth))
table(dfi$secure)
##
## 1 2 3 4 5 6
## 54 139 626 1856 2876 2881
dfi$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
dfi$year[dfi$essround == i] <- replacements[i]
}
auth_avg <- dfi %>%
group_by(year) %>%
summarize(auth_avg = mean(auth, na.rm = TRUE))
ggplot(dfi, aes(x = auth)) +
geom_histogram(bins = 30, fill = "blue", color = "black") +
theme_minimal() +
labs(title = "Distribution of Authoritarian Scale for Italy",
x = "Authoritarian Scale",
y = "Count")
plot_auth <- ggplot(auth_avg, aes(x = year, y = auth_avg)) +
geom_point(aes(color = auth_avg), alpha = 1.0) +
geom_line(aes(group = 1), color = "blue", linetype = "dashed") +
labs(title = "Authoritarian Scale Average by Year (Italy)",
x = "Survey Year",
y = "Authoritarian Scale Average") +
theme_minimal() +
theme(legend.position = "none") +
scale_y_continuous(limits = c(0, 100))
print(plot_auth)
This graph shows that the average authoritarian scale by year in Italy
has been stable over time with hardly any fluctuations in the graph.
This means people’s attitudes towards authoritarianism in Italy have
remained consistent, societal factors have not changed these attitudes.
This is representative of social stability in Italy.
Now run a linear regression model, using the modelsummary package, with authoritarian values as the outcome, and generations as the predictor/potential explanatory variable, and using the Italian data again. Rename the coefficients using the coef_rename function. Interpret the coefficients, whether they are statistically significant, and the adjusted R-squared.
italy_regress<-dfi %>%
mutate(
cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),
gen=case_when(
yrbrn %in% 1900:1945 ~ "1",
yrbrn %in% 1946:1964 ~ "2",
yrbrn %in% 1965:1979 ~ "3",
yrbrn %in% 1980:1996 ~ "4",
TRUE ~ as.character(cohort)),
gen=factor(gen,
levels = c("1", "2", "3", "4"),
labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials")))
table(italy_regress$gen)
##
## Interwar Baby Boomers Gen X Millennials
## 1026 2580 2214 1811
italy_model <- lm(auth ~ gen, data = italy_regress)
modelsummary(
list(italy_model),
fmt = 1,
estimate = c( "{estimate} ({std.error}){stars}"),
statistic = NULL,
coef_omit = "Intercept",
coef_rename = c("genBaby Boomers" = "Baby Boomers", "genGen X" = "Gen X", "genMillennials" = "Millennials"),
title = 'Regression Model for Authoritarian Attitudes in Italy')
| (1) | |
|---|---|
| Baby Boomers | −1.1 (0.6)* |
| Gen X | −2.1 (0.6)*** |
| Millennials | −3.6 (0.6)*** |
| Num.Obs. | 7631 |
| R2 | 0.006 |
| R2 Adj. | 0.006 |
| AIC | 62934.0 |
| BIC | 62968.7 |
| Log.Lik. | −31462.009 |
| RMSE | 14.94 |
The coefficient of -1.1 indicates baby boomers have authoritarian values that are on average, a value of 1.1 lower than the reference category. this coefficient is statistically significant. the coefficient of -2.1 indicates that on average, gen x individuals have less authoritarian values than baby boomers (about 2.1 values less on average). this coefficient is statistically significant. the coefficient of -3.6 indicates that millennials have less authoritarian values than baby boomers (about 3.6 values lower on average). this coefficient is statistically significant. all of the coefficients being statistically significant indicate that the differences in authoritarian values between these generations and baby boomers are highly unlikely to be caused by random chance. the adjusted R-squared value of 0.006 indicates that there is little variation in authoritarian values in Italy, cannot be explained by generations.
Now visualize, using the modelplot() function from the modelsummary package, coefficient estimates and 95% confidence intervals for the following three models:
Model 1 = populist attitudes scale as the outcome; left/right as the single predictor (omitting moderates as we did in the tutorial), and using data for Greece.
Model 2 = populist attitudes scale as the outcome; gender as the single predictor, and using data for Greece.
Model 3 = populist attitudes scale as the outcome; generations as the single predictor (using the four generational category recode we did in the tutorial), and using data for Greece. Interpret.
dfg <- read.fst("greece_data.fst")
dfg$trstplt <- ifelse(dfg$trstplt > 10, NA, dfg$trstplt)
dfg$trstprl <- ifelse(dfg$trstprl > 10, NA, dfg$trstprl)
dfg$trstprt <- ifelse(dfg$trstprt > 10, NA, dfg$trstprt)
dfg$trust <- scales::rescale(dfg$trstplt + dfg$trstprl + dfg$trstprt, na.rm = TRUE, to = c(0, 100))
dfg$populist <-scales::rescale(dfg$trust, na.rm=TRUE, to=c(100,0))
dfg<-dfg %>%
mutate(
lrscale = case_when(
lrscale %in% 0:4 ~ "Left",
lrscale %in% 6:10 ~ "Right", #Note: 5 set to NA because its "Moderate"
lrscale %in% c(77, 88, 99) ~NA_character_,
TRUE ~ NA_character_),
gndr = case_when(
gndr == 1 ~ "Male",
gndr == 2 ~ "Female",
gndr == 9 ~ NA_character_,
TRUE ~ as.character(gndr)),
cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),
gen=case_when(
yrbrn %in% 1900:1945 ~ "1",
yrbrn %in% 1946:1964 ~ "2",
yrbrn %in% 1965:1979 ~ "3",
yrbrn %in% 1980:1996 ~ "4",
TRUE ~ as.character(cohort)),
gen=factor(gen,
levels = c("1", "2", "3", "4"),
labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials")))%>%
filter(!is.na(lrscale) & !is.na(gndr))
table(dfg$gen)
##
## Interwar Baby Boomers Gen X Millennials
## 1549 1847 1664 1036
model1 <- lm(dfg$populist ~ lrscale, data = dfg)
model2 <- lm(dfg$populist ~ gndr, data = dfg)
model3 <- lm(dfg$populist ~ gen, data = dfg)
models <- list(
"Model 1" = lm(dfg$populist ~ lrscale, data = dfg),
"Model 2" = lm(dfg$populist ~ gndr, data = dfg),
"Model 3" = lm(dfg$populist ~ gen, data = dfg))
modelsummary(models, fmt = 1,
estimate = c("{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}"),
statistic = NULL,
coef_rename = c("genBaby Boomers" = "Baby Boomers", "genGen X" = "Gen X", "genMillennials" = "Millennials", "gndrMale" = "Male", "lrscaleRight" = "Right"),
title = 'Regression Models for Populist Attitudes for Greece')
| Model 1 | Model 2 | Model 3 | |
|---|---|---|---|
| (Intercept) | 72.0 (0.5)*** | 67.7 (0.5)*** | 63.0 (0.7)*** |
| Right | −8.8 (0.6)*** | ||
| Male | −1.2 (0.7)+ | ||
| Baby Boomers | 3.9 (0.9)*** | ||
| Gen X | 6.8 (0.9)*** | ||
| Millennials | 6.4 (1.0)*** | ||
| Num.Obs. | 4936 | 4936 | 4822 |
| R2 | 0.037 | 0.001 | 0.013 |
| R2 Adj. | 0.036 | 0.000 | 0.012 |
| AIC | 44723.8 | 44904.5 | 43832.6 |
| BIC | 44743.3 | 44924.1 | 43865.0 |
| Log.Lik. | −22358.894 | −22449.275 | −21911.303 |
| RMSE | 22.44 | 22.85 | 22.76 |
modelplot(models, coef_omit = 'Interc')
Model 1: the intercept of 72.0 indicates that the predicted value of
populist attitudes when the predictor variable of left/right is zero is
72.0. The coefficient for the “Right” predictor variable is -8.8,
indicating that individuals in Greece who are more right-winged have
significantly lower populist attitudes in comparison to left-wing
individuals. This coefficient is statistically significant, indicating
there is a significant positive relationship between political
orientation and populist attitudes. Model 2: the coefficient for the
“Male” predictor variable is -1.2, indicating that males have slightly
lower populist attitudes compared to females, although this value is not
statistically significant. The low adjusted R-squared indicates that
gender is not a predictor of the variance in populist attitudes.
Represented by the green bar, the confidence interval touches 0 on the
x-axis, indicating its statistical insignificance in
predicting/explaining populist attitudes. Model 3: Baby boomers (3.9),
Gen X (6.8) and milennials (6.4) coefficients are all statistically
significant, indicating that in comparison to baby boomers, Gen X and
milennials have similar values for populist attitudes. The R-squared
value of 0.012 indicates that generations as a category can only
slightly account for variance in populist attitudes. Gen X is graphed
furthest to the right, although each category is plotted far right of
the 0 on the x-axis to indicate its statistical significance.