Homework 4 (SOC222)

Setting up environment

# List of packages
packages <- c("tidyverse", "infer", "fst", "modelsummary", "effects", "survey", "interactions") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## Warning: package 'infer' was built under R version 4.3.3

## Warning: package 'effects' was built under R version 4.3.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.3.3

## lattice theme set by effectsTheme()
## See ?effectsTheme for details.

## Warning: package 'survey' was built under R version 4.3.3

## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loading required package: survival
## 
## Attaching package: 'survey'
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart

## Warning: package 'interactions' was built under R version 4.3.3

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[4]]
##  [1] "modelsummary" "fst"          "infer"        "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "effects"      "carData"      "modelsummary" "fst"          "infer"       
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "survey"       "survival"     "Matrix"       "grid"         "effects"     
##  [6] "carData"      "modelsummary" "fst"          "infer"        "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[7]]
##  [1] "interactions" "survey"       "survival"     "Matrix"       "grid"        
##  [6] "effects"      "carData"      "modelsummary" "fst"          "infer"       
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"

Task 1

Graph the histogram distribution using the populist scale (as we did in the tutorial) for the country of Sweden. What do you note?

sweden_data <- read.fst("sweden_data.fst")

# Setting values greater than 10 to NA
sweden_data$trstplt <- ifelse(sweden_data$trstplt > 10, NA, sweden_data$trstplt)
sweden_data$trstprl <- ifelse(sweden_data$trstprl > 10, NA, sweden_data$trstprl)
sweden_data$trstprt <- ifelse(sweden_data$trstprt > 10, NA, sweden_data$trstprt)

# Creating and rescaling the trust variable
sweden_data$trust <- scales::rescale(sweden_data$trstplt + sweden_data$trstprl + sweden_data$trstprt, na.rm = TRUE, to = c(0, 100))

# Here's how you would "flip it" because populist is conceived by N + I as 'anti-trust'
# As Schafer explains there are some issues with that conceptualization, but N + I is also widely applied
sweden_data$populist <- scales::rescale(sweden_data$trust, na.rm = TRUE, to=c(100,0))

ggplot(sweden_data, aes(x = populist)) +
  geom_histogram(bins = 30, fill = "blue", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Populist Scale for Sweden",
       x = "Populist Scale",
       y = "Count")

## Warning: Removed 2503 rows containing non-finite values (`stat_bin()`).

Based on the above histograph it demonstrates that the right side is skewed indicating that higher levels of populism versus lower levels of populism. Mode of this data appears to be 50.

Task 2

Run a linear regression with populist attitudes as the outcome, educational attainment (recoded as a binary “BA or more” and “No BA”), and using the Swedish data. Print the intercept and coefficients only and interpret. Then do a tidy model table displaying the p-value (with digits = 3) and interpret.

sweden_data <- sweden_data %>%
  mutate(
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      essround >= 5 & edulvlb > 600 ~ "BA or more",
      TRUE ~ "No BA"
    ),
    
    edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla),
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
    
    educ.ba = factor(educ.ba, levels = c("No BA", "BA or more"))
  )

sweden_pop_model <- lm(populist ~ educ.ba, data = sweden_data)

coefficients_sweden <- coef(sweden_pop_model)
print(coefficients_sweden)

##       (Intercept) educ.baBA or more 
##         51.358680         -7.868121

broom::tidy

## Registered S3 methods overwritten by 'broom':
##   method            from  
##   tidy.glht         jtools
##   tidy.summary.glht jtools

## function (x, ...) 
## {
##     UseMethod("tidy")
## }
## <bytecode: 0x000002280ebfb000>
## <environment: namespace:generics>

tidy (sweden_pop_model)

## # A tibble: 2 × 5
##   term              estimate std.error statistic   p.value
##   <chr>                <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)          51.4      0.181     283.  0        
## 2 educ.baBA or more    -7.87     0.350     -22.5 2.58e-110

tidy(sweden_pop_model) |>
knitr::kable(digits = 3)

term	estimate	std.error	statistic	p.value
(Intercept)	51.359	0.181	283.363	0
educ.baBA or more	-7.868	0.350	-22.494	0

summary(sweden_pop_model)

## 
## Call:
## lm(formula = populist ~ educ.ba, data = sweden_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.359 -14.692  -1.359  11.975  56.509 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        51.3587     0.1812  283.36   <2e-16 ***
## educ.baBA or more  -7.8681     0.3498  -22.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.43 on 15711 degrees of freedom
##   (2503 observations deleted due to missingness)
## Multiple R-squared:  0.0312, Adjusted R-squared:  0.03114 
## F-statistic:   506 on 1 and 15711 DF,  p-value: < 2.2e-16

Based on the displayed data it depicts that the intercept is 51.38 and the co-efficent is -7.8. This indicates that the intercept has a poplist scale of zero while the co-effeicent suggests that those with BA or more with likely experience a decrease over time. Additionally the p-value is less than one demonstrating that th ehypothesis is null.

Task 3

Now using the Italian dataset and the authoritarian values scale (as we did in the tutorial), graph the average by survey year. Interpret.

italy_data <- read.fst("italy_data.fst")
italy_data <- italy_data %>%
  mutate(behave = ipbhprp,
         secure = impsafe,
         safety = ipstrgv,
         tradition = imptrad,
         rules = ipfrule) %>%
  mutate(across(c("behave", "secure", "safety", "tradition", "rules"),
                ~ na_if(.x, 7) %>% na_if(8) %>% na_if(9))) %>%
  mutate(across(c("behave", "secure", "safety", "tradition", "rules"), ~ 7 - .x ))

italy_data$auth <- scales::rescale(italy_data$behave + 
                      italy_data$secure + 
                      italy_data$safety + 
                      italy_data$tradition + 
                      italy_data$rules, to=c(0,100), na.rm=TRUE)


italy_data <- italy_data %>% filter(!is.na(auth))
table(italy_data$secure)

## 
##    1    2    3    4    5    6 
##   54  139  626 1856 2876 2881

table(italy_data$auth)

## 
##   0  12  16  20  24  28  32  36  40  44  48  52  56  60  64  68  72  76  80  84 
##   7   4   4   9  13  20  27  57 113 155 163 323 367 596 647 789 823 897 992 733 
##  88  92  96 100 
## 632 465 299 297

italy_data$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
  italy_data$year[italy_data$essround == i] <- replacements[i]
}
auth_avg <- italy_data %>%
  group_by(year) %>%
  summarize(auth_avg = mean(auth, na.rm = TRUE))

plot_auth <- ggplot(auth_avg, aes(x = year, y = auth_avg)) +
  geom_point(aes(color = auth_avg), alpha = 1.0) + 
  geom_line(aes(group = 1), color = "blue", linetype = "dashed") +  
  labs(title = "Authoritarian Scale Average by Year (Italy)",
       x = "Survey Year",
       y = "Authoritarian Scale Average") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_y_continuous(limits = c(0, 100))

print(plot_auth)

The graph depicts a steady decrease, however a more drastic one just after 2015.

Task 4

Now run a linear regression model, using the modelsummary package, with authoritarian values as the outcome, and generations as the predictor/potential explanatory variable, and using the Italian data again. Rename the coefficients using the coef_rename function. Interpret the coefficients, whether they are statistically significant, and the adjusted R-squared.

Task 5

Now visualize, using the modelplot() function from the modelsummary package, coefficient estimates and 95% confidence intervals for the following three models:

Model 1 = populist attitudes scale as the outcome; left/right as the single predictor (omitting moderates as we did in the tutorial), and using data for Greece.

Model 2 = populist attitudes scale as the outcome; gender as the single predictor, and using data for Greece.

Model 3 = populist attitudes scale as the outcome; generations as the single predictor (using the four generational category recode we did in the tutorial), and using data for Greece. Interpret.

greece_data <- read.fst("greece_data.fst")

greece_data$trstplt <- ifelse(greece_data$trstplt > 10, NA, greece_data$trstplt)
greece_data$trstprl <- ifelse(greece_data$trstprl > 10, NA, greece_data$trstprl)
greece_data$trstprt <- ifelse(greece_data$trstprt > 10, NA, greece_data$trstprt)

greece_data$trust <- scales::rescale(greece_data$trstplt + greece_data$trstprl + greece_data$trstprt, na.rm = TRUE, to = c(0, 100))

greece_data$populist <- scales::rescale(greece_data$trust, na.rm = TRUE, to=c(100,0))

model_data <- greece_data %>%
  mutate(
    # Recoding gender
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      gndr == 9 ~ NA_character_,
      TRUE ~ as.character(gndr)
    ),

    lrscale = case_when(
      lrscale %in% 0:4 ~ "Left",                     # This time: Values 0 to 3 in 'lrscale' are categorized as "Left"
      lrscale %in% 6:10 ~ "Right",                   # This time: Values 6 to 10 in 'lrscale' are categorized as "Right"
      lrscale %in% c(77, 88, 99) ~ NA_character_,    # Values 77, 88, 99 in 'lrscale' are set as NA
      TRUE ~ NA_character_                           # Any other values not explicitly matched above
    ),

    # Recoding cohort variable, setting years before 1930 and after 2000 to NA
    cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),

    # Recoding generational cohorts based on year of birth
    gen = case_when(
      yrbrn %in% 1900:1945 ~ "1",
      yrbrn %in% 1946:1964 ~ "2",
      yrbrn %in% 1965:1979 ~ "3",
      yrbrn %in% 1980:1996 ~ "4",
      TRUE ~ as.character(cohort)  # Keeping other values as character if they do not fit the ranges
    ),

    # Converting cohort to a factor with labels
    gen = factor(gen,
                 levels = c("1", "2", "3", "4"),
                 labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
  )
table(model_data$gen)

## 
##     Interwar Baby Boomers        Gen X  Millennials 
##         2978         3484         3498         2330

model1 <- lm(populist ~ lrscale, data = model_data)
model2 <- lm(populist ~ gndr, data = model_data)
model3 <- lm(populist ~ gen, data = model_data)

models <- list(
   "Model 1" = lm(populist ~ lrscale, data = model_data),
   "Model 2" = lm(populist ~ gndr, data = model_data),
   "Model 3" = lm(populist ~ gen, data = model_data))
modelsummary(models, fmt = 1,
  estimate  = c("{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}"),
  statistic = NULL,

coef_rename = c("lrscale" = "Right", "gndrMale" = "Male", "genBaby Boomers" = "Baby Boomers", "genGen X" = "Gen X", "genMillennials" = "Millennials"),
  title = 'Table 3. Regression models for populist attitudes')

Table 3. Regression models for populist attitudes
	Model 1	Model 2	Model 3
(Intercept)	72.0 (0.5)***	70.0 (0.3)***	65.8 (0.5)***
lrscaleRight	−8.8 (0.6)***
Male		−1.0 (0.5)*
Baby Boomers			3.5 (0.7)***
Gen X			5.6 (0.7)***
Millennials			5.8 (0.7)***
Num.Obs.	4936	9823	9562
R2	0.037	0.000	0.009
R2 Adj.	0.036	0.000	0.009
AIC	44723.8	88873.2	86473.9
BIC	44743.3	88894.8	86509.7
Log.Lik.	−22358.894	−44433.590	−43231.944
RMSE	22.44	22.30	22.25

modelplot(models, coef_omit = 'Interc')

Based on the suggested findings, i demonstrates that there is a gender coeffiecent of -0.1 which shows that it is lower in comparison to women. The Generational differences scores close to one another but do have some differences, Baby Boomers scoring about a 4 on the scale, which is lower than both Millienals and GenX indicating that there is a lower populist. Lastly, the r-squared value of -9demonstrates that about 9% of teh Greek society have an authoritan ideology.