Task 1

Graph the histogram distribution using the populist scale (as we did in the tutorial) for the country of Sweden. What do you note?

Loading packages and data

# List of packages
packages <- c("tidyverse", "infer", "fst", "modelsummary", "effects", "survey", "interactions","tidymodels") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: carData
## 
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
## 
## Loading required package: grid
## 
## Loading required package: Matrix
## 
## 
## Attaching package: 'Matrix'
## 
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Loading required package: survival
## 
## 
## Attaching package: 'survey'
## 
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart
## 
## 
## Registered S3 methods overwritten by 'broom':
##   method            from  
##   tidy.glht         jtools
##   tidy.summary.glht jtools
## 
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## 
## ✔ broom        1.0.5      ✔ rsample      1.2.0 
## ✔ dials        1.2.1      ✔ tune         1.1.2 
## ✔ modeldata    1.3.0      ✔ workflows    1.1.4 
## ✔ parsnip      1.2.0      ✔ workflowsets 1.0.1 
## ✔ recipes      1.0.10     ✔ yardstick    1.3.0 
## 
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ Matrix::expand()  masks tidyr::expand()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ Matrix::pack()    masks tidyr::pack()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## ✖ Matrix::unpack()  masks tidyr::unpack()
## ✖ recipes::update() masks Matrix::update(), stats::update()
## • Use tidymodels_prefer() to resolve common conflicts.

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[4]]
##  [1] "modelsummary" "fst"          "infer"        "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "effects"      "carData"      "modelsummary" "fst"          "infer"       
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "survey"       "survival"     "Matrix"       "grid"         "effects"     
##  [6] "carData"      "modelsummary" "fst"          "infer"        "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[7]]
##  [1] "interactions" "survey"       "survival"     "Matrix"       "grid"        
##  [6] "effects"      "carData"      "modelsummary" "fst"          "infer"       
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"        
## 
## [[8]]
##  [1] "yardstick"    "workflowsets" "workflows"    "tune"         "rsample"     
##  [6] "recipes"      "parsnip"      "modeldata"    "dials"        "scales"      
## [11] "broom"        "tidymodels"   "interactions" "survey"       "survival"    
## [16] "Matrix"       "grid"         "effects"      "carData"      "modelsummary"
## [21] "fst"          "infer"        "lubridate"    "forcats"      "stringr"     
## [26] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [31] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [36] "utils"        "datasets"     "methods"      "base"

sweden <- read_fst("/Users/jessie/SOC222/sweden_data.fst")

# Populist scale (Norris/Inglehart use low levels of trust in parliaments, politicians and parties as an indicator of populism)

# Note: Schafer does it 2 ways, we will focus now on the most straightforward

# Setting values greater than 10 to NA
sweden$trstplt <- ifelse(sweden$trstplt > 10, NA, sweden$trstplt)
sweden$trstprl <- ifelse(sweden$trstprl > 10, NA, sweden$trstprl)
sweden$trstprt <- ifelse(sweden$trstprt > 10, NA, sweden$trstprt)

# Creating and rescaling the trust variable
sweden$trust <- scales::rescale(sweden$trstplt + sweden$trstprl + sweden$trstprt, na.rm = TRUE, to = c(0, 100))

# Here's how you would "flip it" because populist is conceived by N + I as 'anti-trust'
# As Schafer explains there are some issues with that conceptualization, but N + I is also widely applied
sweden$populist <- scales::rescale(sweden$trust, na.rm = TRUE, to=c(100,0))

#visualize
ggplot(sweden, aes(x = populist)) +
  geom_histogram(bins = 30, fill = "blue", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Populist Scale for Hungary",
       x = "Populist Scale",
       y = "Count")

## Warning: Removed 2503 rows containing non-finite values (`stat_bin()`).

sweden <- sweden %>%
  mutate(
    ID = case_when(
      lrscale %in% 0:4 ~ "Left",                     # This time: Values 0 to 3 in 'lrscale' are categorized as "Left"
      lrscale %in% 6:10 ~ "Right",                   # This time: Values 6 to 10 in 'lrscale' are categorized as "Right"
      lrscale %in% c(77, 88, 99) ~ NA_character_,    # Values 77, 88, 99 in 'lrscale' are set as NA
      TRUE ~ NA_character_                           # Any other values not explicitly matched above
    )
  )

table(sweden$ID)

## 
##  Left Right 
##  6188  7406

sweden_pop_model <- lm(populist ~ ID, data = sweden)

coefficients_sweden <- coef(sweden_pop_model)
print(coefficients_sweden)

## (Intercept)     IDRight 
##  48.3188406  -0.7149274

Sweden’s Output Interpretation:

Intercept (48.3): This value represents the average score on the populist scale for the reference group in Sweden, which is the “Left” group. In simpler terms, the expected average populist scale score would be 48.3 for the left, since it is the reference category. This score indicates a certain level of populism, where higher scores suggest stronger populist attitudes.

IDRight Coefficient (-0.7): The negative coefficient associated with the “Right” group indicates that, on average, individuals who identify as ideologically right in Sweden have populist scale scores that are 0.7 points lower than those who identify as left (note: based on our recode).

Comparing right-leaning individuals to left-leaning ones in Sweden, we expect to see the average populist scale score to be 0.7 points lower for the right-leaning group. This suggests that left-leaning individuals exhibit slightly stronger populist attitudes than their right-leaning counterparts within the Swedish context.

Task 2

Run a linear regression with populist attitudes as the outcome, educational attainment (recoded as a binary “BA or more” and “No BA”), and using the Swedish data. Print the intercept and coefficients only and interpret. Then do a tidy model table displaying the p-value (with digits = 3) and interpret.

#recoding educ.ba
linear_data <- sweden %>%
  mutate(
 # Recoding education
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      essround >= 5 & edulvlb > 600 ~ "BA or more",
      TRUE ~ "No BA"
    ),
    
    # Handle NAs for education levels
    edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla),
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
    
    # Explicitly making 'No BA' the reference category
    educ.ba = factor(educ.ba, levels = c("No BA", "BA or more"))
  )



sweden_edu_pop <- lm(populist ~ educ.ba, data = linear_data)
coefficients_linear <- coef(sweden_edu_pop)
print(coefficients_linear)

##       (Intercept) educ.baBA or more 
##         51.358680         -7.868121

Intrepretation: The intercept value of 51.35868 in Sweden represents the expected average score on the populist scale for individuals with the reference category of educational attainment, which is typically “BA or more.” This means that, on average, individuals with at least a bachelor’s degree in Sweden have a populist scale score of approximately 51.36.

tidy(sweden_edu_pop)

## # A tibble: 2 × 5
##   term              estimate std.error statistic   p.value
##   <chr>                <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)          51.4      0.181     283.  0        
## 2 educ.baBA or more    -7.87     0.350     -22.5 2.58e-110

tidy(sweden_edu_pop) |>
knitr::kable(digits = 3)

term	estimate	std.error	statistic	p.value
(Intercept)	51.359	0.181	283.363	0
educ.baBA or more	-7.868	0.350	-22.494	0

Intrepretation: The intercept’s p-value of 0 indicates that it is statistically significant, suggesting that there is a significant difference in the expected average score on the populist scale for individuals with at least a bachelor’s degree compared to those without. Similarly, the p-value for the coefficient associated with “No BA” is also 0, indicating that the difference in the expected average score on the populist scale between individuals without a bachelor’s degree and the reference category is statistically significant.

Task 3

Now using the Italian dataset and the authoritarian values scale (as we did in the tutorial), graph the average by survey year. Interpret.

italy <- read_fst("/Users/jessie/SOC222/italy_data.fst")

italy  <- italy  %>%
  mutate(behave = ipbhprp,
         secure = impsafe,
         safety = ipstrgv,
         tradition = imptrad,
         rules = ipfrule) %>%
  mutate(across(c("behave", "secure", "safety", "tradition", "rules"),
                ~ na_if(.x, 7) %>% na_if(8) %>% na_if(9))) %>%
  # Apply the reverse coding
  mutate(across(c("behave", "secure", "safety", "tradition", "rules"), ~ 7 - .x ))

# Now you can calculate 'schwartzauth' after the NA recoding
italy $auth <- scales::rescale(italy $behave + 
                      italy $secure + 
                      italy $safety + 
                      italy $tradition + 
                      italy $rules, to=c(0,100), na.rm=TRUE)


italy <-italy  %>% filter(!is.na(auth))
# table(italy $secure)
# table(italy $auth)
# ggplot(italy, aes(x = auth)) +
#   geom_histogram(bins = 30, fill = "blue", color = "black") +
#   theme_minimal() +
#   labs(title = "Distribution of Authoritarian Scale for Italy",
#        x = "Authoritarian Scale",
#        y = "Count")

italy$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10)
  {
  italy$year[italy$essround == i] <- replacements[i]
}
auth_avg <- italy %>%
  group_by(year) %>%
  summarize(auth_avg = mean(auth, na.rm = TRUE))

plot_auth <- ggplot(auth_avg, aes(x = year, y = auth_avg)) +
  geom_point(aes(color = auth_avg), alpha = 1.0) + 
  geom_line(aes(group = 1), color = "blue", linetype = "dashed") +  
  labs(title = "Authoritarian Scale Average by Year (Italy)",
       x = "Survey Year",
       y = "Authoritarian Scale Average") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_y_continuous(limits = c(0, 100))

print(plot_auth)

Intrepretation: The authoritarian scale average in Italy shows small fluctuations over the years, remaining relatively stable around 75. From 2002 to 2020, the average score on the authoritarian scale has shown minimal variation, indicating a consistent level of authoritarian attitudes among the population over this period. Despite potential societal or political changes occurring during these years, the overall tendency suggests a consistent level of authoritarian sentiment among the Italian populace, with the average score hovering around 75.

##Task 4 Now run a linear regression model, using the modelsummary package, with authoritarian values as the outcome, and generations as the predictor/potential explanatory variable, and using the Italian data again. Rename the coefficients using the coef_rename function. Interpret the coefficients, whether they are statistically significant, and the adjusted R-squared.

model_data <- italy %>%
  mutate(
    # Recoding cohort variable, setting years before 1930 and after 2000 to NA
    cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),
    # Recoding generational cohorts based on year of birth
    gen = case_when(
      yrbrn %in% 1900:1945 ~ "1",
      yrbrn %in% 1946:1964 ~ "2",
      yrbrn %in% 1965:1979 ~ "3",
      yrbrn %in% 1980:1996 ~ "4",
      TRUE ~ as.character(cohort)  # Keeping other values as character if they do not fit the ranges
    ),

    # Converting cohort to a factor with labels
    gen = factor(gen,
                 levels = c("1", "2", "3", "4"),
                 labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
  )
table(model_data$gen)

## 
##     Interwar Baby Boomers        Gen X  Millennials 
##         1026         2580         2214         1811

model <- lm(auth ~ gen, data = model_data)

modelsummary(model, 
  fmt = 1,
  estimate  = c("{estimate} ({std.error}){stars}"),
  statistic = NULL,
  coef_rename = c("genBaby Boomers" = "Baby Boomers", "genGen X" = "Gen X", "genMillennials" = "Millennials"),
  title = 'Table1. Regression model for authoritarian attitudes')

Table1. Regression model for authoritarian attitudes
	(1)
(Intercept)	75.3 (0.5)***
Baby Boomers	−1.1 (0.6)*
Gen X	−2.1 (0.6)***
Millennials	−3.6 (0.6)***
Num.Obs.	7631
R2	0.006
R2 Adj.	0.006
AIC	62934.0
BIC	62968.7
Log.Lik.	−31462.009
RMSE	14.94

Intrepertation: The intercept of 75.3 represents the expected average authoritarian scale score when all other predictor variables are zero.The coefficient for Baby Boomers of -1.1 indicates that, on average, Baby Boomers have authoritarian scale scores 1.1 points lower than the reference group, holding other variables constant. The coefficient for Generation X of -2.1 suggests that, on average, individuals from Generation X have authoritarian scale scores 2.1 points lower than the reference group, controlling for other variables. The coefficient for Millennials of -3.6 implies that, on average, Millennials have authoritarian scale scores 3.6 points lower than the reference group, all else being equal.

##Task 5 Now visualize, using the modelplot() function from the modelsummary package, coefficient estimates and 95% confidence intervals for the following three models:

Model 1 = populist attitudes scale as the outcome; left/right as the single predictor (omitting moderates as we did in the tutorial), and using data for Greece.

Model 2 = populist attitudes scale as the outcome; gender as the single predictor, and using data for Greece.

Model 3 = populist attitudes scale as the outcome; generations as the single predictor (using the four generational category recode we did in the tutorial), and using data for Greece. Interpret.

ess<-read_fst("/Users/jessie/All-ESS-Data.fst")
greece<-ess%>%
  filter(cntry=="GR")

model_data <-greece %>%
  mutate(
    trstplt = ifelse(trstplt > 10, NA, trstplt),
    trstprl = ifelse(trstprl > 10, NA, trstprl),
    trstprt = ifelse(trstprt > 10, NA, trstprt),
    trust = scales::rescale(trstplt + trstprl + trstprt, na.rm = TRUE, to = c(0, 100)),
    populist=scales::rescale(trust, na.rm = TRUE, to=c(100,0)),
    # Recoding 'lrscale' into a new variable 'ID'
    ID = case_when(
      lrscale %in% 0:3 ~ "Left",              # Values 0 to 3 in 'lrscale' are categorized as "Left"
      lrscale %in% 7:10 ~ "Right",           # Values 7 to 10 in 'lrscale' are categorized as "Right"
      #lrscale %in% 4:6 ~ "Moderate",       # Values 4 to 6 in 'lrscale' are categorized as "Moderate"
      lrscale %in% c(4,5,6,77, 88, 99) ~ NA_character_    # Values 77, 88, 99 in 'lrscale' are set as NA
    ),
    
    # Recoding gender
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      gndr == 9 ~ NA_character_,
      TRUE ~ as.character(gndr)
    ),

    # Recoding cohort variable, setting years before 1930 and after 2000 to NA
    cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),

    # Recoding generational cohorts based on year of birth
    gen = case_when(
      yrbrn %in% 1900:1945 ~ "1",
      yrbrn %in% 1946:1964 ~ "2",
      yrbrn %in% 1965:1979 ~ "3",
      yrbrn %in% 1980:1996 ~ "4",
      TRUE ~ as.character(cohort)  # Keeping other values as character if they do not fit the ranges
    ),

    # Converting cohort to a factor with labels
    gen = factor(gen,
                 levels = c("1", "2", "3", "4"),
                 labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
  )

models <- list(
   "Model 1" = lm(populist ~ ID, data = model_data),
   "Model 2" = lm(populist ~ gndr, data = model_data),
   "Model 3" = lm(populist ~ gen, data = model_data))

modelsummary(models,
  fmt = 1,
  estimate  = c("{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}"),
  statistic = NULL)

	Model 1	Model 2	Model 3
(Intercept)	72.4 (0.6)***	70.0 (0.3)***	65.8 (0.5)***
IDRight	−10.5 (0.8)***
gndrMale		−1.0 (0.5)*
genBaby Boomers			3.5 (0.7)***
genGen X			5.6 (0.7)***
genMillennials			5.8 (0.7)***
Num.Obs.	3480	9823	9562
R2	0.048	0.000	0.009
R2 Adj.	0.048	0.000	0.009
AIC	31648.1	88873.2	86473.9
BIC	31666.6	88894.8	86509.7
Log.Lik.	−15821.063	−44433.590	−43231.944
RMSE	22.81	22.30	22.25

modelplot(models, coef_omit = 'Interc')

Homework 4

Jiayi Yu

2024-03-14

Task 1

Sweden’s Output Interpretation:

Task 2

Task 3