Zhang_Leanna_Homework

Loading Packages

# List of packages
packages <- c("tidyverse", "infer", "fst", "modelsummary", "effects", "survey", "interactions","tidymodels") # add any you need here

# Installing new packages
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: carData
## 
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
## 
## Loading required package: grid
## 
## Loading required package: Matrix
## 
## 
## Attaching package: 'Matrix'
## 
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Loading required package: survival
## 
## 
## Attaching package: 'survey'
## 
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart
## 
## 
## Registered S3 methods overwritten by 'broom':
##   method            from  
##   tidy.glht         jtools
##   tidy.summary.glht jtools
## 
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## 
## ✔ broom        1.0.5      ✔ rsample      1.2.0 
## ✔ dials        1.2.1      ✔ tune         1.1.2 
## ✔ modeldata    1.3.0      ✔ workflows    1.1.4 
## ✔ parsnip      1.2.0      ✔ workflowsets 1.0.1 
## ✔ recipes      1.0.10     ✔ yardstick    1.3.0 
## 
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ Matrix::expand()  masks tidyr::expand()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ Matrix::pack()    masks tidyr::pack()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## ✖ Matrix::unpack()  masks tidyr::unpack()
## ✖ recipes::update() masks Matrix::update(), stats::update()
## • Search for functions across packages at https://www.tidymodels.org/find/

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[4]]
##  [1] "modelsummary" "fst"          "infer"        "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "effects"      "carData"      "modelsummary" "fst"          "infer"       
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "survey"       "survival"     "Matrix"       "grid"         "effects"     
##  [6] "carData"      "modelsummary" "fst"          "infer"        "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[7]]
##  [1] "interactions" "survey"       "survival"     "Matrix"       "grid"        
##  [6] "effects"      "carData"      "modelsummary" "fst"          "infer"       
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"        
## 
## [[8]]
##  [1] "yardstick"    "workflowsets" "workflows"    "tune"         "rsample"     
##  [6] "recipes"      "parsnip"      "modeldata"    "dials"        "scales"      
## [11] "broom"        "tidymodels"   "interactions" "survey"       "survival"    
## [16] "Matrix"       "grid"         "effects"      "carData"      "modelsummary"
## [21] "fst"          "infer"        "lubridate"    "forcats"      "stringr"     
## [26] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [31] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [36] "utils"        "datasets"     "methods"      "base"

sweden_data <- read_fst("sweden_data.fst")

Task 1

Graph the histogram distribution using the populist scale (as we did in the tutorial) for the country of Sweden. What do you note?

sweden_data <- read_fst("sweden_data.fst")

# Populist scale (Norris/Inglehart use low levels of trust in parliaments, politicians and parties as an indicator of populism)

# Note: Schafer does it 2 ways, we will focus now on the most straightforward

# Setting values greater than 10 to NA
sweden_data$trstplt <- ifelse(sweden_data$trstplt > 10, NA, sweden_data$trstplt)
sweden_data$trstprl <- ifelse(sweden_data$trstprl > 10, NA, sweden_data$trstprl)
sweden_data$trstprt <- ifelse(sweden_data$trstprt > 10, NA, sweden_data$trstprt)

# Creating and rescaling the trust variable
sweden_data$trust <- scales::rescale(sweden_data$trstplt + sweden_data$trstprl + sweden_data$trstprt, na.rm = TRUE, to = c(0, 100))

# Here's how you would "flip it" because populist is conceived by N + I as 'anti-trust'
# As Schafer explains there are some issues with that conceptualization, but N + I is also widely applied
sweden_data$populist <- scales::rescale(sweden_data$trust, na.rm = TRUE, to=c(100,0))

#visualize
ggplot(sweden_data, aes(x = populist)) +
  geom_histogram(bins = 30, fill = "blue", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Populist Scale for Hungary",
       x = "Populist Scale",
       y = "Count")

## Warning: Removed 2503 rows containing non-finite values (`stat_bin()`).

sweden_data<- sweden_data %>%
  mutate(
    ID = case_when(
      lrscale %in% 0:4 ~ "Left",                     # This time: Values 0 to 3 in 'lrscale' are categorized as "Left"
      lrscale %in% 6:10 ~ "Right",                   # This time: Values 6 to 10 in 'lrscale' are categorized as "Right"
      lrscale %in% c(77, 88, 99) ~ NA_character_,    # Values 77, 88, 99 in 'lrscale' are set as NA
      TRUE ~ NA_character_                           # Any other values not explicitly matched above
    )
  )

table(sweden_data$ID)

## 
##  Left Right 
##  6188  7406

sweden_data_pop_model <- lm(populist ~ ID, data = sweden_data)

coefficients_sweden_data <- coef(sweden_data_pop_model)
print(coefficients_sweden_data)

## (Intercept)     IDRight 
##  48.3188406  -0.7149274

The intercept value of 48.32 is the average score on the populist scale, which is the left side for the reference group in Sweden. This represents the level of populism. The IDRight value represents individuals on the right side of the scale who are 0.71 points lower than those on the left. Because this value is negative, we can infer that left leaning individuals have larger populist attitudes.

Task 2

Run a linear regression with populist attitudes as the outcome, educational attainment (recoded as a binary “BA or more” and “No BA”), and using the Swedish data. Print the intercept and coefficients only and interpret. Then do a tidy model table displaying the p-value (with digits = 3) and interpret.

#recoding educ.ba
linear_data <- sweden_data %>%
  mutate(
 # Recoding education
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      essround >= 5 & edulvlb > 600 ~ "BA or more",
      TRUE ~ "No BA"
    ),
    
    # Handle NAs for education levels
    edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla),
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
    
    # Explicitly making 'No BA' the reference category
    educ.ba = factor(educ.ba, levels = c("No BA", "BA or more"))
  )



sweden_data_edu_pop <- lm(populist ~ educ.ba, data = linear_data)
coefficients_linear <- coef(sweden_data_edu_pop)
print(coefficients_linear)

##       (Intercept) educ.baBA or more 
##         51.358680         -7.868121

The intercept value of 51.36 is the populist scole of individials with at least a bachelor’s degree in Sweden.

tidy(sweden_data_edu_pop)

## # A tibble: 2 × 5
##   term              estimate std.error statistic   p.value
##   <chr>                <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)          51.4      0.181     283.  0        
## 2 educ.baBA or more    -7.87     0.350     -22.5 2.58e-110

tidy(sweden_data_edu_pop) |>
knitr::kable(digits = 3)

term	estimate	std.error	statistic	p.value
(Intercept)	51.359	0.181	283.363	0
educ.baBA or more	-7.868	0.350	-22.494	0

The P-value for both the intercept and those associated with “No BA” are both 0 making it statistically significant. The intercepts value of 0 illustrates that there is a significant difference in the average score for individuals with and without a bachelors degree. The p-value of 0 for “No BA” illustrates that there is a significant difference between people without a bachelors degree and the reference category. In this case, there is strong evidence against the null hypothesis so we would reject it.

Task 3

Now using the Italian dataset and the authoritarian values scale (as we did in the tutorial), graph the average by survey year. Interpret.

italy_data <- read_fst("italy_data.fst")

italy_data  <- italy_data  %>%
  mutate(behave = ipbhprp,
         secure = impsafe,
         safety = ipstrgv,
         tradition = imptrad,
         rules = ipfrule) %>%
  mutate(across(c("behave", "secure", "safety", "tradition", "rules"),
                ~ na_if(.x, 7) %>% na_if(8) %>% na_if(9))) %>%
  # Apply the reverse coding
  mutate(across(c("behave", "secure", "safety", "tradition", "rules"), ~ 7 - .x ))

# Now you can calculate 'schwartzauth' after the NA recoding
italy_data $auth <- scales::rescale(italy_data $behave + 
                      italy_data $secure + 
                      italy_data $safety + 
                      italy_data $tradition + 
                      italy_data $rules, to=c(0,100), na.rm=TRUE)


italy_data <-italy_data  %>% filter(!is.na(auth))
# table(italy $secure)
# table(italy $auth)
# ggplot(italy, aes(x = auth)) +
#   geom_histogram(bins = 30, fill = "blue", color = "black") +
#   theme_minimal() +
#   labs(title = "Distribution of Authoritarian Scale for Italy",
#        x = "Authoritarian Scale",
#        y = "Count")

italy_data$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10)
  {
  italy_data$year[italy_data$essround == i] <- replacements[i]
}
auth_avg <- italy_data %>%
  group_by(year) %>%
  summarize(auth_avg = mean(auth, na.rm = TRUE))

plot_auth <- ggplot(auth_avg, aes(x = year, y = auth_avg)) +
  geom_point(aes(color = auth_avg), alpha = 1.0) + 
  geom_line(aes(group = 1), color = "blue", linetype = "dashed") +  
  labs(title = "Authoritarian Scale Average by Year (Italy)",
       x = "Survey Year",
       y = "Authoritarian Scale Average") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_y_continuous(limits = c(0, 100))

print(plot_auth)

The table above illustrates that the authoritoritarian scale average has been quite consistent over the years. There has been small changes but the average stays around 75.

Task 4

Now run a linear regression model, using the modelsummary package, with authoritarian values as the outcome, and generations as the predictor/potential explanatory variable, and using the Italian data again. Rename the coefficients using the coef_rename function. Interpret the coefficients, whether they are statistically significant, and the adjusted R-squared.

model_data <- italy_data %>%
  mutate(
    # Recoding cohort variable, setting years before 1930 and after 2000 to NA
    cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),
    # Recoding generational cohorts based on year of birth
    gen = case_when(
      yrbrn %in% 1900:1945 ~ "1",
      yrbrn %in% 1946:1964 ~ "2",
      yrbrn %in% 1965:1979 ~ "3",
      yrbrn %in% 1980:1996 ~ "4",
      TRUE ~ as.character(cohort)  # Keeping other values as character if they do not fit the ranges
    ),

    # Converting cohort to a factor with labels
    gen = factor(gen,
                 levels = c("1", "2", "3", "4"),
                 labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
  )
table(model_data$gen)

## 
##     Interwar Baby Boomers        Gen X  Millennials 
##         1026         2580         2214         1811

model <- lm(auth ~ gen, data = model_data)

modelsummary(model, 
  fmt = 1,
  estimate  = c("{estimate} ({std.error}){stars}"),
  statistic = NULL,
  coef_rename = c("genBaby Boomers" = "Baby Boomers", "genGen X" = "Gen X", "genMillennials" = "Millennials"),
  title = 'Table1. Regression Model for Authoritarian Attitudes')

Table1. Regression Model for Authoritarian Attitudes
	(1)
(Intercept)	75.3 (0.5)***
Baby Boomers	−1.1 (0.6)*
Gen X	−2.1 (0.6)***
Millennials	−3.6 (0.6)***
Num.Obs.	7631
R2	0.006
R2 Adj.	0.006
AIC	62934.0
BIC	62968.7
Log.Lik.	−31462.009
RMSE	14.94

Because the starting point is 75.3, that is what the average score of authoritarian attitudes should be when all other values are at zero. The negative values for baby boomers, gen x and millennials are the amount of values lower than the reference group.

Task 5

Now visualize, using the modelplot() function from the modelsummary package, coefficient estimates and 95% confidence intervals for the following three models:

Model 1 = populist attitudes scale as the outcome; left/right as the single predictor (omitting moderates as we did in the tutorial), and using data for Greece.

Model 2 = populist attitudes scale as the outcome; gender as the single predictor, and using data for Greece.

Model 3 = populist attitudes scale as the outcome; generations as the single predictor (using the four generational category recode we did in the tutorial), and using data for Greece. Interpret.

rm(list=ls()); gc()

##           used  (Mb) gc trigger  (Mb) limit (Mb)  max used  (Mb)
## Ncells 3251391 173.7    5848758 312.4         NA   5772927 308.4
## Vcells 5629730  43.0   84872577 647.6      16384 106090057 809.5

ess<-read_fst("All-ESS-Data.fst")
greece<-ess%>%
  filter(cntry=="GR")

model_data <-greece %>%
  mutate(
    trstplt = ifelse(trstplt > 10, NA, trstplt),
    trstprl = ifelse(trstprl > 10, NA, trstprl),
    trstprt = ifelse(trstprt > 10, NA, trstprt),
    trust = scales::rescale(trstplt + trstprl + trstprt, na.rm = TRUE, to = c(0, 100)),
    populist=scales::rescale(trust, na.rm = TRUE, to=c(100,0)),
    # Recoding 'lrscale' into a new variable 'ID'
    ID = case_when(
      lrscale %in% 0:3 ~ "Left",              # Values 0 to 3 in 'lrscale' are categorized as "Left"
      lrscale %in% 7:10 ~ "Right",           # Values 7 to 10 in 'lrscale' are categorized as "Right"
      #lrscale %in% 4:6 ~ "Moderate",       # Values 4 to 6 in 'lrscale' are categorized as "Moderate"
      lrscale %in% c(4,5,6,77, 88, 99) ~ NA_character_    # Values 77, 88, 99 in 'lrscale' are set as NA
    ),
    
    # Recoding gender
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      gndr == 9 ~ NA_character_,
      TRUE ~ as.character(gndr)
    ),

    # Recoding cohort variable, setting years before 1930 and after 2000 to NA
    cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),

    # Recoding generational cohorts based on year of birth
    gen = case_when(
      yrbrn %in% 1900:1945 ~ "1",
      yrbrn %in% 1946:1964 ~ "2",
      yrbrn %in% 1965:1979 ~ "3",
      yrbrn %in% 1980:1996 ~ "4",
      TRUE ~ as.character(cohort)  # Keeping other values as character if they do not fit the ranges
    ),

    # Converting cohort to a factor with labels
    gen = factor(gen,
                 levels = c("1", "2", "3", "4"),
                 labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
  )

models <- list(
   "Model 1" = lm(populist ~ ID, data = model_data),
   "Model 2" = lm(populist ~ gndr, data = model_data),
   "Model 3" = lm(populist ~ gen, data = model_data))

modelsummary(models,
  fmt = 1,
  estimate  = c("{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}"),
  statistic = NULL)

	Model 1	Model 2	Model 3
(Intercept)	72.4 (0.6)***	70.0 (0.3)***	65.8 (0.5)***
IDRight	−10.5 (0.8)***
gndrMale		−1.0 (0.5)*
genBaby Boomers			3.5 (0.7)***
genGen X			5.6 (0.7)***
genMillennials			5.8 (0.7)***
Num.Obs.	3480	9823	9562
R2	0.048	0.000	0.009
R2 Adj.	0.048	0.000	0.009
AIC	31648.1	88873.2	86473.9
BIC	31666.6	88894.8	86509.7
Log.Lik.	−15821.063	−44433.590	−43231.944
RMSE	22.81	22.30	22.25

modelplot(models, coef_omit = 'Interc')

Zhang_Leanna_Homework_4

2024-03-18

Task 1

Task 2

Task 3

Task 4

Task 5

End