k_meryem

Setting up environment

# List of packages
packages <- c("tidyverse", "infer", "fst", "modelsummary", "effects", "survey", "interactions") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## Warning: package 'infer' was built under R version 4.3.3

## Warning: package 'effects' was built under R version 4.3.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.3.3

## lattice theme set by effectsTheme()
## See ?effectsTheme for details.

## Warning: package 'survey' was built under R version 4.3.3

## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loading required package: survival
## 
## Attaching package: 'survey'
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart

## Warning: package 'interactions' was built under R version 4.3.3

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[4]]
##  [1] "modelsummary" "fst"          "infer"        "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "effects"      "carData"      "modelsummary" "fst"          "infer"       
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "survey"       "survival"     "Matrix"       "grid"         "effects"     
##  [6] "carData"      "modelsummary" "fst"          "infer"        "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[7]]
##  [1] "interactions" "survey"       "survival"     "Matrix"       "grid"        
##  [6] "effects"      "carData"      "modelsummary" "fst"          "infer"       
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"

Task 1

Graph the histogram distribution using the populist scale (as we did in the tutorial) for the country of Sweden. What do you note?

Ans:

# Read in the Sweden dataset:
sweden_data <- read.fst("sweden_data.fst")

# Populist scale (Norris/Inglehart use low levels of trust in parliaments, politicians and parties as an indicator of populism)

# Setting values greater than 10 to NA
sweden_data$trstplt <- ifelse(sweden_data$trstplt > 10, NA, sweden_data$trstplt)
sweden_data$trstprl <- ifelse(sweden_data$trstprl > 10, NA, sweden_data$trstprl)
sweden_data$trstprt <- ifelse(sweden_data$trstprt > 10, NA, sweden_data$trstprt)

# Creating and rescaling the trust variable
sweden_data$trust <- scales::rescale(sweden_data$trstplt + sweden_data$trstprl + sweden_data$trstprt, na.rm = TRUE, to = c(0, 100))

# Here's how you would "flip it" because populist is conceived by N + I as 'anti-trust'
# As Schafer explains there are some issues with that conceptualization, but N + I is also widely applied
sweden_data$populist <- scales::rescale(sweden_data$trust, na.rm = TRUE, to=c(100,0))

# Graphing the histogram distribution using the populist scale:
ggplot(sweden_data, aes(x = populist)) +
  geom_histogram(bins = 30, fill = "blue", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Populist Scale for Sweden",
       x = "Populist Scale",
       y = "Count")

## Warning: Removed 2503 rows containing non-finite values (`stat_bin()`).

What do i see: Compared to Hungary and Spain, which we saw in the tutorial had two modes at slightly less than 50 and 100 on the populist scale, I see that Sweden has one mode that is slightly less than 50. Additionally, I notice that the graph appears to be slightly tilted to the right, with high populist scale counts falling between 0 and 50 and low populist scale counts exceeding 50. The Norway histogram that we saw in the tutorial appears a lot like this one.

Task 2

Run a linear regression with populist attitudes as the outcome, educational attainment (recoded as a binary “BA or more” and “No BA”), and using the Swedish data. Print the intercept and coefficients only and interpret. Then do a tidy model table displaying the p-value (with digits = 3) and interpret.

# Recode educational attainment as a binary
sweden_data <- sweden_data %>%
  mutate(
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      essround >= 5 & edulvlb > 600 ~ "BA or more",
      TRUE ~ "No BA"
    ),
    # Handle NAs for education levels
    edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla),
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
    
    # Explicitly making 'No BA' the reference category
    educ.ba = factor(educ.ba, levels = c("No BA", "BA or more"))
  )

# Check that the recoding worked as expected:
table(sweden_data$educ.ba)

## 
##      No BA BA or more 
##      13477       4739

# Run a linear regression
sweden_pop_model <- lm(populist ~ educ.ba, data = sweden_data)

# Print the intercept and coefficients only:
coefficients_sweden <- coef(sweden_pop_model)
print(coefficients_sweden)

##       (Intercept) educ.baBA or more 
##         51.358680         -7.868121

Thus, 51.36 would be the predicted average populist scale score in Sweden. According to this score, Swedish populist views are more robust and unbiased. The populist scale scores of those who identify as politically right in Sweden are, on average, 7.868121 points lower than those of those who identify as left, according to the negative coefficient linked to the “Right” category. This indicates that those who lean left in Sweden have more populist views than people who lean right. (“No BA” individuals exhibit stronger populist attitudes than the “BA or more” individuals.)

# Now do a tidy model table displaying p-value and interpret:
broom::tidy(sweden_pop_model) |>
knitr::kable(digits = 3)

## Registered S3 methods overwritten by 'broom':
##   method            from  
##   tidy.glht         jtools
##   tidy.summary.glht jtools

term	estimate	std.error	statistic	p.value
(Intercept)	51.359	0.181	283.363	0
educ.baBA or more	-7.868	0.350	-22.494	0

summary(sweden_pop_model)

## 
## Call:
## lm(formula = populist ~ educ.ba, data = sweden_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.359 -14.692  -1.359  11.975  56.509 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        51.3587     0.1812  283.36   <2e-16 ***
## educ.baBA or more  -7.8681     0.3498  -22.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.43 on 15711 degrees of freedom
##   (2503 observations deleted due to missingness)
## Multiple R-squared:  0.0312, Adjusted R-squared:  0.03114 
## F-statistic:   506 on 1 and 15711 DF,  p-value: < 2.2e-16

p-value of a very small number close to 0 indicates that it is statistically significant and we can say that there is a statistically significant association between populist attitudes and educational attainment.

Task 3

Now using the Italian dataset and the authoritarian values scale (as we did in the tutorial), graph the average by survey year. Interpret. Auth flipped non auth not flipped))

italy_data <- read.fst("italy_data.fst")

# Load necessary libraries
library(dplyr)
library(ggplot2)

# Read the Italian dataset
italy_data <- read.fst("italy_data.fst")

# Create authoritarian values scale based on human modules items
italy_data <- italy_data %>%
  mutate(behave = ipbhprp,
         secure = impsafe,
         safety = ipstrgv,
         tradition = imptrad,
         rules = ipfrule) %>%
  mutate(across(c("behave", "secure", "safety", "tradition", "rules"),
                ~ na_if(.x, 7) %>% na_if(8) %>% na_if(9))) %>%
  # Apply reverse coding
  mutate(across(c("behave", "secure", "safety", "tradition", "rules"), ~ 7 - .x ))

# Now you can calculate 'schwartzauth' after the NA recoding
italy_data$auth <- scales::rescale(italy_data$behave + 
                      italy_data$secure + 
                      italy_data$safety + 
                      italy_data$tradition + 
                      italy_data$rules, to=c(0,100), na.rm=TRUE)


italy_data <- italy_data %>% filter(!is.na(auth))
table(italy_data$secure)

## 
##    1    2    3    4    5    6 
##   54  139  626 1856 2876 2881

italy_data$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
  italy_data$year[italy_data$essround == i] <- replacements[i]
}
auth_avg <- italy_data %>%
  group_by(year) %>%
  summarize(auth_avg = mean(auth, na.rm = TRUE))

plot_auth <- ggplot(auth_avg, aes(x = year, y = auth_avg)) +
  geom_point(aes(color = auth_avg), alpha = 1.0) + 
  geom_line(aes(group = 1), color = "blue", linetype = "dashed") +  
  labs(title = "Authoritarian Scale Average by Year (Italy)",
       x = "Survey Year",
       y = "Authoritarian Scale Average") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_y_continuous(limits = c(0, 100))

print(plot_auth)

The visual shows small, but linear fluctuations in the authoritarian scale average by year. Starting from 75 in 2012, dropping to around 73-4 by 2016, rising slightly to about 74.5 btw 2018 - 2020.

Task 4

Now run a linear regression model, using the modelsummary package, with authoritarian values as the outcome, and generations as the predictor/potential explanatory variable, and using the Italian data again. Rename the coefficients using the coef_rename function. Interpret the coefficients, whether they are statistically significant, and the adjusted R-squared.

model_data <- italy_data %>%
  mutate(
    # Recoding cohort variable, setting years before 1930 and after 2000 to NA
    cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),

    # Recoding generational cohorts based on year of birth
    gen = case_when(
      yrbrn %in% 1900:1945 ~ "1",
      yrbrn %in% 1946:1964 ~ "2",
      yrbrn %in% 1965:1979 ~ "3",
      yrbrn %in% 1980:1996 ~ "4",
      TRUE ~ as.character(cohort)  # Keeping other values as character if they do not fit the ranges
    ),

    # Converting cohort to a factor with labels
    gen = factor(gen,
                 levels = c("1", "2", "3", "4"),
                 labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
  )
table(model_data$gen)

## 
##     Interwar Baby Boomers        Gen X  Millennials 
##         1026         2580         2214         1811

# Next, run a linear regression model and rename the coefficients:
model <- lm(auth ~ gen, data = model_data)
modelsummary(
  list(model),coef_rename = c("genBaby Boomers" = "Baby Boomers", "genGen X" = "Gen X", "genMillennials" = "Millennials"))

	(1)
(Intercept)	75.345
	(0.467)
Baby Boomers	−1.125
	(0.552)
Gen X	−2.080
	(0.564)
Millennials	−3.592
	(0.584)
Num.Obs.	7631
R2	0.006
R2 Adj.	0.006
AIC	62934.0
BIC	62968.7
Log.Lik.	−31462.009
RMSE	14.94

As a result, the negative coefficient shows that Baby Boomers are less authoritarian than the interwar generation. Comparing Gen X and Millennials to the interwar generation, there is a similar drop in authoritarian ideology.

When Gen X - Interwar generation are compared, Gen X exhibits scores that are, on average, 2.080 points lower on the authoritarian values scale. This is a larger decrease than was observed for the Baby Boomers generation compared to the Interwar generation. Again, there is no triple asterisks for this coefficient indicating that we do not have a very low p-value below our 0.05 alpha threshold, and therefore we cannot say that this is statistically significant.

The small value of the adjusted R squared (value is 0.006) means that the ‘gen’ variable explains approximately 0.6% of the variance in authoritarian values scales for Italy, after accounting for the number of predictors which is 1 (taking into account number of predictors is used in adjusted R squared but not R squared). This suggests that this model only explains a very small portion of the variability in authoritarian values and that there are likely other parameters outside of generational differences that may play a significant role in shaping these attitudes.

Task 5

Now visualize, using the modelplot() function from the modelsummary package, coefficient estimates and 95% confidence intervals for the following three models:

Model 1 = populist attitudes scale as the outcome; left/right as the single predictor (omitting moderates as we did in the tutorial), and using data for Greece.

Model 2 = populist attitudes scale as the outcome; gender as the single predictor, and using data for Greece.

Model 3 = populist attitudes scale as the outcome; generations as the single predictor (using the four generational category recode we did in the tutorial), and using data for Greece. Interpret.

## Model #1

greece_data <- read.fst("greece_data.fst")

greece_data$trstplt <- ifelse(greece_data$trstplt > 10, NA, greece_data$trstplt)
greece_data$trstprl <- ifelse(greece_data$trstprl > 10, NA, greece_data$trstprl)
greece_data$trstprt <- ifelse(greece_data$trstprt > 10, NA, greece_data$trstprt)

greece_data$trust <- scales::rescale(greece_data$trstplt + greece_data$trstprl + greece_data$trstprt, na.rm = TRUE, to = c(0, 100))

greece_data$populist <- scales::rescale(greece_data$trust, na.rm = TRUE, to=c(100,0))

greece_data <- greece_data %>%
  mutate(
    ID = case_when(
      lrscale %in% 0:4 ~ "Left",                     
      lrscale %in% 6:10 ~ "Right",                  
      lrscale %in% c(77, 88, 99) ~ NA_character_,   
      TRUE ~ NA_character_                          
    )
  )

greece_lr_model <- lm(populist ~ ID, data = greece_data)

Model #2

greece_data <- greece_data %>%
  mutate(
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      gndr == 9 ~ NA_character_,
      TRUE ~ as.character(gndr)
    ))

greece_gndr_model <- lm(populist ~ gndr, data = greece_data)

Model#3

greece_data <- greece_data %>%
  mutate(
    cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),

    gen = case_when(
      yrbrn %in% 1900:1945 ~ "1",
      yrbrn %in% 1946:1964 ~ "2",
      yrbrn %in% 1965:1979 ~ "3",
      yrbrn %in% 1980:1996 ~ "4",
      TRUE ~ as.character(cohort)  # Keeping other values as character if they do not fit the ranges
    ),

    gen = factor(gen,
                 levels = c("1", "2", "3", "4"),
                 labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
  )
table(greece_data$gen)

## 
##     Interwar Baby Boomers        Gen X  Millennials 
##         2978         3484         3498         2330

greece_gen_model <- lm(populist ~ gen, data = greece_data)

models <- list(
   "Model 1" = lm(populist ~ ID, data = greece_data),
   "Model 2" = lm(populist ~ gndr, data = greece_data),
   "Model 3" = lm(populist ~ gen, data = greece_data))
modelsummary(models, fmt = 1,
  estimate  = c("{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}"),
  statistic = NULL)

	Model 1	Model 2	Model 3
(Intercept)	72.0 (0.5)***	70.0 (0.3)***	65.8 (0.5)***
IDRight	−8.8 (0.6)***
gndrMale		−1.0 (0.5)*
genBaby Boomers			3.5 (0.7)***
genGen X			5.6 (0.7)***
genMillennials			5.8 (0.7)***
Num.Obs.	4936	9823	9562
R2	0.037	0.000	0.009
R2 Adj.	0.036	0.000	0.009
AIC	44723.8	88873.2	86473.9
BIC	44743.3	88894.8	86509.7
Log.Lik.	−22358.894	−44433.590	−43231.944
RMSE	22.44	22.30	22.25

modelplot(models, coef_omit = 'Interc')

The graph shows that three generations coefficients are above 0, with the “Millenials” being the largest. The gender coefficient is negative and closer to 0 than “IDRight” which is also negative but is still further.

k_meryem_hw4

2024-03-19