Kim_Roy_Homework

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

# List of packages
packages <- c("tidyverse", "infer", "fst", "modelsummary", "effects", "survey", "interactions") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## Warning: package 'infer' was built under R version 4.3.3

## Warning: package 'effects' was built under R version 4.3.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.3.3

## lattice theme set by effectsTheme()
## See ?effectsTheme for details.

## Warning: package 'survey' was built under R version 4.3.3

## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loading required package: survival
## 
## Attaching package: 'survey'
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart

## Warning: package 'interactions' was built under R version 4.3.3

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[4]]
##  [1] "modelsummary" "fst"          "infer"        "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "effects"      "carData"      "modelsummary" "fst"          "infer"       
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "survey"       "survival"     "Matrix"       "grid"         "effects"     
##  [6] "carData"      "modelsummary" "fst"          "infer"        "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[7]]
##  [1] "interactions" "survey"       "survival"     "Matrix"       "grid"        
##  [6] "effects"      "carData"      "modelsummary" "fst"          "infer"       
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"

task 1

Graph the histogram distribution using the populist scale (as we did in the tutorial) for the country of Sweden. What do you note?

Sweden_data <- read.fst("Sweden_data.fst")

Sweden_data$trstplt <- ifelse(Sweden_data$trstplt > 10, NA, Sweden_data$trstplt)
Sweden_data$trstprl <- ifelse(Sweden_data$trstprl > 10, NA, Sweden_data$trstprl)
Sweden_data$trstprt <- ifelse(Sweden_data$trstprt > 10, NA, Sweden_data$trstprt)

Sweden_data$trust <- scales::rescale(Sweden_data$trstplt + Sweden_data$trstprl + Sweden_data$trstprt, na.rm = TRUE, to = c(0, 100))

Sweden_data$populist <- scales::rescale(Sweden_data$trust, na.rm = TRUE, to=c(100,0))

ggplot(Sweden_data, aes(x = populist)) +
  geom_histogram(bins = 30, fill = "blue", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Populist Scale for Sweden",
       x = "Populist Scale",
       y = "Count")

## Warning: Removed 2503 rows containing non-finite values (`stat_bin()`).

The histogram shows that the populist data of Sweden has a slight left skew. The graph shows a standard bell curve with a peak appearing at 50. The overall trend of the populist scale dies down after 50 while the trend up until 50 seems to pick up gradually.

Task 2

Run a linear regression with populist attitudes as the outcome, educational attainment (recoded as a binary “BA or more” and “No BA”), and using the Swedish data. Print the intercept and coefficients only and interpret. Then do a tidy model table displaying the p-value (with digits = 3) and interpret.

Sf <- read.fst("Sweden_data.fst")

Sf <- Sf %>%
  mutate(
    behave = ipbhprp,
    secure = impsafe,
    safety = ipstrgv,
    tradition = imptrad,
    rules = ipfrule
  ) %>%
  mutate(across(c(behave, secure, safety, tradition, rules),
                ~ na_if(.x, 7) %>% na_if(8) %>% na_if(9))) %>%
  mutate(across(c(behave, secure, safety, tradition, rules), ~ 7 - .x )) %>%
  mutate(
    populist = scales::rescale(behave + secure + safety + tradition + rules, to=c(0,100), na.rm=TRUE)
  )

Sf <- Sf %>% filter(!is.na(populist))

model_data <- Sf %>%
  mutate(
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      essround >= 5 & edulvlb > 600 ~ "BA or more",
      TRUE ~ "No BA"
    ),
    educ.ba = factor(educ.ba, levels = c("No BA", "BA or more"))
  )

model1 <- lm(populist ~ educ.ba, data = model_data)

summary(model1)$coefficients

##                   Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)       58.04499  0.1690301 343.40030 0.000000e+00
## educ.baBA or more -4.73686  0.3324911 -14.24658 9.664324e-46

##                   Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)       58.04499  0.1690301 343.40030 0.000000e+00
## educ.baBA or more -4.73686  0.3324911 -14.24658 9.664324e-46

modelsummary(
  list(model1),
  fmt = 3,
  statistic = "p.value"
)

	(1)
(Intercept)	58.045
	(<0.001)
educ.baBA or more	−4.737
	(<0.001)
Num.Obs.	14328
R2	0.014
R2 Adj.	0.014
AIC	122558.3
BIC	122581.0
Log.Lik.	−61276.168
RMSE	17.42

The intercept 58.045 represents the estimated value of the dependent variable when all predictor variables are set to zero. The coefficient -4.737 signifies that indiviudals with a BA or more” education level, compared to those with “No BA” education level, have, on average, a decrease of 4.737 units in their populist attitudes scale score. Accordingly, the p-value “<0.001” indicates that the coefficient is statistically significant which suggests that the education level is associated with differences in the populist attitudes scale score. The R-squared value of 0.014 indicates that approximately 1.4% of the variance in the dependent variable (populist attitudes scale) is explained by the independent variable (education level) in the model. The AIC value of 122558.3 is a measure of the relative quality of the statistical model for a given set of data.

task 3

Now using the Italian dataset and the authoritarian values scale (as we did in the tutorial), graph the average by survey year. Interpret.

Italy_data <- read.fst("Italy_data.fst")

af <- read.fst("italy_data.fst")
## Creates an authoritarian values scale based on human modules items (can also do libertarian value scale)
af <- af %>%
  mutate(behave = ipbhprp,
         secure = impsafe,
         safety = ipstrgv,
         tradition = imptrad,
         rules = ipfrule) %>%
  mutate(across(c("behave", "secure", "safety", "tradition", "rules"),
                ~ na_if(.x, 7) %>% na_if(8) %>% na_if(9))) %>%
  # Apply the reverse coding
  mutate(across(c("behave", "secure", "safety", "tradition", "rules"), ~ 7 - .x ))

# Now you can calculate 'schwartzauth' after the NA recoding
af$auth <- scales::rescale(af$behave + 
                      af$secure + 
                      af$safety + 
                      af$tradition + 
                      af$rules, to=c(0,100), na.rm=TRUE)


af <- af %>% filter(!is.na(auth))
table(af$secure)

## 
##    1    2    3    4    5    6 
##   54  139  626 1856 2876 2881

ggplot(af, aes(x = auth)) +
  geom_histogram(bins = 30, fill = "blue", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Authoritarian Scale for Italy",
       x = "Authoritarian Scale",
       y = "Count")

The graph shows a peak of 1000 as well as a shifted bell curve towards the right.

task 4

Now run a linear regression model, using the modelsummary package, with authoritarian values as the outcome, and generations as the predictor/potential explanatory variable, and using the Italian data again. Rename the coefficients using the coef_rename function. Interpret the coefficients, whether they are statistically significant, and the adjusted R-squared.

model_data1 <- af %>%
  mutate(
    gen = case_when(
      yrbrn >= 1900 & yrbrn <= 1945 ~ "1",
      yrbrn >= 1946 & yrbrn <= 1964 ~ "2",
      yrbrn >= 1965 & yrbrn <= 1979 ~ "3",
      yrbrn >= 1980 & yrbrn <= 1996 ~ "4",
      TRUE ~ NA_character_  # Keeping other values as NA
    ),
    # Converting gen to a factor with labels
    gen = factor(gen,
                 levels = c("1", "2", "3", "4"),
                 labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
  )
table(model_data1$gen)

## 
##     Interwar Baby Boomers        Gen X  Millennials 
##         1026         2580         2214         1811

model <- lm(auth ~ gen, data = model_data1)

modelsummary(
  list(model),
  fmt = 1,
  estimate  = c( "{estimate} ({std.error}){stars}"),
  statistic = NULL,
  coef_omit = "Intercept")

	(1)
genBaby Boomers	−1.1 (0.6)*
genGen X	−2.1 (0.6)***
genMillennials	−3.6 (0.6)***
Num.Obs.	7631
R2	0.006
R2 Adj.	0.006
AIC	62934.0
BIC	62968.7
Log.Lik.	−31462.009
RMSE	14.94

models <- list(
  "Model 1" = lm(auth ~ gen, data = model_data1)
)

modelsummary(models,
             estimate = c("{estimate} ({std.error}){stars}"),
             statistic = NULL)

	Model 1
(Intercept)	75.345 (0.467)***
genBaby Boomers	−1.125 (0.552)*
genGen X	−2.080 (0.564)***
genMillennials	−3.592 (0.584)***
Num.Obs.	7631
R2	0.006
R2 Adj.	0.006
AIC	62934.0
BIC	62968.7
Log.Lik.	−31462.009
RMSE	14.94

Intercept 75.345 shows the expected average authoritarian scale score when other predictor variables are zero. Baby boomers, on average, are 1.1 points lower on the authoritarian values scale while milennials have the most notieceable decrease in authoritarian values with a score of 3.592. The value 0.006 means that the gen variable explains 0.6% of the variance in authoritarian values scale. This model explains an extremely small portion of the varied authortarian values.

task 5

Now visualize, using the modelplot() function from the modelsummary package, coefficient estimates and 95% confidence intervals for the following three models:

Model 1 = populist attitudes scale as the outcome; left/right as the single predictor (omitting moderates as we did in the tutorial), and using data for Greece.

Model 2 = populist attitudes scale as the outcome; gender as the single predictor, and using data for Greece.

Model 3 = populist attitudes scale as the outcome; generations as the single predictor (using the four generational category recode we did in the tutorial), and using data for Greece. Interpret.

Greece_data <- read.fst("Greece_data.fst")

Greece_data$trstplt <- ifelse(Greece_data$trstplt > 10, NA, Greece_data$trstplt)
Greece_data$trstprl <- ifelse(Greece_data$trstprl > 10, NA, Greece_data$trstprl)
Greece_data$trstprt <- ifelse(Greece_data$trstprt > 10, NA, Greece_data$trstprt)

Greece_data$trust <- scales::rescale(Greece_data$trstplt + Greece_data$trstprl + Greece_data$trstprt, na.rm = TRUE, to = c(0, 100))

Greece_data$populist <- scales::rescale(Greece_data$trust, na.rm = TRUE, to=c(100,0))

ggplot(Greece_data, aes(x = populist)) +
  geom_histogram(bins = 30, fill = "blue", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Populist Scale for Greece",
       x = "Populist Scale",
       y = "Count")

## Warning: Removed 2735 rows containing non-finite values (`stat_bin()`).

Greece_data <- Greece_data %>%
  mutate(
    # Recoding gender
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      gndr == 9 ~ NA_character_,
      TRUE ~ as.character(gndr)
    ),

 # Recoding education
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      essround >= 5 & edulvlb > 600 ~ "BA or more",
      TRUE ~ "No BA"
    ),
    
    # Handle NAs for education levels
    edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla),
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
    
    # Explicitly making 'No BA' the reference category
    educ.ba = factor(educ.ba, levels = c("No BA", "BA or more")),

    # Recoding age, setting 999 to NA
    age = ifelse(agea == 999, NA, agea),

    # Recoding cohort variable, setting years before 1930 and after 2000 to NA
    cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),

    # Recoding generational cohorts based on year of birth
    gen = case_when(
      yrbrn %in% 1900:1945 ~ "1",
      yrbrn %in% 1946:1964 ~ "2",
      yrbrn %in% 1965:1979 ~ "3",
      yrbrn %in% 1980:1996 ~ "4",
      TRUE ~ as.character(cohort)  # Keeping other values as character if they do not fit the ranges
    ),

    # Converting cohort to a factor with labels
    gen = factor(gen,
                 levels = c("1", "2", "3", "4"),
                 labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
  )
table(model_data$gen)

## < table of extent 0 >

modelG_data <- Greece_data %>%
  mutate(
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      essround >= 5 & edulvlb > 600 ~ "BA or more",
      TRUE ~ "No BA"
    ),
    edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla),
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
  
   educ.ba = factor(educ.ba, levels = c("No BA", "BA or more")),

   age = ifelse(agea == 999, NA, agea),

   cohort = ifelse(yrbrn < 1930 | yrbrn > 2000, NA, yrbrn),

    gen = case_when(
      yrbrn %in% 1900:1945 ~ "1",
      yrbrn %in% 1946:1964 ~ "2",
      yrbrn %in% 1965:1979 ~ "3",
      yrbrn %in% 1980:1996 ~ "4",
      TRUE ~ as.character(cohort)  # Keeping other values as character if they do not fit the ranges
    ),

    gen = factor(gen,
                 levels = c("1", "2", "3", "4"),
                 labels = c("Interwar", "Baby Boomers", "Gen X", "Millennials"))
  )

table(modelG_data$educ.ba)

## 
##      No BA BA or more 
##       9977       2581

modelG_data <- lm(populist ~ educ.ba, data = modelG_data)

modelplot(modelG_data, coef_names = "Left/Right", title = "Model G: Populist Attitudes Scale vs. Left/Right", conf_level = 0.95)

## Warning in ggplot2::geom_pointrange(ggplot2::aes(y = term, x = estimate, :
## Ignoring unknown parameters: `coef_names` and `title`

model_2 <- lm(populist ~ gndr, data = Greece_data)

modelplot(model_2, coef_names = "Gender", title = "Model 2: Populist Attitudes Scale vs. Gender", conf_level = 0.95)

## Warning in ggplot2::geom_pointrange(ggplot2::aes(y = term, x = estimate, :
## Ignoring unknown parameters: `coef_names` and `title`

model_3 <- lm(populist ~ gen, data = Greece_data)

# Visualize model 3
modelplot(model_3, coef_names = "Generation", title = "Model 3: Populist Attitudes Scale vs. Generation", conf_level = 0.95)

## Warning in ggplot2::geom_pointrange(ggplot2::aes(y = term, x = estimate, :
## Ignoring unknown parameters: `coef_names` and `title`

As generations progress, we can see higher levels of populism by generation.

Kim_Roy_Homework_4

2024-03-19

task 1

Task 2

task 3

task 4

task 5