packages <- c("tidyverse", "fst", "modelsummary", "viridis", "kableExtra", "flextable", "officer")

new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
## 
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## 
## 
## 
## Attaching package: 'flextable'
## 
## 
## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote
## 
## 
## The following object is masked from 'package:purrr':
## 
##     compose
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "viridis"      "viridisLite"  "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[5]]
##  [1] "kableExtra"   "viridis"      "viridisLite"  "modelsummary" "fst"         
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "flextable"    "kableExtra"   "viridis"      "viridisLite"  "modelsummary"
##  [6] "fst"          "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "officer"      "flextable"    "kableExtra"   "viridis"      "viridisLite" 
##  [6] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"
load("/Users/liuyichen/Downloads/gss2018_egp.RData")
gss <- df
gss <- gss %>%
  mutate(egp = factor(egp, levels = c(
    "I",
    "II",
    "IIIa",
    "IIIb",
    "IVa",
    "IVb",
    "IVc",
    "V",
    "VI",
    "VIIa",
    "VIIb"
  ), labels = c(
    "Higher-grade professionals, managers, large proprietors",
    "Lower-grade professionals, technicians, non-manual supervisors",
    "Higher-grade routine non-manual employees (admin/commerce)",
    "Lower-grade routine non-manual employees (sales/service)",
    "Small proprietors, artisans with employees",
    "Small proprietors, artisans without employees",
    "Farmers, small-holders, self-employed in primary production",
    "Lower-grade technicians, manual supervisors",
    "Skilled manual workers",
    "Semi- and unskilled manual workers (not agriculture)",
    "Agricultural and primary production workers"
  ), ordered = TRUE))
table(gss$natcrime)
## 
##  too little about right    too much          DK         IAP          NA 
##       23616        9317        2148           0           0           0
unique(gss$natcrime)
## [1] too little  <NA>        about right too much   
## Levels: too little about right too much DK IAP NA
load("/Users/liuyichen/Downloads/gss2022.Rdata")
gss <- df

Task 1: Data Cleaning and Recoding

Objective: Clean and recode the variables to ensure they are ready for analysis.

Recode polviews into three categories: “Liberal”, “Moderate”, and “Conservative”. Clean sex, degree, and race but retain the relevant categories.

gss_cleaned <- gss %>%
  filter(!is.na(polviews), !is.na(sex), !is.na(degree), !is.na(race)) %>%
  mutate(
    polviews = case_when(
      polviews %in% c("liberal", "moderate", "conservative") ~ polviews,
      TRUE ~ NA_character_
    ),
    race = case_when(
      race %in% c("white", "black", "other") ~ race,
      TRUE ~ NA_character_
    ),
    sex = case_when(
      sex %in% c("male", "female") ~ sex,
      TRUE ~ NA_character_
    ),
    degree = case_when(
      degree %in% c("less than high school", "high school", "junior college", "bachelor", "graduate") ~ degree,
      TRUE ~ NA_character_
    )
  )

gss_cleaned <- gss_cleaned %>%
  rename(
    "Political Views" = polviews,
    "Respondent Sex" = sex,
    "Highest Degree" = degree,
    "Respondent Race" = race
  )

Task 2: Data Summary

Objective: Generate a summary table for selected variables using the datasummary_skim function from the modelsummary package.

Select the variables of interest: polviews, sex, degree, and race.

Generate a categorical summary table for these variables, clean the labels, and display it using the flextable package for styling.

gss <- gss %>%
  filter(!is.na(sex), !is.na(degree), !is.na(race))

gss_filtered <- gss_cleaned %>%
  dplyr::select(`Political Views`, `Respondent Sex`, `Highest Degree`, `Respondent Race`)
categorical_summary <- datasummary_skim(gss_filtered, type = "categorical")
categorical_summary 
tinytable_1kqftbaeitt4v0d8io30
N %
Political Views conservative 9316 14.9
liberal 7596 12.2
NA 45515 72.9
Respondent Sex female 34482 55.2
male 27945 44.8
Highest Degree graduate 5460 8.7
high school 31800 50.9
less than high school 10980 17.6
NA 14187 22.7
Respondent Race black 8506 13.6
other 3841 6.2
white 50080 80.2
categorical_summary_relabelled <- datasummary_skim(
  gss_cleaned %>%
    dplyr::select(`Political Views`, `Respondent Sex`, `Highest Degree`, `Respondent Race`),
  type = "categorical", 
  output = "kableExtra" 
)
## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
##   produced by the `tinytable` backend.
categorical_summary_relabelled %>%
  kableExtra::kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% 
  kableExtra::row_spec(0, bold = TRUE, color = "white", background = "#4CAF50") %>% 
  kableExtra::column_spec(1, bold = TRUE) %>% 
  kableExtra::add_header_above(c(" " = 1, "Summary Statistics for Categorical Variables" = 3)) 
Summary Statistics for Categorical Variables
N %
Political Views conservative 9316 14.9
liberal 7596 12.2
NA 45515 72.9
Respondent Sex female 34482 55.2
male 27945 44.8
Highest Degree graduate 5460 8.7
high school 31800 50.9
less than high school 10980 17.6
NA 14187 22.7
Respondent Race black 8506 13.6
other 3841 6.2
white 50080 80.2

Task 3: Visualization of Political Views by Gender

Objective: Create a bar chart showing the distribution of political views by gender.

Create a bar chart showing the distribution of political views by gender. Use a color palette that clearly differentiates the categories.

ggplot(gss_cleaned, aes(x = `Political Views`, fill = `Respondent Sex`)) +
  geom_bar(position = "fill", color = "black", alpha = 0.8) +
  scale_y_continuous(labels = scales::percent_format()) + 
  labs(
    title = "Distribution of Political Views by Gender",
    x = "Political Views",
    y = "Proportion",
    fill = "Gender"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1), 
    legend.position = "bottom", 
    legend.title = element_text(size = 10), 
    legend.text = element_text(size = 8) 
  ) +
  guides(fill = guide_legend(nrow = 1, byrow = TRUE, title.position = "top"))