Problem Set 2

packages <- c("tidyverse", "fst", "modelsummary", "viridis", "kableExtra", "flextable", "officer")

new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
## 
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## 
## 
## 
## Attaching package: 'flextable'
## 
## 
## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote
## 
## 
## The following object is masked from 'package:purrr':
## 
##     compose

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "viridis"      "viridisLite"  "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[5]]
##  [1] "kableExtra"   "viridis"      "viridisLite"  "modelsummary" "fst"         
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "flextable"    "kableExtra"   "viridis"      "viridisLite"  "modelsummary"
##  [6] "fst"          "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "officer"      "flextable"    "kableExtra"   "viridis"      "viridisLite" 
##  [6] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"

load("/Users/liuyichen/Downloads/gss2018_egp.RData")
gss <- df

gss <- gss %>%
  mutate(egp = factor(egp, levels = c(
    "I",
    "II",
    "IIIa",
    "IIIb",
    "IVa",
    "IVb",
    "IVc",
    "V",
    "VI",
    "VIIa",
    "VIIb"
  ), labels = c(
    "Higher-grade professionals, managers, large proprietors",
    "Lower-grade professionals, technicians, non-manual supervisors",
    "Higher-grade routine non-manual employees (admin/commerce)",
    "Lower-grade routine non-manual employees (sales/service)",
    "Small proprietors, artisans with employees",
    "Small proprietors, artisans without employees",
    "Farmers, small-holders, self-employed in primary production",
    "Lower-grade technicians, manual supervisors",
    "Skilled manual workers",
    "Semi- and unskilled manual workers (not agriculture)",
    "Agricultural and primary production workers"
  ), ordered = TRUE))

table(gss$natcrime)

## 
##  too little about right    too much          DK         IAP          NA 
##       23616        9317        2148           0           0           0

unique(gss$natcrime)

## [1] too little  <NA>        about right too much   
## Levels: too little about right too much DK IAP NA

load("/Users/liuyichen/Downloads/gss2022.Rdata")
gss <- df

Task 1: Data Cleaning and Recoding

Objective: Clean and recode the variables to ensure they are ready for analysis.

Recode polviews into three categories: “Liberal”, “Moderate”, and “Conservative”. Clean sex, degree, and race but retain the relevant categories.

gss_cleaned <- gss %>%
  filter(!is.na(polviews), !is.na(sex), !is.na(degree), !is.na(race)) %>%
  mutate(
    polviews = case_when(
      polviews %in% c("liberal", "moderate", "conservative") ~ polviews,
      TRUE ~ NA_character_
    ),
    race = case_when(
      race %in% c("white", "black", "other") ~ race,
      TRUE ~ NA_character_
    ),
    sex = case_when(
      sex %in% c("male", "female") ~ sex,
      TRUE ~ NA_character_
    ),
    degree = case_when(
      degree %in% c("less than high school", "high school", "junior college", "bachelor", "graduate") ~ degree,
      TRUE ~ NA_character_
    )
  )

gss_cleaned <- gss_cleaned %>%
  rename(
    "Political Views" = polviews,
    "Respondent Sex" = sex,
    "Highest Degree" = degree,
    "Respondent Race" = race
  )

Task 2: Data Summary

Objective: Generate a summary table for selected variables using the datasummary_skim function from the modelsummary package.

Select the variables of interest: polviews, sex, degree, and race.

Generate a categorical summary table for these variables, clean the labels, and display it using the flextable package for styling.

gss <- gss %>%
  filter(!is.na(sex), !is.na(degree), !is.na(race))

gss_filtered <- gss_cleaned %>%
  dplyr::select(`Political Views`, `Respondent Sex`, `Highest Degree`, `Respondent Race`)
categorical_summary <- datasummary_skim(gss_filtered, type = "categorical")
categorical_summary

tinytable_1kqftbaeitt4v0d8io30

		N	%
Political Views	conservative	9316	14.9
	liberal	7596	12.2
	NA	45515	72.9
Respondent Sex	female	34482	55.2
	male	27945	44.8
Highest Degree	graduate	5460	8.7
	high school	31800	50.9
	less than high school	10980	17.6
	NA	14187	22.7
Respondent Race	black	8506	13.6
	other	3841	6.2
	white	50080	80.2

categorical_summary_relabelled <- datasummary_skim(
  gss_cleaned %>%
    dplyr::select(`Political Views`, `Respondent Sex`, `Highest Degree`, `Respondent Race`),
  type = "categorical", 
  output = "kableExtra" 
)

## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
##   produced by the `tinytable` backend.

categorical_summary_relabelled %>%
  kableExtra::kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% 
  kableExtra::row_spec(0, bold = TRUE, color = "white", background = "#4CAF50") %>% 
  kableExtra::column_spec(1, bold = TRUE) %>% 
  kableExtra::add_header_above(c(" " = 1, "Summary Statistics for Categorical Variables" = 3))

	Summary Statistics for Categorical Variables
		N	%
Political Views	conservative	9316	14.9
	liberal	7596	12.2
	NA	45515	72.9
Respondent Sex	female	34482	55.2
	male	27945	44.8
Highest Degree	graduate	5460	8.7
	high school	31800	50.9
	less than high school	10980	17.6
	NA	14187	22.7
Respondent Race	black	8506	13.6
	other	3841	6.2
	white	50080	80.2

Task 3: Visualization of Political Views by Gender

Objective: Create a bar chart showing the distribution of political views by gender.

Create a bar chart showing the distribution of political views by gender. Use a color palette that clearly differentiates the categories.

ggplot(gss_cleaned, aes(x = `Political Views`, fill = `Respondent Sex`)) +
  geom_bar(position = "fill", color = "black", alpha = 0.8) +
  scale_y_continuous(labels = scales::percent_format()) + 
  labs(
    title = "Distribution of Political Views by Gender",
    x = "Political Views",
    y = "Proportion",
    fill = "Gender"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1), 
    legend.position = "bottom", 
    legend.title = element_text(size = 10), 
    legend.text = element_text(size = 8) 
  ) +
  guides(fill = guide_legend(nrow = 1, byrow = TRUE, title.position = "top"))

Task 4: Trends Over Time

Objective: Visualize trends in religious attendance over time.

Select the year and attend variables from the GSS dataset.

Create a line plot showing the proportion of each category of religious attendance over time.

gss_yearly <- gss %>%
  group_by(year, attend) %>% 
  summarize(count = n(), .groups = 'drop') %>% 
  group_by(year) %>% 
  mutate(total = sum(count), 
         proportion = count / total) 

ggplot(gss_yearly, aes(x = year, y = proportion, color = attend, group = attend)) +
  geom_line(size = 1.2) + 
  scale_color_brewer(palette = "Set3") + 
  labs(title = "Evolution of Religious Attendance Over Time", 
       x = "Year", 
       y = "Proportion", 
       color = "Religious Attendance") + 
  theme_minimal() + 
  theme(legend.position = "bottom")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 34 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Task 5: Comparison Trends Objective: Create a stacked bar chart showing the distribution of fejobaff (preferential hiring) across different age groups.

Create an age group variable by categorizing age into “18-29”, “30-44”, “45-59”, “60+”. Create a stacked bar chart showing the distribution of the fejobaff response categories for each age group.

gss_age <- gss %>%
  mutate(age_group = case_when(
    age >= 18 & age <= 29 ~ "18-29",
    age >= 30 & age <= 44 ~ "30-44",
    age >= 45 & age <= 59 ~ "45-59",
    age >= 60 ~ "60+",
    TRUE ~ NA_character_
  ))

gss_fejobaff <- gss_age %>%
  group_by(age_group, fejobaff) %>% 
  summarize(count = n(), .groups = 'drop') %>% 
  group_by(age_group) %>% 
  mutate(total = sum(count), 
         proportion = count / total) %>% 
  filter(!is.na(fejobaff)) 

ggplot(gss_fejobaff, aes(x = age_group, y = proportion, fill = fejobaff)) +
  geom_bar(stat = "identity") +
  scale_fill_brewer(palette = "Set3") + 
  labs(title = "Distribution of Preferential Hiring Across Age Groups", 
       x = "Age Group", 
       y = "Proportion", 
       fill = "Preferential Hiring") + 
  theme_minimal() + 
  theme(legend.position = "bottom")

Problem Set 2

Yichen Liu

2024-07-14

Task 1: Data Cleaning and Recoding

Task 2: Data Summary

Task 3: Visualization of Political Views by Gender

Task 4: Trends Over Time