Problem Set 2: Tasks 2-5

packages <- c("tidyverse", "fst", "modelsummary", "viridis", "kableExtra", "flextable", "officer")

new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## 
## 
## 
## Attaching package: 'flextable'
## 
## 
## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote
## 
## 
## The following object is masked from 'package:purrr':
## 
##     compose
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "viridis"      "viridisLite"  "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[5]]
##  [1] "kableExtra"   "viridis"      "viridisLite"  "modelsummary" "fst"         
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "flextable"    "kableExtra"   "viridis"      "viridisLite"  "modelsummary"
##  [6] "fst"          "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "officer"      "flextable"    "kableExtra"   "viridis"      "viridisLite" 
##  [6] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"
gss <- load("gss2022.RData")
gss <- df

Task 2: Data Summary

Objective: Generate a summary table for selected variables using the datasummary_skim function from the modelsummary package.

Select the variables of interest: polviews, sex, degree, and race.

Generate a categorical summary table for these variables, clean the labels, and display it using the flextable package for styling.

gss <- gss %>%
  mutate(
    polviews = case_when(
      polviews %in% c("extremely conservative", "conservative", "slightly conservative", "moderate, middle of the road", "slightly liberal", "liberal", "extremely liberal") ~ polviews,
      TRUE ~ NA_character_
    ),
    sex = case_when(
      sex %in% c("male", "female") ~ sex,
      TRUE ~ NA_character_
    ),
    degree = case_when(
      degree %in% c("less than high school", "high school", "junior college", "bachelor", "graduate") ~ degree,
      TRUE ~ NA_character_
    ),
    race = case_when(
      race %in% c("white", "black", "other") ~ race,
      TRUE ~ NA_character_
    )
  )
gss_filtered <- gss %>%
  dplyr::select(polviews, sex, degree, race)
categorical_summary <- datasummary_skim(gss_filtered, type = "categorical")
categorical_summary 
N %
polviews conservative 9361 12.9
extremely conservative 2165 3.0
extremely liberal 2081 2.9
liberal 7623 10.5
moderate, middle of the road 23992 33.1
slightly conservative 9596 13.3
slightly liberal 7900 10.9
NA 9672 13.4
sex female 40301 55.7
male 31977 44.2
NA 112 0.2
degree graduate 5953 8.2
high school 36446 50.3
less than high school 14192 19.6
NA 15799 21.8
race black 10215 14.1
other 4411 6.1
white 57657 79.6
NA 107 0.1
gss_cleaned <- gss %>%
  filter(!is.na(polviews),
        !is.na(sex), !is.na(degree), !is.na(race),  
        !is.na(fefam),
        !is.na(libhomo), !is.na(attend)) %>%
  mutate(
    polviews = recode(polviews, "extremely conservative" = "Extremely Conservative", "conservative" = "Conservative", "slightly conservative" = "Slightly Conservative", "moderate, middle of the road" = "Moderate, Middle of the Road", "slightly liberal" = "Slightly Liberal", "liberal" = "Liberal", "extremely liberal" = "Extremely Liberal",
    sex = recode(sex, "male" = "Male", "female" = "Female"),
    degree = recode(degree, "less than high school" = "Less than High School", "high school" = "High School", "junior college" = "Junior College", "bachelor" = "Bachelor", "graduate" = "Graduate"),
    race = recode(race, "white" = "White", "black" = "Black", "other" = "Other")
  )
)

gss_cleaned <- gss_cleaned %>%
  rename(
    "Think of Self as Liberal or Conservative" = polviews,
    "Respondent Sex" = sex,
    "Highest Degree" = degree,
    "Respondent Race" = race
  )

categorical_summary_relabelled <- datasummary_skim(
  gss_cleaned %>%
    dplyr::select(`Think of Self as Liberal or Conservative`, `Respondent Sex`, `Highest Degree`, `Respondent Race`), 
  type = "categorical",
  output = "kableExtra"
)

categorical_summary_relabelled %>%
  kableExtra::kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% 
  kableExtra::row_spec(0, bold = TRUE, color = "white", background = "#4CAF50") %>% 
  kableExtra::column_spec(1, bold = TRUE) %>% 
  kableExtra::add_header_above(c(" " = 1, "Summary Statistics for Categorical Variables" = 3))
Summary Statistics for Categorical Variables
N %
Think of Self as Liberal or Conservative Conservative 1742 14.7
Extremely Conservative 419 3.5
Extremely Liberal 382 3.2
Liberal 1463 12.3
Moderate, Middle of the Road 4767 40.1
Slightly Conservative 1664 14.0
Slightly Liberal 1438 12.1
Respondent Sex female 6648 56.0
male 5227 44.0
Highest Degree graduate 1372 11.6
high school 8015 67.5
less than high school 2488 21.0
Respondent Race black 1639 13.8
other 728 6.1
white 9508 80.1

Task 3: Visualization of Political Views by Gender

Objective: Create a bar chart showing the distribution of political views by gender.

Create a bar chart showing the distribution of political views by gender. Use a color palette that clearly differentiates the categories.

polviews_summary <- gss_filtered %>%
  count(polviews) %>% 
  mutate(pct = n / sum(n) * 100) 
ggplot(gss_filtered, aes(x = polviews, fill = sex)) +
  geom_bar(position = "dodge", color = "black") + 
  labs(title = "Distribution of Political Views by Gender", x = "Political Views", y = "Count") + 
  scale_fill_brewer(palette = "Set1", name = "Gender") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))