knitr::opts_chunk$set(echo = TRUE)
# List of packages
packages <- c("tidyverse", "fst", "modelsummary", "viridis", "kableExtra", "flextable", "officer") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
## 
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## 
## 
## 
## Attaching package: 'flextable'
## 
## 
## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote
## 
## 
## The following object is masked from 'package:purrr':
## 
##     compose
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "viridis"      "viridisLite"  "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[5]]
##  [1] "kableExtra"   "viridis"      "viridisLite"  "modelsummary" "fst"         
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "flextable"    "kableExtra"   "viridis"      "viridisLite"  "modelsummary"
##  [6] "fst"          "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "officer"      "flextable"    "kableExtra"   "viridis"      "viridisLite" 
##  [6] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"

Data

file.exists("gss2022.RData")  # Check if the file exists in the current working directory
## [1] TRUE
gss <- load("gss2022.RData")
gss <- df
table(gss$polviews)
## 
##             extremely liberal                       liberal 
##                          2081                          7623 
##              slightly liberal  moderate, middle of the road 
##                          7900                         23992 
##         slightly conservative                  conservative 
##                          9596                          9361 
##        extremely conservative                    don't know 
##                          2165                             0 
##                           iap            I don't have a job 
##                             0                             0 
##                   dk, na, iap                     no answer 
##                             0                             0 
##    not imputable_(2147483637)    not imputable_(2147483638) 
##                             0                             0 
##                       refused                skipped on web 
##                             0                             0 
##                    uncodeable not available in this release 
##                             0                             0 
##    not available in this year                  see codebook 
##                             0                             0
unique(gss$polviews)
## [1] <NA>                         moderate, middle of the road
## [3] slightly conservative        conservative                
## [5] liberal                      extremely conservative      
## [7] slightly liberal             extremely liberal           
## 20 Levels: extremely liberal liberal ... see codebook

Task 1 - Data Cleaning and Recoding

# Recode and clean variables
gss <- gss %>%
  mutate(
    polviews = case_when(
      polviews %in% c("extremely liberal", "liberal", "slightly liberal", "moderate, middle of the road", "slightly conservative", 
                      "conservative", "extremely conservative"
                      ) ~ polviews, 
      TRUE ~ NA_character_
    ),
    sex = case_when(
      sex %in% c("male", "female") ~ sex, 
      TRUE ~ NA_character_
    ),
    degree = case_when(
      degree %in% c("less than high school", "high school", 
                    "associate/junior college", "bachelor's", "graduate") ~ degree, 
      TRUE ~ NA_character_
    ),
    race = case_when(
      race %in% c("white", "black", "other") ~ race, 
      TRUE ~ NA_character_
    )
  )


# Filter to variables of interest
gss_filtered <- gss %>%
  dplyr::select(polviews, sex, degree, race)
categorical_summary <- datasummary_skim(gss_filtered, type = "categorical")
categorical_summary 
tinytable_9h4br54s3o1k4lplw8r3
N %
polviews conservative 9361 12.9
extremely conservative 2165 3.0
extremely liberal 2081 2.9
liberal 7623 10.5
moderate, middle of the road 23992 33.1
slightly conservative 9596 13.3
slightly liberal 7900 10.9
NA 9672 13.4
sex female 40301 55.7
male 31977 44.2
NA 112 0.2
degree associate/junior college 4355 6.0
bachelor's 11248 15.5
graduate 5953 8.2
high school 36446 50.3
less than high school 14192 19.6
NA 196 0.3
race black 10215 14.1
other 4411 6.1
white 57657 79.6
NA 107 0.1
gss_cleaned <- gss %>%
  filter(!is.na(polviews), !is.na(sex), !is.na(degree),
         !is.na(race)) %>%
  
  #Recode polviews to Liberal, Moderate, Conservative
  mutate(
    polviews = recode(polviews, "extremely liberal" = "Liberal", "liberal" = "Liberal", "slightly liberal" = "Liberal", 
                      "moderate, middle of the road" = "Moderate", "slightly conservative" = "Conservative", 
                      "conservative" = "Conservative", "extremely conservative" = "Conservative"
                      ),
    sex = recode(sex, "female" = "Female", "male" = "Male"),
    degree = recode(degree, "associate/junior college" = "Associate/Junior College", "bachelor's" = "Bachelor's", "graduate" = "Graduate", 
                    "high school" = "High School", "less than high school" = "Less than High School"
                    ),
    race = recode(race, "black" = "Black", "white" = "White", "other" = "Other")
    
  )


# Select and rename the variables to be more informative
gss_cleaned <- gss_cleaned %>%
  rename(
    "Political Views" = polviews,
    "Gender" = sex,
    "Education" = degree,
    "Race" = race
  )
  table(gss_filtered$sex)
## 
## female   male 
##  40301  31977

Task 2 - Data Summary

# Create summary for relabeled categorical variables with flextable
categorical_summary_flextable <- datasummary_skim(
  gss_cleaned %>%
    dplyr::select(`Political Views`, `Gender`, `Education`, `Race`),
  type = "categorical",
  output = "flextable"
)
## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
##   produced by the `tinytable` backend.
# Customize the table appearance with flextable
categorical_summary_flextable <- categorical_summary_flextable %>%
  set_header_labels(Variable = "Variable", Value = "Value", Freq = "Frequency") %>%
  theme_box() %>%
  bold(part = "header") %>%
  bg(part = "header", bg = "#4CAF50") %>%
  color(part = "header", color = "white") %>%
  border_remove() %>%
  border_inner_v(border = fp_border(color = "black", width = 1)) %>%
  autofit()

print(categorical_summary_flextable)
## a flextable object.
## col_keys: ` `, `  `, `N`, `%` 
## header has 1 row(s) 
## body has 13 row(s) 
## original dataset sample: 
##                                    N    %
## 1 Political Views Conservative 21025 33.7
## 2                      Liberal 17531 28.1
## 3                     Moderate 23871 38.2
## 4          Gender       Female 34482 55.2
## 5                         Male 27945 44.8

Task 3 - Visualization of Political Views by Gender

gss <- load("gss2022.RData")
gss <- df

Recoding Sex

# Recoding sex
gss <- gss %>%
  mutate(sex = case_when(
    sex == "female" ~ 1,
    sex == "male" ~ 2,
    TRUE ~ NA_real_
  ))



# Filter out non-relevant responses in sex
gss_filtered_1 <- gss %>%
  filter(sex %in% c(1, 2)) %>%
  mutate(sex = factor(sex, levels = c(1, 2), labels = c("Female", "Male")))

Cross Tab (Political Views and Gender)

# Creat cross-tabulation of polviews and sex
cross_tab_full <- gss_filtered_1 %>%
  count(polviews, sex) %>%
  spread(key = sex, value = n, fill = 0)

# Create and style the table with a footnote
cross_tab_full %>%
  kable(col.names = c("Political Views", "Female", "Male"), align = 'c') %>% # Set column names and align columns centrally
  kable_styling(bootstrap_options = "striped", full_width = F) %>% # Apply table styling
  add_footnote(label = "Data: General Social Survey (1972-2022)") # Add a footnote with the data source
Political Views Female Male
extremely liberal 1121 955
liberal 4313 3302
slightly liberal 4377 3513
moderate, middle of the road 13980 9974
slightly conservative 4875 4713
conservative 4827 4517
extremely conservative 1103 1055
NA 5705 3948
a Data: General Social Survey (1972-2022)

Bar Chart

gss_filtered_1 %>%
  count(polviews, sex) %>%
  ggplot(aes(x = polviews, y = n, fill = sex)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_brewer(palette = "Set1", name = "Gender") +
  labs(title = "Political Identification by Gender",
       subtitle = "General Social Survey, 1972-2022",
       x = "Political View",
       y = "Count") +
  theme_minimal() +
  theme(legend.position = "bottom")

Task 5

#Objective: Create a stacked bar chart showing the distribution of fejobaff (preferential hiring) across different age groups.
#Create an age group variable by categorizing age into “18-29”, “30-44”, “45-59”, “60+”. Create a stacked bar chart showing the distribution of the fejobaff response categories for each age group.

gss <- load("gss2022.RData")