# List of packages
packages <- c("tidyverse", "fst", "modelsummary", "viridis", "kableExtra", "flextable", "officer") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
## 
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## 
## 
## 
## Attaching package: 'flextable'
## 
## 
## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote
## 
## 
## The following object is masked from 'package:purrr':
## 
##     compose
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "viridis"      "viridisLite"  "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[5]]
##  [1] "kableExtra"   "viridis"      "viridisLite"  "modelsummary" "fst"         
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "flextable"    "kableExtra"   "viridis"      "viridisLite"  "modelsummary"
##  [6] "fst"          "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "officer"      "flextable"    "kableExtra"   "viridis"      "viridisLite" 
##  [6] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"
gss <- load("gss2018_egp.RData")
gss <- df
gss <- load("gss2022.RData")
gss <- df

Task 1: Data Cleaning and Recoding

Objective: Clean and recode the variables to ensure they are ready for analysis.

Recode polviews into three categories: “Liberal”, “Moderate”, and “Conservative”. Clean sex, degree, and race but retain the relevant categories.

# Recode and clean variables
gss <- gss %>%
  mutate(
    polviews = case_when(
      polviews %in% c("liberal", "moderate", "conservative") ~ polviews, 
      TRUE ~ NA_character_
    ),
    race = case_when(
      race %in% c("white", "black", "other") ~ race,
      TRUE ~ NA_character_
    ),
    sex = case_when(
      sex %in% c("male", "female") ~ sex,
      TRUE ~ NA_character_
    ),
    degree = case_when(
      degree %in% c("less than high school", "high school", "junior college", "bachelor", "graduate") ~ degree,
      TRUE ~ NA_character_
    )
  )

Task 2: Data Summary

Objective: Generate a summary table for selected variables using the datasummary_skim function from the modelsummary package.

Select the variables of interest: polviews, sex, degree, and race.

Generate a categorical summary table for these variables, clean the labels, and display it using the flextable package for styling.

# Filter to variables of interest
gss_filtered <- gss %>%
  dplyr::select(polviews, race, sex, degree)
categorical_summary <- datasummary_skim(gss_filtered, type = "categorical")
categorical_summary 
tinytable_ip1cstvl99bue1gajhse
N %
polviews conservative 9361 12.9
liberal 7623 10.5
NA 55406 76.5
race black 10215 14.1
other 4411 6.1
white 57657 79.6
NA 107 0.1
sex female 40301 55.7
male 31977 44.2
NA 112 0.2
degree graduate 5953 8.2
high school 36446 50.3
less than high school 14192 19.6
NA 15799 21.8
gss_cleaned <- gss %>%
  filter(!is.na(polviews), !is.na(race), !is.na(sex), 
          !is.na(degree)) %>%
  mutate(
   polviews = recode(polviews, "liberal" = "Liberal", "moderate" = "Moderate", "conservative" = "Conservative"),
    race = recode(race, "white" = "White", "black" = "Black", "other" = "Other"),
    sex = recode(sex, "male" = "Male", "female" = "Female"),
    degree = recode(degree, "less than high school" = "Less than High School", "high school" = "High School", "junior college" = "Junior College", "bachelor" = "Bachelor", "graduate" = "Graduate"),
  )

gss_cleaned <- gss_cleaned %>%
  rename(
    "Political Views" = polviews,
    "Respondent Race" = race,
    "Respondent Sex" = sex,
    "Highest Degree" = degree
  )
table(gss$polviews)
## 
## conservative      liberal 
##         9361         7623
unique(gss$polviews)
## [1] NA             "conservative" "liberal"
table(gss$sex)
## 
## female   male 
##  40301  31977
unique(gss$sex)
## [1] "female" "male"   NA
table(gss$degree)
## 
##              graduate           high school less than high school 
##                  5953                 36446                 14192
unique(gss$degree)
## [1] NA                      "less than high school" "high school"          
## [4] "graduate"
table(gss$race)
## 
## black other white 
## 10215  4411 57657
unique(gss$race)
## [1] "white" "black" "other" NA
# Create summary for relabeled categorical variables
categorical_summary_relabelled <- datasummary_skim(
  gss_cleaned %>%
    dplyr::select(`Political Views`, `Respondent Race`, `Respondent Sex`, `Highest Degree`,), # Select categorical variables
  type = "categorical", # Specify the type of variables to summarize
  output = "kableExtra" # Specify the output format
)
## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
##   produced by the `tinytable` backend.
# Customize the table appearance
categorical_summary_relabelled %>%
  kableExtra::kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% # Apply table styling options
  kableExtra::row_spec(0, bold = TRUE, color = "white", background = "#4CAF50") %>% # Customize the header row
  kableExtra::column_spec(1, bold = TRUE) %>% # Make the first column bold
  kableExtra::add_header_above(c(" " = 1, "Summary Statistics for Categorical Variables" = 3)) # Add a header above the table
Summary Statistics for Categorical Variables
N %
Political Views Conservative 6945 55.2
Liberal 5627 44.8
Respondent Race Black 1717 13.7
Other 772 6.1
White 10083 80.2
Respondent Sex Female 6813 54.2
Male 5759 45.8
Highest Degree Graduate 1943 15.5
High School 7940 63.2
Less than High School 2689 21.4
# Create summary for relabeled categorical variables with flextable
categorical_summary_flextable <- datasummary_skim(
  gss_cleaned %>%
    dplyr::select(`Political Views`, `Respondent Race`, `Respondent Sex`, `Highest Degree`),
  type = "categorical",
  output = "flextable"
)
## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
##   produced by the `tinytable` backend.
# Customize the table appearance with flextable
categorical_summary_flextable <- categorical_summary_flextable %>%
  set_header_labels(Variable = "Variable", Value = "Value", Freq = "Frequency") %>%
  theme_box() %>%
  bold(part = "header") %>%
  bg(part = "header", bg = "#4CAF50") %>%
  color(part = "header", color = "white") %>%
  border_remove() %>%
  border_inner_v(border = fp_border(color = "black", width = 1)) %>%
  autofit()

print(categorical_summary_flextable) 
## a flextable object.
## col_keys: ` `, `  `, `N`, `%` 
## header has 1 row(s) 
## body has 10 row(s) 
## original dataset sample: 
##                                    N    %
## 1 Political Views Conservative  6945 55.2
## 2                      Liberal  5627 44.8
## 3 Respondent Race        Black  1717 13.7
## 4                        Other   772  6.1
## 5                        White 10083 80.2

Task 3: Visualization of Political Views by Gender

Objective: Create a bar chart showing the distribution of political views by gender.

Create a bar chart showing the distribution of political views by gender. Use a color palette that clearly differentiates the categories.

# Summary
gss_filtered <- gss %>%
  filter(polviews %in% c("liberal", "moderate", "conservative"))
gss_sex <- gss_filtered %>%
  group_by (sex, polviews) %>%
  summarize(count = n(), .groups = 'drop') %>%
  group_by(sex) %>%
  mutate(total = sum(count),
         proportion = count / total)


gss_filtered <- gss %>%
  filter(polviews %in% c("liberal", "moderate", "conservative"))
gss_sex <- gss_filtered %>%
  group_by(sex, polviews) %>%
  summarize(count = n(), .groups = 'drop') %>%
  group_by(sex) %>%
  mutate(total = sum(count),
         proportion = count / total)
# Plot political views based on gender
ggplot(gss_sex, aes(x = sex, y = proportion, color = polviews, group = polviews)) +
  geom_line(size = 1.2) +
  scale_color_brewer(palette = "Dark2") +
  labs(title = "Distribution of Political Views by Gender",
       x = "Sex",
       y = "Proportion",
       color = "Political Views") +
  theme_minimal() +
  theme(legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.