Problem set 2

# List of packages
packages <- c("tidyverse", "fst", "modelsummary", "viridis", "kableExtra", "flextable", "officer") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## Warning: package 'ggplot2' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## Warning: package 'fst' was built under R version 4.3.3

## Warning: package 'modelsummary' was built under R version 4.3.3

## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)

## Warning: package 'viridis' was built under R version 4.3.3

## Loading required package: viridisLite
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

## Warning: package 'flextable' was built under R version 4.3.3

## 
## Attaching package: 'flextable'
## 
## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote
## 
## The following object is masked from 'package:purrr':
## 
##     compose

## Warning: package 'officer' was built under R version 4.3.3

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "viridis"      "viridisLite"  "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[5]]
##  [1] "kableExtra"   "viridis"      "viridisLite"  "modelsummary" "fst"         
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "flextable"    "kableExtra"   "viridis"      "viridisLite"  "modelsummary"
##  [6] "fst"          "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "officer"      "flextable"    "kableExtra"   "viridis"      "viridisLite" 
##  [6] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"

gss <- load("gss2022.RData")
gss <- df

Task 1: Data Cleaning and Recoding

Objective: Clean and recode the variables to ensure they are ready for analysis.

Recode polviews into three categories: “Liberal”, “Moderate”, and “Conservative”. Clean sex, degree, and race but retain the relevant categories.

# Recode and clean variables
gss <- gss %>%
  mutate(
    polviews = case_when(
      polviews %in% c("extremely liberal", "liberal", "slightly liberal") ~ "Liberal",
      polviews %in% c("moderate") ~ "Moderate",
      polviews %in% c("extremely conservative", "conservative", "slightly conservative") ~ "Conservative",
      TRUE ~ NA_character_
    ),
    sex = case_when(
      sex %in% c("male", "female") ~ sex,
      TRUE ~ NA_character_
    ),
    degree = case_when(
      degree %in% c("less than high school", "high school", "junior college", "bachelor", "graduate") ~ degree,
      TRUE ~ NA_character_
    ),
    race = case_when(
      race %in% c("white", "black", "other") ~ race,
      TRUE ~ NA_character_
    )
  )

Task 2: Data Summary

Objective: Generate a summary table for selected variables using the datasummary_skim function from the modelsummary package.

Select the variables of interest: polviews, sex, degree, and race.

Generate a categorical summary table for these variables, clean the labels, and display it using the flextable package for styling.

# Create summary for selected categorical variables with flextable
categorical_summary_flextable <- datasummary_skim(
  gss %>%
    dplyr::select(polviews, sex, degree, race),
  type = "categorical",
  output = "flextable"
)

## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
##   produced by the `tinytable` backend.

# Customize the table appearance with flextable
categorical_summary_flextable <- categorical_summary_flextable %>%
  set_header_labels(Variable = "Variable", Value = "Value", Freq = "Frequency") %>%
  theme_box() %>%
  bold(part = "header") %>%
  bg(part = "header", bg = "#4CAF50") %>%
  color(part = "header", color = "white") %>%
  border_remove() %>%
  border_inner_v(border = fp_border(color = "black", width = 1)) %>%
  autofit()

print(categorical_summary_flextable)

## a flextable object.
## col_keys: ` `, `  `, `N`, `%` 
## header has 1 row(s) 
## body has 14 row(s) 
## original dataset sample: 
##                             N    %
## 1 polviews Conservative 21122 29.2
## 2               Liberal 17604 24.3
## 3                    NA 33664 46.5
## 4      sex       female 40301 55.7
## 5                  male 31977 44.2

Task 3: Visualization of Political Views by Gender

Objective: Create a bar chart showing the distribution of political views by gender.

Create a bar chart showing the distribution of political views by gender. Use a color palette that clearly differentiates the categories.

# Filter the data to remove NA values
gss_filtered <- gss %>%
  filter(!is.na(polviews), !is.na(sex))

# Create a bar chart showing the distribution of political views by gender
ggplot(gss_filtered, aes(x = polviews, fill = sex)) +
  geom_bar(position = "dodge") +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Distribution of Political Views by Gender",
       x = "Political Views",
       y = "Count",
       fill = "Gender") +
  theme_minimal() +
  theme(legend.position = "bottom")

Task 4: Trends Over Time

Objective: Visualize trends in religious attendance over time.

Select the year and attend variables from the GSS data.

Create a line plot showing the proportion of each category of religious attendance over time.

# Select the year and attend variables, and filter out NA values
gss_filtered <- gss %>%
  select(year, attend) %>%
  filter(!is.na(year), !is.na(attend))

# Summarize the data by year and attendance
gss_yearly <- gss_filtered %>%
  group_by(year, attend) %>%
  summarize(count = n(), .groups = 'drop') %>%
  group_by(year) %>%
  mutate(total = sum(count),
         proportion = count / total)

# Create a line plot to visualize the trends in religious attendance over time
ggplot(gss_yearly, aes(x = year, y = proportion, color = attend, group = attend)) +
  geom_line(size = 1.2) + # Create lines for each attendance category with increased line size
  scale_color_brewer(palette = "Set3") + # Use a color palette for better differentiation
  labs(title = "Trends in Religious Attendance Over Time", # Add plot title
       x = "Year", # Label x-axis
       y = "Proportion", # Label y-axis
       color = "Religious Attendance") + # Label the legend
  theme_minimal() + # Apply a minimal theme to the plot
  theme(legend.position = "bottom") # Position the legend at the bottom

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Task 5: Comparison Trends

Objective: Create a stacked bar chart showing the distribution of fejobaff (preferential hiring) across different age groups.

Create an age group variable by categorizing age into “18-29”, “30-44”, “45-59”, “60+”. Create a stacked bar chart showing the distribution of the fejobaff response categories for each age group.

# Create age groups based on age variable
gss <- gss %>%
  mutate(age_group = case_when(
    age >= 18 & age <= 29 ~ "18-29",
    age >= 30 & age <= 44 ~ "30-44",
    age >= 45 & age <= 59 ~ "45-59",
    age >= 60 ~ "60+",
    TRUE ~ NA_character_
  ))

# Filter out rows with NA values in fejobaff 
gss_filtered <- gss %>%
  filter(!is.na(fejobaff), !is.na(age_group))

# Create a stacked bar chart showing distribution of fejobaff across age groups
ggplot(gss_filtered, aes(x = age_group, fill = fejobaff)) +
  geom_bar(position = "stack") +
  labs(title = "Distribution of Preferential Hiring Responses by Age Group",
       x = "Age Group",
       y = "Count",
       fill = "Preferential Hiring Response") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_brewer(palette = "Set2")  # Choose a suitable color palette

## End