Problem Set 2

knitr::opts_chunk$set(echo = TRUE)

# List of packages
packages <- c("tidyverse", "fst", "modelsummary", "viridis", "kableExtra", "flextable", "officer") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
## 
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## 
## 
## 
## Attaching package: 'flextable'
## 
## 
## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote
## 
## 
## The following object is masked from 'package:purrr':
## 
##     compose

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "viridis"      "viridisLite"  "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[5]]
##  [1] "kableExtra"   "viridis"      "viridisLite"  "modelsummary" "fst"         
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "flextable"    "kableExtra"   "viridis"      "viridisLite"  "modelsummary"
##  [6] "fst"          "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "officer"      "flextable"    "kableExtra"   "viridis"      "viridisLite" 
##  [6] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"

Data

file.exists("gss2022.RData")  # Check if the file exists in the current working directory

## [1] TRUE

gss <- load("gss2022.RData")
gss <- df
table(gss$polviews)

## 
##             extremely liberal                       liberal 
##                          2081                          7623 
##              slightly liberal  moderate, middle of the road 
##                          7900                         23992 
##         slightly conservative                  conservative 
##                          9596                          9361 
##        extremely conservative                    don't know 
##                          2165                             0 
##                           iap            I don't have a job 
##                             0                             0 
##                   dk, na, iap                     no answer 
##                             0                             0 
##    not imputable_(2147483637)    not imputable_(2147483638) 
##                             0                             0 
##                       refused                skipped on web 
##                             0                             0 
##                    uncodeable not available in this release 
##                             0                             0 
##    not available in this year                  see codebook 
##                             0                             0

unique(gss$polviews)

## [1] <NA>                         moderate, middle of the road
## [3] slightly conservative        conservative                
## [5] liberal                      extremely conservative      
## [7] slightly liberal             extremely liberal           
## 20 Levels: extremely liberal liberal ... see codebook

Task 1 - Data Cleaning and Recoding

# Recode and clean variables
gss <- gss %>%
  mutate(
    polviews = case_when(
      polviews %in% c("extremely liberal", "liberal", "slightly liberal", "moderate, middle of the road", "slightly conservative", 
                      "conservative", "extremely conservative"
                      ) ~ polviews, 
      TRUE ~ NA_character_
    ),
    sex = case_when(
      sex %in% c("male", "female") ~ sex, 
      TRUE ~ NA_character_
    ),
    degree = case_when(
      degree %in% c("less than high school", "high school", 
                    "associate/junior college", "bachelor's", "graduate") ~ degree, 
      TRUE ~ NA_character_
    ),
    race = case_when(
      race %in% c("white", "black", "other") ~ race, 
      TRUE ~ NA_character_
    )
  )


# Filter to variables of interest
gss_filtered <- gss %>%
  dplyr::select(polviews, sex, degree, race)
categorical_summary <- datasummary_skim(gss_filtered, type = "categorical")
categorical_summary

tinytable_9h4br54s3o1k4lplw8r3

		N	%
polviews	conservative	9361	12.9
	extremely conservative	2165	3.0
	extremely liberal	2081	2.9
	liberal	7623	10.5
	moderate, middle of the road	23992	33.1
	slightly conservative	9596	13.3
	slightly liberal	7900	10.9
	NA	9672	13.4
sex	female	40301	55.7
	male	31977	44.2
	NA	112	0.2
degree	associate/junior college	4355	6.0
	bachelor's	11248	15.5
	graduate	5953	8.2
	high school	36446	50.3
	less than high school	14192	19.6
	NA	196	0.3
race	black	10215	14.1
	other	4411	6.1
	white	57657	79.6
	NA	107	0.1

gss_cleaned <- gss %>%
  filter(!is.na(polviews), !is.na(sex), !is.na(degree),
         !is.na(race)) %>%
  
  #Recode polviews to Liberal, Moderate, Conservative
  mutate(
    polviews = recode(polviews, "extremely liberal" = "Liberal", "liberal" = "Liberal", "slightly liberal" = "Liberal", 
                      "moderate, middle of the road" = "Moderate", "slightly conservative" = "Conservative", 
                      "conservative" = "Conservative", "extremely conservative" = "Conservative"
                      ),
    sex = recode(sex, "female" = "Female", "male" = "Male"),
    degree = recode(degree, "associate/junior college" = "Associate/Junior College", "bachelor's" = "Bachelor's", "graduate" = "Graduate", 
                    "high school" = "High School", "less than high school" = "Less than High School"
                    ),
    race = recode(race, "black" = "Black", "white" = "White", "other" = "Other")
    
  )


# Select and rename the variables to be more informative
gss_cleaned <- gss_cleaned %>%
  rename(
    "Political Views" = polviews,
    "Gender" = sex,
    "Education" = degree,
    "Race" = race
  )
  table(gss_filtered$sex)

## 
## female   male 
##  40301  31977

Task 2 - Data Summary

# Create summary for relabeled categorical variables with flextable
categorical_summary_flextable <- datasummary_skim(
  gss_cleaned %>%
    dplyr::select(`Political Views`, `Gender`, `Education`, `Race`),
  type = "categorical",
  output = "flextable"
)

## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
##   produced by the `tinytable` backend.

# Customize the table appearance with flextable
categorical_summary_flextable <- categorical_summary_flextable %>%
  set_header_labels(Variable = "Variable", Value = "Value", Freq = "Frequency") %>%
  theme_box() %>%
  bold(part = "header") %>%
  bg(part = "header", bg = "#4CAF50") %>%
  color(part = "header", color = "white") %>%
  border_remove() %>%
  border_inner_v(border = fp_border(color = "black", width = 1)) %>%
  autofit()

print(categorical_summary_flextable)

## a flextable object.
## col_keys: ` `, `  `, `N`, `%` 
## header has 1 row(s) 
## body has 13 row(s) 
## original dataset sample: 
##                                    N    %
## 1 Political Views Conservative 21025 33.7
## 2                      Liberal 17531 28.1
## 3                     Moderate 23871 38.2
## 4          Gender       Female 34482 55.2
## 5                         Male 27945 44.8

Task 3 - Visualization of Political Views by Gender

gss <- load("gss2022.RData")
gss <- df

Recoding Sex

# Recoding sex
gss <- gss %>%
  mutate(sex = case_when(
    sex == "female" ~ 1,
    sex == "male" ~ 2,
    TRUE ~ NA_real_
  ))



# Filter out non-relevant responses in sex
gss_filtered_1 <- gss %>%
  filter(sex %in% c(1, 2)) %>%
  mutate(sex = factor(sex, levels = c(1, 2), labels = c("Female", "Male")))

Cross Tab (Political Views and Gender)

# Creat cross-tabulation of polviews and sex
cross_tab_full <- gss_filtered_1 %>%
  count(polviews, sex) %>%
  spread(key = sex, value = n, fill = 0)

# Create and style the table with a footnote
cross_tab_full %>%
  kable(col.names = c("Political Views", "Female", "Male"), align = 'c') %>% # Set column names and align columns centrally
  kable_styling(bootstrap_options = "striped", full_width = F) %>% # Apply table styling
  add_footnote(label = "Data: General Social Survey (1972-2022)") # Add a footnote with the data source

Political Views	Female	Male
extremely liberal	1121	955
liberal	4313	3302
slightly liberal	4377	3513
moderate, middle of the road	13980	9974
slightly conservative	4875	4713
conservative	4827	4517
extremely conservative	1103	1055
NA	5705	3948
^a Data: General Social Survey (1972-2022)

Bar Chart

gss_filtered_1 %>%
  count(polviews, sex) %>%
  ggplot(aes(x = polviews, y = n, fill = sex)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_brewer(palette = "Set1", name = "Gender") +
  labs(title = "Political Identification by Gender",
       subtitle = "General Social Survey, 1972-2022",
       x = "Political View",
       y = "Count") +
  theme_minimal() +
  theme(legend.position = "bottom")

Task 4 - Trends Over Time

gss <- load("gss2022.RData")
gss <- df

#Objective: Visualize trends in religious attendance over time.
#Select the year and attend variables from the GSS dataset.
#Create a line plot showing the proportion of each category of religious attendance over time.

table(gss$attend)

## 
##                         never         less than once a year 
##                         13855                          5825 
##    about once or twice a year          several times a year 
##                          9415                          8752 
##            about once a month             2-3 times a month 
##                          4831                          6114 
##             nearly every week                    every week 
##                          4029                         13659 
##          several times a week                    don't know 
##                          5210                             0 
##                           iap            I don't have a job 
##                             0                             0 
##                   dk, na, iap                     no answer 
##                             0                             0 
##    not imputable_(2147483637)    not imputable_(2147483638) 
##                             0                             0 
##                       refused                skipped on web 
##                             0                             0 
##                    uncodeable not available in this release 
##                             0                             0 
##    not available in this year                  see codebook 
##                             0                             0

# Prepare the data
gss_yearly <- gss %>%
  group_by(year, attend) %>% # Group by year and religious attendance
  summarize(count = n(), .groups = 'drop') %>% # Calculate the count for each group
  group_by(year) %>% # Group by year
  mutate(total = sum(count), # Calculate the total count per year
         proportion = count / total) # Calculate the proportion of each frequency of religious attenace per year

# Create a line plot to visualize the evolution of religious preferences over time
ggplot(gss_yearly, aes(x = year, y = proportion, color = attend, group = attend)) +
  geom_line(size = 1.2) + # Create lines for each frequency of religious attendance with increased line size
  scale_color_brewer(palette = "Set3") + # Use a color palette for better differentiation
  labs(title = "Evolution of Religious Attendance Over Time", # Add plot title
       x = "Year", # Label x-axis
       y = "Attendance", # Label y-axis
       color = "Frequency") + # Label the legend
  theme_minimal() + # Apply a minimal theme to the plot
  theme(legend.position = "bottom") # Position the legend at the bottom

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 34 rows containing missing values or values outside the scale range
## (`geom_line()`).

Task 5

#Objective: Create a stacked bar chart showing the distribution of fejobaff (preferential hiring) across different age groups.
#Create an age group variable by categorizing age into “18-29”, “30-44”, “45-59”, “60+”. Create a stacked bar chart showing the distribution of the fejobaff response categories for each age group.

gss <- load("gss2022.RData")

Problem Set 2

Benny Konishi

2024-07-14

Task 1 - Data Cleaning and Recoding

Task 2 - Data Summary

Task 3 - Visualization of Political Views by Gender

Task 4 - Trends Over Time

Task 5