Problem Set 2.

# List of packages
packages <- c("tidyverse", "fst", "modelsummary", "viridis", "kableExtra", "flextable", "officer") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
## 
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## 
## 
## 
## Attaching package: 'flextable'
## 
## 
## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote
## 
## 
## The following object is masked from 'package:purrr':
## 
##     compose

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "viridis"      "viridisLite"  "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[5]]
##  [1] "kableExtra"   "viridis"      "viridisLite"  "modelsummary" "fst"         
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "flextable"    "kableExtra"   "viridis"      "viridisLite"  "modelsummary"
##  [6] "fst"          "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "officer"      "flextable"    "kableExtra"   "viridis"      "viridisLite" 
##  [6] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"

gss <- load("gss2018_egp.RData")
gss <- df

gss <- load("gss2022.RData")
gss <- df

Task 1: Data Cleaning and Recoding

Objective: Clean and recode the variables to ensure they are ready for analysis.

Recode polviews into three categories: “Liberal”, “Moderate”, and “Conservative”. Clean sex, degree, and race but retain the relevant categories.

# Recode and clean variables
gss <- gss %>%
  mutate(
    polviews = case_when(
      polviews %in% c("liberal", "moderate", "conservative") ~ polviews, 
      TRUE ~ NA_character_
    ),
    race = case_when(
      race %in% c("white", "black", "other") ~ race,
      TRUE ~ NA_character_
    ),
    sex = case_when(
      sex %in% c("male", "female") ~ sex,
      TRUE ~ NA_character_
    ),
    degree = case_when(
      degree %in% c("less than high school", "high school", "junior college", "bachelor", "graduate") ~ degree,
      TRUE ~ NA_character_
    )
  )

Task 2: Data Summary

Objective: Generate a summary table for selected variables using the datasummary_skim function from the modelsummary package.

Select the variables of interest: polviews, sex, degree, and race.

Generate a categorical summary table for these variables, clean the labels, and display it using the flextable package for styling.

# Filter to variables of interest
gss_filtered <- gss %>%
  dplyr::select(polviews, race, sex, degree)
categorical_summary <- datasummary_skim(gss_filtered, type = "categorical")
categorical_summary

tinytable_ip1cstvl99bue1gajhse

		N	%
polviews	conservative	9361	12.9
	liberal	7623	10.5
	NA	55406	76.5
race	black	10215	14.1
	other	4411	6.1
	white	57657	79.6
	NA	107	0.1
sex	female	40301	55.7
	male	31977	44.2
	NA	112	0.2
degree	graduate	5953	8.2
	high school	36446	50.3
	less than high school	14192	19.6
	NA	15799	21.8

gss_cleaned <- gss %>%
  filter(!is.na(polviews), !is.na(race), !is.na(sex), 
          !is.na(degree)) %>%
  mutate(
   polviews = recode(polviews, "liberal" = "Liberal", "moderate" = "Moderate", "conservative" = "Conservative"),
    race = recode(race, "white" = "White", "black" = "Black", "other" = "Other"),
    sex = recode(sex, "male" = "Male", "female" = "Female"),
    degree = recode(degree, "less than high school" = "Less than High School", "high school" = "High School", "junior college" = "Junior College", "bachelor" = "Bachelor", "graduate" = "Graduate"),
  )

gss_cleaned <- gss_cleaned %>%
  rename(
    "Political Views" = polviews,
    "Respondent Race" = race,
    "Respondent Sex" = sex,
    "Highest Degree" = degree
  )

table(gss$polviews)

## 
## conservative      liberal 
##         9361         7623

unique(gss$polviews)

## [1] NA             "conservative" "liberal"

table(gss$sex)

## 
## female   male 
##  40301  31977

unique(gss$sex)

## [1] "female" "male"   NA

table(gss$degree)

## 
##              graduate           high school less than high school 
##                  5953                 36446                 14192

unique(gss$degree)

## [1] NA                      "less than high school" "high school"          
## [4] "graduate"

table(gss$race)

## 
## black other white 
## 10215  4411 57657

unique(gss$race)

## [1] "white" "black" "other" NA

# Create summary for relabeled categorical variables
categorical_summary_relabelled <- datasummary_skim(
  gss_cleaned %>%
    dplyr::select(`Political Views`, `Respondent Race`, `Respondent Sex`, `Highest Degree`,), # Select categorical variables
  type = "categorical", # Specify the type of variables to summarize
  output = "kableExtra" # Specify the output format
)

## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
##   produced by the `tinytable` backend.

# Customize the table appearance
categorical_summary_relabelled %>%
  kableExtra::kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% # Apply table styling options
  kableExtra::row_spec(0, bold = TRUE, color = "white", background = "#4CAF50") %>% # Customize the header row
  kableExtra::column_spec(1, bold = TRUE) %>% # Make the first column bold
  kableExtra::add_header_above(c(" " = 1, "Summary Statistics for Categorical Variables" = 3)) # Add a header above the table

	Summary Statistics for Categorical Variables
		N	%
Political Views	Conservative	6945	55.2
	Liberal	5627	44.8
Respondent Race	Black	1717	13.7
	Other	772	6.1
	White	10083	80.2
Respondent Sex	Female	6813	54.2
	Male	5759	45.8
Highest Degree	Graduate	1943	15.5
	High School	7940	63.2
	Less than High School	2689	21.4

# Create summary for relabeled categorical variables with flextable
categorical_summary_flextable <- datasummary_skim(
  gss_cleaned %>%
    dplyr::select(`Political Views`, `Respondent Race`, `Respondent Sex`, `Highest Degree`),
  type = "categorical",
  output = "flextable"
)

## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
##   produced by the `tinytable` backend.

# Customize the table appearance with flextable
categorical_summary_flextable <- categorical_summary_flextable %>%
  set_header_labels(Variable = "Variable", Value = "Value", Freq = "Frequency") %>%
  theme_box() %>%
  bold(part = "header") %>%
  bg(part = "header", bg = "#4CAF50") %>%
  color(part = "header", color = "white") %>%
  border_remove() %>%
  border_inner_v(border = fp_border(color = "black", width = 1)) %>%
  autofit()

print(categorical_summary_flextable)

## a flextable object.
## col_keys: ` `, `  `, `N`, `%` 
## header has 1 row(s) 
## body has 10 row(s) 
## original dataset sample: 
##                                    N    %
## 1 Political Views Conservative  6945 55.2
## 2                      Liberal  5627 44.8
## 3 Respondent Race        Black  1717 13.7
## 4                        Other   772  6.1
## 5                        White 10083 80.2

Task 3: Visualization of Political Views by Gender

Objective: Create a bar chart showing the distribution of political views by gender.

Create a bar chart showing the distribution of political views by gender. Use a color palette that clearly differentiates the categories.

# Summary
gss_filtered <- gss %>%
  filter(polviews %in% c("liberal", "moderate", "conservative"))

gss_sex <- gss_filtered %>%
  group_by (sex, polviews) %>%
  summarize(count = n(), .groups = 'drop') %>%
  group_by(sex) %>%
  mutate(total = sum(count),
         proportion = count / total)


gss_filtered <- gss %>%
  filter(polviews %in% c("liberal", "moderate", "conservative"))
gss_sex <- gss_filtered %>%
  group_by(sex, polviews) %>%
  summarize(count = n(), .groups = 'drop') %>%
  group_by(sex) %>%
  mutate(total = sum(count),
         proportion = count / total)
# Plot political views based on gender
ggplot(gss_sex, aes(x = sex, y = proportion, color = polviews, group = polviews)) +
  geom_line(size = 1.2) +
  scale_color_brewer(palette = "Dark2") +
  labs(title = "Distribution of Political Views by Gender",
       x = "Sex",
       y = "Proportion",
       color = "Political Views") +
  theme_minimal() +
  theme(legend.position = "bottom")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Task 4: Trends Over Time

Objective: Visualize trends in religious attendance over time.

Select the year and attend variables from the GSS dataset.

Create a line plot showing the proportion of each category of religious attendance over time.

# Prepare the data
gss_yearly <- gss %>%
  group_by(attend, relig) %>% # Group by religious attendance
  summarize(count = n(), .groups = 'drop') %>% # Calculate the count for each group
  group_by(attend) %>% #Group by attendance
  mutate(total = sum(count), # Calculate the total count overtime
         proportion = count / total) # Calculate the proportion of religion attendance per year

# Create a line plot to visualize the evolution of religious attendance over time
ggplot(gss_yearly, aes(x = attend, y = proportion, color = relig, group = relig)) +
  geom_line(size = 1.2) + # Create lines for each religious attendance with increased line size
  scale_color_brewer(palette = "Set3") + # Use a color palette for better differentiation
  labs(title = "Evolution of Religious Attendance Over Time", # Add plot title
       x = "Attendance", # Label x-axis
       y = "Proportion", # Label y-axis
       color = "Religions") + # Label the legend
    theme_minimal() + # Apply a minimal theme to the plot
  theme(legend.position = "bottom") # Position the legend at the bottom

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set3 is 12
## Returning the palette you asked for with that many colors

## Warning: Removed 19 rows containing missing values or values outside the scale range
## (`geom_line()`).

Task 5: Comparison Trends

Objective: Create a stacked bar chart showing the distribution of fejobaff (preferential hiring) across different age groups.

Create an age group variable by categorizing age into “18-29”, “30-44”, “45-59”, “60+”. Create a stacked bar chart showing the distribution of the fejobaff response categories for each age group.

gss_cleaned <- gss %>%
  filter(!is.na(age), !is.na(fejobaff)) %>%
  mutate(
    age = recode(age, "18 - 29" = "18-29", "30 - 44" = "30-44", "45 - 59" = "45-59", "60 + " = "60+"), 
  )

## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `age = recode(...)`.
## Caused by warning in `recode.numeric()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.

gss_cleaned <- gss_cleaned %>%
  rename(
    "Preferential Hiring" = fejobaff,
    "Age Groups" = age
  )

# Recode the age variable
gss_filtered <- gss_filtered %>%
  mutate
    age = case_when
    age %in% c("18-29") ~ "Age Group 1"

## age %in% c("18-29") ~ "Age Group 1"

    age %in% c("30-44") ~ "Age Group 2"

## age %in% c("30-44") ~ "Age Group 2"

    age %in% c("45-59") ~ "Age Group 3"

## age %in% c("45-59") ~ "Age Group 3"

    age %in% c("60+") ~ "Age Group 4"

## age %in% c("60+") ~ "Age Group 4"

      TRUE ~ NA_character_

## TRUE ~ NA_character_

gss_filtered %>% count(age, fejobaff) %>% group_by(age_recoded) %>% mutate(proportion = n / sum(n)) %>% ggplot(aes(x = age_recoded, y = proportion, fill = fejobaff)) + geom_bar(stat = “identity”, position = “fill”) + scale_fill_brewer(palette = “Set2”, name = “Hiring Preferences”) + labs(title = “Hiring Preferences Based on Age”, x = ““, y =”Proportion”) + theme_minimal() + theme(legend.position = “bottom”)

P.S. For the last question, I tried very hard to solve it but couldn’t. I couldn’t figure it out completely, so I thought it’d be best to show as much work as I could. The second part would not run and prevent me from being able to save my work, so I had to insert it as normal text and not an R code.