Problem Set 2

packages <- c("tidyverse", "fst", "modelsummary", "viridis", "kableExtra", "flextable", "officer")

new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## 
## 
## 
## Attaching package: 'flextable'
## 
## 
## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote
## 
## 
## The following object is masked from 'package:purrr':
## 
##     compose

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "viridis"      "viridisLite"  "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[5]]
##  [1] "kableExtra"   "viridis"      "viridisLite"  "modelsummary" "fst"         
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "flextable"    "kableExtra"   "viridis"      "viridisLite"  "modelsummary"
##  [6] "fst"          "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "officer"      "flextable"    "kableExtra"   "viridis"      "viridisLite" 
##  [6] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"

gss <- load("gss2022.RData")
gss <- df

Task 2: Data Summary

Objective: Generate a summary table for selected variables using the datasummary_skim function from the modelsummary package.

Select the variables of interest: polviews, sex, degree, and race.

Generate a categorical summary table for these variables, clean the labels, and display it using the flextable package for styling.

gss <- gss %>%
  mutate(
    polviews = case_when(
      polviews %in% c("extremely conservative", "conservative", "slightly conservative", "moderate, middle of the road", "slightly liberal", "liberal", "extremely liberal") ~ polviews,
      TRUE ~ NA_character_
    ),
    sex = case_when(
      sex %in% c("male", "female") ~ sex,
      TRUE ~ NA_character_
    ),
    degree = case_when(
      degree %in% c("less than high school", "high school", "junior college", "bachelor", "graduate") ~ degree,
      TRUE ~ NA_character_
    ),
    race = case_when(
      race %in% c("white", "black", "other") ~ race,
      TRUE ~ NA_character_
    )
  )

gss_filtered <- gss %>%
  dplyr::select(polviews, sex, degree, race)
categorical_summary <- datasummary_skim(gss_filtered, type = "categorical")
categorical_summary

		N	%
polviews	conservative	9361	12.9
	extremely conservative	2165	3.0
	extremely liberal	2081	2.9
	liberal	7623	10.5
	moderate, middle of the road	23992	33.1
	slightly conservative	9596	13.3
	slightly liberal	7900	10.9
	NA	9672	13.4
sex	female	40301	55.7
	male	31977	44.2
	NA	112	0.2
degree	graduate	5953	8.2
	high school	36446	50.3
	less than high school	14192	19.6
	NA	15799	21.8
race	black	10215	14.1
	other	4411	6.1
	white	57657	79.6
	NA	107	0.1

gss_cleaned <- gss %>%
  filter(!is.na(polviews),
        !is.na(sex), !is.na(degree), !is.na(race),  
        !is.na(fefam),
        !is.na(libhomo), !is.na(attend)) %>%
  mutate(
    polviews = recode(polviews, "extremely conservative" = "Extremely Conservative", "conservative" = "Conservative", "slightly conservative" = "Slightly Conservative", "moderate, middle of the road" = "Moderate, Middle of the Road", "slightly liberal" = "Slightly Liberal", "liberal" = "Liberal", "extremely liberal" = "Extremely Liberal",
    sex = recode(sex, "male" = "Male", "female" = "Female"),
    degree = recode(degree, "less than high school" = "Less than High School", "high school" = "High School", "junior college" = "Junior College", "bachelor" = "Bachelor", "graduate" = "Graduate"),
    race = recode(race, "white" = "White", "black" = "Black", "other" = "Other")
  )
)

gss_cleaned <- gss_cleaned %>%
  rename(
    "Think of Self as Liberal or Conservative" = polviews,
    "Respondent Sex" = sex,
    "Highest Degree" = degree,
    "Respondent Race" = race
  )

categorical_summary_relabelled <- datasummary_skim(
  gss_cleaned %>%
    dplyr::select(`Think of Self as Liberal or Conservative`, `Respondent Sex`, `Highest Degree`, `Respondent Race`), 
  type = "categorical",
  output = "kableExtra"
)

categorical_summary_relabelled %>%
  kableExtra::kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% 
  kableExtra::row_spec(0, bold = TRUE, color = "white", background = "#4CAF50") %>% 
  kableExtra::column_spec(1, bold = TRUE) %>% 
  kableExtra::add_header_above(c(" " = 1, "Summary Statistics for Categorical Variables" = 3))

	Summary Statistics for Categorical Variables
		N	%
Think of Self as Liberal or Conservative	Conservative	1742	14.7
	Extremely Conservative	419	3.5
	Extremely Liberal	382	3.2
	Liberal	1463	12.3
	Moderate, Middle of the Road	4767	40.1
	Slightly Conservative	1664	14.0
	Slightly Liberal	1438	12.1
Respondent Sex	female	6648	56.0
	male	5227	44.0
Highest Degree	graduate	1372	11.6
	high school	8015	67.5
	less than high school	2488	21.0
Respondent Race	black	1639	13.8
	other	728	6.1
	white	9508	80.1

Task 3: Visualization of Political Views by Gender

Objective: Create a bar chart showing the distribution of political views by gender.

Create a bar chart showing the distribution of political views by gender. Use a color palette that clearly differentiates the categories.

polviews_summary <- gss_filtered %>%
  count(polviews) %>% 
  mutate(pct = n / sum(n) * 100)

ggplot(gss_filtered, aes(x = polviews, fill = sex)) +
  geom_bar(position = "dodge", color = "black") + 
  labs(title = "Distribution of Political Views by Gender", x = "Political Views", y = "Count") + 
  scale_fill_brewer(palette = "Set1", name = "Gender") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Task 4: Trends Over Time

Objective: Visualize trends in religious attendance over time.

Select the year and attend variables from the GSS dataset.

Create a line plot showing the proportion of each category of religious attendance over time.

gss_yearly <- gss %>%
  group_by(year, attend) %>% 
  summarize(count = n(), .groups = 'drop') %>% 
  group_by(year) %>% 
  mutate(total = sum(count), 
         proportion = count / total) 

ggplot(gss_yearly, aes(x = year, y = proportion, color = attend, group = attend)) +
  geom_line(size = 1.2) + 
  scale_color_brewer(palette = "Set3") + 
  labs(title = "Evolution of Religious Attendance Over Time", 
       x = "Year", 
       y = "Proportion", 
       color = "Religious Attendance") + 
  theme_minimal() + 
  theme(legend.position = "bottom")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 34 rows containing missing values (`geom_line()`).

Task 5: Comparison Trends

Objective: Create a stacked bar chart showing the distribution of fejobaff (preferential hiring) across different age groups.

Create an age group variable by categorizing age into “18-29”, “30-44”, “45-59”, “60+”. Create a stacked bar chart showing the distribution of the fejobaff response categories for each age group.

gss <- gss %>%
  mutate(
    fejobaff = case_when(
      fejobaff %in% c("strongly favor", "not strongly favor", "not strongly oppose", "strongly oppose") ~ fejobaff,
      TRUE ~ NA_character_
    ),
    age_group = case_when(
      age >= 18 & age <= 29 ~ "18-29",
      age >= 30 & age <= 44 ~ "30-44",
      age >= 45 & age <= 59 ~ "45-59",
      age >= 60 ~ "60+",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(!is.na(fejobaff), !is.na(age_group)) 

fejobaff_summary <- gss %>%
  count(age_group, fejobaff) %>%
  group_by(age_group) %>%
  mutate(total = sum(n), proportion = n / total)

ggplot(fejobaff_summary, aes(x = age_group, y = proportion, fill = fejobaff)) +
  geom_bar(stat = "identity", position = "fill", color = "black") +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_brewer(palette = "Set2", name = "Preferential Hiring") +
  labs(title = "Distribution of Preferential Hiring Views by Age Group",
       x = "Age Group",
       y = "Proportion") +
  theme_minimal(base_size = 15) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
        axis.text.y = element_text(size = 12),
        plot.title = element_text(size = 18, hjust = 0.5),
        legend.title = element_text(size = 14),
        legend.text = element_text(size = 12))