packages <- c("tidyverse", "fst", "modelsummary", "viridis", "kableExtra", "flextable", "officer")
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
## backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
##
## Revert to `kableExtra` for one session:
##
## options(modelsummary_factory_default = 'kableExtra')
## options(modelsummary_factory_latex = 'kableExtra')
## options(modelsummary_factory_html = 'kableExtra')
##
## Silence this message forever:
##
## config_modelsummary(startup_message = FALSE)
##
## Loading required package: viridisLite
##
##
## Attaching package: 'kableExtra'
##
##
## The following object is masked from 'package:dplyr':
##
## group_rows
##
##
##
## Attaching package: 'flextable'
##
##
## The following objects are masked from 'package:kableExtra':
##
## as_image, footnote
##
##
## The following object is masked from 'package:purrr':
##
## compose
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "fst" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "viridis" "viridisLite" "modelsummary" "fst" "lubridate"
## [6] "forcats" "stringr" "dplyr" "purrr" "readr"
## [11] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
##
## [[5]]
## [1] "kableExtra" "viridis" "viridisLite" "modelsummary" "fst"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[6]]
## [1] "flextable" "kableExtra" "viridis" "viridisLite" "modelsummary"
## [6] "fst" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[7]]
## [1] "officer" "flextable" "kableExtra" "viridis" "viridisLite"
## [6] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [11] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [21] "utils" "datasets" "methods" "base"
load("/Users/liuyichen/Downloads/gss2018_egp.RData")
gss <- df
gss <- gss %>%
mutate(egp = factor(egp, levels = c(
"I",
"II",
"IIIa",
"IIIb",
"IVa",
"IVb",
"IVc",
"V",
"VI",
"VIIa",
"VIIb"
), labels = c(
"Higher-grade professionals, managers, large proprietors",
"Lower-grade professionals, technicians, non-manual supervisors",
"Higher-grade routine non-manual employees (admin/commerce)",
"Lower-grade routine non-manual employees (sales/service)",
"Small proprietors, artisans with employees",
"Small proprietors, artisans without employees",
"Farmers, small-holders, self-employed in primary production",
"Lower-grade technicians, manual supervisors",
"Skilled manual workers",
"Semi- and unskilled manual workers (not agriculture)",
"Agricultural and primary production workers"
), ordered = TRUE))
table(gss$natcrime)
##
## too little about right too much DK IAP NA
## 23616 9317 2148 0 0 0
unique(gss$natcrime)
## [1] too little <NA> about right too much
## Levels: too little about right too much DK IAP NA
load("/Users/liuyichen/Downloads/gss2022.Rdata")
gss <- df
Objective: Clean and recode the variables to ensure they are ready for analysis.
Recode polviews into three categories: “Liberal”, “Moderate”, and “Conservative”. Clean sex, degree, and race but retain the relevant categories.
gss_cleaned <- gss %>%
filter(!is.na(polviews), !is.na(sex), !is.na(degree), !is.na(race)) %>%
mutate(
polviews = case_when(
polviews %in% c("liberal", "moderate", "conservative") ~ polviews,
TRUE ~ NA_character_
),
race = case_when(
race %in% c("white", "black", "other") ~ race,
TRUE ~ NA_character_
),
sex = case_when(
sex %in% c("male", "female") ~ sex,
TRUE ~ NA_character_
),
degree = case_when(
degree %in% c("less than high school", "high school", "junior college", "bachelor", "graduate") ~ degree,
TRUE ~ NA_character_
)
)
gss_cleaned <- gss_cleaned %>%
rename(
"Political Views" = polviews,
"Respondent Sex" = sex,
"Highest Degree" = degree,
"Respondent Race" = race
)
Objective: Generate a summary table for selected variables using the datasummary_skim function from the modelsummary package.
Select the variables of interest: polviews, sex, degree, and race.
Generate a categorical summary table for these variables, clean the labels, and display it using the flextable package for styling.
gss <- gss %>%
filter(!is.na(sex), !is.na(degree), !is.na(race))
gss_filtered <- gss_cleaned %>%
dplyr::select(`Political Views`, `Respondent Sex`, `Highest Degree`, `Respondent Race`)
categorical_summary <- datasummary_skim(gss_filtered, type = "categorical")
categorical_summary
| N | % | ||
|---|---|---|---|
| Political Views | conservative | 9316 | 14.9 |
| liberal | 7596 | 12.2 | |
| NA | 45515 | 72.9 | |
| Respondent Sex | female | 34482 | 55.2 |
| male | 27945 | 44.8 | |
| Highest Degree | graduate | 5460 | 8.7 |
| high school | 31800 | 50.9 | |
| less than high school | 10980 | 17.6 | |
| NA | 14187 | 22.7 | |
| Respondent Race | black | 8506 | 13.6 |
| other | 3841 | 6.2 | |
| white | 50080 | 80.2 |
categorical_summary_relabelled <- datasummary_skim(
gss_cleaned %>%
dplyr::select(`Political Views`, `Respondent Sex`, `Highest Degree`, `Respondent Race`),
type = "categorical",
output = "kableExtra"
)
## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
## produced by the `tinytable` backend.
categorical_summary_relabelled %>%
kableExtra::kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
kableExtra::row_spec(0, bold = TRUE, color = "white", background = "#4CAF50") %>%
kableExtra::column_spec(1, bold = TRUE) %>%
kableExtra::add_header_above(c(" " = 1, "Summary Statistics for Categorical Variables" = 3))
| N | % | ||
|---|---|---|---|
| Political Views | conservative | 9316 | 14.9 |
| liberal | 7596 | 12.2 | |
| NA | 45515 | 72.9 | |
| Respondent Sex | female | 34482 | 55.2 |
| male | 27945 | 44.8 | |
| Highest Degree | graduate | 5460 | 8.7 |
| high school | 31800 | 50.9 | |
| less than high school | 10980 | 17.6 | |
| NA | 14187 | 22.7 | |
| Respondent Race | black | 8506 | 13.6 |
| other | 3841 | 6.2 | |
| white | 50080 | 80.2 |
Objective: Create a bar chart showing the distribution of political views by gender.
Create a bar chart showing the distribution of political views by gender. Use a color palette that clearly differentiates the categories.
ggplot(gss_cleaned, aes(x = `Political Views`, fill = `Respondent Sex`)) +
geom_bar(position = "fill", color = "black", alpha = 0.8) +
scale_y_continuous(labels = scales::percent_format()) +
labs(
title = "Distribution of Political Views by Gender",
x = "Political Views",
y = "Proportion",
fill = "Gender"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom",
legend.title = element_text(size = 10),
legend.text = element_text(size = 8)
) +
guides(fill = guide_legend(nrow = 1, byrow = TRUE, title.position = "top"))
Objective: Visualize trends in religious attendance over time.
Select the year and attend variables from the GSS dataset.
Create a line plot showing the proportion of each category of religious attendance over time.
gss_yearly <- gss %>%
group_by(year, attend) %>%
summarize(count = n(), .groups = 'drop') %>%
group_by(year) %>%
mutate(total = sum(count),
proportion = count / total)
ggplot(gss_yearly, aes(x = year, y = proportion, color = attend, group = attend)) +
geom_line(size = 1.2) +
scale_color_brewer(palette = "Set3") +
labs(title = "Evolution of Religious Attendance Over Time",
x = "Year",
y = "Proportion",
color = "Religious Attendance") +
theme_minimal() +
theme(legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 34 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Task 5: Comparison Trends Objective: Create a stacked bar chart
showing the distribution of fejobaff (preferential hiring) across
different age groups.
Create an age group variable by categorizing age into “18-29”, “30-44”, “45-59”, “60+”. Create a stacked bar chart showing the distribution of the fejobaff response categories for each age group.
gss_age <- gss %>%
mutate(age_group = case_when(
age >= 18 & age <= 29 ~ "18-29",
age >= 30 & age <= 44 ~ "30-44",
age >= 45 & age <= 59 ~ "45-59",
age >= 60 ~ "60+",
TRUE ~ NA_character_
))
gss_fejobaff <- gss_age %>%
group_by(age_group, fejobaff) %>%
summarize(count = n(), .groups = 'drop') %>%
group_by(age_group) %>%
mutate(total = sum(count),
proportion = count / total) %>%
filter(!is.na(fejobaff))
ggplot(gss_fejobaff, aes(x = age_group, y = proportion, fill = fejobaff)) +
geom_bar(stat = "identity") +
scale_fill_brewer(palette = "Set3") +
labs(title = "Distribution of Preferential Hiring Across Age Groups",
x = "Age Group",
y = "Proportion",
fill = "Preferential Hiring") +
theme_minimal() +
theme(legend.position = "bottom")