packages <- c("tidyverse", "fst", "modelsummary", "viridis", "kableExtra", "flextable", "officer")
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
##
##
## Attaching package: 'kableExtra'
##
##
## The following object is masked from 'package:dplyr':
##
## group_rows
##
##
##
## Attaching package: 'flextable'
##
##
## The following objects are masked from 'package:kableExtra':
##
## as_image, footnote
##
##
## The following object is masked from 'package:purrr':
##
## compose
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "fst" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "viridis" "viridisLite" "modelsummary" "fst" "lubridate"
## [6] "forcats" "stringr" "dplyr" "purrr" "readr"
## [11] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
##
## [[5]]
## [1] "kableExtra" "viridis" "viridisLite" "modelsummary" "fst"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[6]]
## [1] "flextable" "kableExtra" "viridis" "viridisLite" "modelsummary"
## [6] "fst" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[7]]
## [1] "officer" "flextable" "kableExtra" "viridis" "viridisLite"
## [6] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [11] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [21] "utils" "datasets" "methods" "base"
gss <- load("gss2022.RData")
gss <- df
Objective: Generate a summary table for selected variables using the datasummary_skim function from the modelsummary package.
Select the variables of interest: polviews, sex, degree, and race.
Generate a categorical summary table for these variables, clean the labels, and display it using the flextable package for styling.
gss <- gss %>%
mutate(
polviews = case_when(
polviews %in% c("extremely conservative", "conservative", "slightly conservative", "moderate, middle of the road", "slightly liberal", "liberal", "extremely liberal") ~ polviews,
TRUE ~ NA_character_
),
sex = case_when(
sex %in% c("male", "female") ~ sex,
TRUE ~ NA_character_
),
degree = case_when(
degree %in% c("less than high school", "high school", "junior college", "bachelor", "graduate") ~ degree,
TRUE ~ NA_character_
),
race = case_when(
race %in% c("white", "black", "other") ~ race,
TRUE ~ NA_character_
)
)
gss_filtered <- gss %>%
dplyr::select(polviews, sex, degree, race)
categorical_summary <- datasummary_skim(gss_filtered, type = "categorical")
categorical_summary
| N | % | ||
|---|---|---|---|
| polviews | conservative | 9361 | 12.9 |
| extremely conservative | 2165 | 3.0 | |
| extremely liberal | 2081 | 2.9 | |
| liberal | 7623 | 10.5 | |
| moderate, middle of the road | 23992 | 33.1 | |
| slightly conservative | 9596 | 13.3 | |
| slightly liberal | 7900 | 10.9 | |
| NA | 9672 | 13.4 | |
| sex | female | 40301 | 55.7 |
| male | 31977 | 44.2 | |
| NA | 112 | 0.2 | |
| degree | graduate | 5953 | 8.2 |
| high school | 36446 | 50.3 | |
| less than high school | 14192 | 19.6 | |
| NA | 15799 | 21.8 | |
| race | black | 10215 | 14.1 |
| other | 4411 | 6.1 | |
| white | 57657 | 79.6 | |
| NA | 107 | 0.1 |
gss_cleaned <- gss %>%
filter(!is.na(polviews),
!is.na(sex), !is.na(degree), !is.na(race),
!is.na(fefam),
!is.na(libhomo), !is.na(attend)) %>%
mutate(
polviews = recode(polviews, "extremely conservative" = "Extremely Conservative", "conservative" = "Conservative", "slightly conservative" = "Slightly Conservative", "moderate, middle of the road" = "Moderate, Middle of the Road", "slightly liberal" = "Slightly Liberal", "liberal" = "Liberal", "extremely liberal" = "Extremely Liberal",
sex = recode(sex, "male" = "Male", "female" = "Female"),
degree = recode(degree, "less than high school" = "Less than High School", "high school" = "High School", "junior college" = "Junior College", "bachelor" = "Bachelor", "graduate" = "Graduate"),
race = recode(race, "white" = "White", "black" = "Black", "other" = "Other")
)
)
gss_cleaned <- gss_cleaned %>%
rename(
"Think of Self as Liberal or Conservative" = polviews,
"Respondent Sex" = sex,
"Highest Degree" = degree,
"Respondent Race" = race
)
categorical_summary_relabelled <- datasummary_skim(
gss_cleaned %>%
dplyr::select(`Think of Self as Liberal or Conservative`, `Respondent Sex`, `Highest Degree`, `Respondent Race`),
type = "categorical",
output = "kableExtra"
)
categorical_summary_relabelled %>%
kableExtra::kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
kableExtra::row_spec(0, bold = TRUE, color = "white", background = "#4CAF50") %>%
kableExtra::column_spec(1, bold = TRUE) %>%
kableExtra::add_header_above(c(" " = 1, "Summary Statistics for Categorical Variables" = 3))
| N | % | ||
|---|---|---|---|
| Think of Self as Liberal or Conservative | Conservative | 1742 | 14.7 |
| Extremely Conservative | 419 | 3.5 | |
| Extremely Liberal | 382 | 3.2 | |
| Liberal | 1463 | 12.3 | |
| Moderate, Middle of the Road | 4767 | 40.1 | |
| Slightly Conservative | 1664 | 14.0 | |
| Slightly Liberal | 1438 | 12.1 | |
| Respondent Sex | female | 6648 | 56.0 |
| male | 5227 | 44.0 | |
| Highest Degree | graduate | 1372 | 11.6 |
| high school | 8015 | 67.5 | |
| less than high school | 2488 | 21.0 | |
| Respondent Race | black | 1639 | 13.8 |
| other | 728 | 6.1 | |
| white | 9508 | 80.1 |
Objective: Create a bar chart showing the distribution of political views by gender.
Create a bar chart showing the distribution of political views by gender. Use a color palette that clearly differentiates the categories.
polviews_summary <- gss_filtered %>%
count(polviews) %>%
mutate(pct = n / sum(n) * 100)
ggplot(gss_filtered, aes(x = polviews, fill = sex)) +
geom_bar(position = "dodge", color = "black") +
labs(title = "Distribution of Political Views by Gender", x = "Political Views", y = "Count") +
scale_fill_brewer(palette = "Set1", name = "Gender") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Objective: Visualize trends in religious attendance over time.
Select the year and attend variables from the GSS dataset.
Create a line plot showing the proportion of each category of religious attendance over time.
gss_yearly <- gss %>%
group_by(year, attend) %>%
summarize(count = n(), .groups = 'drop') %>%
group_by(year) %>%
mutate(total = sum(count),
proportion = count / total)
ggplot(gss_yearly, aes(x = year, y = proportion, color = attend, group = attend)) +
geom_line(size = 1.2) +
scale_color_brewer(palette = "Set3") +
labs(title = "Evolution of Religious Attendance Over Time",
x = "Year",
y = "Proportion",
color = "Religious Attendance") +
theme_minimal() +
theme(legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 34 rows containing missing values (`geom_line()`).
Objective: Create a stacked bar chart showing the distribution of fejobaff (preferential hiring) across different age groups.
Create an age group variable by categorizing age into “18-29”, “30-44”, “45-59”, “60+”. Create a stacked bar chart showing the distribution of the fejobaff response categories for each age group.
gss <- gss %>%
mutate(
fejobaff = case_when(
fejobaff %in% c("strongly favor", "not strongly favor", "not strongly oppose", "strongly oppose") ~ fejobaff,
TRUE ~ NA_character_
),
age_group = case_when(
age >= 18 & age <= 29 ~ "18-29",
age >= 30 & age <= 44 ~ "30-44",
age >= 45 & age <= 59 ~ "45-59",
age >= 60 ~ "60+",
TRUE ~ NA_character_
)
) %>%
filter(!is.na(fejobaff), !is.na(age_group))
fejobaff_summary <- gss %>%
count(age_group, fejobaff) %>%
group_by(age_group) %>%
mutate(total = sum(n), proportion = n / total)
ggplot(fejobaff_summary, aes(x = age_group, y = proportion, fill = fejobaff)) +
geom_bar(stat = "identity", position = "fill", color = "black") +
scale_y_continuous(labels = scales::percent_format()) +
scale_fill_brewer(palette = "Set2", name = "Preferential Hiring") +
labs(title = "Distribution of Preferential Hiring Views by Age Group",
x = "Age Group",
y = "Proportion") +
theme_minimal(base_size = 15) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(size = 18, hjust = 0.5),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12))