# List of packages
packages <- c("tidyverse", "fst", "modelsummary", "viridis", "kableExtra", "flextable", "officer") # add any you need here
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
## backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
##
## Revert to `kableExtra` for one session:
##
## options(modelsummary_factory_default = 'kableExtra')
## options(modelsummary_factory_latex = 'kableExtra')
## options(modelsummary_factory_html = 'kableExtra')
##
## Silence this message forever:
##
## config_modelsummary(startup_message = FALSE)
##
## Loading required package: viridisLite
##
##
## Attaching package: 'kableExtra'
##
##
## The following object is masked from 'package:dplyr':
##
## group_rows
##
##
##
## Attaching package: 'flextable'
##
##
## The following objects are masked from 'package:kableExtra':
##
## as_image, footnote
##
##
## The following object is masked from 'package:purrr':
##
## compose
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "fst" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "viridis" "viridisLite" "modelsummary" "fst" "lubridate"
## [6] "forcats" "stringr" "dplyr" "purrr" "readr"
## [11] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
##
## [[5]]
## [1] "kableExtra" "viridis" "viridisLite" "modelsummary" "fst"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[6]]
## [1] "flextable" "kableExtra" "viridis" "viridisLite" "modelsummary"
## [6] "fst" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[7]]
## [1] "officer" "flextable" "kableExtra" "viridis" "viridisLite"
## [6] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [11] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [21] "utils" "datasets" "methods" "base"
gss <- load("gss2018_egp.RData")
gss <- df
gss <- load("gss2022.RData")
gss <- df
Objective: Clean and recode the variables to ensure they are ready for analysis.
Recode polviews into three categories: “Liberal”, “Moderate”, and “Conservative”. Clean sex, degree, and race but retain the relevant categories.
# Recode and clean variables
gss <- gss %>%
mutate(
polviews = case_when(
polviews %in% c("liberal", "moderate", "conservative") ~ polviews,
TRUE ~ NA_character_
),
race = case_when(
race %in% c("white", "black", "other") ~ race,
TRUE ~ NA_character_
),
sex = case_when(
sex %in% c("male", "female") ~ sex,
TRUE ~ NA_character_
),
degree = case_when(
degree %in% c("less than high school", "high school", "junior college", "bachelor", "graduate") ~ degree,
TRUE ~ NA_character_
)
)
Objective: Generate a summary table for selected variables using the datasummary_skim function from the modelsummary package.
Select the variables of interest: polviews, sex, degree, and race.
Generate a categorical summary table for these variables, clean the labels, and display it using the flextable package for styling.
# Filter to variables of interest
gss_filtered <- gss %>%
dplyr::select(polviews, race, sex, degree)
categorical_summary <- datasummary_skim(gss_filtered, type = "categorical")
categorical_summary
| N | % | ||
|---|---|---|---|
| polviews | conservative | 9361 | 12.9 |
| liberal | 7623 | 10.5 | |
| NA | 55406 | 76.5 | |
| race | black | 10215 | 14.1 |
| other | 4411 | 6.1 | |
| white | 57657 | 79.6 | |
| NA | 107 | 0.1 | |
| sex | female | 40301 | 55.7 |
| male | 31977 | 44.2 | |
| NA | 112 | 0.2 | |
| degree | graduate | 5953 | 8.2 |
| high school | 36446 | 50.3 | |
| less than high school | 14192 | 19.6 | |
| NA | 15799 | 21.8 |
gss_cleaned <- gss %>%
filter(!is.na(polviews), !is.na(race), !is.na(sex),
!is.na(degree)) %>%
mutate(
polviews = recode(polviews, "liberal" = "Liberal", "moderate" = "Moderate", "conservative" = "Conservative"),
race = recode(race, "white" = "White", "black" = "Black", "other" = "Other"),
sex = recode(sex, "male" = "Male", "female" = "Female"),
degree = recode(degree, "less than high school" = "Less than High School", "high school" = "High School", "junior college" = "Junior College", "bachelor" = "Bachelor", "graduate" = "Graduate"),
)
gss_cleaned <- gss_cleaned %>%
rename(
"Political Views" = polviews,
"Respondent Race" = race,
"Respondent Sex" = sex,
"Highest Degree" = degree
)
table(gss$polviews)
##
## conservative liberal
## 9361 7623
unique(gss$polviews)
## [1] NA "conservative" "liberal"
table(gss$sex)
##
## female male
## 40301 31977
unique(gss$sex)
## [1] "female" "male" NA
table(gss$degree)
##
## graduate high school less than high school
## 5953 36446 14192
unique(gss$degree)
## [1] NA "less than high school" "high school"
## [4] "graduate"
table(gss$race)
##
## black other white
## 10215 4411 57657
unique(gss$race)
## [1] "white" "black" "other" NA
# Create summary for relabeled categorical variables
categorical_summary_relabelled <- datasummary_skim(
gss_cleaned %>%
dplyr::select(`Political Views`, `Respondent Race`, `Respondent Sex`, `Highest Degree`,), # Select categorical variables
type = "categorical", # Specify the type of variables to summarize
output = "kableExtra" # Specify the output format
)
## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
## produced by the `tinytable` backend.
# Customize the table appearance
categorical_summary_relabelled %>%
kableExtra::kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% # Apply table styling options
kableExtra::row_spec(0, bold = TRUE, color = "white", background = "#4CAF50") %>% # Customize the header row
kableExtra::column_spec(1, bold = TRUE) %>% # Make the first column bold
kableExtra::add_header_above(c(" " = 1, "Summary Statistics for Categorical Variables" = 3)) # Add a header above the table
| N | % | ||
|---|---|---|---|
| Political Views | Conservative | 6945 | 55.2 |
| Liberal | 5627 | 44.8 | |
| Respondent Race | Black | 1717 | 13.7 |
| Other | 772 | 6.1 | |
| White | 10083 | 80.2 | |
| Respondent Sex | Female | 6813 | 54.2 |
| Male | 5759 | 45.8 | |
| Highest Degree | Graduate | 1943 | 15.5 |
| High School | 7940 | 63.2 | |
| Less than High School | 2689 | 21.4 |
# Create summary for relabeled categorical variables with flextable
categorical_summary_flextable <- datasummary_skim(
gss_cleaned %>%
dplyr::select(`Political Views`, `Respondent Race`, `Respondent Sex`, `Highest Degree`),
type = "categorical",
output = "flextable"
)
## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
## produced by the `tinytable` backend.
# Customize the table appearance with flextable
categorical_summary_flextable <- categorical_summary_flextable %>%
set_header_labels(Variable = "Variable", Value = "Value", Freq = "Frequency") %>%
theme_box() %>%
bold(part = "header") %>%
bg(part = "header", bg = "#4CAF50") %>%
color(part = "header", color = "white") %>%
border_remove() %>%
border_inner_v(border = fp_border(color = "black", width = 1)) %>%
autofit()
print(categorical_summary_flextable)
## a flextable object.
## col_keys: ` `, ` `, `N`, `%`
## header has 1 row(s)
## body has 10 row(s)
## original dataset sample:
## N %
## 1 Political Views Conservative 6945 55.2
## 2 Liberal 5627 44.8
## 3 Respondent Race Black 1717 13.7
## 4 Other 772 6.1
## 5 White 10083 80.2
Objective: Create a bar chart showing the distribution of political views by gender.
Create a bar chart showing the distribution of political views by gender. Use a color palette that clearly differentiates the categories.
# Summary
gss_filtered <- gss %>%
filter(polviews %in% c("liberal", "moderate", "conservative"))
gss_sex <- gss_filtered %>%
group_by (sex, polviews) %>%
summarize(count = n(), .groups = 'drop') %>%
group_by(sex) %>%
mutate(total = sum(count),
proportion = count / total)
gss_filtered <- gss %>%
filter(polviews %in% c("liberal", "moderate", "conservative"))
gss_sex <- gss_filtered %>%
group_by(sex, polviews) %>%
summarize(count = n(), .groups = 'drop') %>%
group_by(sex) %>%
mutate(total = sum(count),
proportion = count / total)
# Plot political views based on gender
ggplot(gss_sex, aes(x = sex, y = proportion, color = polviews, group = polviews)) +
geom_line(size = 1.2) +
scale_color_brewer(palette = "Dark2") +
labs(title = "Distribution of Political Views by Gender",
x = "Sex",
y = "Proportion",
color = "Political Views") +
theme_minimal() +
theme(legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Objective: Visualize trends in religious attendance over time.
Select the year and attend variables from the GSS dataset.
Create a line plot showing the proportion of each category of religious attendance over time.
# Prepare the data
gss_yearly <- gss %>%
group_by(attend, relig) %>% # Group by religious attendance
summarize(count = n(), .groups = 'drop') %>% # Calculate the count for each group
group_by(attend) %>% #Group by attendance
mutate(total = sum(count), # Calculate the total count overtime
proportion = count / total) # Calculate the proportion of religion attendance per year
# Create a line plot to visualize the evolution of religious attendance over time
ggplot(gss_yearly, aes(x = attend, y = proportion, color = relig, group = relig)) +
geom_line(size = 1.2) + # Create lines for each religious attendance with increased line size
scale_color_brewer(palette = "Set3") + # Use a color palette for better differentiation
labs(title = "Evolution of Religious Attendance Over Time", # Add plot title
x = "Attendance", # Label x-axis
y = "Proportion", # Label y-axis
color = "Religions") + # Label the legend
theme_minimal() + # Apply a minimal theme to the plot
theme(legend.position = "bottom") # Position the legend at the bottom
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set3 is 12
## Returning the palette you asked for with that many colors
## Warning: Removed 19 rows containing missing values or values outside the scale range
## (`geom_line()`).
Objective: Create a stacked bar chart showing the distribution of fejobaff (preferential hiring) across different age groups.
Create an age group variable by categorizing age into “18-29”, “30-44”, “45-59”, “60+”. Create a stacked bar chart showing the distribution of the fejobaff response categories for each age group.
gss_cleaned <- gss %>%
filter(!is.na(age), !is.na(fejobaff)) %>%
mutate(
age = recode(age, "18 - 29" = "18-29", "30 - 44" = "30-44", "45 - 59" = "45-59", "60 + " = "60+"),
)
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `age = recode(...)`.
## Caused by warning in `recode.numeric()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
gss_cleaned <- gss_cleaned %>%
rename(
"Preferential Hiring" = fejobaff,
"Age Groups" = age
)
# Recode the age variable
gss_filtered <- gss_filtered %>%
mutate
age = case_when
age %in% c("18-29") ~ "Age Group 1"
## age %in% c("18-29") ~ "Age Group 1"
age %in% c("30-44") ~ "Age Group 2"
## age %in% c("30-44") ~ "Age Group 2"
age %in% c("45-59") ~ "Age Group 3"
## age %in% c("45-59") ~ "Age Group 3"
age %in% c("60+") ~ "Age Group 4"
## age %in% c("60+") ~ "Age Group 4"
TRUE ~ NA_character_
## TRUE ~ NA_character_
gss_filtered %>% count(age, fejobaff) %>% group_by(age_recoded) %>% mutate(proportion = n / sum(n)) %>% ggplot(aes(x = age_recoded, y = proportion, fill = fejobaff)) + geom_bar(stat = “identity”, position = “fill”) + scale_fill_brewer(palette = “Set2”, name = “Hiring Preferences”) + labs(title = “Hiring Preferences Based on Age”, x = ““, y =”Proportion”) + theme_minimal() + theme(legend.position = “bottom”)