Let’s first set up all our packages and our dataset. We will be working with the General Social Survey dataset from 2022
packages <- c("tidyverse", "fst", "modelsummary", "viridis", "kableExtra", "flextable", "officer") # add any you need here
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
## backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
##
## Revert to `kableExtra` for one session:
##
## options(modelsummary_factory_default = 'kableExtra')
## options(modelsummary_factory_latex = 'kableExtra')
## options(modelsummary_factory_html = 'kableExtra')
##
## Silence this message forever:
##
## config_modelsummary(startup_message = FALSE)
##
## Loading required package: viridisLite
##
##
## Attaching package: 'kableExtra'
##
##
## The following object is masked from 'package:dplyr':
##
## group_rows
##
##
##
## Attaching package: 'flextable'
##
##
## The following objects are masked from 'package:kableExtra':
##
## as_image, footnote
##
##
## The following object is masked from 'package:purrr':
##
## compose
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "fst" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "viridis" "viridisLite" "modelsummary" "fst" "lubridate"
## [6] "forcats" "stringr" "dplyr" "purrr" "readr"
## [11] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
##
## [[5]]
## [1] "kableExtra" "viridis" "viridisLite" "modelsummary" "fst"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[6]]
## [1] "flextable" "kableExtra" "viridis" "viridisLite" "modelsummary"
## [6] "fst" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[7]]
## [1] "officer" "flextable" "kableExtra" "viridis" "viridisLite"
## [6] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [11] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [21] "utils" "datasets" "methods" "base"
setwd("C:/Users/matej/OneDrive/Desktop/U of T/Summer 2024/SOC252/RMarkdowns")
gss <- load("gss2022.Rdata")
gss <- df
Now to recode and clean our variables
polviews must be recoded into 3 categories Liberal, Conservative, and Moderate. sex, degree, and race must all be cleaned into relevant categories without recoding.
# Lets take a look at our variables
table(gss$polviews)
##
## extremely liberal liberal
## 2081 7623
## slightly liberal moderate, middle of the road
## 7900 23992
## slightly conservative conservative
## 9596 9361
## extremely conservative don't know
## 2165 0
## iap I don't have a job
## 0 0
## dk, na, iap no answer
## 0 0
## not imputable_(2147483637) not imputable_(2147483638)
## 0 0
## refused skipped on web
## 0 0
## uncodeable not available in this release
## 0 0
## not available in this year see codebook
## 0 0
unique(gss$polviews)
## [1] <NA> moderate, middle of the road
## [3] slightly conservative conservative
## [5] liberal extremely conservative
## [7] slightly liberal extremely liberal
## 20 Levels: extremely liberal liberal ... see codebook
table(gss$sex)
##
## male female
## 31977 40301
## don't know iap
## 0 0
## I don't have a job dk, na, iap
## 0 0
## no answer not imputable_(2147483637)
## 0 0
## not imputable_(2147483638) refused
## 0 0
## skipped on web uncodeable
## 0 0
## not available in this release not available in this year
## 0 0
## see codebook
## 0
unique(gss$sex)
## [1] female male <NA>
## 15 Levels: male female don't know iap I don't have a job ... see codebook
table(gss$degree)
##
## less than high school high school
## 14192 36446
## associate/junior college bachelor's
## 4355 11248
## graduate don't know
## 5953 0
## iap I don't have a job
## 0 0
## dk, na, iap no answer
## 0 0
## not imputable_(2147483637) not imputable_(2147483638)
## 0 0
## refused skipped on web
## 0 0
## uncodeable not available in this release
## 0 0
## not available in this year see codebook
## 0 0
unique(gss$degree)
## [1] bachelor's less than high school high school
## [4] graduate associate/junior college <NA>
## 18 Levels: less than high school high school ... see codebook
table(gss$race)
##
## white black
## 57657 10215
## other don't know
## 4411 0
## iap I don't have a job
## 0 0
## dk, na, iap no answer
## 0 0
## not imputable_(2147483637) not imputable_(2147483638)
## 0 0
## refused skipped on web
## 0 0
## uncodeable not available in this release
## 0 0
## not available in this year see codebook
## 0 0
unique(gss$race)
## [1] white black other <NA>
## 16 Levels: white black other don't know iap I don't have a job ... see codebook
# Great, they exist, let's clean polviews, sex, degree, and race, we'll just keep relevant categories like "male" and "female" and remove na values. It's important to keep syntax and case sensitivity in mind here
gss <- gss %>%
mutate(
polviews = case_when(
polviews %in% c("liberal", "moderate, middle of the road", "conservative") ~ polviews,
TRUE ~ NA_character_
),
race = case_when(
race %in% c("white", "black", "other") ~ race,
TRUE ~ NA_character_
),
sex = case_when(
sex %in% c("male", "female") ~ sex,
TRUE ~ NA_character_
),
degree = case_when(
degree %in% c("less than high school", "high school", "associate/junior college", "bachelor's", "graduate") ~ degree,
TRUE ~ NA_character_
)
)
Now we need our data summary table of our selected variables
gss_filtered <- gss %>%
dplyr::select(polviews, race, sex, degree)
categorical_summary <- datasummary_skim(gss_filtered, type = "categorical")
categorical_summary
| N | % | ||
|---|---|---|---|
| polviews | conservative | 9361 | 12.9 |
| liberal | 7623 | 10.5 | |
| moderate, middle of the road | 23992 | 33.1 | |
| NA | 31414 | 43.4 | |
| race | black | 10215 | 14.1 |
| other | 4411 | 6.1 | |
| white | 57657 | 79.6 | |
| NA | 107 | 0.1 | |
| sex | female | 40301 | 55.7 |
| male | 31977 | 44.2 | |
| NA | 112 | 0.2 | |
| degree | associate/junior college | 4355 | 6.0 |
| bachelor's | 11248 | 15.5 | |
| graduate | 5953 | 8.2 | |
| high school | 36446 | 50.3 | |
| less than high school | 14192 | 19.6 | |
| NA | 196 | 0.3 |
Everything seems to be functioning correctly in the datasummary table, we can proceed with recoding and cleaning up some of these labels
gss_cleaned <- gss %>%
filter(!is.na(polviews), !is.na(race), !is.na(sex), !is.na(degree)) %>%
mutate(
polviews = recode(polviews, "liberal" = "Liberal", "moderate, middle of the road" = "Moderate", "conservative" = "Conservative"),
sex = recode(sex, "male" = "Male", "female" = "Female"),
race = recode(race, "white" = "White", "black" = "Black", "other" = "Other"),
degree = recode(degree, "less than high school" = "Less than High School", "high school" = "High School", "assosciate/junior college" = "Junior College", "bachelor" = "Bachelor", "graduate" = "Graduate")
)
gss_cleaned <- gss_cleaned %>%
rename(
"Political Views" = polviews,
"Respondent Race" = race,
"Respondent Sex" = sex,
"Highest Degree Obtained" = degree,
)
# We've cleaned the variables' labels and properly recoded polviews. Now we can make a categorical summary table with flextable.
categorical_summary_flextable <- datasummary_skim(
gss_cleaned %>%
dplyr::select(`Political Views`, `Respondent Race`, `Respondent Sex`, `Highest Degree Obtained`),
type = "categorical",
output = "flextable"
)
## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
## produced by the `tinytable` backend.
categorical_summary_flextable <- categorical_summary_flextable %>%
set_header_labels(Variable = "Variable", Value = "Value", Freq = "Frequency") %>%
theme_box() %>%
bold(part = "header") %>%
bg(part = "header", bg = "deepskyblue4") %>%
color(part = "header", color = "white") %>%
border_remove() %>%
border_inner_v(border = fp_border(color = "black", width = 1)) %>%
autofit()
flextable::htmltools_value(categorical_summary_flextable)
|
| N | % |
|---|---|---|---|
Political Views | Conservative | 9316 | 22.8 |
Liberal | 7596 | 18.6 | |
Moderate | 23871 | 58.5 | |
Respondent Race | Black | 5571 | 13.7 |
Other | 2559 | 6.3 | |
White | 32653 | 80.1 | |
Respondent Sex | Female | 23042 | 56.5 |
Male | 17741 | 43.5 | |
Highest Degree Obtained | associate/junior college | 2619 | 6.4 |
bachelor's | 6115 | 15.0 | |
Graduate | 3313 | 8.1 | |
High School | 21371 | 52.4 | |
Less than High School | 7365 | 18.1 |
Now we can see our variables, nice, clean, and ignoring NA values ## Task 3 Our next objective is to create a bar chart. We want to focus on our variables polviews and sex so that we may show the distribution of political views by gender.
library(dplyr)
library(ggplot2)
gss_filtered <- gss %>%
filter(!is.na(polviews), !is.na(sex)) %>%
mutate(
polviews = recode(polviews, "liberal" = "Liberal", "moderate, middle of the road" = "Moderate", "conservative" = "Conservative"),
sex = recode(sex, "male" = "Male", "female" = "Female")
)
gss_filtered %>%
count(polviews, sex) %>%
ggplot(aes(x = sex, y = n, fill = polviews)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_brewer(palette = "Set1", name = "Political view") +
labs(title = "Distribution of political views by gender",
subtitle = "General Social Survey, 2022",
x = "Gender",
y = "Count") +
theme_minimal() +
theme(legend.position = "bottom")
What a readable bar chart! We can clearly see that, in the gss2022 dataset that women answered the polviews question with our 3 categories more than men did. If we were to adjust it with a stacked bar chart we would likely notice a pretty simillar distribution between political views and gender.
For this objective we are going to be using the variables “Religious Attendance” known as “attend” and the “year” variable so that we may observe religious attendance over time in years
Lets prepare our variables
gss_yearly <- gss %>%
group_by(year, attend) %>%
summarize(count = n(), .groups = 'drop') %>%
group_by(year) %>%
mutate(total = sum(count),
proportion = count / total)
# Now to code our line plot
ggplot(gss_yearly, aes(x = year, y = proportion, color = attend, group = attend)) +
geom_line(size = 1.2) +
scale_color_brewer(palette = "Dark2") +
labs(title = "Evolution of Religious Attendance Over Time",
x = "Year",
y = "Proportion",
color = "Religious Attendance") +
theme_minimal() +
theme(legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors
## Warning: Removed 68 rows containing missing values or values outside the scale range
## (`geom_line()`).
With the line plot above, visualizing trends in religious attendance is easy. It is interesting to note that the only 2 attendance groups that drastically increased and decreased most over time are the never attending religious services group (saw a great increase over time in years) and attending religious services every week group (saw a great decrease over time). Clearly, the society that the GSS surveyed has decreased it’s religious attendance as the years went by. Perhaps we can infer that the GSS surveys a more secular society? Very interesting line plot.
Our final task is to create a bar chart showing the distribution of preferential hiring across different age groups. While we do have a variable for preferential hiring (fejobaff), we do not have one for age groups.
We do have a variable for age though, lets categorize it into groups so we may have an age group variable and then construct the bar chart.
library(dplyr)
library(ggplot2)
# Creating a variable for age group
gss <- gss %>%
mutate(age_group = case_when(
age >= 18 & age <= 29 ~ "18-29",
age >= 30 & age <= 44 ~ "30-44",
age >= 45 & age <= 59 ~ "45-59",
age >= 60 ~ "60+",
TRUE ~ NA_character_
))
# Constructing the bar chart
gss_filtered <- gss %>%
filter(!is.na(fejobaff), !is.na(age_group))
fejobaff_summary <- gss_filtered %>%
count(age_group, fejobaff) %>%
group_by(age_group) %>%
mutate(total = sum(n),
proportion = n / total)
ggplot(fejobaff_summary, aes(x = factor(age_group), y = proportion, fill = fejobaff)) +
geom_bar(stat = "identity", position = "fill") +
scale_y_continuous(labels = scales::percent_format()) +
scale_fill_brewer(palette = "Pastel1") +
labs(title = "Distribution of preferential hiring across different age groups",
x = "Age Group",
y = "Proportion",
fill = "For or against preferential hiring and promotion of women") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "bottom",
legend.title = element_text(size = 10),
legend.text = element_text(size = 8)) +
guides(fill = guide_legend(nrow = 3, byrow = TRUE, title.position = "top"))
From this chart, while the distribution for preferential hiring is
pretty similar for the most part, it is clear that those in the age
group 18-29 strongly oppose the preferential hiring and promotion of
women way less than those in the age group 60 and above do.