Instructions

Setup

Let’s first set up all our packages and our dataset. We will be working with the General Social Survey dataset from 2022

packages <- c("tidyverse", "fst", "modelsummary", "viridis", "kableExtra", "flextable", "officer") # add any you need here

new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
## 
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## 
## 
## 
## Attaching package: 'flextable'
## 
## 
## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote
## 
## 
## The following object is masked from 'package:purrr':
## 
##     compose
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "viridis"      "viridisLite"  "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[5]]
##  [1] "kableExtra"   "viridis"      "viridisLite"  "modelsummary" "fst"         
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "flextable"    "kableExtra"   "viridis"      "viridisLite"  "modelsummary"
##  [6] "fst"          "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "officer"      "flextable"    "kableExtra"   "viridis"      "viridisLite" 
##  [6] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"
setwd("C:/Users/matej/OneDrive/Desktop/U of T/Summer 2024/SOC252/RMarkdowns")

gss <- load("gss2022.Rdata")
gss <- df

Task 1

Now to recode and clean our variables

polviews must be recoded into 3 categories Liberal, Conservative, and Moderate. sex, degree, and race must all be cleaned into relevant categories without recoding.

# Lets take a look at our variables
table(gss$polviews)
## 
##             extremely liberal                       liberal 
##                          2081                          7623 
##              slightly liberal  moderate, middle of the road 
##                          7900                         23992 
##         slightly conservative                  conservative 
##                          9596                          9361 
##        extremely conservative                    don't know 
##                          2165                             0 
##                           iap            I don't have a job 
##                             0                             0 
##                   dk, na, iap                     no answer 
##                             0                             0 
##    not imputable_(2147483637)    not imputable_(2147483638) 
##                             0                             0 
##                       refused                skipped on web 
##                             0                             0 
##                    uncodeable not available in this release 
##                             0                             0 
##    not available in this year                  see codebook 
##                             0                             0
unique(gss$polviews)
## [1] <NA>                         moderate, middle of the road
## [3] slightly conservative        conservative                
## [5] liberal                      extremely conservative      
## [7] slightly liberal             extremely liberal           
## 20 Levels: extremely liberal liberal ... see codebook
table(gss$sex)
## 
##                          male                        female 
##                         31977                         40301 
##                    don't know                           iap 
##                             0                             0 
##            I don't have a job                   dk, na, iap 
##                             0                             0 
##                     no answer    not imputable_(2147483637) 
##                             0                             0 
##    not imputable_(2147483638)                       refused 
##                             0                             0 
##                skipped on web                    uncodeable 
##                             0                             0 
## not available in this release    not available in this year 
##                             0                             0 
##                  see codebook 
##                             0
unique(gss$sex)
## [1] female male   <NA>  
## 15 Levels: male female don't know iap I don't have a job ... see codebook
table(gss$degree)
## 
##         less than high school                   high school 
##                         14192                         36446 
##      associate/junior college                    bachelor's 
##                          4355                         11248 
##                      graduate                    don't know 
##                          5953                             0 
##                           iap            I don't have a job 
##                             0                             0 
##                   dk, na, iap                     no answer 
##                             0                             0 
##    not imputable_(2147483637)    not imputable_(2147483638) 
##                             0                             0 
##                       refused                skipped on web 
##                             0                             0 
##                    uncodeable not available in this release 
##                             0                             0 
##    not available in this year                  see codebook 
##                             0                             0
unique(gss$degree)
## [1] bachelor's               less than high school    high school             
## [4] graduate                 associate/junior college <NA>                    
## 18 Levels: less than high school high school ... see codebook
table(gss$race)
## 
##                         white                         black 
##                         57657                         10215 
##                         other                    don't know 
##                          4411                             0 
##                           iap            I don't have a job 
##                             0                             0 
##                   dk, na, iap                     no answer 
##                             0                             0 
##    not imputable_(2147483637)    not imputable_(2147483638) 
##                             0                             0 
##                       refused                skipped on web 
##                             0                             0 
##                    uncodeable not available in this release 
##                             0                             0 
##    not available in this year                  see codebook 
##                             0                             0
unique(gss$race)
## [1] white black other <NA> 
## 16 Levels: white black other don't know iap I don't have a job ... see codebook
# Great, they exist, let's clean polviews, sex, degree, and race, we'll just keep relevant categories like "male" and "female" and remove na values. It's important to keep syntax and case sensitivity in mind here

gss <- gss %>%
  mutate(
    polviews = case_when(
      polviews %in% c("liberal", "moderate, middle of the road", "conservative") ~ polviews,
      TRUE ~ NA_character_
    ),
    race = case_when(
      race %in% c("white", "black", "other") ~ race,
      TRUE ~ NA_character_
  ),
  sex = case_when(
      sex %in% c("male", "female") ~ sex,
      TRUE ~ NA_character_
  ),
  degree = case_when(
      degree %in% c("less than high school", "high school", "associate/junior college", "bachelor's", "graduate") ~ degree,
      TRUE ~ NA_character_
  )
  )

Task 2

Now we need our data summary table of our selected variables

gss_filtered <- gss %>%
  dplyr::select(polviews, race, sex, degree)
categorical_summary <- datasummary_skim(gss_filtered, type = "categorical")
categorical_summary 
tinytable_ryiudzcpd7xl62vlm3ml
N %
polviews conservative 9361 12.9
liberal 7623 10.5
moderate, middle of the road 23992 33.1
NA 31414 43.4
race black 10215 14.1
other 4411 6.1
white 57657 79.6
NA 107 0.1
sex female 40301 55.7
male 31977 44.2
NA 112 0.2
degree associate/junior college 4355 6.0
bachelor's 11248 15.5
graduate 5953 8.2
high school 36446 50.3
less than high school 14192 19.6
NA 196 0.3

Everything seems to be functioning correctly in the datasummary table, we can proceed with recoding and cleaning up some of these labels

gss_cleaned <- gss %>%
  filter(!is.na(polviews), !is.na(race), !is.na(sex), !is.na(degree)) %>%
  mutate(
    polviews = recode(polviews, "liberal" = "Liberal", "moderate, middle of the road" = "Moderate", "conservative" = "Conservative"),
    sex = recode(sex, "male" = "Male", "female" = "Female"),
    race = recode(race, "white" = "White", "black" = "Black", "other" = "Other"),
    degree = recode(degree, "less than high school" = "Less than High School", "high school" = "High School", "assosciate/junior college" = "Junior College", "bachelor" = "Bachelor", "graduate" = "Graduate")
  )
gss_cleaned <- gss_cleaned %>%
  rename(
    "Political Views" = polviews,
    "Respondent Race" = race,
    "Respondent Sex" = sex,
    "Highest Degree Obtained" = degree,
  )

# We've cleaned the variables' labels and properly recoded polviews. Now we can make a categorical summary table with flextable.

categorical_summary_flextable <- datasummary_skim(
  gss_cleaned %>%
    dplyr::select(`Political Views`, `Respondent Race`, `Respondent Sex`, `Highest Degree Obtained`),
  type = "categorical",
  output = "flextable"
)
## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
##   produced by the `tinytable` backend.
categorical_summary_flextable <- categorical_summary_flextable %>%
  set_header_labels(Variable = "Variable", Value = "Value", Freq = "Frequency") %>%
  theme_box() %>%
  bold(part = "header") %>%
  bg(part = "header", bg = "deepskyblue4") %>%
  color(part = "header", color = "white") %>%
  border_remove() %>%
  border_inner_v(border = fp_border(color = "black", width = 1)) %>%
  autofit()

flextable::htmltools_value(categorical_summary_flextable)

N

%

Political Views

Conservative

9316

22.8

Liberal

7596

18.6

Moderate

23871

58.5

Respondent Race

Black

5571

13.7

Other

2559

6.3

White

32653

80.1

Respondent Sex

Female

23042

56.5

Male

17741

43.5

Highest Degree Obtained

associate/junior college

2619

6.4

bachelor's

6115

15.0

Graduate

3313

8.1

High School

21371

52.4

Less than High School

7365

18.1

Now we can see our variables, nice, clean, and ignoring NA values ## Task 3 Our next objective is to create a bar chart. We want to focus on our variables polviews and sex so that we may show the distribution of political views by gender.

library(dplyr)
library(ggplot2)

gss_filtered <- gss %>%
  filter(!is.na(polviews), !is.na(sex)) %>% 
  mutate(
    polviews = recode(polviews, "liberal" = "Liberal", "moderate, middle of the road" = "Moderate", "conservative" = "Conservative"),
    sex = recode(sex, "male" = "Male", "female" = "Female")
  )

gss_filtered %>%
  count(polviews, sex) %>%
  ggplot(aes(x = sex, y = n, fill = polviews)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_brewer(palette = "Set1", name = "Political view") +
  labs(title = "Distribution of political views by gender",
       subtitle = "General Social Survey, 2022",
       x = "Gender",
       y = "Count") +
  theme_minimal() +
  theme(legend.position = "bottom")

What a readable bar chart! We can clearly see that, in the gss2022 dataset that women answered the polviews question with our 3 categories more than men did. If we were to adjust it with a stacked bar chart we would likely notice a pretty simillar distribution between political views and gender.

Task 4

For this objective we are going to be using the variables “Religious Attendance” known as “attend” and the “year” variable so that we may observe religious attendance over time in years

Lets prepare our variables

gss_yearly <- gss %>%
  group_by(year, attend) %>% 
  summarize(count = n(), .groups = 'drop') %>% 
  group_by(year) %>% 
  mutate(total = sum(count), 
         proportion = count / total)

# Now to code our line plot
ggplot(gss_yearly, aes(x = year, y = proportion, color = attend, group = attend)) +
  geom_line(size = 1.2) + 
  scale_color_brewer(palette = "Dark2") + 
  labs(title = "Evolution of Religious Attendance Over Time", 
       x = "Year", 
       y = "Proportion", 
       color = "Religious Attendance") + 
  theme_minimal() + 
  theme(legend.position = "bottom") 
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors
## Warning: Removed 68 rows containing missing values or values outside the scale range
## (`geom_line()`).

With the line plot above, visualizing trends in religious attendance is easy. It is interesting to note that the only 2 attendance groups that drastically increased and decreased most over time are the never attending religious services group (saw a great increase over time in years) and attending religious services every week group (saw a great decrease over time). Clearly, the society that the GSS surveyed has decreased it’s religious attendance as the years went by. Perhaps we can infer that the GSS surveys a more secular society? Very interesting line plot.

Task 5

Our final task is to create a bar chart showing the distribution of preferential hiring across different age groups. While we do have a variable for preferential hiring (fejobaff), we do not have one for age groups.

We do have a variable for age though, lets categorize it into groups so we may have an age group variable and then construct the bar chart.

library(dplyr)
library(ggplot2)

# Creating a variable for age group
gss <- gss %>%
  mutate(age_group = case_when(
    age >= 18 & age <= 29 ~ "18-29",
    age >= 30 & age <= 44 ~ "30-44",
    age >= 45 & age <= 59 ~ "45-59",
    age >= 60 ~ "60+",
    TRUE ~ NA_character_
  ))

# Constructing the bar chart
gss_filtered <- gss %>%
  filter(!is.na(fejobaff), !is.na(age_group))

fejobaff_summary <- gss_filtered %>%
  count(age_group, fejobaff) %>%
  group_by(age_group) %>%
  mutate(total = sum(n),
         proportion = n / total)

ggplot(fejobaff_summary, aes(x = factor(age_group), y = proportion, fill = fejobaff)) + 
  geom_bar(stat = "identity", position = "fill") + 
  scale_y_continuous(labels = scales::percent_format()) + 
  scale_fill_brewer(palette = "Pastel1") + 
  labs(title = "Distribution of preferential hiring across different age groups", 
       x = "Age Group", 
       y = "Proportion", 
       fill = "For or against preferential hiring and promotion of women") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1), 
        legend.position = "bottom", 
        legend.title = element_text(size = 10), 
        legend.text = element_text(size = 8)) + 
  guides(fill = guide_legend(nrow = 3, byrow = TRUE, title.position = "top"))

From this chart, while the distribution for preferential hiring is pretty similar for the most part, it is clear that those in the age group 18-29 strongly oppose the preferential hiring and promotion of women way less than those in the age group 60 and above do.

The End

Thanks for reading my problem set!
Mateja Dokic