Loading packages and dataset

packages <- c("tidyverse", "fst", "modelsummary", "viridis", "kableExtra", "flextable", "officer") 

new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
## 
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## 
## 
## 
## Attaching package: 'flextable'
## 
## 
## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote
## 
## 
## The following object is masked from 'package:purrr':
## 
##     compose
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "viridis"      "viridisLite"  "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[5]]
##  [1] "kableExtra"   "viridis"      "viridisLite"  "modelsummary" "fst"         
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "flextable"    "kableExtra"   "viridis"      "viridisLite"  "modelsummary"
##  [6] "fst"          "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "officer"      "flextable"    "kableExtra"   "viridis"      "viridisLite" 
##  [6] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"
setwd("C:/Users/matej/OneDrive/Desktop/U of T/Summer 2024/SOC252/RMarkdowns")

gss <- load("gss2022.Rdata")
gss <- df

Cleaning variables

Here is what our variables look like before cleaning

table(gss$polviews)
## 
##             extremely liberal                       liberal 
##                          2081                          7623 
##              slightly liberal  moderate, middle of the road 
##                          7900                         23992 
##         slightly conservative                  conservative 
##                          9596                          9361 
##        extremely conservative                    don't know 
##                          2165                             0 
##                           iap            I don't have a job 
##                             0                             0 
##                   dk, na, iap                     no answer 
##                             0                             0 
##    not imputable_(2147483637)    not imputable_(2147483638) 
##                             0                             0 
##                       refused                skipped on web 
##                             0                             0 
##                    uncodeable not available in this release 
##                             0                             0 
##    not available in this year                  see codebook 
##                             0                             0
unique(gss$polviews)
## [1] <NA>                         moderate, middle of the road
## [3] slightly conservative        conservative                
## [5] liberal                      extremely conservative      
## [7] slightly liberal             extremely liberal           
## 20 Levels: extremely liberal liberal ... see codebook
table(gss$reliten)
## 
##                        strong               not very strong 
##                         22652                         23738 
##        somewhat strong (vol.)                   no religion 
##                          5736                          7629 
##                    don't know                           iap 
##                             0                             0 
##            I don't have a job                   dk, na, iap 
##                             0                             0 
##                     no answer    not imputable_(2147483637) 
##                             0                             0 
##    not imputable_(2147483638)                       refused 
##                             0                             0 
##                skipped on web                    uncodeable 
##                             0                             0 
## not available in this release    not available in this year 
##                             0                             0 
##                  see codebook 
##                             0
unique(gss$reliten)
## [1] <NA>                   strong                 not very strong       
## [4] somewhat strong (vol.) no religion           
## 17 Levels: strong not very strong somewhat strong (vol.) ... see codebook
table(gss$sexeduc)
## 
##                         favor                        oppose 
##                         35639                          5127 
##   depends on age/grade (vol.)                    don't know 
##                             9                             0 
##                           iap            I don't have a job 
##                             0                             0 
##                   dk, na, iap                     no answer 
##                             0                             0 
##    not imputable_(2147483637)    not imputable_(2147483638) 
##                             0                             0 
##                       refused                skipped on web 
##                             0                             0 
##                    uncodeable not available in this release 
##                             0                             0 
##    not available in this year                  see codebook 
##                             0                             0
unique(gss$sexeduc)
## [1] <NA>                        favor                      
## [3] oppose                      depends on age/grade (vol.)
## 16 Levels: favor oppose depends on age/grade (vol.) don't know ... see codebook

Now to clean the data. Its important to note syntax in all the categories’ names in these variables

# Removing NA values
gss <- gss %>%
  mutate(
    polviews = case_when(
      polviews %in% c("extremely liberal", "liberal", "slightly liberal", "moderate, middle of the road", "slightly conservative", "conservative", "extremely conservative") ~ polviews,
        TRUE ~ NA_character_
    ),
    reliten = case_when(
      reliten %in% c("strong", "not very strong", "somewhat strong (vol.)", "no religion") ~ reliten,
      TRUE ~ NA_character_
    ),
    #we need to make sexeduc a dichotomous variable
    sexeduc = case_when(
      sexeduc %in% c("favor", "oppose") ~ sexeduc,
      TRUE ~ NA_character_
    )
  )
gss_filtered <- gss %>%
  dplyr::select(polviews, reliten, sexeduc)
categorical_summary <- datasummary_skim(gss_filtered, type = "categorical")
categorical_summary
tinytable_4vkylh25pe20huo4cdxw
N %
polviews conservative 9361 12.9
extremely conservative 2165 3.0
extremely liberal 2081 2.9
liberal 7623 10.5
moderate, middle of the road 23992 33.1
slightly conservative 9596 13.3
slightly liberal 7900 10.9
NA 9672 13.4
reliten no religion 7629 10.5
not very strong 23738 32.8
somewhat strong (vol.) 5736 7.9
strong 22652 31.3
NA 12635 17.5
sexeduc favor 35639 49.2
oppose 5127 7.1
NA 31624 43.7
# Now, removing NA values and making it reader friendly

gss_cleaned <- gss %>%
  filter(!is.na(polviews), !is.na(reliten), !is.na(sexeduc)) %>%
  mutate(
    polviews = recode(polviews, 
                      "extremely liberal" = "Extremely Liberal", 
                      "liberal" = "Liberal", 
                      "slightly liberal" = "Slightly Liberal", 
                      "moderate, middle of the road" = "Moderate", 
                      "slightly conservative" = "Slightly Conservative", 
                      "conservative" = "Conservative", 
                      "extremely conservative" = "Extremely Conservative"),
    polviews = factor(polviews, levels = c("Extremely Liberal", "Liberal", "Slightly Liberal", "Moderate", "Slightly Conservative", "Conservative", "Extremely Conservative")),
    sexeduc = recode(sexeduc, "favor" = "Favor", "oppose" = "Oppose"),
    sexeduc = factor(sexeduc, levels = c("Oppose", "Favor")),
    reliten = recode(reliten, 
                    "strong" = "Strong",
                    "somewhat strong (vol.)" = "Somewhat Strong",
                    "not very strong" = "Not Very Strong",
                    "no religion" = "No Religion"
                    ),
    reliten = factor(reliten, levels = c("No Religion", "Not Very Strong", "Somewhat Strong", "Strong"))
  )

gss_cleaned <- gss_cleaned %>%
  rename(
    "Political Views" = polviews,
    "Strength of Religious Affiliation" = reliten,
    "Attitude on Public School Sex Education" = sexeduc
  )

Exploratory Data Visualizations

We will create two relevant visualizations that explore our main variables and their relationships. The visualizations we will be constructing are a diverging stacked bar plot and a grouped bar plot

library(ggplot2)
library(dplyr)

# Diverging stacked bar chart
ggplot(gss_cleaned, aes(x = `Political Views`, fill = `Attitude on Public School Sex Education`)) +
  geom_bar(position = "fill", width = 0.7) +
  facet_wrap(~ `Strength of Religious Affiliation`, ncol = 2) +
  scale_fill_manual(values = c("Favor" = "#1f78b4", "Oppose" = "#e31a1c")) + 
  labs(
    title = "Attitudes on Sex Education by Political Views\nand Religious Affiliation",
    x = "Political Views",
    y = "Proportion",
    fill = "Attitude on Sex Education"
  ) +
  coord_flip() +
  theme_minimal(base_size = 14) + 
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"), 
    axis.text.x = element_text(angle = 45, hjust = 1), 
    legend.position = "top", 
    legend.title = element_blank(), 
    strip.background = element_rect(fill = "#f0f0f0", color = NA), 
    strip.text = element_text(face = "bold"),
     plot.margin = margin(20, 20, 20, 20)
    
  )

#Now our grouped bar plot
ggplot(gss_cleaned, aes(x = `Political Views`, fill = `Attitude on Public School Sex Education`)) +
  geom_bar(position = "dodge", color = "black", width = 0.7) +
  facet_wrap(~ `Strength of Religious Affiliation`, ncol = 2) +
  scale_fill_manual(values = c("Favor" = "#1f78b4", "Oppose" = "#e31a1c")) + 
  labs(
    title = "Attitudes on Sex Education by Political Views\nand Religious Affiliation",
    x = "Political Views",
    y = "Count",
    fill = "Attitude on Sex Education"
  ) +
  theme_minimal(base_size = 14) + 
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 16), 
    axis.text.x = element_text(angle = 45, hjust = 1), 
    axis.title.x = element_text(face = "bold"), 
    axis.title.y = element_text(face = "bold"), 
    legend.position = "top", 
    legend.title = element_blank(), 
    strip.background = element_rect(fill = "#f0f0f0", color = NA), 
    strip.text = element_text(face = "bold"), 
    panel.grid.major = element_line(color = "grey80"), 
    panel.grid.minor = element_blank() 
  )

Initial Regression Analysis

We will make a simple linear regression here to analyze 3 models, 2 single predictors with polviews and reliten respectively and one multiple predictor incorporating both. We’ll compare the models in one regression table with sjplot.

# Lets make our models first
model1 <- lm(`Attitude on Public School Sex Education` ~ `Political Views`, data = gss_cleaned)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
## response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
model2 <- lm(`Attitude on Public School Sex Education` ~ `Strength of Religious Affiliation`, data = gss_cleaned)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
## response will be ignored
## Warning in model.response(mf, "numeric"): '-' not meaningful for factors
model3 <- lm(`Attitude on Public School Sex Education` ~ `Political Views` + `Strength of Religious Affiliation`, data = gss_cleaned)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
## response will be ignored
## Warning in model.response(mf, "numeric"): '-' not meaningful for factors
# Now our table with sjPlot
library(sjPlot)

tab_model(model1, model2, model3)
  Attitude on Public School
Sex Education
Attitude on Public School
Sex Education
Attitude on Public School
Sex Education
Predictors Estimates CI p Estimates CI p Estimates CI p
(Intercept) 1.92 1.95 1.96
Political Views [Liberal] 0.01 0.01
Political Views [Slightly
Liberal]
0.00 0.00
Political Views
[Moderate]
-0.02 -0.01
Political Views [Slightly
Conservative]
-0.03 -0.02
Political Views
[Conservative]
-0.15 -0.13
Political Views
[Extremely Conservative]
-0.26 -0.23
Strength of Religious
Affiliation [Not Very
Strong]
-0.04 -0.03
Strength of Religious
Affiliation [Somewhat
Strong]
-0.07 -0.05
Strength of Religious
Affiliation [Strong]
-0.14 -0.11
Observations 32437 32437 32437