Stepping Stone 2 Parts 4 & 5

Loading packages and dataset

packages <- c("tidyverse", "fst", "modelsummary", "viridis", "kableExtra", "flextable", "officer") 

new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
## 
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## 
## 
## 
## Attaching package: 'flextable'
## 
## 
## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote
## 
## 
## The following object is masked from 'package:purrr':
## 
##     compose

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "viridis"      "viridisLite"  "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[5]]
##  [1] "kableExtra"   "viridis"      "viridisLite"  "modelsummary" "fst"         
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "flextable"    "kableExtra"   "viridis"      "viridisLite"  "modelsummary"
##  [6] "fst"          "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "officer"      "flextable"    "kableExtra"   "viridis"      "viridisLite" 
##  [6] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"

setwd("C:/Users/matej/OneDrive/Desktop/U of T/Summer 2024/SOC252/RMarkdowns")

gss <- load("gss2022.Rdata")
gss <- df

Cleaning variables

Here is what our variables look like before cleaning

table(gss$polviews)

## 
##             extremely liberal                       liberal 
##                          2081                          7623 
##              slightly liberal  moderate, middle of the road 
##                          7900                         23992 
##         slightly conservative                  conservative 
##                          9596                          9361 
##        extremely conservative                    don't know 
##                          2165                             0 
##                           iap            I don't have a job 
##                             0                             0 
##                   dk, na, iap                     no answer 
##                             0                             0 
##    not imputable_(2147483637)    not imputable_(2147483638) 
##                             0                             0 
##                       refused                skipped on web 
##                             0                             0 
##                    uncodeable not available in this release 
##                             0                             0 
##    not available in this year                  see codebook 
##                             0                             0

unique(gss$polviews)

## [1] <NA>                         moderate, middle of the road
## [3] slightly conservative        conservative                
## [5] liberal                      extremely conservative      
## [7] slightly liberal             extremely liberal           
## 20 Levels: extremely liberal liberal ... see codebook

table(gss$reliten)

## 
##                        strong               not very strong 
##                         22652                         23738 
##        somewhat strong (vol.)                   no religion 
##                          5736                          7629 
##                    don't know                           iap 
##                             0                             0 
##            I don't have a job                   dk, na, iap 
##                             0                             0 
##                     no answer    not imputable_(2147483637) 
##                             0                             0 
##    not imputable_(2147483638)                       refused 
##                             0                             0 
##                skipped on web                    uncodeable 
##                             0                             0 
## not available in this release    not available in this year 
##                             0                             0 
##                  see codebook 
##                             0

unique(gss$reliten)

## [1] <NA>                   strong                 not very strong       
## [4] somewhat strong (vol.) no religion           
## 17 Levels: strong not very strong somewhat strong (vol.) ... see codebook

table(gss$sexeduc)

## 
##                         favor                        oppose 
##                         35639                          5127 
##   depends on age/grade (vol.)                    don't know 
##                             9                             0 
##                           iap            I don't have a job 
##                             0                             0 
##                   dk, na, iap                     no answer 
##                             0                             0 
##    not imputable_(2147483637)    not imputable_(2147483638) 
##                             0                             0 
##                       refused                skipped on web 
##                             0                             0 
##                    uncodeable not available in this release 
##                             0                             0 
##    not available in this year                  see codebook 
##                             0                             0

unique(gss$sexeduc)

## [1] <NA>                        favor                      
## [3] oppose                      depends on age/grade (vol.)
## 16 Levels: favor oppose depends on age/grade (vol.) don't know ... see codebook

Now to clean the data. Its important to note syntax in all the categories’ names in these variables

# Removing NA values
gss <- gss %>%
  mutate(
    polviews = case_when(
      polviews %in% c("extremely liberal", "liberal", "slightly liberal", "moderate, middle of the road", "slightly conservative", "conservative", "extremely conservative") ~ polviews,
        TRUE ~ NA_character_
    ),
    reliten = case_when(
      reliten %in% c("strong", "not very strong", "somewhat strong (vol.)", "no religion") ~ reliten,
      TRUE ~ NA_character_
    ),
    #we need to make sexeduc a dichotomous variable
    sexeduc = case_when(
      sexeduc %in% c("favor", "oppose") ~ sexeduc,
      TRUE ~ NA_character_
    )
  )
gss_filtered <- gss %>%
  dplyr::select(polviews, reliten, sexeduc)
categorical_summary <- datasummary_skim(gss_filtered, type = "categorical")
categorical_summary

tinytable_4vkylh25pe20huo4cdxw

		N	%
polviews	conservative	9361	12.9
	extremely conservative	2165	3.0
	extremely liberal	2081	2.9
	liberal	7623	10.5
	moderate, middle of the road	23992	33.1
	slightly conservative	9596	13.3
	slightly liberal	7900	10.9
	NA	9672	13.4
reliten	no religion	7629	10.5
	not very strong	23738	32.8
	somewhat strong (vol.)	5736	7.9
	strong	22652	31.3
	NA	12635	17.5
sexeduc	favor	35639	49.2
	oppose	5127	7.1
	NA	31624	43.7

# Now, removing NA values and making it reader friendly

gss_cleaned <- gss %>%
  filter(!is.na(polviews), !is.na(reliten), !is.na(sexeduc)) %>%
  mutate(
    polviews = recode(polviews, 
                      "extremely liberal" = "Extremely Liberal", 
                      "liberal" = "Liberal", 
                      "slightly liberal" = "Slightly Liberal", 
                      "moderate, middle of the road" = "Moderate", 
                      "slightly conservative" = "Slightly Conservative", 
                      "conservative" = "Conservative", 
                      "extremely conservative" = "Extremely Conservative"),
    polviews = factor(polviews, levels = c("Extremely Liberal", "Liberal", "Slightly Liberal", "Moderate", "Slightly Conservative", "Conservative", "Extremely Conservative")),
    sexeduc = recode(sexeduc, "favor" = "Favor", "oppose" = "Oppose"),
    sexeduc = factor(sexeduc, levels = c("Oppose", "Favor")),
    reliten = recode(reliten, 
                    "strong" = "Strong",
                    "somewhat strong (vol.)" = "Somewhat Strong",
                    "not very strong" = "Not Very Strong",
                    "no religion" = "No Religion"
                    ),
    reliten = factor(reliten, levels = c("No Religion", "Not Very Strong", "Somewhat Strong", "Strong"))
  )

gss_cleaned <- gss_cleaned %>%
  rename(
    "Political Views" = polviews,
    "Strength of Religious Affiliation" = reliten,
    "Attitude on Public School Sex Education" = sexeduc
  )

Exploratory Data Visualizations

We will create two relevant visualizations that explore our main variables and their relationships. The visualizations we will be constructing are a diverging stacked bar plot and a grouped bar plot

library(ggplot2)
library(dplyr)

# Diverging stacked bar chart
ggplot(gss_cleaned, aes(x = `Political Views`, fill = `Attitude on Public School Sex Education`)) +
  geom_bar(position = "fill", width = 0.7) +
  facet_wrap(~ `Strength of Religious Affiliation`, ncol = 2) +
  scale_fill_manual(values = c("Favor" = "#1f78b4", "Oppose" = "#e31a1c")) + 
  labs(
    title = "Attitudes on Sex Education by Political Views\nand Religious Affiliation",
    x = "Political Views",
    y = "Proportion",
    fill = "Attitude on Sex Education"
  ) +
  coord_flip() +
  theme_minimal(base_size = 14) + 
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"), 
    axis.text.x = element_text(angle = 45, hjust = 1), 
    legend.position = "top", 
    legend.title = element_blank(), 
    strip.background = element_rect(fill = "#f0f0f0", color = NA), 
    strip.text = element_text(face = "bold"),
     plot.margin = margin(20, 20, 20, 20)
    
  )

#Now our grouped bar plot
ggplot(gss_cleaned, aes(x = `Political Views`, fill = `Attitude on Public School Sex Education`)) +
  geom_bar(position = "dodge", color = "black", width = 0.7) +
  facet_wrap(~ `Strength of Religious Affiliation`, ncol = 2) +
  scale_fill_manual(values = c("Favor" = "#1f78b4", "Oppose" = "#e31a1c")) + 
  labs(
    title = "Attitudes on Sex Education by Political Views\nand Religious Affiliation",
    x = "Political Views",
    y = "Count",
    fill = "Attitude on Sex Education"
  ) +
  theme_minimal(base_size = 14) + 
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 16), 
    axis.text.x = element_text(angle = 45, hjust = 1), 
    axis.title.x = element_text(face = "bold"), 
    axis.title.y = element_text(face = "bold"), 
    legend.position = "top", 
    legend.title = element_blank(), 
    strip.background = element_rect(fill = "#f0f0f0", color = NA), 
    strip.text = element_text(face = "bold"), 
    panel.grid.major = element_line(color = "grey80"), 
    panel.grid.minor = element_blank() 
  )

Initial Regression Analysis

We will make a simple linear regression here to analyze 3 models, 2 single predictors with polviews and reliten respectively and one multiple predictor incorporating both. We’ll compare the models in one regression table with sjplot.

# Lets make our models first
model1 <- lm(`Attitude on Public School Sex Education` ~ `Political Views`, data = gss_cleaned)

## Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
## response will be ignored

## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors

model2 <- lm(`Attitude on Public School Sex Education` ~ `Strength of Religious Affiliation`, data = gss_cleaned)

## Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
## response will be ignored
## Warning in model.response(mf, "numeric"): '-' not meaningful for factors

model3 <- lm(`Attitude on Public School Sex Education` ~ `Political Views` + `Strength of Religious Affiliation`, data = gss_cleaned)

## Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
## response will be ignored
## Warning in model.response(mf, "numeric"): '-' not meaningful for factors

# Now our table with sjPlot
library(sjPlot)

tab_model(model1, model2, model3)

	Attitude on Public School Sex Education			Attitude on Public School Sex Education			Attitude on Public School Sex Education
Predictors	Estimates	CI	p	Estimates	CI	p	Estimates	CI	p
(Intercept)	1.92			1.95			1.96
Political Views [Liberal]	0.01						0.01
Political Views [Slightly Liberal]	0.00						0.00
Political Views [Moderate]	-0.02						-0.01
Political Views [Slightly Conservative]	-0.03						-0.02
Political Views [Conservative]	-0.15						-0.13
Political Views [Extremely Conservative]	-0.26						-0.23
Strength of Religious Affiliation [Not Very Strong]				-0.04			-0.03
Strength of Religious Affiliation [Somewhat Strong]				-0.07			-0.05
Strength of Religious Affiliation [Strong]				-0.14			-0.11
Observations	32437			32437			32437

Stepping Stone 2 Parts 4 & 5

Mateja Dokic

2024-08-04

Loading packages and dataset

Cleaning variables

Exploratory Data Visualizations

Initial Regression Analysis