packages <- c("tidyverse", "fst", "modelsummary", "viridis", "kableExtra", "flextable", "officer")
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
## backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
##
## Revert to `kableExtra` for one session:
##
## options(modelsummary_factory_default = 'kableExtra')
## options(modelsummary_factory_latex = 'kableExtra')
## options(modelsummary_factory_html = 'kableExtra')
##
## Silence this message forever:
##
## config_modelsummary(startup_message = FALSE)
##
## Loading required package: viridisLite
##
##
## Attaching package: 'kableExtra'
##
##
## The following object is masked from 'package:dplyr':
##
## group_rows
##
##
##
## Attaching package: 'flextable'
##
##
## The following objects are masked from 'package:kableExtra':
##
## as_image, footnote
##
##
## The following object is masked from 'package:purrr':
##
## compose
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "fst" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "viridis" "viridisLite" "modelsummary" "fst" "lubridate"
## [6] "forcats" "stringr" "dplyr" "purrr" "readr"
## [11] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
##
## [[5]]
## [1] "kableExtra" "viridis" "viridisLite" "modelsummary" "fst"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[6]]
## [1] "flextable" "kableExtra" "viridis" "viridisLite" "modelsummary"
## [6] "fst" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[7]]
## [1] "officer" "flextable" "kableExtra" "viridis" "viridisLite"
## [6] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [11] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [21] "utils" "datasets" "methods" "base"
setwd("C:/Users/matej/OneDrive/Desktop/U of T/Summer 2024/SOC252/RMarkdowns")
gss <- load("gss2022.Rdata")
gss <- df
Here is what our variables look like before cleaning
table(gss$polviews)
##
## extremely liberal liberal
## 2081 7623
## slightly liberal moderate, middle of the road
## 7900 23992
## slightly conservative conservative
## 9596 9361
## extremely conservative don't know
## 2165 0
## iap I don't have a job
## 0 0
## dk, na, iap no answer
## 0 0
## not imputable_(2147483637) not imputable_(2147483638)
## 0 0
## refused skipped on web
## 0 0
## uncodeable not available in this release
## 0 0
## not available in this year see codebook
## 0 0
unique(gss$polviews)
## [1] <NA> moderate, middle of the road
## [3] slightly conservative conservative
## [5] liberal extremely conservative
## [7] slightly liberal extremely liberal
## 20 Levels: extremely liberal liberal ... see codebook
table(gss$reliten)
##
## strong not very strong
## 22652 23738
## somewhat strong (vol.) no religion
## 5736 7629
## don't know iap
## 0 0
## I don't have a job dk, na, iap
## 0 0
## no answer not imputable_(2147483637)
## 0 0
## not imputable_(2147483638) refused
## 0 0
## skipped on web uncodeable
## 0 0
## not available in this release not available in this year
## 0 0
## see codebook
## 0
unique(gss$reliten)
## [1] <NA> strong not very strong
## [4] somewhat strong (vol.) no religion
## 17 Levels: strong not very strong somewhat strong (vol.) ... see codebook
table(gss$sexeduc)
##
## favor oppose
## 35639 5127
## depends on age/grade (vol.) don't know
## 9 0
## iap I don't have a job
## 0 0
## dk, na, iap no answer
## 0 0
## not imputable_(2147483637) not imputable_(2147483638)
## 0 0
## refused skipped on web
## 0 0
## uncodeable not available in this release
## 0 0
## not available in this year see codebook
## 0 0
unique(gss$sexeduc)
## [1] <NA> favor
## [3] oppose depends on age/grade (vol.)
## 16 Levels: favor oppose depends on age/grade (vol.) don't know ... see codebook
Now to clean the data. Its important to note syntax in all the categories’ names in these variables
# Removing NA values
gss <- gss %>%
mutate(
polviews = case_when(
polviews %in% c("extremely liberal", "liberal", "slightly liberal", "moderate, middle of the road", "slightly conservative", "conservative", "extremely conservative") ~ polviews,
TRUE ~ NA_character_
),
reliten = case_when(
reliten %in% c("strong", "not very strong", "somewhat strong (vol.)", "no religion") ~ reliten,
TRUE ~ NA_character_
),
#we need to make sexeduc a dichotomous variable
sexeduc = case_when(
sexeduc %in% c("favor", "oppose") ~ sexeduc,
TRUE ~ NA_character_
)
)
gss_filtered <- gss %>%
dplyr::select(polviews, reliten, sexeduc)
categorical_summary <- datasummary_skim(gss_filtered, type = "categorical")
categorical_summary
| N | % | ||
|---|---|---|---|
| polviews | conservative | 9361 | 12.9 |
| extremely conservative | 2165 | 3.0 | |
| extremely liberal | 2081 | 2.9 | |
| liberal | 7623 | 10.5 | |
| moderate, middle of the road | 23992 | 33.1 | |
| slightly conservative | 9596 | 13.3 | |
| slightly liberal | 7900 | 10.9 | |
| NA | 9672 | 13.4 | |
| reliten | no religion | 7629 | 10.5 |
| not very strong | 23738 | 32.8 | |
| somewhat strong (vol.) | 5736 | 7.9 | |
| strong | 22652 | 31.3 | |
| NA | 12635 | 17.5 | |
| sexeduc | favor | 35639 | 49.2 |
| oppose | 5127 | 7.1 | |
| NA | 31624 | 43.7 |
# Now, removing NA values and making it reader friendly
gss_cleaned <- gss %>%
filter(!is.na(polviews), !is.na(reliten), !is.na(sexeduc)) %>%
mutate(
polviews = recode(polviews,
"extremely liberal" = "Extremely Liberal",
"liberal" = "Liberal",
"slightly liberal" = "Slightly Liberal",
"moderate, middle of the road" = "Moderate",
"slightly conservative" = "Slightly Conservative",
"conservative" = "Conservative",
"extremely conservative" = "Extremely Conservative"),
polviews = factor(polviews, levels = c("Extremely Liberal", "Liberal", "Slightly Liberal", "Moderate", "Slightly Conservative", "Conservative", "Extremely Conservative")),
sexeduc = recode(sexeduc, "favor" = "Favor", "oppose" = "Oppose"),
sexeduc = factor(sexeduc, levels = c("Oppose", "Favor")),
reliten = recode(reliten,
"strong" = "Strong",
"somewhat strong (vol.)" = "Somewhat Strong",
"not very strong" = "Not Very Strong",
"no religion" = "No Religion"
),
reliten = factor(reliten, levels = c("No Religion", "Not Very Strong", "Somewhat Strong", "Strong"))
)
gss_cleaned <- gss_cleaned %>%
rename(
"Political Views" = polviews,
"Strength of Religious Affiliation" = reliten,
"Attitude on Public School Sex Education" = sexeduc
)
We will create two relevant visualizations that explore our main variables and their relationships. The visualizations we will be constructing are a diverging stacked bar plot and a grouped bar plot
library(ggplot2)
library(dplyr)
# Diverging stacked bar chart
ggplot(gss_cleaned, aes(x = `Political Views`, fill = `Attitude on Public School Sex Education`)) +
geom_bar(position = "fill", width = 0.7) +
facet_wrap(~ `Strength of Religious Affiliation`, ncol = 2) +
scale_fill_manual(values = c("Favor" = "#1f78b4", "Oppose" = "#e31a1c")) +
labs(
title = "Attitudes on Sex Education by Political Views\nand Religious Affiliation",
x = "Political Views",
y = "Proportion",
fill = "Attitude on Sex Education"
) +
coord_flip() +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "top",
legend.title = element_blank(),
strip.background = element_rect(fill = "#f0f0f0", color = NA),
strip.text = element_text(face = "bold"),
plot.margin = margin(20, 20, 20, 20)
)
#Now our grouped bar plot
ggplot(gss_cleaned, aes(x = `Political Views`, fill = `Attitude on Public School Sex Education`)) +
geom_bar(position = "dodge", color = "black", width = 0.7) +
facet_wrap(~ `Strength of Religious Affiliation`, ncol = 2) +
scale_fill_manual(values = c("Favor" = "#1f78b4", "Oppose" = "#e31a1c")) +
labs(
title = "Attitudes on Sex Education by Political Views\nand Religious Affiliation",
x = "Political Views",
y = "Count",
fill = "Attitude on Sex Education"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
legend.position = "top",
legend.title = element_blank(),
strip.background = element_rect(fill = "#f0f0f0", color = NA),
strip.text = element_text(face = "bold"),
panel.grid.major = element_line(color = "grey80"),
panel.grid.minor = element_blank()
)
We will make a simple linear regression here to analyze 3 models, 2 single predictors with polviews and reliten respectively and one multiple predictor incorporating both. We’ll compare the models in one regression table with sjplot.
# Lets make our models first
model1 <- lm(`Attitude on Public School Sex Education` ~ `Political Views`, data = gss_cleaned)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
## response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
model2 <- lm(`Attitude on Public School Sex Education` ~ `Strength of Religious Affiliation`, data = gss_cleaned)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
## response will be ignored
## Warning in model.response(mf, "numeric"): '-' not meaningful for factors
model3 <- lm(`Attitude on Public School Sex Education` ~ `Political Views` + `Strength of Religious Affiliation`, data = gss_cleaned)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
## response will be ignored
## Warning in model.response(mf, "numeric"): '-' not meaningful for factors
# Now our table with sjPlot
library(sjPlot)
tab_model(model1, model2, model3)
|
Attitude on Public School Sex Education |
Attitude on Public School Sex Education |
Attitude on Public School Sex Education |
|||||||
|---|---|---|---|---|---|---|---|---|---|
| Predictors | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p |
| (Intercept) | 1.92 | 1.95 | 1.96 | ||||||
| Political Views [Liberal] | 0.01 | 0.01 | |||||||
|
Political Views [Slightly Liberal] |
0.00 | 0.00 | |||||||
|
Political Views [Moderate] |
-0.02 | -0.01 | |||||||
|
Political Views [Slightly Conservative] |
-0.03 | -0.02 | |||||||
|
Political Views [Conservative] |
-0.15 | -0.13 | |||||||
|
Political Views [Extremely Conservative] |
-0.26 | -0.23 | |||||||
|
Strength of Religious Affiliation [Not Very Strong] |
-0.04 | -0.03 | |||||||
|
Strength of Religious Affiliation [Somewhat Strong] |
-0.07 | -0.05 | |||||||
|
Strength of Religious Affiliation [Strong] |
-0.14 | -0.11 | |||||||
| Observations | 32437 | 32437 | 32437 | ||||||