Loading packages and dataset

packages <- c("tidyverse", "fst", "modelsummary", "viridis", "kableExtra", "flextable", "officer") 

new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
## 
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## 
## 
## 
## Attaching package: 'flextable'
## 
## 
## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote
## 
## 
## The following object is masked from 'package:purrr':
## 
##     compose
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "viridis"      "viridisLite"  "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[5]]
##  [1] "kableExtra"   "viridis"      "viridisLite"  "modelsummary" "fst"         
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "flextable"    "kableExtra"   "viridis"      "viridisLite"  "modelsummary"
##  [6] "fst"          "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "officer"      "flextable"    "kableExtra"   "viridis"      "viridisLite" 
##  [6] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"
setwd("C:/Users/matej/OneDrive/Desktop/U of T/Summer 2024/SOC252/RMarkdowns")

gss <- load("gss2022.Rdata")
gss <- df

Cleaning variables

Here is what our variables look like before cleaning

table(gss$polviews)
## 
##             extremely liberal                       liberal 
##                          2081                          7623 
##              slightly liberal  moderate, middle of the road 
##                          7900                         23992 
##         slightly conservative                  conservative 
##                          9596                          9361 
##        extremely conservative                    don't know 
##                          2165                             0 
##                           iap            I don't have a job 
##                             0                             0 
##                   dk, na, iap                     no answer 
##                             0                             0 
##    not imputable_(2147483637)    not imputable_(2147483638) 
##                             0                             0 
##                       refused                skipped on web 
##                             0                             0 
##                    uncodeable not available in this release 
##                             0                             0 
##    not available in this year                  see codebook 
##                             0                             0
unique(gss$polviews)
## [1] <NA>                         moderate, middle of the road
## [3] slightly conservative        conservative                
## [5] liberal                      extremely conservative      
## [7] slightly liberal             extremely liberal           
## 20 Levels: extremely liberal liberal ... see codebook
table(gss$attend)
## 
##                         never         less than once a year 
##                         13855                          5825 
##    about once or twice a year          several times a year 
##                          9415                          8752 
##            about once a month             2-3 times a month 
##                          4831                          6114 
##             nearly every week                    every week 
##                          4029                         13659 
##          several times a week                    don't know 
##                          5210                             0 
##                           iap            I don't have a job 
##                             0                             0 
##                   dk, na, iap                     no answer 
##                             0                             0 
##    not imputable_(2147483637)    not imputable_(2147483638) 
##                             0                             0 
##                       refused                skipped on web 
##                             0                             0 
##                    uncodeable not available in this release 
##                             0                             0 
##    not available in this year                  see codebook 
##                             0                             0
unique(gss$attend)
##  [1] about once or twice a year every week                
##  [3] about once a month         never                     
##  [5] several times a year       several times a week      
##  [7] 2-3 times a month          less than once a year     
##  [9] nearly every week          <NA>                      
## 22 Levels: never less than once a year ... see codebook
table(gss$sexeduc)
## 
##                         favor                        oppose 
##                         35639                          5127 
##   depends on age/grade (vol.)                    don't know 
##                             9                             0 
##                           iap            I don't have a job 
##                             0                             0 
##                   dk, na, iap                     no answer 
##                             0                             0 
##    not imputable_(2147483637)    not imputable_(2147483638) 
##                             0                             0 
##                       refused                skipped on web 
##                             0                             0 
##                    uncodeable not available in this release 
##                             0                             0 
##    not available in this year                  see codebook 
##                             0                             0
unique(gss$sexeduc)
## [1] <NA>                        favor                      
## [3] oppose                      depends on age/grade (vol.)
## 16 Levels: favor oppose depends on age/grade (vol.) don't know ... see codebook

Lets clean these up

# Removing NA values
gss <- gss %>%
  mutate(
    polviews = case_when(
      polviews %in% c("extremely liberal", "liberal", "slightly liberal", "moderate, middle of the road", "slightly conservative", "conservative", "extremely conservative") ~ polviews,
        TRUE ~ NA_character_
    ),
    attend = case_when(
      attend %in% c("about once or twice a year", "every week", "about once a month", "never", "several times a year", "several times a week", "2-3 times a month", "less than once a year", "nearly every week") ~ attend,
      TRUE ~ NA_character_
    ),
    #we need to make sexeduc a dichotomous variable
    sexeduc = case_when(
      sexeduc %in% c("favor", "oppose") ~ sexeduc,
      TRUE ~ NA_character_
    )
  )
gss_filtered <- gss %>%
  dplyr::select(polviews, attend, sexeduc)
categorical_summary <- datasummary_skim(gss_filtered, type = "categorical")
categorical_summary
tinytable_p9rmde9iq8fok8bimbbg
N %
polviews conservative 9361 12.9
extremely conservative 2165 3.0
extremely liberal 2081 2.9
liberal 7623 10.5
moderate, middle of the road 23992 33.1
slightly conservative 9596 13.3
slightly liberal 7900 10.9
NA 9672 13.4
attend 2-3 times a month 6114 8.4
about once a month 4831 6.7
about once or twice a year 9415 13.0
every week 13659 18.9
less than once a year 5825 8.0
nearly every week 4029 5.6
never 13855 19.1
several times a week 5210 7.2
several times a year 8752 12.1
NA 700 1.0
sexeduc favor 35639 49.2
oppose 5127 7.1
NA 31624 43.7

Data summary table

Lets add some finishing cosmetic touches and remove NA.

gss_cleaned <- gss %>%
  filter(!is.na(polviews), !is.na(attend), !is.na(sexeduc)) %>%
  mutate(
    polviews = recode(polviews, 
                      "extremely liberal" = "Extremely Liberal", 
                      "liberal" = "Liberal", 
                      "slightly liberal" = "Slightly Liberal", 
                      "moderate, middle of the road" = "Moderate", 
                      "slightly conservative" = "Slightly Conservative", 
                      "conservative" = "Conservative", 
                      "extremely conservative" = "Extremely Conservative"),
    polviews = factor(polviews, levels = c("Extremely Liberal", "Liberal", "Slightly Liberal", "Moderate", "Slightly Conservative", "Conservative", "Extremely Conservative")),
    sexeduc = recode(sexeduc, "favor" = "Favor", "oppose" = "Oppose"),
    sexeduc = factor(sexeduc, levels = c("Oppose", "Favor")),
    attend = recode(attend, 
                    "about once or twice a year" = "Abt 1-2/year",
                    "every week" = "Every week",
                    "about once a month" = "Abt 1/month",
                    "never" = "Never", 
                    "several times a year" = "Several times/year", 
                    "several times a week" = "Several times/week", 
                    "2-3 times a month" = "2-3 times/month", 
                    "less than once a year" = "Less than 1/year", 
                    "nearly every week" = "Nearly every week"),
    attend = factor(attend, levels = c("Never", "Less than 1/year", "Abt 1-2 times/year", "Several times/year", "Abt 1/month", "2-3 times/month", "Nearly every week", "Every week", "Several times/week"))
  )

gss_cleaned <- gss_cleaned %>%
  rename(
    "Political Views" = polviews,
    "Religiosity by Attendance" = attend,
    "Attitude on Public School Sex Education" = sexeduc
  )

# Now our table
categorical_summary_flextable <- datasummary_skim(
  gss_cleaned %>%
    dplyr::select(`Political Views`, `Religiosity by Attendance`, `Attitude on Public School Sex Education`),
  type = "categorical",
  output = "flextable"
)
## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
##   produced by the `tinytable` backend.
categorical_summary_flextable <- categorical_summary_flextable %>%
  set_header_labels(Variable = "Variable", Value = "Value", Freq = "Frequency") %>%
  theme_box() %>%
  bold(part = "header") %>%
  bg(part = "header", bg = "deepskyblue4") %>%
  color(part = "header", color = "white") %>%
  border_remove() %>%
  border_inner_v(border = fp_border(color = "black", width = 1)) %>%
  autofit()

flextable::htmltools_value(categorical_summary_flextable)

N

%

Political Views

Extremely Liberal

1320

3.5

Liberal

4695

12.3

Slightly Liberal

4867

12.8

Moderate

14489

38.0

Slightly Conservative

5787

15.2

Conservative

5689

14.9

Extremely Conservative

1310

3.4

Religiosity by Attendance

Never

7789

20.4

Less than 1/year

3118

8.2

Abt 1-2 times/year

0

0.0

Several times/year

4470

11.7

Abt 1/month

2579

6.8

2-3 times/month

3290

8.6

Nearly every week

2027

5.3

Every week

7167

18.8

Several times/week

2610

6.8

Attitude on Public School Sex Education

Oppose

4588

12.0

Favor

33569

88.0

Above is a datasummary table showing our variables