# List of packages
packages <- c("tidyverse", "fst", "modelsummary", "viridis", "kableExtra", "flextable", "officer") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
## 
## 载入需要的程序包:viridisLite
## 
## 
## 载入程序包:'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## 
## 
## 
## 载入程序包:'flextable'
## 
## 
## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote
## 
## 
## The following object is masked from 'package:purrr':
## 
##     compose
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "viridis"      "viridisLite"  "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[5]]
##  [1] "kableExtra"   "viridis"      "viridisLite"  "modelsummary" "fst"         
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "flextable"    "kableExtra"   "viridis"      "viridisLite"  "modelsummary"
##  [6] "fst"          "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "officer"      "flextable"    "kableExtra"   "viridis"      "viridisLite" 
##  [6] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"
gss <- load("gss2022.RData")
gss <- df
table(gss$polviews)
## 
##             extremely liberal                       liberal 
##                          2081                          7623 
##              slightly liberal  moderate, middle of the road 
##                          7900                         23992 
##         slightly conservative                  conservative 
##                          9596                          9361 
##        extremely conservative                    don't know 
##                          2165                             0 
##                           iap            I don't have a job 
##                             0                             0 
##                   dk, na, iap                     no answer 
##                             0                             0 
##    not imputable_(2147483637)    not imputable_(2147483638) 
##                             0                             0 
##                       refused                skipped on web 
##                             0                             0 
##                    uncodeable not available in this release 
##                             0                             0 
##    not available in this year                  see codebook 
##                             0                             0
unique(gss$polviews)
## [1] <NA>                         moderate, middle of the road
## [3] slightly conservative        conservative                
## [5] liberal                      extremely conservative      
## [7] slightly liberal             extremely liberal           
## 20 Levels: extremely liberal liberal ... see codebook
gss <- gss %>%
  mutate(
    polviews = case_when(
      polviews %in% c("liberal", "moderate", "conservative") ~ polviews,
      TRUE ~ NA_character_
    ),
    
    race = case_when(
      race %in% c("white", "black", "other") ~ race,
      TRUE ~ NA_character_
    ),
    sex = case_when(
      sex %in% c("male", "female") ~ sex,
      TRUE ~ NA_character_
    ),
    degree = case_when(
      degree %in% c("less than high school", "high school", "junior college", "bachelor", "graduate") ~ degree,
      TRUE ~ NA_character_
    ),
   
  )
gss_filtered <- gss %>%
  dplyr::select(polviews, race, sex, degree)
categorical_summary <- datasummary_skim(gss_filtered, type = "categorical")
categorical_summary 
tinytable_t94fwob7vx8im4izs8in
N %
polviews conservative 9361 12.9
liberal 7623 10.5
NA 55406 76.5
race black 10215 14.1
other 4411 6.1
white 57657 79.6
NA 107 0.1
sex female 40301 55.7
male 31977 44.2
NA 112 0.2
degree graduate 5953 8.2
high school 36446 50.3
less than high school 14192 19.6
NA 15799 21.8
gss_cleaned <- gss %>%
  filter(!is.na(polviews), 
         !is.na(race), !is.na(sex), !is.na(degree)) %>%
  mutate(
    polviews = recode(polviews, "conservative" = "Conservative", "liberal" = "Liberal", "moderate" = "Moderate"),
    race = recode(race, "white" = "White", "black" = "Black", "other" = "Other"),
    sex = recode(sex, "male" = "Male", "female" = "Female"),
    degree = recode(degree, "less than high school" = "Less than High School", "high school" = "High School", "junior college" = "Junior College", "bachelor" = "Bachelor", "graduate" = "Graduate"),
  )
gss_cleaned <- gss_cleaned %>%
  rename(
    "Political Views" = polviews,
    "Respondent Race" = race,
    "Respondent Sex" = sex,
    "Highest Degree" = degree,
  )
categorical_summary_relabelled <- datasummary_skim(
  gss_cleaned %>%
    dplyr::select(`Political Views`, `Respondent Race`, `Respondent Sex`, `Highest Degree`), # Select categorical variables
  type = "categorical", # Specify the type of variables to summarize
  output = "kableExtra" # Specify the output format
)
## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
##   produced by the `tinytable` backend.
categorical_summary_relabelled %>%
  kableExtra::kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% # Apply table styling options
  kableExtra::row_spec(0, bold = TRUE, color = "white", background = "#4CAF50") %>% # Customize the header row
  kableExtra::column_spec(1, bold = TRUE) %>% # Make the first column bold
  kableExtra::add_header_above(c(" " = 1, "Summary Statistics for Categorical Variables" = 3)) # Add a header above the table
Summary Statistics for Categorical Variables
N %
Political Views Conservative 6945 55.2
Liberal 5627 44.8
Respondent Race Black 1717 13.7
Other 772 6.1
White 10083 80.2
Respondent Sex Female 6813 54.2
Male 5759 45.8
Highest Degree Graduate 1943 15.5
High School 7940 63.2
Less than High School 2689 21.4
table(df$polviews)
## 
##             extremely liberal                       liberal 
##                          2081                          7623 
##              slightly liberal  moderate, middle of the road 
##                          7900                         23992 
##         slightly conservative                  conservative 
##                          9596                          9361 
##        extremely conservative                    don't know 
##                          2165                             0 
##                           iap            I don't have a job 
##                             0                             0 
##                   dk, na, iap                     no answer 
##                             0                             0 
##    not imputable_(2147483637)    not imputable_(2147483638) 
##                             0                             0 
##                       refused                skipped on web 
##                             0                             0 
##                    uncodeable not available in this release 
##                             0                             0 
##    not available in this year                  see codebook 
##                             0                             0
table(df$sex)
## 
##                          male                        female 
##                         31977                         40301 
##                    don't know                           iap 
##                             0                             0 
##            I don't have a job                   dk, na, iap 
##                             0                             0 
##                     no answer    not imputable_(2147483637) 
##                             0                             0 
##    not imputable_(2147483638)                       refused 
##                             0                             0 
##                skipped on web                    uncodeable 
##                             0                             0 
## not available in this release    not available in this year 
##                             0                             0 
##                  see codebook 
##                             0
polviews_summary <- gss %>%
  count(polviews) %>% # Count the occurrences of each religious preference
  mutate(pct = n / sum(n) * 100) # Calculate the percentage of each preference
ggplot(gss, aes(x = polviews)) +
  geom_bar(fill = "lightblue", color = "black") + # Create a bar plot with light blue fill and black borders
  labs(title = "Distribution of Political Views", x = "Religious Preference", y = "Count") + # Add title and axis labels
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis text for better readability

gss_sex <- gss %>%
  group_by(sex, polviews) %>% # Group by year and religious preference
  summarize(count = n(), .groups = 'drop') %>% # Calculate the count for each group
  group_by(sex) %>% # Group by year
  mutate(total = sum(count), # Calculate the total count per year
         proportion = count / total) # Calculate the proportion of each religious preference per year


ggplot(gss_sex, aes(x = sex, y = proportion, color = polviews, group = polviews)) +
  geom_line(size = 1.2) + # Create lines for each religious preference with increased line size
  scale_color_brewer(palette = "Set3") + # Use a color palette for better differentiation
  labs(title = "Evolution of Religious Preferences Over Time", # Add plot title
       x = "Sex", # Label x-axis
       y = "Proportion", # Label y-axis
       color = "Political Views") + # Label the legend
  theme_minimal() + # Apply a minimal theme to the plot
  theme(legend.position = "bottom") # Position the legend at the bottom
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_line()`).

gss_filtered <- gss %>%
  filter(polviews %in% c("protestant", "catholic", "jewish", "none", "other"))


gss_yearly <- gss_filtered %>%
  group_by(sex, polviews) %>%
  summarize(count = n(), .groups = 'drop') %>%
  group_by(sex) %>%
  mutate(total = sum(count),
         proportion = count / total)


ggplot(gss_yearly, aes(x = sex, y = proportion, color = polviews, group = polviews)) +
  geom_line(size = 1.2) +
  scale_color_brewer(palette = "Dark2") +
  labs(title = "Evolution of Political Views Over Time",
       x = "Sex",
       y = "Proportion",
       color = "Political Views") +
  theme_minimal() +
  theme(legend.position = "bottom")

gss_filtered_clean <- gss_filtered %>%
  filter(!is.na(year) & !is.na(fejobaff) & !is.na(age))


gss_yearly <- gss_filtered_clean %>%
  count(year, fejobaff, age) %>%  # Count occurrences for each combination of year, partyid_recoded, and abany
  group_by(year, fejobaff) %>%  # Group by year and political identity
  mutate(total = sum(n),  # Calculate the total count per year and political identity
         proportion = n / total) %>%  # Calculate the proportion of each response within each year and political identity
  filter(age == "Yes")  # Filter to keep only 'Yes' responses


print(head(gss_yearly))
## # A tibble: 0 × 6
## # Groups:   year, fejobaff [0]
## # ℹ 6 variables: year <int>, fejobaff <fct>, age <int>, n <int>, total <int>,
## #   proportion <dbl>
ggplot(gss_yearly, aes(x = year, y = proportion, color = fejobaff)) +
  geom_line(size = 1.2) +  # Create line plot with increased line size
  scale_color_brewer(palette = "Dark2", name = "Political Identification") +  # Use Dark2 color palette for lines and set legend title
  labs(title = "Preferential Hiring by Year and age",
       subtitle = "General Social Survey, 1972-2022",
       x = "Year",
       y = "Preferential Hiring") +  # Add title, subtitle, and axis labels
  theme_minimal() +  # Apply minimal theme
  theme(legend.position = "bottom")  # Position legend at the bottom