HW 2

# List of packages
packages <- c("tidyverse", "fst", "modelsummary", "viridis", "kableExtra", "flextable", "officer") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
## 
## 载入需要的程序包：viridisLite
## 
## 
## 载入程序包：'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## 
## 
## 
## 载入程序包：'flextable'
## 
## 
## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote
## 
## 
## The following object is masked from 'package:purrr':
## 
##     compose

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "viridis"      "viridisLite"  "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[5]]
##  [1] "kableExtra"   "viridis"      "viridisLite"  "modelsummary" "fst"         
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "flextable"    "kableExtra"   "viridis"      "viridisLite"  "modelsummary"
##  [6] "fst"          "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "officer"      "flextable"    "kableExtra"   "viridis"      "viridisLite" 
##  [6] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"

gss <- load("gss2022.RData")
gss <- df

table(gss$polviews)

## 
##             extremely liberal                       liberal 
##                          2081                          7623 
##              slightly liberal  moderate, middle of the road 
##                          7900                         23992 
##         slightly conservative                  conservative 
##                          9596                          9361 
##        extremely conservative                    don't know 
##                          2165                             0 
##                           iap            I don't have a job 
##                             0                             0 
##                   dk, na, iap                     no answer 
##                             0                             0 
##    not imputable_(2147483637)    not imputable_(2147483638) 
##                             0                             0 
##                       refused                skipped on web 
##                             0                             0 
##                    uncodeable not available in this release 
##                             0                             0 
##    not available in this year                  see codebook 
##                             0                             0

unique(gss$polviews)

## [1] <NA>                         moderate, middle of the road
## [3] slightly conservative        conservative                
## [5] liberal                      extremely conservative      
## [7] slightly liberal             extremely liberal           
## 20 Levels: extremely liberal liberal ... see codebook

gss <- gss %>%
  mutate(
    polviews = case_when(
      polviews %in% c("liberal", "moderate", "conservative") ~ polviews,
      TRUE ~ NA_character_
    ),
    
    race = case_when(
      race %in% c("white", "black", "other") ~ race,
      TRUE ~ NA_character_
    ),
    sex = case_when(
      sex %in% c("male", "female") ~ sex,
      TRUE ~ NA_character_
    ),
    degree = case_when(
      degree %in% c("less than high school", "high school", "junior college", "bachelor", "graduate") ~ degree,
      TRUE ~ NA_character_
    ),
   
  )

gss_filtered <- gss %>%
  dplyr::select(polviews, race, sex, degree)
categorical_summary <- datasummary_skim(gss_filtered, type = "categorical")
categorical_summary

tinytable_t94fwob7vx8im4izs8in

		N	%
polviews	conservative	9361	12.9
	liberal	7623	10.5
	NA	55406	76.5
race	black	10215	14.1
	other	4411	6.1
	white	57657	79.6
	NA	107	0.1
sex	female	40301	55.7
	male	31977	44.2
	NA	112	0.2
degree	graduate	5953	8.2
	high school	36446	50.3
	less than high school	14192	19.6
	NA	15799	21.8

gss_cleaned <- gss %>%
  filter(!is.na(polviews), 
         !is.na(race), !is.na(sex), !is.na(degree)) %>%
  mutate(
    polviews = recode(polviews, "conservative" = "Conservative", "liberal" = "Liberal", "moderate" = "Moderate"),
    race = recode(race, "white" = "White", "black" = "Black", "other" = "Other"),
    sex = recode(sex, "male" = "Male", "female" = "Female"),
    degree = recode(degree, "less than high school" = "Less than High School", "high school" = "High School", "junior college" = "Junior College", "bachelor" = "Bachelor", "graduate" = "Graduate"),
  )

gss_cleaned <- gss_cleaned %>%
  rename(
    "Political Views" = polviews,
    "Respondent Race" = race,
    "Respondent Sex" = sex,
    "Highest Degree" = degree,
  )

categorical_summary_relabelled <- datasummary_skim(
  gss_cleaned %>%
    dplyr::select(`Political Views`, `Respondent Race`, `Respondent Sex`, `Highest Degree`), # Select categorical variables
  type = "categorical", # Specify the type of variables to summarize
  output = "kableExtra" # Specify the output format
)

## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
##   produced by the `tinytable` backend.

categorical_summary_relabelled %>%
  kableExtra::kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% # Apply table styling options
  kableExtra::row_spec(0, bold = TRUE, color = "white", background = "#4CAF50") %>% # Customize the header row
  kableExtra::column_spec(1, bold = TRUE) %>% # Make the first column bold
  kableExtra::add_header_above(c(" " = 1, "Summary Statistics for Categorical Variables" = 3)) # Add a header above the table

	Summary Statistics for Categorical Variables
		N	%
Political Views	Conservative	6945	55.2
	Liberal	5627	44.8
Respondent Race	Black	1717	13.7
	Other	772	6.1
	White	10083	80.2
Respondent Sex	Female	6813	54.2
	Male	5759	45.8
Highest Degree	Graduate	1943	15.5
	High School	7940	63.2
	Less than High School	2689	21.4

table(df$polviews)

## 
##             extremely liberal                       liberal 
##                          2081                          7623 
##              slightly liberal  moderate, middle of the road 
##                          7900                         23992 
##         slightly conservative                  conservative 
##                          9596                          9361 
##        extremely conservative                    don't know 
##                          2165                             0 
##                           iap            I don't have a job 
##                             0                             0 
##                   dk, na, iap                     no answer 
##                             0                             0 
##    not imputable_(2147483637)    not imputable_(2147483638) 
##                             0                             0 
##                       refused                skipped on web 
##                             0                             0 
##                    uncodeable not available in this release 
##                             0                             0 
##    not available in this year                  see codebook 
##                             0                             0

table(df$sex)

## 
##                          male                        female 
##                         31977                         40301 
##                    don't know                           iap 
##                             0                             0 
##            I don't have a job                   dk, na, iap 
##                             0                             0 
##                     no answer    not imputable_(2147483637) 
##                             0                             0 
##    not imputable_(2147483638)                       refused 
##                             0                             0 
##                skipped on web                    uncodeable 
##                             0                             0 
## not available in this release    not available in this year 
##                             0                             0 
##                  see codebook 
##                             0

polviews_summary <- gss %>%
  count(polviews) %>% # Count the occurrences of each religious preference
  mutate(pct = n / sum(n) * 100) # Calculate the percentage of each preference

ggplot(gss, aes(x = polviews)) +
  geom_bar(fill = "lightblue", color = "black") + # Create a bar plot with light blue fill and black borders
  labs(title = "Distribution of Political Views", x = "Religious Preference", y = "Count") + # Add title and axis labels
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis text for better readability

gss_sex <- gss %>%
  group_by(sex, polviews) %>% # Group by year and religious preference
  summarize(count = n(), .groups = 'drop') %>% # Calculate the count for each group
  group_by(sex) %>% # Group by year
  mutate(total = sum(count), # Calculate the total count per year
         proportion = count / total) # Calculate the proportion of each religious preference per year


ggplot(gss_sex, aes(x = sex, y = proportion, color = polviews, group = polviews)) +
  geom_line(size = 1.2) + # Create lines for each religious preference with increased line size
  scale_color_brewer(palette = "Set3") + # Use a color palette for better differentiation
  labs(title = "Evolution of Religious Preferences Over Time", # Add plot title
       x = "Sex", # Label x-axis
       y = "Proportion", # Label y-axis
       color = "Political Views") + # Label the legend
  theme_minimal() + # Apply a minimal theme to the plot
  theme(legend.position = "bottom") # Position the legend at the bottom

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_line()`).

gss_filtered <- gss %>%
  filter(polviews %in% c("protestant", "catholic", "jewish", "none", "other"))


gss_yearly <- gss_filtered %>%
  group_by(sex, polviews) %>%
  summarize(count = n(), .groups = 'drop') %>%
  group_by(sex) %>%
  mutate(total = sum(count),
         proportion = count / total)


ggplot(gss_yearly, aes(x = sex, y = proportion, color = polviews, group = polviews)) +
  geom_line(size = 1.2) +
  scale_color_brewer(palette = "Dark2") +
  labs(title = "Evolution of Political Views Over Time",
       x = "Sex",
       y = "Proportion",
       color = "Political Views") +
  theme_minimal() +
  theme(legend.position = "bottom")

gss_filtered_clean <- gss_filtered %>%
  filter(!is.na(year) & !is.na(fejobaff) & !is.na(age))


gss_yearly <- gss_filtered_clean %>%
  count(year, fejobaff, age) %>%  # Count occurrences for each combination of year, partyid_recoded, and abany
  group_by(year, fejobaff) %>%  # Group by year and political identity
  mutate(total = sum(n),  # Calculate the total count per year and political identity
         proportion = n / total) %>%  # Calculate the proportion of each response within each year and political identity
  filter(age == "Yes")  # Filter to keep only 'Yes' responses


print(head(gss_yearly))

## # A tibble: 0 × 6
## # Groups:   year, fejobaff [0]
## # ℹ 6 variables: year <int>, fejobaff <fct>, age <int>, n <int>, total <int>,
## #   proportion <dbl>

ggplot(gss_yearly, aes(x = year, y = proportion, color = fejobaff)) +
  geom_line(size = 1.2) +  # Create line plot with increased line size
  scale_color_brewer(palette = "Dark2", name = "Political Identification") +  # Use Dark2 color palette for lines and set legend title
  labs(title = "Preferential Hiring by Year and age",
       subtitle = "General Social Survey, 1972-2022",
       x = "Year",
       y = "Preferential Hiring") +  # Add title, subtitle, and axis labels
  theme_minimal() +  # Apply minimal theme
  theme(legend.position = "bottom")  # Position legend at the bottom

HW 2

Lyra Dong

2024-07-15