load("gss2018_egp.RData")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(modelsummary)
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
library(flextable)
## 
## Attaching package: 'flextable'
## 
## The following object is masked from 'package:purrr':
## 
##     compose
library(ggplot2)
library(RColorBrewer)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following objects are masked from 'package:flextable':
## 
##     as_image, footnote
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(readxl)
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
read_excel("GSS2.xlsx")
## # A tibble: 10,229 × 4
##     year   id_ age   fejobaff           
##    <dbl> <dbl> <chr> <chr>              
##  1  1996     3 55    Strongly oppose    
##  2  1996     8 29    Strongly favor     
##  3  1996    15 53    Not strongly oppose
##  4  1996    16 31    Strongly oppose    
##  5  1996    18 32    Strongly favor     
##  6  1996    19 44    Not strongly oppose
##  7  1996    23 41    Strongly favor     
##  8  1996    29 42    Strongly oppose    
##  9  1996    31 40    Not strongly oppose
## 10  1996    34 25    Not strongly oppose
## # ℹ 10,219 more rows
read_excel("~/GSS3.xlsx")
## # A tibble: 71,690 × 3
##     year   id_ attend                    
##    <dbl> <dbl> <chr>                     
##  1  1972     1 About once or twice a year
##  2  1972     2 Every week                
##  3  1972     3 About once a month        
##  4  1972     4 Never                     
##  5  1972     5 Never                     
##  6  1972     6 About once or twice a year
##  7  1972     7 Every week                
##  8  1972     8 Never                     
##  9  1972     9 Several times a year      
## 10  1972    10 Several times a week      
## # ℹ 71,680 more rows

Task 1: Data Cleaning and Recoding Objective: Clean and recode the variables to ensure they are ready for analysis.

Recode polviews into three categories: “Liberal”, “Moderate”, and “Conservative”. Clean sex, degree, and race but retain the relevant categories.

df1 = mutate(df, Y_polid = recode(Y_polid, "liberal" = "Liberal", "slightly liberal" = "Liberal", "conservative" = "Conservative", "slghtly conservative" = "Conservative", "extremely liberal" = "Liberal", "extrmly conservative" = "Conservative", "moderate" = "Moderate"), female = recode(female, "0" = "Male", "1" = "Female"), white = recode(white, "0" = "Non-white", "1" = "White"), D_edudg = recode(D_edudg, "1" = "Less than a high school diploma", "2" = "High school diploma", "3" = "Associate degree", "4" = "Bachelors")) 
dfclean<- df1 %>%
  filter(!is.na(female), !is.na(white), !is.na(D_edudg), !is.na(Y_polid))

dfnew = rename(dfclean, "Sex" = female, "Race" = white, "Highest_Degree" = D_edudg, "Political_Leaning" = Y_polid) 

Task 2: Data Summary Objective: Generate a summary table for selected variables using the datasummary_skim function from the modelsummary package.

Select the variables of interest: polviews, sex, degree, and race.

Generate a categorical summary table for these variables, clean the labels, and display it using the flextable package for styling.

dfnewfilter <- datasummary_skim(
  dfnew %>%
    dplyr::select(`Sex`, `Race`, `Highest_Degree`, `Political_Leaning`), 
  type = "categorical",  output = "kableExtra" )
## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
##   produced by the `tinytable` backend.

Task 3: Visualization of Political Views by Gender Objective: Create a bar chart showing the distribution of political views by gender.

Create a bar chart showing the distribution of political views by gender. Use a color palette that clearly differentiates the categories.

dfsummary <- dfnew %>%
  count(Sex, Political_Leaning) %>%
  group_by(Sex) %>%
  mutate(total = sum(n),
         Proportion = n / total)

ggplot(dfsummary, aes(x = Sex, (y = Proportion), fill = Political_Leaning)) +   geom_bar(stat = "identity", position = "fill") + scale_y_continuous(labels = scales::percent_format()) + scale_fill_brewer(palette = "PiYG") +  labs(title = "Distribution of Political Beliefs by Sex", 
       x = "Sex",
       y = "Proportion")

Task 4: Trends Over Time Objective: Visualize trends in religious attendance over time.

Select the year and attend variables from the GSS dataset.

Create a line plot showing the proportion of each category of religious attendance over time.

Task 5: Comparison Trends Objective: Create a stacked bar chart showing the distribution of fejobaff (preferential hiring) across different age groups.

Create an age group variable by categorizing age into “18-29”, “30-44”, “45-59”, “60+”. Create a stacked bar chart showing the distribution of the fejobaff response categories for each age group.

GSS3 <- read_xlsx("GSS2.xlsx") %>% mutate(agegroup = case_when (age >=60 ~ '60+',
                                              age >= 45  & age <= 59 ~ '45-59',
                                              age >= 30  & age <= 44 ~ '30-44',
                                              age >= 18  & age <= 29 ~ '18-29'))

GSS4 <- GSS3 %>%
  count(agegroup, fejobaff) %>%
  group_by(agegroup) %>%
  mutate(total = sum(n),
         Proportion = n / total)

ggplot(GSS4, aes(x = agegroup, (y = Proportion), fill = fejobaff)) +  geom_bar(stat = "identity", position = "fill") + scale_y_continuous(labels = scales::percent_format()) + scale_fill_brewer(palette = "Accent") +  labs(title = "Preferential Hiring by Age", 
       x = "agegroup",
       y = "Proportion")