load("gss2018_egp.RData")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(modelsummary)
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
## backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
##
## Revert to `kableExtra` for one session:
##
## options(modelsummary_factory_default = 'kableExtra')
## options(modelsummary_factory_latex = 'kableExtra')
## options(modelsummary_factory_html = 'kableExtra')
##
## Silence this message forever:
##
## config_modelsummary(startup_message = FALSE)
library(flextable)
##
## Attaching package: 'flextable'
##
## The following object is masked from 'package:purrr':
##
## compose
library(ggplot2)
library(RColorBrewer)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following objects are masked from 'package:flextable':
##
## as_image, footnote
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(readxl)
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following object is masked from 'package:purrr':
##
## transpose
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
read_excel("GSS2.xlsx")
## # A tibble: 10,229 × 4
## year id_ age fejobaff
## <dbl> <dbl> <chr> <chr>
## 1 1996 3 55 Strongly oppose
## 2 1996 8 29 Strongly favor
## 3 1996 15 53 Not strongly oppose
## 4 1996 16 31 Strongly oppose
## 5 1996 18 32 Strongly favor
## 6 1996 19 44 Not strongly oppose
## 7 1996 23 41 Strongly favor
## 8 1996 29 42 Strongly oppose
## 9 1996 31 40 Not strongly oppose
## 10 1996 34 25 Not strongly oppose
## # ℹ 10,219 more rows
read_excel("~/GSS3.xlsx")
## # A tibble: 71,690 × 3
## year id_ attend
## <dbl> <dbl> <chr>
## 1 1972 1 About once or twice a year
## 2 1972 2 Every week
## 3 1972 3 About once a month
## 4 1972 4 Never
## 5 1972 5 Never
## 6 1972 6 About once or twice a year
## 7 1972 7 Every week
## 8 1972 8 Never
## 9 1972 9 Several times a year
## 10 1972 10 Several times a week
## # ℹ 71,680 more rows
Task 1: Data Cleaning and Recoding Objective: Clean and recode the variables to ensure they are ready for analysis.
Recode polviews into three categories: “Liberal”, “Moderate”, and “Conservative”. Clean sex, degree, and race but retain the relevant categories.
df1 = mutate(df, Y_polid = recode(Y_polid, "liberal" = "Liberal", "slightly liberal" = "Liberal", "conservative" = "Conservative", "slghtly conservative" = "Conservative", "extremely liberal" = "Liberal", "extrmly conservative" = "Conservative", "moderate" = "Moderate"), female = recode(female, "0" = "Male", "1" = "Female"), white = recode(white, "0" = "Non-white", "1" = "White"), D_edudg = recode(D_edudg, "1" = "Less than a high school diploma", "2" = "High school diploma", "3" = "Associate degree", "4" = "Bachelors"))
dfclean<- df1 %>%
filter(!is.na(female), !is.na(white), !is.na(D_edudg), !is.na(Y_polid))
dfnew = rename(dfclean, "Sex" = female, "Race" = white, "Highest_Degree" = D_edudg, "Political_Leaning" = Y_polid)
Task 2: Data Summary Objective: Generate a summary table for selected variables using the datasummary_skim function from the modelsummary package.
Select the variables of interest: polviews, sex, degree, and race.
Generate a categorical summary table for these variables, clean the labels, and display it using the flextable package for styling.
dfnewfilter <- datasummary_skim(
dfnew %>%
dplyr::select(`Sex`, `Race`, `Highest_Degree`, `Political_Leaning`),
type = "categorical", output = "kableExtra" )
## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
## produced by the `tinytable` backend.
Task 3: Visualization of Political Views by Gender Objective: Create a bar chart showing the distribution of political views by gender.
Create a bar chart showing the distribution of political views by gender. Use a color palette that clearly differentiates the categories.
dfsummary <- dfnew %>%
count(Sex, Political_Leaning) %>%
group_by(Sex) %>%
mutate(total = sum(n),
Proportion = n / total)
ggplot(dfsummary, aes(x = Sex, (y = Proportion), fill = Political_Leaning)) + geom_bar(stat = "identity", position = "fill") + scale_y_continuous(labels = scales::percent_format()) + scale_fill_brewer(palette = "PiYG") + labs(title = "Distribution of Political Beliefs by Sex",
x = "Sex",
y = "Proportion")
Task 4: Trends Over Time Objective: Visualize trends in religious
attendance over time.
Select the year and attend variables from the GSS dataset.
Create a line plot showing the proportion of each category of religious attendance over time.
Task 5: Comparison Trends Objective: Create a stacked bar chart showing the distribution of fejobaff (preferential hiring) across different age groups.
Create an age group variable by categorizing age into “18-29”, “30-44”, “45-59”, “60+”. Create a stacked bar chart showing the distribution of the fejobaff response categories for each age group.
GSS3 <- read_xlsx("GSS2.xlsx") %>% mutate(agegroup = case_when (age >=60 ~ '60+',
age >= 45 & age <= 59 ~ '45-59',
age >= 30 & age <= 44 ~ '30-44',
age >= 18 & age <= 29 ~ '18-29'))
GSS4 <- GSS3 %>%
count(agegroup, fejobaff) %>%
group_by(agegroup) %>%
mutate(total = sum(n),
Proportion = n / total)
ggplot(GSS4, aes(x = agegroup, (y = Proportion), fill = fejobaff)) + geom_bar(stat = "identity", position = "fill") + scale_y_continuous(labels = scales::percent_format()) + scale_fill_brewer(palette = "Accent") + labs(title = "Preferential Hiring by Age",
x = "agegroup",
y = "Proportion")