# accessing variable data
table(gss_data$class)
##
## lower class working class
## 4601 30940
## middle class upper class
## 31014 2302
## no class don't know
## 1 0
## iap I don't have a job
## 0 0
## dk, na, iap no answer
## 0 0
## not imputable_(2147483637) not imputable_(2147483638)
## 0 0
## refused skipped on web
## 0 0
## uncodeable not available in this release
## 0 0
## not available in this year see codebook
## 0 0
unique(gss_data$class)
## [1] middle class working class upper class lower class <NA>
## [6] no class
## 18 Levels: lower class working class middle class upper class ... see codebook
table(gss_data$degree)
##
## less than high school high school
## 14192 36446
## associate/junior college bachelor's
## 4355 11248
## graduate don't know
## 5953 0
## iap I don't have a job
## 0 0
## dk, na, iap no answer
## 0 0
## not imputable_(2147483637) not imputable_(2147483638)
## 0 0
## refused skipped on web
## 0 0
## uncodeable not available in this release
## 0 0
## not available in this year see codebook
## 0 0
unique(gss_data$degree)
## [1] bachelor's less than high school high school
## [4] graduate associate/junior college <NA>
## 18 Levels: less than high school high school ... see codebook
table(gss_data$polint)
##
## very interested fairly interested
## 864 1330
## somewhat interested not very interested
## 1630 932
## not at all interested can't choose
## 561 0
## iap I don't have a job
## 0 0
## dk, na, iap no answer
## 0 0
## not imputable_(2147483637) not imputable_(2147483638)
## 0 0
## refused skipped on web
## 0 0
## uncodeable not available in this release
## 0 0
## not available in this year see codebook
## 0 0
unique(gss_data$polint)
## [1] <NA> fairly interested somewhat interested
## [4] not at all interested not very interested very interested
## 18 Levels: very interested fairly interested ... see codebook
# Clean and recode the 'class' variable, and filter the data for respondents aged 18-29
gss_data <- gss_data %>%
mutate(class_cleaned = case_when(
class %in% c("lower class") ~ "Lower Class",
class %in% c("working class") ~ "Working Class",
class %in% c("middle class") ~ "Middle Class",
class %in% c("upper class") ~ "Upper Class",
TRUE ~ NA_character_
),
# Convert the cleaned class variable into a factor with ordered levels
class_cleaned = factor(class_cleaned, levels = c("Lower Class", "Working Class", "Middle Class", "Upper Class"))
) %>%
# Filter the dataset to include only respondents with valid class data and within the age range 18-29
filter(!is.na(class_cleaned) & age >= 18 & age <= 29)
# Check to see if data is cleaned properly
gss_data %>%
count(class_cleaned)
## class_cleaned n
## 1 Lower Class 914
## 2 Working Class 7184
## 3 Middle Class 5255
## 4 Upper Class 302
# Same process for 'polint' variable
gss_data <- gss_data %>%
mutate(polint_cleaned = case_when(
polint %in% c("very interested") ~ "Very Interested",
polint %in% c("fairly interested") ~ "Fairly Interested",
polint %in% c("somewhat interested") ~ "Somewhat Interested",
polint %in% c("not very interested") ~ "Not Very Interested",
polint %in% c("not at all interested") ~ "Not at All Interested",
TRUE ~ NA_character_
),
polint_cleaned = factor(polint_cleaned, levels = c("Very Interested", "Fairly Interested", "Somewhat Interested", "Not Very Interested", "Not at All Interested"))
) %>%
filter(!is.na(polint_cleaned) & age >= 18 & age <= 29)
gss_data %>%
count(polint_cleaned)
## polint_cleaned n
## 1 Very Interested 57
## 2 Fairly Interested 163
## 3 Somewhat Interested 243
## 4 Not Very Interested 172
## 5 Not at All Interested 109
# Same process for 'degree' variable, however, we are removing 'associate/junior college' to simplify the levels as it is fairly uncommon especially after filtering for ages 18-29
gss_data <- gss_data %>%
mutate(degree_cleaned = case_when(
degree %in% c("less than high school") ~ "Less Than High School",
degree %in% c("high school") ~ "High School",
degree %in% c("bachelor's") ~ "Bachelor's",
degree %in% c("graduate") ~ "Graduate",
TRUE ~ NA_character_
),
degree_cleaned = factor(degree_cleaned, levels = c("Less Than High School", "High School", "Bachelor's", "Graduate"))
) %>%
filter(!is.na(degree_cleaned) & age >= 18 & age <= 29)
gss_data %>%
count(degree_cleaned)
## degree_cleaned n
## 1 Less Than High School 93
## 2 High School 477
## 3 Bachelor's 104
## 4 Graduate 20
gss_data <- gss_data %>%
mutate(
class_cleaned_numeric = case_when(
class_cleaned == "Lower Class" ~ 0,
class_cleaned == "Working Class" ~ 1,
class_cleaned == "Middle Class" ~ 2,
class_cleaned == "Upper Class" ~ 3,
TRUE ~ NA_real_
),
polint_cleaned_numeric = case_when(
polint_cleaned == "Not at All Interested" ~ 0,
polint_cleaned == "Not Very Interested" ~ 1,
polint_cleaned == "Somewhat Interested" ~ 2,
polint_cleaned == "Fairly Interested" ~ 3,
polint_cleaned == "Very Interested" ~ 4,
TRUE ~ NA_real_
),
degree_cleaned_numeric = case_when(
degree_cleaned == "Less Than High School" ~ 0,
degree_cleaned == "High School" ~ 1,
degree_cleaned == "Bachelor's" ~ 2,
degree_cleaned == "Graduate" ~ 3,
TRUE ~ NA_real_
))
# Rename the numeric variables to more descriptive names
gss_data <- gss_data %>%
rename(
"Social Class" = class_cleaned_numeric,
"Political Interest" = polint_cleaned_numeric,
"Educational Attainment" = degree_cleaned_numeric
)
# Generate a descriptive statistics table of the cleaned and numeric main variables
datasummary_skim(
gss_data %>% select(
"Social Class",
"Political Interest",
"Educational Attainment"
),
histogram = TRUE
)
| Unique | Missing Pct. | Mean | SD | Min | Median | Max | Histogram | |
|---|---|---|---|---|---|---|---|---|
| Social Class | 4 | 0 | 1.3 | 0.6 | 0.0 | 1.0 | 3.0 | |
| Political Interest | 5 | 0 | 1.8 | 1.1 | 0.0 | 2.0 | 4.0 | |
| Educational Attainment | 4 | 0 | 1.1 | 0.6 | 0.0 | 1.0 | 3.0 |
# Create a boxplot of political interest by socioeconomic status with customized colors and labels
ggplot(gss_data, aes(x = factor(class_cleaned, levels = c("Lower Class", "Working Class", "Middle Class", "Upper Class")),
y = `Political Interest`)) +
geom_boxplot(aes(fill = class_cleaned)) +
scale_fill_manual(values = c("Lower Class" = "#FF9999",
"Working Class" = "#66B2FF",
"Middle Class" = "#99FF99",
"Upper Class" = "#FFCC99")) +
labs(title = "Political Interest by Socioeconomic Status",
x = "Socioeconomic Status",
y = "Political Interest",
fill = "Socioeconomic Status") +
theme_minimal()
# Create a diverging stacked bar chart to visualize the relationship between educational attainment and socioeconomic status on political interest levels
ggplot(gss_data, aes(x = factor(degree_cleaned, levels = c("Less Than High School", "High School", "Bachelor's", "Graduate")),
fill = factor(class_cleaned, levels = c("Lower Class", "Working Class", "Middle Class", "Upper Class")))) +
geom_bar(position = "fill", width = 0.7) +
facet_wrap(~ polint_cleaned, ncol = 2) +
scale_fill_manual(
values = c(
"Lower Class" = "#FF9999", # Tomato Red
"Working Class" = "#66B2FF", # Steel Blue
"Middle Class" = "#99FF99", # Lime Green
"Upper Class" = "#FFCC99" # Gold
),
name = "Socioeconomic Status"
) +
labs(
title = "Socioeconomic Status by Educational Attainment\nand Political Interest",
x = "Educational Attainment",
y = "Proportion"
) +
coord_flip() +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "top",
strip.background = element_rect(fill = "#f0f0f0", color = NA),
strip.text = element_text(face = "bold"),
plot.margin = margin(20, 20, 20, 20)
)
# Fitting regression models using the numeric variables related to your hypotheses
model1 <- lm(`Political Interest` ~ `Social Class`, data = gss_data)
model2 <- lm(`Political Interest` ~ `Educational Attainment`, data = gss_data)
model3 <- lm(`Political Interest` ~ `Social Class` + `Educational Attainment`, data = gss_data)
# Creating a regression table with sjPlot
tab_model(
model1, model2, model3,
dv.labels = c(
"Model 1<br>Social Class<br>VS<br>Political Interest",
"Model 2<br>Educational Attainment<br>VS<br>Political Interest",
"Model 3<br>Social Class + Educational Attainment<br>VS<br>Political Interest"
),
string.pred = "Predictors",
string.est = "Coefficient",
string.ci = "CI",
string.p = "P-Value"
)
| Â |
Model 1 Social Class VS Political Interest |
Model 2 Educational Attainment VS Political Interest |
Model 3 Social Class + Educational Attainment VS Political Interest |
||||||
|---|---|---|---|---|---|---|---|---|---|
| Predictors | Coefficient | CI | P-Value | Coefficient | CI | P-Value | Coefficient | CI | P-Value |
| (Intercept) | 1.56 | 1.36 – 1.76 | <0.001 | 1.47 | 1.31 – 1.64 | <0.001 | 1.31 | 1.08 – 1.54 | <0.001 |
| Social Class | 0.21 | 0.07 – 0.34 | 0.004 | 0.14 | 0.01 – 0.28 | 0.042 | |||
| Educational Attainment | 0.33 | 0.20 – 0.47 | <0.001 | 0.31 | 0.17 – 0.44 | <0.001 | |||
| Observations | 694 | 694 | 694 | ||||||
| R2 / R2 adjusted | 0.012 / 0.011 | 0.034 / 0.032 | 0.040 / 0.037 | ||||||
# Bar plot showing the average Political Interest by Social Class with adjusted y-axis label
ggplot(gss_data, aes(x = factor(`Social Class`, levels = c(0, 1, 2, 3)),
y = `Political Interest`,
fill = factor(`Social Class`, levels = c(0, 1, 2, 3)))) +
stat_summary(fun = mean, geom = "bar", position = "dodge", width = 0.7) + # Bar plot of mean Political Interest
scale_x_discrete(labels = c("Lower Class", "Working Class", "Middle Class", "Upper Class")) + # Replace x-axis numbers with words
scale_fill_manual(values = c("#FF9999", "#66B2FF", "#99FF99", "#FFCC99")) +
labs(title = "Average Political Interest by Social Class",
x = "Social Class",
y = "Political Interest\n(0 = Not at All Interested, 4 = Very Interested)", # Adjusted y-axis label
fill = "Social Class") +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
axis.title.y = element_text(size = 12) # Adjust the y-axis label font size
)