library(ggplot2)
library(dplyr)
library(tidyr)
library(janitor)
library(png)
library(ggimage)
library(readr)
library(forcats)
library(scales)
library(stringr)
library(tibble)
library(viridis)
income <- read.csv("~/Downloads/income.csv")STA 313 - Spring 2024 - HW 3
From Du Bois to COVID
1 - Du Bois challenge.
income <- income |>
janitor::clean_names()
income_two <- income |>
pivot_longer(3:7) |>
filter(!is.na(value)) |>
filter(value > 1) |>
mutate(name = toupper(name)) |>
mutate(textcol = ifelse(name == "RENT", "1", "0")) |>
mutate(average_income = paste0(round(average_income))) |>
mutate(class_income = paste0(class, " | ", "$", average_income))
income_two$class_income <- factor(
income_two$class_income,
levels = c(
"$1000 AND OVER | $1125",
"$750-1000 | $880",
"$500-750 | $547",
"$400-500 | $434",
"$300-400 | $336",
"$200-300 | $249",
"$100-200 | $139"
)
)
income_two$name <-
factor(income_two$name,
levels = c("OTHER", "TAX", "CLOTHES", "FOOD", "RENT"))
p1 <- income_two |>
ggplot(aes(
fill = name,
x = value,
y = class_income,
label = paste0(round(value), "%")
),
color = "black") +
geom_bar(
position = "fill",
stat = "identity",
color = "#495057",
width = 0.7,
linewidth = 0.2
) +
geom_text(
aes(color = textcol),
position = position_fill(vjust = 0.5),
size = 3,
show.legend = FALSE
) +
theme(
legend.position = "top",
plot.margin = margin(20, 40, 0, 20),
axis.title = element_text(size = 8),
text = element_text(family = "mono"),
axis.ticks = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.title.y.left = element_blank(),
plot.title.position = "plot",
plot.title = element_text(size = 12.2),
panel.grid = element_blank(),
legend.text = element_text(size = 7, margin = margin(r = 0.82, unit = "cm")),
legend.margin = margin(0, 24, 0, 0),
legend.box.margin = margin(10, 0, -5, 0),
plot.tag.position = c(0.17, 0.75),
plot.tag = element_text(family = "mono", size = 7),
plot.caption = element_text(hjust = 0.5, vjust = 7),
legend.key.height = unit(0.5, 'cm'),
legend.key.width = unit(1.25, 'cm'),
legend.title = element_text(size = 10, hjust = 0.5)
) +
scale_fill_manual(
labels =
c("OTHER EXPENSES AND SAVINGS", "DIRECT TAXES", "CLOTHES", "FOOD", "RENT"),
values = c("lightgrey", "#8e9aaf", "#d78879", "#a08294", "#161213"),
guide = guide_legend(
reverse = TRUE,
title.position = "top",
title = "ANNUAL EXPENDITURE FOR",
label.position = "top"
)) +
scale_color_manual(values = c("black", "white")) +
labs(
fill = "",
x = "",
y = "",
title = "INCOME AND EXPENDITURE OF 150 NEGRO FAMILIES IN ATLANTA, GA.,U.S.A.",
tag = "CLASS | ACTUAL AVERAGE",
caption = "FOR FURTHER STATISTICS RAISE THIS FRAME."
) +
scale_y_discrete(expand = expansion(add = 1)) +
scale_x_continuous(expand = expansion(mult = c(0.02, 0.05)))Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic argument:
• color = "black"
ℹ Did you misspell an argument name?
ggbackground(p1, "~/Downloads/paper.png")Warning: `aes_()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`
ℹ The deprecated feature was likely used in the ggimage package.
Please report the issue at <https://github.com/YuLab-SMU/ggimage/issues>.
https://rpubs.com/ejhollowood/du-bois
I used this code as a baseline after pivoting my data. However, a good amount of modifications needed to be made to the code, so I would say the final result is quite different in a lot of ways.
2 - COVID survey - interpret.
As discussed above, this plot is very information dense. On the y axis, there are different demographic groups The data is grouped according to five different questions about the COVID vaccine, and each x axis contains the mean Likert score. 1. It’s interesting to see that most people who are under 20 believe that the COVID vaccine is safe and the errors bars are relatively low compared to the other age groups. This makes sense because younger people tend to have a more trusting view of the government / are more politically open, so more young people would likely trust what the government is saying about receiving the vaccine. 2. I honestly thought there would be more differences between males and females across questions, since men tend to lean more right. However, the error bars and mean likert values are extremely similar. Non binary third gender is similar as well; prefer not to saying has large differences compared to the other three gender categories listed on the survey, possbily because it is not as well defined. 3. For four out of the six questions, American Indians / Alaskan natives had the highest magnitude error bars, which means there is more variation of responses in this particular demographic. This makes sense because part of the demographic could have a mistrust of government (and therefore vaccines) due to past offenses. Although the mean score was slightly higher (in the direction of disagree), American Indians / Alaskan natives were somewhat similar to other races, meaning part of the population must have a trust in vaccines.
3 - COVID survey - reconstruct.
covid_survey <- read_csv("~/Downloads/covid-survey.csv")
covid_survey_skip <- read_csv("~/Downloads/covid-survey.csv", skip = 1)
dim(covid_survey_skip)[1] 1121 14
covid_survey_filter <- covid_survey_skip |>
filter(!if_all(-response_id, is.na))
dim(covid_survey_filter)[1] 1111 14
covid_survey_named <- covid_survey_filter |>
mutate(
exp_flu_vax = case_when(exp_flu_vax == 0 ~ "No",
exp_flu_vax == 1 ~ "Yes"),
exp_already_vax = case_when(exp_already_vax == 0 ~ "No",
exp_already_vax == 1 ~ "Yes"),
exp_profession = case_when(exp_profession == 0 ~ "Medical",
exp_profession == 1 ~ "Nursing"),
exp_gender = case_when(
exp_gender == 0 ~ "Male",
exp_gender == 1 ~ "Female",
exp_gender == 3 ~ "Non-binary third gender",
exp_gender == 4 ~ "Prefer not to say"
),
exp_race = case_when(
exp_race == 1 ~ "American Indian / Alaskan Native",
exp_race == 2 ~ "Asian",
exp_race == 3 ~ "Black / African American",
exp_race == 4 ~ "Native Hawaiian / Other Pacific Islander",
exp_race == 5 ~ "White"
),
exp_ethnicity = case_when(
exp_ethnicity == 1 ~ "Hispanic / Latino",
exp_ethnicity == 2 ~ "Non-Hispanic/Non-Latino"
),
exp_age_bin = case_when(
exp_age_bin == 0 ~ "<20",
exp_age_bin == 20 ~ "21-25",
exp_age_bin == 25 ~ "26-30",
exp_age_bin == 30 ~ ">30"
)
) |>
mutate(
exp_age_bin = fct_relevel(exp_age_bin,
"<20", "21-25", "26-30", ">30")) |>
mutate(across(starts_with("exp"), as.factor))
dim(covid_survey_named)[1] 1111 14
covid_survey_longer <- covid_survey_named |>
pivot_longer(cols = starts_with("exp_"),
names_to = "explanatory",
values_to = "explanatory_value") |>
filter(!is.na(explanatory_value)) |>
pivot_longer(cols = starts_with("resp_"),
names_to = "response",
values_to = "response_value")
covid_survey_longer <- covid_survey_longer |>
mutate(explanatory_value = factor(explanatory_value))
covid_survey_renamed <- covid_survey_longer |>
mutate(
response = case_when(
response == "resp_safety" ~
"Based on my understanding, I believe the vaccine is safe",
response == "resp_confidence_science" ~
"I am confident in the scientific vetting process for the new COVID vaccines",
response == "resp_feel_safe_at_work" ~
"Getting the vaccine will make me feel safer at work",
response == "resp_will_recommend" ~
"I will recommend the vaccine to family, friends, and community members",
response == "resp_trust_info" ~
"I trust the information that I have received about the vaccines",
response == "resp_concern_safety" ~
"I am concerned about the safety and side effects of the vaccine"
)
) |>
mutate(
explanatory = case_when(
explanatory == "All" ~ "All",
explanatory == "exp_age_bin" ~ "Age",
explanatory == "exp_gender" ~ "Gender",
explanatory == "exp_race" ~ "Race",
explanatory == "exp_ethnicity" ~ "Ethnicity",
explanatory == "exp_profession" ~ "Profession",
explanatory == "exp_already_vax" ~ "Had COVID vaccine",
explanatory == "exp_flu_vax" ~ "Had flu vaccine this year"
))
covid_survey_summary_stats_by_group <- covid_survey_renamed |>
group_by(explanatory, explanatory_value, response) |>
summarise(
mean = round(mean(response_value, na.rm = TRUE), 2),
low = round(quantile(response_value, 0.1, na.rm = TRUE), 0),
high = quantile(response_value, 0.9, na.rm = TRUE),
.groups = "drop"
)
covid_survey_summary_stats_all <- covid_survey_renamed |>
group_by(response) |>
summarise(
mean = round(mean(response_value, na.rm = TRUE), 2),
low = round(quantile(response_value, 0.1, na.rm = TRUE), 0),
high = quantile(response_value, 0.9, na.rm = TRUE),
.groups = "drop"
) |>
mutate(explanatory = "All",
explanatory_value = factor(""))
covid_survey_summary_stats <- bind_rows(covid_survey_summary_stats_all,
covid_survey_summary_stats_by_group)The first pivot longer is taking columns that start with exp and turning them into values for a new column titled explanatory. It is then taking the values from the previous columns starting with exp and creating a new column titled explanatory_value. The second pivot is doing the same thing with columns starting with resp.
#| label: three-plot
covid_survey_summary_stats |>
mutate(
explanatory = factor(explanatory, levels = c(
"All",
"Age",
"Gender",
"Race",
"Ethnicity",
"Profession",
"Had COVID vaccine",
"Had flu vaccine this year"
))) |>
ggplot(aes(y = explanatory_value, x = mean, xmin = low, xmax = high)) +
geom_errorbar(width = 0.2, linewidth = 0.25) +
geom_point(size = .75) +
facet_grid(explanatory ~ response, labeller = label_wrap_gen(20),
space = "free", scales = "free_y") +
theme_void() +
labs(title = "",
x = "Mean Likert score\n
(Error bars range from 10th to 90th percentile)") +
scale_x_continuous(breaks = 1:5, labels = c("1", "2", "3", "4 ", "5")) +
scale_y_discrete(labels = label_wrap(25)) +
theme(
axis.text.x = element_text(hjust = 1, size = 5),
strip.background = element_rect(fill = "gray90"),
strip.placement = "outside",
strip.text.x.top = element_text(size = 5),
strip.text.y.right = element_text(size = 5),
axis.text.y = element_text(size = 5, hjust = 1),
panel.spacing.x = unit(0.1, "lines"),
panel.spacing.y = unit(0.1, "lines"),
plot.margin = margin(t = 15, r = 15, b = 15, l = 15, unit = "mm"),
axis.title.x = element_text(size = 7),
strip.text = element_text(margin = margin(10, 10, 10, 10))
)4 - COVID survey - re-reconstruct.
two_covid_survey_summary_stats_by_group <- covid_survey_renamed |>
group_by(explanatory, explanatory_value, response) |>
summarise(
mean = round(mean(response_value, na.rm = TRUE), 2),
low = round(quantile(response_value, 0.25, na.rm = TRUE), 0),
high = quantile(response_value, 0.75, na.rm = TRUE),
.groups = "drop"
)
two_covid_survey_summary_stats_all <- covid_survey_renamed |>
group_by(response) |>
summarise(
mean = round(mean(response_value, na.rm = TRUE), 2),
low = round(quantile(response_value, 0.25, na.rm = TRUE), 0),
high = quantile(response_value, 0.75, na.rm = TRUE),
.groups = "drop"
) |>
mutate(
explanatory = "All",
explanatory_value = factor(""))
two_covid_survey_summary_stats <- bind_rows(
two_covid_survey_summary_stats_all,
two_covid_survey_summary_stats_by_group)#| label: four-plot
two_covid_survey_summary_stats |>
mutate(
explanatory = factor(explanatory, levels = c(
"All",
"Age",
"Gender",
"Race",
"Ethnicity",
"Profession",
"Had COVID vaccine",
"Had flu vaccine this year"
))) |>
ggplot(aes(y = explanatory_value, x = mean, xmin = low, xmax = high)) +
geom_errorbar(width = 0.2, linewidth = 0.25) +
geom_point(size = .75) +
facet_grid(explanatory ~ response, scales = "free_y", space = "free",
labeller = label_wrap_gen(20)) +
theme_void() +
labs(title = "",
x = "Mean Likert score\n
(Error bars range from 25th to 75th percentile)") +
scale_x_continuous(breaks = 1:5, labels = c("1", "2", "3", "4 ", "5")) +
scale_y_discrete(labels = label_wrap(25)) +
theme(
axis.text.x = element_text(hjust = 1, size = 5),
strip.background = element_rect(fill = "gray90"),
strip.placement = "outside",
strip.text.x.top = element_text(size = 5),
strip.text.y.right = element_text(size = 5),
axis.text.y = element_text(size = 5, hjust = 1),
panel.spacing.x = unit(0.1, "lines"),
panel.spacing.y = unit(0.1, "lines"),
plot.margin = margin(t = 15, r = 15, b = 15, l = 15, unit = "mm"),
axis.title.x = element_text(size = 7)
)The main thing this shows is that when you change the percentiles the error bars are smaller since the percentiles are in a smaller range. The points themselves are similarily placed. This doesn’t change my conclusions about the data. Since the points are in the somewhat same location, the data feels reliable.
5 - COVID survey - another view.
part b.
five_covid_survey_summary_stats_p <- covid_survey_renamed |>
count(response, response_value) |>
group_by(response) |>
mutate(p = n/sum(n)) |>
mutate(
likert = case_when(
response_value == 1 ~ "Strongly Agree",
response_value == 2 ~ "Somewhat Agree",
response_value == 3 ~ "Neither Agree Nor Disagree",
response_value == 4 ~ "Somewhat Disagree",
response_value == 5 ~ "Strongly Disagree",
TRUE ~ NA_character_)) |>
mutate(likert = fct_relevel(likert,
"Strongly Disagree",
"Somewhat Disagree",
"Neither Agree Nor Disagree",
"Somewhat Agree",
"Strongly Agree")) |>
mutate(likert = (fct_rev(likert))) |>
filter(!is.na(response), !is.na(likert)) |>
mutate(response = as.factor(response))
five_covid_survey_summary_o <- five_covid_survey_summary_stats_p |>
filter(likert == "Strongly Disagree") |>
arrange(p) |>
rowid_to_column(var = "order") |>
select(response, order)
five_covid_survey_summary_stats_p <- five_covid_survey_summary_stats_p |>
mutate(response = fct_relevel(response,
"I trust the information that I have received about the vaccines",
"I will recommend the vaccine to family, friends, and community members",
"I am confident in the scientific vetting process for the new COVID vaccines",
"Getting the vaccine will make me feel safer at work",
"Based on my understanding, I believe the vaccine is safe",
"I am concerned about the safety and side effects of the vaccine"))#| label: five-one-plot
five_covid_survey_summary_stats_p |>
ggplot(aes(x = p, y = response, fill = likert)) +
geom_col() +
geom_vline(xintercept = c(0.25, 0.5, 0.75),
color = "white", linetype = "dotted") +
scale_x_continuous(labels = scales::label_percent()) +
scale_fill_brewer(palette = "Spectral", direction = 1) +
theme_minimal() +
theme(
legend.position = "top",
legend.margin = margin(0, 0, 0, -300),
legend.text = element_text(size = 4),
legend.key.size = unit(0.25, "cm")) +
labs(
x = "Percentage",
title = "How much do you agree:",
fill = "",
y = ""
)part. a
five_covid_survey_summary_stats_p_step_one <-
five_covid_survey_summary_stats_p |>
mutate(p = if_else(likert == "Neither Agree Nor Disagree", p / 2, p))
five_covid_survey_summary_stats_p_step_two <-
five_covid_survey_summary_stats_p |>
mutate(likert_category = case_when(
likert %in% c("Strongly Agree", "Somewhat Agree") ~ "pos",
likert %in% c("Strongly Disagree", "Somewhat Disagree") ~ "neg",
TRUE ~ "pos_neg"
)) |>
group_by(response, likert_category) |>
mutate(total_sum = sum(p)) |>
pivot_wider(names_from = likert_category,
values_from = total_sum) |>
mutate(pos = if_else(!(is.na(pos_neg)), pos_neg, pos),
neg = if_else(!(is.na(pos_neg)), pos_neg, neg)) |>
select(-pos_neg) |>
arrange(neg) |>
rowid_to_column(var = "order") |>
select(order, response)
five_covid_survey_summary_stats_p_div <-
left_join(
five_covid_survey_summary_stats_p_step_one,
five_covid_survey_summary_stats_p_step_two,
by = "response",
relationship = "many-to-many"
)
five_covid_survey_summary_stats_p_div <-
five_covid_survey_summary_stats_p_div |>
mutate(p = if_else(
likert %in% c(
"Strongly Disagree",
"Somewhat Disagree",
"Neither Agree Nor Disagree"
),
-p,
p)) #| label: five-two-plot
#| warning: FALSE
five_covid_survey_summary_stats_p_div$likert <- factor(five_covid_survey_summary_stats_p_div$likert,
levels = c("Strongly Disagree", "Somewhat Disagree", "Neither Agree Nor Disagree", "Somewhat Agree", "Strongly Agree"))
five_covid_survey_summary_stats_p_div |>
ggplot(aes(y = response, fill = likert, x = p)) +
geom_col() +
scale_fill_brewer(palette = "Spectral", type = "div") +
scale_fill_brewer(palette = "Spectral", direction = 1) +
theme_minimal() +
expand_limits(x = c(0, 10)) +
theme(
legend.position = "top",
legend.margin = margin(0, 0, 0, -300),
legend.text = element_text(size = 4),
legend.key.size = unit(0.25, "cm"),
plot.margin = margin(1, 1, 1, 1)) +
labs(
x = "Proportion",
title = "How much do you agree:",
fill = "",
y = ""
)Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.
part c. The main ways these two plots are different is that the diverging bar chart makes the dissent more noticeable than the agreements, since it has negative proportions. The 100 percent chart almost feels a little bit easier to read, since the length of the bars are the same. However, they both showcase the main point of the data, which is that a majority of people agree with the statements listed.