library(tidyverse)
library(plotly)
Assignment 3
Exercise 1
When you click the Render button a document will be generated that includes both content and the output of embedded code. You can embed code like this:
You can add options to executable code like this
= read_csv("https://raw.githubusercontent.com/Aranaur/aranaur.rbind.io/main/datasets/delta/delta.csv")
covid_df
covid_df
This dataset has 268166 rows and 3 columns. Each row represents a record about a person who has been infected with the virus. The vaccine
column contains information about the vaccine status of the person. It has 2 unique values: Vaccinated, Unvaccinated.
The age
column contains information about the age of the person. It has 2 unique values: <50, 50+.
The outcome
column contains information about the health outcome of the person. It has 2 unique values: died, survived.
Exercise 2
This data in dataset come from an observational study. This is because the data was collected by observing people who were infected with the virus. The data is not from a randomized controlled trial. In contrast experimental study is a study where the investigator assigns the treatment to the subjects. In this case, the investigator would have to assign the vaccine status to the subjects. This is not possible in this case because the data was collected by observing people who were infected with the virus.
Exercise 3
library(tidyverse)
library(plotly)
<- covid_df %>%
p group_by(vaccine) %>%
summarize(
total_count = n(),
death_count = sum(outcome == "died"),
death_rate = (death_count / total_count) * 100
%>%
) ggplot(aes(x = vaccine, y = death_rate, fill = vaccine)) +
geom_bar(stat = "identity", width = 0.5) +
labs(
title = "Death Rate by Vaccine Status",
x = "Vaccine Status",
y = "Death Rate (%)",
fill = "Vaccine Status"
+
) scale_y_continuous(labels = scales::percent_format(scale = 1)) +
theme_minimal()
ggplotly(p, tooltip = c("x", "y"))
Exercise 4
<- covid_df %>%
p group_by(vaccine, outcome) %>%
summarize(count = n(), .groups = "drop") %>%
group_by(vaccine) %>%
mutate(percentage = count / sum(count) * 100) %>%
ggplot(aes(x = vaccine, y = percentage, fill = outcome, text = paste("Vaccine:", vaccine, "<br>Outcome:", outcome, "<br>Percentage:", round(percentage, 2), "%"))) +
geom_bar(stat = "identity", position = "fill", width = 0.3) +
labs(
title = "Vaccine and Health Outcome",
x = "Vaccine Status",
y = "Proportion (%)",
fill = "Health Outcome"
+
) scale_y_continuous(labels = scales::percent_format(scale = 100)) +
theme_minimal()
ggplotly(p, tooltip = "text")
Exercise 5
<- covid_df %>%
p filter(vaccine == "Vaccinated") %>%
group_by(age, outcome) %>%
summarize(count = n(), .groups = "drop") %>%
group_by(outcome) %>%
mutate(percentage = count / sum(count) * 100) %>%
ggplot(aes(x = age, y = percentage, fill = outcome, text = paste("Age:", age, "<br>Outcome:", outcome, "<br>Percentage:", round(percentage, 2), "%"))) +
geom_bar(stat = "identity", position = "fill", width = 0.3) +
labs(
title = "Vaccinated People: Age and Health Outcome",
x = "Age",
y = "Proportion (%)",
fill = "Health Outcome"
+
) scale_y_continuous(labels = scales::percent_format(scale = 100)) +
theme_minimal()
ggplotly(p, tooltip = "text")
<- covid_df %>%
p filter(vaccine == "Unvaccinated") %>%
group_by(age, outcome) %>%
summarize(count = n(), .groups = "drop") %>%
group_by(outcome) %>%
mutate(percentage = count / sum(count) * 100) %>%
ggplot(aes(x = age, y = percentage, fill = outcome, text = paste("Age:", age, "<br>Outcome:", outcome, "<br>Percentage:", round(percentage, 2), "%"))) +
geom_bar(stat = "identity", position = "fill", width = 0.3) +
labs(
title = "Unvaccinated People: Age and Health Outcome",
x = "Age",
y = "Proportion (%)",
fill = "Health Outcome"
+
) scale_y_continuous(labels = scales::percent_format(scale = 100)) +
theme_minimal()
ggplotly(p, tooltip = "text")
Exercise 6
In 2021, among those in the UK who were COVID Delta cases, the vaccinated were less likely to die than the unvaccinated. For those under 50, those who were unvaccinated were more likely to die than those who were vaccinated. For those 50 and up, those who were unvaccinated were more likely to die than those who were vaccinated.
Exercise 7
A confounding variable in this context is age. Age is a strong predictor of mortality for COVID-19, and older individuals are both more likely to be vaccinated and more vulnerable to severe outcomes from the virus.
Exercise 8
<- covid_df %>%
p filter(age != "<50") %>%
group_by(vaccine) %>%
summarize(
total_seniors = nrow(covid_df %>% filter(age != "<50")),
senior_count = n(),
proportion_seniors = (senior_count / total_seniors) * 100
%>%
) ggplot(aes(x = vaccine, y = proportion_seniors, fill = vaccine, text = paste(
"Vaccine:", vaccine,
"<br>Proportion seniors:", round(proportion_seniors, 2), "%"
+
))) geom_bar(stat = "identity", width = 0.5) +
labs(
title = "Proportion of Seniors (50 and Up) by Vaccination Status",
x = "Vaccine Status",
y = "Proportion of Seniors (%)",
fill = "Vaccine Status"
+
) scale_y_continuous(labels = scales::percent_format(scale = 1)) +
theme_minimal()
ggplotly(p, tooltip = "text")
Summary
<- covid_df %>%
death_percentages group_by(vaccine) %>%
summarize(
total_count = n(),
death_count = sum(outcome == "died"),
death_percentage = (death_count / total_count) * 100
)
::kable(death_percentages) knitr
vaccine | total_count | death_count | death_percentage |
---|---|---|---|
Unvaccinated | 151052 | 250 | 0.1655059 |
Vaccinated | 117114 | 477 | 0.4072955 |
<- covid_df %>%
under_50_rates filter(age == "<50") %>%
group_by(vaccine) %>%
summarize(
total_count = n(),
death_count = sum(outcome == "died"),
death_rate = (death_count / total_count) * 100
)
<- covid_df %>%
over_50_rates filter(age != "<50") %>%
group_by(vaccine) %>%
summarize(
total_count = n(),
death_count = sum(outcome == "died"),
death_rate = (death_count / total_count) * 100
)
::kable(under_50_rates) knitr
vaccine | total_count | death_count | death_rate |
---|---|---|---|
Unvaccinated | 147612 | 45 | 0.0304853 |
Vaccinated | 89807 | 18 | 0.0200430 |
::kable(over_50_rates) knitr
vaccine | total_count | death_count | death_rate |
---|---|---|---|
Unvaccinated | 3440 | 205 | 5.959302 |
Vaccinated | 27307 | 459 | 1.680888 |
library(tidyr)
<- bind_rows(
combined_rates %>% mutate(age_group = "Under 50"),
under_50_rates %>% mutate(age_group = "50 and Up")
over_50_rates
)
<- combined_rates %>%
pivoted_data pivot_longer(cols = death_rate, names_to = "measure", values_to = "value")
::kable(pivoted_data) knitr
vaccine | total_count | death_count | age_group | measure | value |
---|---|---|---|---|---|
Unvaccinated | 147612 | 45 | Under 50 | death_rate | 0.0304853 |
Vaccinated | 89807 | 18 | Under 50 | death_rate | 0.0200430 |
Unvaccinated | 3440 | 205 | 50 and Up | death_rate | 5.9593023 |
Vaccinated | 27307 | 459 | 50 and Up | death_rate | 1.6808877 |
<- covid_df %>%
age_distribution group_by(vaccine, age) %>%
summarize(count = n()) %>%
mutate(proportion = (count / sum(count)) * 100)
::kable(age_distribution) knitr
vaccine | age | count | proportion |
---|---|---|---|
Unvaccinated | 50+ | 3440 | 2.277361 |
Unvaccinated | <50 | 147612 | 97.722639 |
Vaccinated | 50+ | 27307 | 23.316598 |
Vaccinated | <50 | 89807 | 76.683402 |
<- age_distribution %>%
weighted_death_rate filter(age != "<50") %>%
left_join(over_50_rates, by = "vaccine") %>%
mutate(weighted_rate = death_rate * (proportion / 100))
::kable(weighted_death_rate) knitr
vaccine | age | count | proportion | total_count | death_count | death_rate | weighted_rate |
---|---|---|---|---|---|---|---|
Unvaccinated | 50+ | 3440 | 2.277361 | 3440 | 205 | 5.959302 | 0.1357149 |
Vaccinated | 50+ | 27307 | 23.316598 | 27307 | 459 | 1.680888 | 0.3919258 |