Assignment 3

Author
Affiliation

Moisieiev Vasyl

Kyiv School of Economics

Exercise 1

When you click the Render button a document will be generated that includes both content and the output of embedded code. You can embed code like this:

library(tidyverse)
library(plotly)

You can add options to executable code like this

covid_df = read_csv("https://raw.githubusercontent.com/Aranaur/aranaur.rbind.io/main/datasets/delta/delta.csv")

covid_df

This dataset has 268166 rows and 3 columns. Each row represents a record about a person who has been infected with the virus. The vaccine column contains information about the vaccine status of the person. It has 2 unique values: Vaccinated, Unvaccinated.

The age column contains information about the age of the person. It has 2 unique values: <50, 50+.

The outcome column contains information about the health outcome of the person. It has 2 unique values: died, survived.

Exercise 2

This data in dataset come from an observational study. This is because the data was collected by observing people who were infected with the virus. The data is not from a randomized controlled trial. In contrast experimental study is a study where the investigator assigns the treatment to the subjects. In this case, the investigator would have to assign the vaccine status to the subjects. This is not possible in this case because the data was collected by observing people who were infected with the virus.

Exercise 3

library(tidyverse)
library(plotly)

p <- covid_df %>%
  group_by(vaccine) %>%
  summarize(
    total_count = n(),
    death_count = sum(outcome == "died"),
    death_rate = (death_count / total_count) * 100
  ) %>%
  ggplot(aes(x = vaccine, y = death_rate, fill = vaccine)) +
  geom_bar(stat = "identity", width = 0.5) +
  labs(
    title = "Death Rate by Vaccine Status",
    x = "Vaccine Status",
    y = "Death Rate (%)",
    fill = "Vaccine Status"
  ) +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  theme_minimal()

ggplotly(p, tooltip = c("x", "y"))

Exercise 4

p <- covid_df %>%
  group_by(vaccine, outcome) %>%
  summarize(count = n(), .groups = "drop") %>%
  group_by(vaccine) %>%
  mutate(percentage = count / sum(count) * 100) %>%
  ggplot(aes(x = vaccine, y = percentage, fill = outcome, text = paste("Vaccine:", vaccine, "<br>Outcome:", outcome, "<br>Percentage:", round(percentage, 2), "%"))) +
  geom_bar(stat = "identity", position = "fill", width = 0.3) +
  labs(
    title = "Vaccine and Health Outcome",
    x = "Vaccine Status",
    y = "Proportion (%)",
    fill = "Health Outcome"
  ) +
  scale_y_continuous(labels = scales::percent_format(scale = 100)) +
  theme_minimal()

ggplotly(p, tooltip = "text")

Exercise 5

p <- covid_df %>%
  filter(vaccine == "Vaccinated") %>%
  group_by(age, outcome) %>%
  summarize(count = n(), .groups = "drop") %>%
  group_by(outcome) %>%
  mutate(percentage = count / sum(count) * 100) %>%
  ggplot(aes(x = age, y = percentage, fill = outcome, text = paste("Age:", age, "<br>Outcome:", outcome, "<br>Percentage:", round(percentage, 2), "%"))) +
  geom_bar(stat = "identity", position = "fill", width = 0.3) +
  labs(
    title = "Vaccinated People: Age and Health Outcome",
    x = "Age",
    y = "Proportion (%)",
    fill = "Health Outcome"
  ) +
  scale_y_continuous(labels = scales::percent_format(scale = 100)) +
  theme_minimal()

ggplotly(p, tooltip = "text")
p <- covid_df %>%
  filter(vaccine == "Unvaccinated") %>%
  group_by(age, outcome) %>%
  summarize(count = n(), .groups = "drop") %>%
  group_by(outcome) %>%
  mutate(percentage = count / sum(count) * 100) %>%
  ggplot(aes(x = age, y = percentage, fill = outcome, text = paste("Age:", age, "<br>Outcome:", outcome, "<br>Percentage:", round(percentage, 2), "%"))) +
  geom_bar(stat = "identity", position = "fill", width = 0.3) +
  labs(
    title = "Unvaccinated People: Age and Health Outcome",
    x = "Age",
    y = "Proportion (%)",
    fill = "Health Outcome"
  ) +
  scale_y_continuous(labels = scales::percent_format(scale = 100)) +
  theme_minimal()

ggplotly(p, tooltip = "text")

Exercise 6

In 2021, among those in the UK who were COVID Delta cases, the vaccinated were less likely to die than the unvaccinated. For those under 50, those who were unvaccinated were more likely to die than those who were vaccinated. For those 50 and up, those who were unvaccinated were more likely to die than those who were vaccinated.

Exercise 7

A confounding variable in this context is age. Age is a strong predictor of mortality for COVID-19, and older individuals are both more likely to be vaccinated and more vulnerable to severe outcomes from the virus.

Exercise 8

p <- covid_df %>%
  filter(age != "<50") %>%
  group_by(vaccine) %>%
  summarize(
    total_seniors = nrow(covid_df %>% filter(age != "<50")),
    senior_count = n(),
    proportion_seniors = (senior_count / total_seniors) * 100
  ) %>%
  ggplot(aes(x = vaccine, y = proportion_seniors, fill = vaccine, text = paste(
    "Vaccine:", vaccine,
    "<br>Proportion seniors:", round(proportion_seniors, 2), "%"
  ))) +
  geom_bar(stat = "identity", width = 0.5) +
  labs(
    title = "Proportion of Seniors (50 and Up) by Vaccination Status",
    x = "Vaccine Status",
    y = "Proportion of Seniors (%)",
    fill = "Vaccine Status"
  ) +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  theme_minimal()

ggplotly(p, tooltip = "text")

Summary

death_percentages <- covid_df %>%
  group_by(vaccine) %>%
  summarize(
    total_count = n(),
    death_count = sum(outcome == "died"),
    death_percentage = (death_count / total_count) * 100
  )

knitr::kable(death_percentages)
vaccine total_count death_count death_percentage
Unvaccinated 151052 250 0.1655059
Vaccinated 117114 477 0.4072955
under_50_rates <- covid_df %>%
  filter(age == "<50") %>%
  group_by(vaccine) %>%
  summarize(
    total_count = n(),
    death_count = sum(outcome == "died"),
    death_rate = (death_count / total_count) * 100
  )

over_50_rates <- covid_df %>%
  filter(age != "<50") %>%
  group_by(vaccine) %>%
  summarize(
    total_count = n(),
    death_count = sum(outcome == "died"),
    death_rate = (death_count / total_count) * 100
  )

knitr::kable(under_50_rates)
vaccine total_count death_count death_rate
Unvaccinated 147612 45 0.0304853
Vaccinated 89807 18 0.0200430
knitr::kable(over_50_rates)
vaccine total_count death_count death_rate
Unvaccinated 3440 205 5.959302
Vaccinated 27307 459 1.680888
library(tidyr)

combined_rates <- bind_rows(
  under_50_rates %>% mutate(age_group = "Under 50"),
  over_50_rates %>% mutate(age_group = "50 and Up")
)

pivoted_data <- combined_rates %>%
  pivot_longer(cols = death_rate, names_to = "measure", values_to = "value")

knitr::kable(pivoted_data)
vaccine total_count death_count age_group measure value
Unvaccinated 147612 45 Under 50 death_rate 0.0304853
Vaccinated 89807 18 Under 50 death_rate 0.0200430
Unvaccinated 3440 205 50 and Up death_rate 5.9593023
Vaccinated 27307 459 50 and Up death_rate 1.6808877
age_distribution <- covid_df %>%
  group_by(vaccine, age) %>%
  summarize(count = n()) %>%
  mutate(proportion = (count / sum(count)) * 100)

knitr::kable(age_distribution)
vaccine age count proportion
Unvaccinated 50+ 3440 2.277361
Unvaccinated <50 147612 97.722639
Vaccinated 50+ 27307 23.316598
Vaccinated <50 89807 76.683402
weighted_death_rate <- age_distribution %>%
  filter(age != "<50") %>%
  left_join(over_50_rates, by = "vaccine") %>%
  mutate(weighted_rate = death_rate * (proportion / 100))

knitr::kable(weighted_death_rate)
vaccine age count proportion total_count death_count death_rate weighted_rate
Unvaccinated 50+ 3440 2.277361 3440 205 5.959302 0.1357149
Vaccinated 50+ 27307 23.316598 27307 459 1.680888 0.3919258