#Load the data
covid_data <- read_csv("mydata/figure-b.2.3.1_global-5050.csv")
## Rows: 190 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (13): Country code, Country, Case & death data by sex?, Cases date, Case...
## dbl (3): Cases where sex-disaggregated data is available, Deaths where sex-...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Clean the data
covid_clean <- covid_data %>%
select(
Country,
total_cases = `Cases where sex-disaggregated data is available`,
cases_male_pct = `Cases (% male)`,
cases_female_pct = `Cases (% female)`,
total_deaths = `Deaths where sex-disaggregated data is available`,
deaths_male_pct = `Deaths (% male)`,
deaths_female_pct = `Deaths (% female)`
) %>%
# remove rows without totals
filter(!is.na(total_cases) & !is.na(total_deaths)) %>%
# convert percentages from string ("67.16%") to numeric
mutate(
cases_male_pct = as.numeric(gsub("%", "", cases_male_pct)),
cases_female_pct = as.numeric(gsub("%", "", cases_female_pct)),
deaths_male_pct = as.numeric(gsub("%", "", deaths_male_pct)),
deaths_female_pct = as.numeric(gsub("%", "", deaths_female_pct)),
# calculate estimated counts
male_cases = total_cases * (cases_male_pct/100),
female_cases = total_cases * (cases_female_pct/100),
male_deaths = total_deaths * (deaths_male_pct/100),
female_deaths = total_deaths * (deaths_female_pct/100)
)
#Use the guiding question: How do male and female COVID-19 cases and deaths vary across countries? #Group by country and summarize totals
gender_summary <- covid_clean %>%
group_by(Country) %>%
summarize(
total_male_cases = sum(male_cases, na.rm=TRUE),
total_female_cases = sum(female_cases, na.rm=TRUE),
total_male_deaths = sum(male_deaths, na.rm=TRUE),
total_female_deaths = sum(female_deaths, na.rm=TRUE)
)
head(gender_summary)
## # A tibble: 6 × 5
## Country total_male_cases total_female_cases total_male_deaths
## <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan 35784. 17498. 1493.
## 2 Albania 51807. 56124. 1217.
## 3 Argentina 1041526. 1051573. 24789.
## 4 Australia 14169. 14707. 440.
## 5 Austria 225873. 238505. 4398.
## 6 Bangladesh 388229. 158572. 6480.
## # ℹ 1 more variable: total_female_deaths <dbl>
#Create two insights: Are male or female cases higher across countries, and which gender experienced more deaths proportionally
#Average cases across countries
avg_cases <- gender_summary %>%
summarize(
avg_male_cases = mean(total_male_cases, na.rm=TRUE),
avg_female_cases = mean(total_female_cases, na.rm=TRUE)
)
#Deaths proportion
death_ratio <- gender_summary %>%
mutate(
male_death_rate = total_male_deaths / (total_male_cases + 1),
female_death_rate = total_female_deaths / (total_female_cases + 1)
) %>%
summarize(
avg_male_death_rate = mean(male_death_rate, na.rm=TRUE),
avg_female_death_rate = mean(female_death_rate, na.rm=TRUE)
)
avg_cases
## # A tibble: 1 × 2
## avg_male_cases avg_female_cases
## <dbl> <dbl>
## 1 347373. 360094.
death_ratio
## # A tibble: 1 × 2
## avg_male_death_rate avg_female_death_rate
## <dbl> <dbl>
## 1 0.0403 0.0458
#Make a visualization of the top countries
cases_long <- gender_summary %>%
select(Country, total_male_cases, total_female_cases) %>%
tidyr::pivot_longer(cols = starts_with("total"),
names_to = "gender",
values_to = "cases") %>%
mutate(gender = ifelse(gender == "total_male_cases", "Male", "Female"))
top_countries <- cases_long %>%
group_by(Country) %>%
summarize(total_cases = sum(cases)) %>%
top_n(15, total_cases)
ggplot(cases_long %>% filter(Country %in% top_countries$Country),
aes(x=reorder(Country, -cases), y=cases, fill=gender)) +
geom_bar(stat="identity", position="dodge") +
labs(title="Top 15 Countries: COVID-19 Cases by Gender",
x="Country", y="Total Cases") +
theme(axis.text.x = element_text(angle=45, hjust=1))