#Load the data

covid_data <- read_csv("mydata/figure-b.2.3.1_global-5050.csv")
## Rows: 190 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (13): Country code, Country, Case & death data by sex?, Cases date, Case...
## dbl  (3): Cases where sex-disaggregated data is available, Deaths where sex-...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#Clean the data

covid_clean <- covid_data %>%
  select(
    Country,
    total_cases = `Cases where sex-disaggregated data is available`,
    cases_male_pct = `Cases (% male)`,
    cases_female_pct = `Cases (% female)`,
    total_deaths = `Deaths where sex-disaggregated data is available`,
    deaths_male_pct = `Deaths (% male)`,
    deaths_female_pct = `Deaths (% female)`
  ) %>%
  # remove rows without totals
  filter(!is.na(total_cases) & !is.na(total_deaths)) %>%
  # convert percentages from string ("67.16%") to numeric
  mutate(
    cases_male_pct = as.numeric(gsub("%", "", cases_male_pct)),
    cases_female_pct = as.numeric(gsub("%", "", cases_female_pct)),
    deaths_male_pct = as.numeric(gsub("%", "", deaths_male_pct)),
    deaths_female_pct = as.numeric(gsub("%", "", deaths_female_pct)),
    # calculate estimated counts
    male_cases = total_cases * (cases_male_pct/100),
    female_cases = total_cases * (cases_female_pct/100),
    male_deaths = total_deaths * (deaths_male_pct/100),
    female_deaths = total_deaths * (deaths_female_pct/100)
  )

#Use the guiding question: How do male and female COVID-19 cases and deaths vary across countries? #Group by country and summarize totals

gender_summary <- covid_clean %>%
  group_by(Country) %>%
  summarize(
    total_male_cases = sum(male_cases, na.rm=TRUE),
    total_female_cases = sum(female_cases, na.rm=TRUE),
    total_male_deaths = sum(male_deaths, na.rm=TRUE),
    total_female_deaths = sum(female_deaths, na.rm=TRUE)
  )

head(gender_summary)
## # A tibble: 6 × 5
##   Country     total_male_cases total_female_cases total_male_deaths
##   <chr>                  <dbl>              <dbl>             <dbl>
## 1 Afghanistan           35784.             17498.             1493.
## 2 Albania               51807.             56124.             1217.
## 3 Argentina           1041526.           1051573.            24789.
## 4 Australia             14169.             14707.              440.
## 5 Austria              225873.            238505.             4398.
## 6 Bangladesh           388229.            158572.             6480.
## # ℹ 1 more variable: total_female_deaths <dbl>

#Create two insights: Are male or female cases higher across countries, and which gender experienced more deaths proportionally

#Average cases across countries
avg_cases <- gender_summary %>%
  summarize(
    avg_male_cases = mean(total_male_cases, na.rm=TRUE),
    avg_female_cases = mean(total_female_cases, na.rm=TRUE)
  )

#Deaths proportion
death_ratio <- gender_summary %>%
  mutate(
    male_death_rate = total_male_deaths / (total_male_cases + 1),
    female_death_rate = total_female_deaths / (total_female_cases + 1)
  ) %>%
  summarize(
    avg_male_death_rate = mean(male_death_rate, na.rm=TRUE),
    avg_female_death_rate = mean(female_death_rate, na.rm=TRUE)
  )

avg_cases
## # A tibble: 1 × 2
##   avg_male_cases avg_female_cases
##            <dbl>            <dbl>
## 1        347373.          360094.
death_ratio
## # A tibble: 1 × 2
##   avg_male_death_rate avg_female_death_rate
##                 <dbl>                 <dbl>
## 1              0.0403                0.0458

#Make a visualization of the top countries

cases_long <- gender_summary %>%
  select(Country, total_male_cases, total_female_cases) %>%
  tidyr::pivot_longer(cols = starts_with("total"),
                      names_to = "gender",
                      values_to = "cases") %>%
  mutate(gender = ifelse(gender == "total_male_cases", "Male", "Female"))

top_countries <- cases_long %>%
  group_by(Country) %>%
  summarize(total_cases = sum(cases)) %>%
  top_n(15, total_cases)

ggplot(cases_long %>% filter(Country %in% top_countries$Country),
       aes(x=reorder(Country, -cases), y=cases, fill=gender)) +
  geom_bar(stat="identity", position="dodge") +
  labs(title="Top 15 Countries: COVID-19 Cases by Gender",
       x="Country", y="Total Cases") +
  theme(axis.text.x = element_text(angle=45, hjust=1))