library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Data tidying of who data set
who_tidy <- who %>%
pivot_longer(
cols = new_sp_m014:newrel_f65,
names_to = "key",
values_to = "cases",
values_drop_na = TRUE
) %>%
mutate(key = stringr::str_replace(key, "newrel", "new_rel")) %>%
separate(key, into = c("new", "type", "sexage"), sep = "_") %>%
select(-new, -iso2, -iso3) %>%
separate(sexage, into = c("sex", "age"), sep = 1)
who_tidy
## # A tibble: 76,046 × 6
## country year type sex age cases
## <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 Afghanistan 1997 sp m 014 0
## 2 Afghanistan 1997 sp m 1524 10
## 3 Afghanistan 1997 sp m 2534 6
## 4 Afghanistan 1997 sp m 3544 3
## 5 Afghanistan 1997 sp m 4554 5
## 6 Afghanistan 1997 sp m 5564 2
## 7 Afghanistan 1997 sp m 65 0
## 8 Afghanistan 1997 sp f 014 5
## 9 Afghanistan 1997 sp f 1524 38
## 10 Afghanistan 1997 sp f 2534 36
## # ℹ 76,036 more rows
Plot the total number of TB cases in the world across years
yearly_totals <- who_tidy %>%
group_by(year) %>%
summarize(total_cases = sum(cases), .groups = 'drop')
ggplot(yearly_totals, aes(x = year, y = total_cases)) +
geom_line(color = "steelblue", linewidth = 1) +
geom_point(color = "darkblue", size = 2) +
labs(
title = "Total Global Tuberculosis Cases Over Time",
x = "Year",
y = "Total Cases"
) +
scale_y_continuous(labels = scales::comma) +
theme_minimal()

Find out which country has the highest male-to-female ratio of TB
cases in 2010.
highest_ratio_2010 <- who_tidy %>%
filter(year == 2010) %>%
group_by(country, sex) %>%
summarize(total_cases = sum(cases), .groups = 'drop') %>%
pivot_wider(
names_from = sex,
values_from = total_cases,
values_fill = 0
) %>%
filter(f > 0) %>%
mutate(ratio = m / f) %>%
arrange(desc(ratio)) %>%
slice_head(n = 1)
highest_ratio_2010
## # A tibble: 1 × 4
## country f m ratio
## <chr> <dbl> <dbl> <dbl>
## 1 Antigua and Barbuda 1 5 5