library(tidyverse)
library (plotly)
library(gganimate)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(GGally)
library(knitr)

Import My Data

The main data source is the WHO Global Tuberculosis Report dataset, available at: https://extranet.who.int/tme/generateCSV.asp?ds=estimates These data are compiled and published annually by the World Health Organization from national TB programs and global health monitoring systems.

The dataset includes global data (~ 200 countries), but this project will focus on 10 Southeast Asia countries, such as Myanmar, Thailand, Indonesia, Malaysia, Vietnam, Philippines, Laos, Brunei, Singapore and Cambodia.

TB_data <- read_csv("C:/Users/User/OneDrive/Coursera Courses/Data visualization & dashboarding with R - John Hopkin/Course 5/Module 3/2) Peer review assignment final project/TB_burden_countries_2025-07-28.csv")

TB_data <- TB_data %>% filter(country %in% c("Myanmar", "Thailand", "Indonesia", "Malaysia", "Viet Nam", "Philippines", "Lao People's Democratic Republic", "Cambodia", "Brunei Darussalam", "Singapore"))

# Save the filtered file as .cvs file (for future use)
write.csv(TB_data, "TB_data_asean.csv")
# find the working directory (where the .csv file was saved)
getwd()
## [1] "C:/Users/User/OneDrive/Coursera Courses/Data visualization & dashboarding with R - John Hopkin/Course 5/Module 3/2) Peer review assignment final project"

Figure 1

For my first figure, I am going to create a Line Plot: Trends of TB incidence rate (per 100,000 population) (y axis) over time (2000–2023) (x axis) for Southeast Asian countries. I have made the interactive plot by ggplotly (it will show the label texts when you touch the line).

fig_1 <- ggplot(TB_data, aes(x=year, y=e_inc_100k, color=country, group=country, 
      text = paste("Country:", country,
      "<br>Year:", year,
      "<br>Incidence:", e_inc_100k))) +
  geom_line() +
  labs(x = "Year", y = "TB Incidence Rate (per 100,000)", title = "Trend of TB Incidence Over Time", color="Country")

ggplotly(fig_1, tooltip = "text")

Figure 2

For my second figure, I am going to create a Bar Chart: Comparison of TB mortality (with and without HIV) across Southeast Asian countries. I will use animated plot (by gganimate) across the years.

TB_data_long <- TB_data %>%
  select(country, year, mor_withoutHIV = e_mort_exc_tbhiv_100k, mor_withHIV = e_mort_tbhiv_100k) %>%
  pivot_longer(cols = c(mor_withoutHIV, mor_withHIV),
               names_to = "category",
               values_to = "mortality")

fig_2 <- ggplot(TB_data_long, aes(y=country, x=mortality, fill=category)) +
  geom_bar(stat="identity", position = "dodge") +
  labs(x = "TB Mortality Rate (per 100,000)", y = "Country", title = "TB Mortality in ASEAN Countries", subtitle = "Year: {closest_state}") +
  scale_fill_discrete(name = "HIV Status", 
                      labels = c("TB with HIV", "TB without HIV")) +
    transition_states(year, transition_length = 3, state_length = 4) 

animate(fig_2)

# save the animation as .gif in the working directory folder 
anim <- animate(fig_2)
anim_save("AnimFig.gif", animation=anim) 

Figure 3

For the third figure, I will display a Scatter Plot: Plot TB incidence rate (per 100,000 population) on the x-axis against TB mortality rate (per 100,000 population) on the y-axis for Southeast Asian countries for the recent 3 years (2021, 2022, 2023).

TB_data_3year <-TB_data %>% filter(year %in% 2021:2023)

ggplot(TB_data_3year, aes(x=e_inc_100k, y=e_mort_100k, color=country)) +
  geom_point() +
  labs(x = "TB Incidence Rate (per 100,000)", y = "TB Mortality Rate (per 100,000)", title = "TB Incidence vs TB Mortality in ASEAN Countries", color="Country") +
  facet_wrap(~year)

Figure 4

For the fourth figure, I will display a Box Plot: Plot country on the x-axis and TB treatment coverage on the y-axis for Southeast Asian countries.

ggplot(TB_data, aes(x=country, y=c_cdr, fill=country)) +
  geom_boxplot() +
  labs(x = "Country", y = "TB Treatment Coverage", title = "TB Treatment Coverage in ASEAN Countries") +
  guides(fill = "none") +
  theme(axis.text.x = element_text(angle = 45))

Figure 5

For my fifth figure, I am going to create a Stacked Area Plot: TB incidence rate (per 100,000 population) - with or without HIV (y axis) over time (2000–2023) (x axis) for Myanmar.

TB_data_long2 <- TB_data %>% filter(country=="Myanmar") %>%
  select(country, year, e_inc_100k, e_inc_tbhiv_100k) %>%  pivot_longer(cols = c(e_inc_100k, e_inc_tbhiv_100k),
               names_to = "category",
               values_to = "tb_incidence")

ggplot(TB_data_long2, aes(x=year, y=tb_incidence, fill=category)) +
  geom_area() +
  labs(x = "Year", y = "TB Incidence Rate (per 100,000)", title = "Trend of TB Incidence over Time in Myanmar (with or without HIV)") +
  scale_fill_discrete(name = "HIV Status", 
                      labels = c("All TB", "TB with HIV"))

Figure 6

For my sixth figure, I am going to create a Lollipop Plot: TB incidence rate (per 100,000 population) for Southeast Asian countries in 2023.

TB_data_2023 <-TB_data %>% filter(year==2023)

ggplot(TB_data_2023, aes(x = e_inc_100k, y = country)) +
  geom_segment(aes(x = 0, xend = e_inc_100k, y = country, yend = country), color = "black") +
  geom_point(color = "blue", size = 4) +
  labs(
    title = "TB Incidence Rate in ASEAN Countries (2023)",
    x = "TB Incidence Rate (per 100,000 population)",
    y = "Country"
  ) +
  theme_minimal()

Figure 7

For my seventh figure, I am going to create a Dot Plot: TB mortality rate (per 100,000 population) for Southeast Asian countries in 2023.

ggplot(TB_data_2023, aes(y = e_mort_100k, x = country)) +
  geom_point(color = "blue", size = 3) +
  labs(
    title = "TB Mortality Rate in ASEAN Countries (2023)",
    y = "TB Mortality Rate (per 100,000 population)",
    x = "Country") +
  theme(axis.text.x = element_text(angle = 45)) 

Figure 8

For my eighth figure, I am going to create a Pie Chart: TB Treatment Coverage for Southeast Asian countries in 2023.

pie(TB_data_2023$c_cdr,
    labels = TB_data_2023$country,
    main = "TB Treatment Coverage by Country (2023)",
    col = rainbow(length(TB_data_2023$country)))

Figure 9

For my nineth figure, I am going to create a Choropleth Map: TB incidence rate (per 100,000 population) in 2023 for Southeast Asian countries.

TB_data_2023_map <- TB_data_2023 %>%
  select(country, e_inc_100k)

# Load world map data
world <- ne_countries(scale = "medium", returnclass = "sf")

# Check country name for mismatch
#unique(TB_data_2023_map$country)
#unique(world$admin)

# Fix country names if necessary to match map data (adjust based on mismatches)
TB_data_2023_map <- TB_data_2023_map %>%
  mutate(country = recode(country,
                          "Lao People's Democratic Republic" = "Laos",
                          "Viet Nam" = "Vietnam",
                          "Brunei Darussalam" = "Brunei"))

# Join TB data to world map
map_data <- world %>%
  filter(admin %in% TB_data_2023_map$country)
map_data <- left_join(map_data, TB_data_2023_map, by = c("admin" = "country"))

# Calculate centroids for country labels
centroids <- st_centroid(map_data$geometry)

# Create a data frame for labels with coordinates
label_data <- cbind(map_data, st_coordinates(centroids))

# # Choropleth map with labels
ggplot(data = map_data) +
  geom_sf(aes(fill = e_inc_100k), color = "white") +
  geom_text(data = label_data, aes(x = X, y = Y, label = admin), 
            size = 3, color = "black") +
  scale_fill_viridis_c(name = "TB Incidence\n(per 100,000)", option = "C") +
  labs(
    title = "TB Incidence Rate in ASEAN Countries (2023)",
    caption = "Data source: WHO TB Estimates", x="", y=""
  ) +
  theme_minimal()

Figure 10

For my tenth figure, I am going to create a Heat map: TB Incidence Rate (per 100,000 population) across Southeast Asian countries over time (2000-2023).

ggplot(data = TB_data, aes(x = year, y = country, fill = e_inc_100k)) +
  geom_tile(color = "white") +        
  scale_fill_distiller(name = "TB Incidence\n(per 100k)", palette = "YlOrRd") +
  labs(title = "TB Incidence Rate in ASEAN Countries Over Time",
       x = "Year",
       y = "Country") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Figure 11

For my eleventh figure, I am going to show the relationships between multiple numerical variables in the dataset by ggpairs() function.

# Filter for recent year and columns
TB_ggpairs_data <- TB_data_2023 %>%
  select(country, e_inc_100k, e_mort_100k, c_cdr, e_pop_num) %>%
  na.omit()

ggpairs(TB_ggpairs_data[, -1],  # exclude 'country' from matrix
        title = "Pairwise Relationships in TB Indicators (ASEAN, 2023)")