library(tidyverse)
library (plotly)
library(gganimate)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(GGally)
library(knitr)
The main data source is the WHO Global Tuberculosis Report dataset, available at: https://extranet.who.int/tme/generateCSV.asp?ds=estimates These data are compiled and published annually by the World Health Organization from national TB programs and global health monitoring systems.
The dataset includes global data (~ 200 countries), but this project will focus on 10 Southeast Asia countries, such as Myanmar, Thailand, Indonesia, Malaysia, Vietnam, Philippines, Laos, Brunei, Singapore and Cambodia.
TB_data <- read_csv("C:/Users/User/OneDrive/Coursera Courses/Data visualization & dashboarding with R - John Hopkin/Course 5/Module 3/2) Peer review assignment final project/TB_burden_countries_2025-07-28.csv")
TB_data <- TB_data %>% filter(country %in% c("Myanmar", "Thailand", "Indonesia", "Malaysia", "Viet Nam", "Philippines", "Lao People's Democratic Republic", "Cambodia", "Brunei Darussalam", "Singapore"))
# Save the filtered file as .cvs file (for future use)
write.csv(TB_data, "TB_data_asean.csv")
# find the working directory (where the .csv file was saved)
getwd()
## [1] "C:/Users/User/OneDrive/Coursera Courses/Data visualization & dashboarding with R - John Hopkin/Course 5/Module 3/2) Peer review assignment final project"
For my first figure, I am going to create a Line Plot: Trends of TB incidence rate (per 100,000 population) (y axis) over time (2000–2023) (x axis) for Southeast Asian countries. I have made the interactive plot by ggplotly (it will show the label texts when you touch the line).
fig_1 <- ggplot(TB_data, aes(x=year, y=e_inc_100k, color=country, group=country,
text = paste("Country:", country,
"<br>Year:", year,
"<br>Incidence:", e_inc_100k))) +
geom_line() +
labs(x = "Year", y = "TB Incidence Rate (per 100,000)", title = "Trend of TB Incidence Over Time", color="Country")
ggplotly(fig_1, tooltip = "text")
For my second figure, I am going to create a Bar Chart: Comparison of TB mortality (with and without HIV) across Southeast Asian countries. I will use animated plot (by gganimate) across the years.
TB_data_long <- TB_data %>%
select(country, year, mor_withoutHIV = e_mort_exc_tbhiv_100k, mor_withHIV = e_mort_tbhiv_100k) %>%
pivot_longer(cols = c(mor_withoutHIV, mor_withHIV),
names_to = "category",
values_to = "mortality")
fig_2 <- ggplot(TB_data_long, aes(y=country, x=mortality, fill=category)) +
geom_bar(stat="identity", position = "dodge") +
labs(x = "TB Mortality Rate (per 100,000)", y = "Country", title = "TB Mortality in ASEAN Countries", subtitle = "Year: {closest_state}") +
scale_fill_discrete(name = "HIV Status",
labels = c("TB with HIV", "TB without HIV")) +
transition_states(year, transition_length = 3, state_length = 4)
animate(fig_2)
# save the animation as .gif in the working directory folder
anim <- animate(fig_2)
anim_save("AnimFig.gif", animation=anim)
For the third figure, I will display a Scatter Plot: Plot TB incidence rate (per 100,000 population) on the x-axis against TB mortality rate (per 100,000 population) on the y-axis for Southeast Asian countries for the recent 3 years (2021, 2022, 2023).
TB_data_3year <-TB_data %>% filter(year %in% 2021:2023)
ggplot(TB_data_3year, aes(x=e_inc_100k, y=e_mort_100k, color=country)) +
geom_point() +
labs(x = "TB Incidence Rate (per 100,000)", y = "TB Mortality Rate (per 100,000)", title = "TB Incidence vs TB Mortality in ASEAN Countries", color="Country") +
facet_wrap(~year)
For the fourth figure, I will display a Box Plot: Plot country on the x-axis and TB treatment coverage on the y-axis for Southeast Asian countries.
ggplot(TB_data, aes(x=country, y=c_cdr, fill=country)) +
geom_boxplot() +
labs(x = "Country", y = "TB Treatment Coverage", title = "TB Treatment Coverage in ASEAN Countries") +
guides(fill = "none") +
theme(axis.text.x = element_text(angle = 45))
For my fifth figure, I am going to create a Stacked Area Plot: TB incidence rate (per 100,000 population) - with or without HIV (y axis) over time (2000–2023) (x axis) for Myanmar.
TB_data_long2 <- TB_data %>% filter(country=="Myanmar") %>%
select(country, year, e_inc_100k, e_inc_tbhiv_100k) %>% pivot_longer(cols = c(e_inc_100k, e_inc_tbhiv_100k),
names_to = "category",
values_to = "tb_incidence")
ggplot(TB_data_long2, aes(x=year, y=tb_incidence, fill=category)) +
geom_area() +
labs(x = "Year", y = "TB Incidence Rate (per 100,000)", title = "Trend of TB Incidence over Time in Myanmar (with or without HIV)") +
scale_fill_discrete(name = "HIV Status",
labels = c("All TB", "TB with HIV"))
For my sixth figure, I am going to create a Lollipop Plot: TB incidence rate (per 100,000 population) for Southeast Asian countries in 2023.
TB_data_2023 <-TB_data %>% filter(year==2023)
ggplot(TB_data_2023, aes(x = e_inc_100k, y = country)) +
geom_segment(aes(x = 0, xend = e_inc_100k, y = country, yend = country), color = "black") +
geom_point(color = "blue", size = 4) +
labs(
title = "TB Incidence Rate in ASEAN Countries (2023)",
x = "TB Incidence Rate (per 100,000 population)",
y = "Country"
) +
theme_minimal()
For my seventh figure, I am going to create a Dot Plot: TB mortality rate (per 100,000 population) for Southeast Asian countries in 2023.
ggplot(TB_data_2023, aes(y = e_mort_100k, x = country)) +
geom_point(color = "blue", size = 3) +
labs(
title = "TB Mortality Rate in ASEAN Countries (2023)",
y = "TB Mortality Rate (per 100,000 population)",
x = "Country") +
theme(axis.text.x = element_text(angle = 45))
For my eighth figure, I am going to create a Pie Chart: TB Treatment Coverage for Southeast Asian countries in 2023.
pie(TB_data_2023$c_cdr,
labels = TB_data_2023$country,
main = "TB Treatment Coverage by Country (2023)",
col = rainbow(length(TB_data_2023$country)))
For my nineth figure, I am going to create a Choropleth Map: TB incidence rate (per 100,000 population) in 2023 for Southeast Asian countries.
TB_data_2023_map <- TB_data_2023 %>%
select(country, e_inc_100k)
# Load world map data
world <- ne_countries(scale = "medium", returnclass = "sf")
# Check country name for mismatch
#unique(TB_data_2023_map$country)
#unique(world$admin)
# Fix country names if necessary to match map data (adjust based on mismatches)
TB_data_2023_map <- TB_data_2023_map %>%
mutate(country = recode(country,
"Lao People's Democratic Republic" = "Laos",
"Viet Nam" = "Vietnam",
"Brunei Darussalam" = "Brunei"))
# Join TB data to world map
map_data <- world %>%
filter(admin %in% TB_data_2023_map$country)
map_data <- left_join(map_data, TB_data_2023_map, by = c("admin" = "country"))
# Calculate centroids for country labels
centroids <- st_centroid(map_data$geometry)
# Create a data frame for labels with coordinates
label_data <- cbind(map_data, st_coordinates(centroids))
# # Choropleth map with labels
ggplot(data = map_data) +
geom_sf(aes(fill = e_inc_100k), color = "white") +
geom_text(data = label_data, aes(x = X, y = Y, label = admin),
size = 3, color = "black") +
scale_fill_viridis_c(name = "TB Incidence\n(per 100,000)", option = "C") +
labs(
title = "TB Incidence Rate in ASEAN Countries (2023)",
caption = "Data source: WHO TB Estimates", x="", y=""
) +
theme_minimal()
For my tenth figure, I am going to create a Heat map: TB Incidence Rate (per 100,000 population) across Southeast Asian countries over time (2000-2023).
ggplot(data = TB_data, aes(x = year, y = country, fill = e_inc_100k)) +
geom_tile(color = "white") +
scale_fill_distiller(name = "TB Incidence\n(per 100k)", palette = "YlOrRd") +
labs(title = "TB Incidence Rate in ASEAN Countries Over Time",
x = "Year",
y = "Country") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
For my eleventh figure, I am going to show the relationships between multiple numerical variables in the dataset by ggpairs() function.
# Filter for recent year and columns
TB_ggpairs_data <- TB_data_2023 %>%
select(country, e_inc_100k, e_mort_100k, c_cdr, e_pop_num) %>%
na.omit()
ggpairs(TB_ggpairs_data[, -1], # exclude 'country' from matrix
title = "Pairwise Relationships in TB Indicators (ASEAN, 2023)")