library(tidyverse)

dataset <- read.csv("dataset.csv")

1. Global Trend

This line chart shows how the average statistical capacity score changed globally from 2016 to 2023. The score rose from 58.6 to 69.7, an 11-point gain, indicating steady progress in countries’ ability to collect and use data.

# Calculate yearly averages
yearly_avg <- dataset %>%
  filter(!is.na(overall_score)) %>%
  group_by(year) %>%
  summarise(avg_score = round(mean(overall_score), 1)) %>%
  filter(year >= 2016)

# Plot
ggplot(yearly_avg, aes(x = year, y = avg_score)) +
  geom_line(color = "#1E90C0", linewidth = 1.5) +
  geom_point(color = "#1E90C0", size = 3) +
  geom_text(aes(label = round(avg_score, 0)),
            vjust = -1.2, size = 3.5, color = "#1A2B3C") +
  scale_x_continuous(breaks = 2016:2023) +
  scale_y_continuous(limits = c(50, 80), breaks = seq(50, 80, 5)) +
  labs(
    title   = "Global Statistical Capacity Has Steadily Increased Over Time",
    x       = "Year",
    y       = "Average Statistical Capacity Score (0-100)",
    caption = "Source: World Bank | Average overall score across all countries, 2016-2023"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title         = element_text(face = "bold", size = 14, color = "#0A2342"),
    axis.title         = element_text(size = 11, color = "#1A2B3C"),
    axis.text          = element_text(size = 10, color = "#1A2B3C"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor   = element_blank(),
    plot.caption       = element_text(face = "italic", color = "gray50", hjust = 0)
  )

ggsave("slide5_trend.png", width = 8, height = 4.5, dpi = 300)

2. Growth Acceleration After 2019

This bar chart compares two phases of growth. The blue bars (2016-2019) show slower progress (+4.5 points), while the green bars (2020-2023) show a faster acceleration (+6.6 points), suggesting a structural shift in global data capacity.

# Add phase label to yearly averages
yearly_avg <- yearly_avg %>%
  mutate(phase = ifelse(year <= 2019,
                        "2016-2019 (Slow Growth)",
                        "2020-2023 (Accelerated Growth)"))

# Plot
ggplot(yearly_avg, aes(x = factor(year), y = avg_score, fill = phase)) +
  geom_col() +
  scale_fill_manual(values = c(
    "2016-2019 (Slow Growth)"        = "#1E90C0",
    "2020-2023 (Accelerated Growth)" = "#028A5B"
  )) +
  scale_y_continuous(limits = c(50, 75), oob = scales::squish,
                     breaks = seq(50, 75, 5)) +
  labs(
    title   = "Growth Accelerated Sharply After 2019 - A Structural Shift in Progress",
    x       = "Year",
    y       = "Average Statistical Capacity Score (0-100)",
    fill    = NULL,
    caption = "Source: World Bank | Blue = 2016-2019 (slow phase) | Green = 2020-2023 (accelerated phase)"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title         = element_text(face = "bold", size = 13, color = "#0A2342"),
    axis.title         = element_text(size = 11, color = "#1A2B3C"),
    axis.text          = element_text(size = 10, color = "#1A2B3C"),
    legend.position    = "bottom",
    panel.grid.major.x = element_blank(),
    panel.grid.minor   = element_blank(),
    plot.caption       = element_text(face = "italic", color = "gray50", hjust = 0)
  )

ggsave("slide6_acceleration.png", width = 8, height = 4.5, dpi = 300)

3. Regional Differences

This horizontal bar chart compares average statistical capacity scores across 7 world regions in 2023. North America and Europe lead with scores above 84, while Sub-Saharan Africa scores just 60, a 33-point gap that highlights a persistent inequality in data readiness across regions.

# Calculate regional averages for 2023
regional_avg <- dataset %>%
  filter(!is.na(overall_score), year == 2023) %>%
  group_by(region) %>%
  summarise(avg_score = round(mean(overall_score), 1)) %>%
  mutate(
    tier = case_when(
      avg_score >= 80 ~ "High performers",
      avg_score < 65  ~ "Lagging regions",
      TRUE            ~ "Mid-tier regions"
    )
  )

# Plot
ggplot(regional_avg, aes(x = avg_score,
                          y = reorder(region, avg_score),
                          fill = tier)) +
  geom_col() +
  geom_text(aes(label = round(avg_score, 0)),
            hjust = -0.3, size = 3.5, color = "#1A2B3C") +
  scale_fill_manual(values = c(
    "High performers"  = "#028A5B",
    "Mid-tier regions" = "#1E90C0",
    "Lagging regions"  = "#E07B39"
  )) +
  scale_x_continuous(limits = c(0, 105), breaks = seq(0, 100, 10)) +
  labs(
    title   = "Some Regions Improved Faster - Sub-Saharan Africa Still Lags Behind",
    x       = "Average Statistical Capacity Score (0-100)",
    y       = NULL,
    fill    = NULL,
    caption = "Source: World Bank | 2023 data\nA 33-point gap separates the top and bottom regions."
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title         = element_text(face = "bold", size = 13, color = "#0A2342"),
    axis.title.x       = element_text(size = 11, color = "#1A2B3C"),
    axis.text          = element_text(size = 10, color = "#1A2B3C"),
    legend.position    = "right",
    panel.grid.major.y = element_blank(),
    panel.grid.minor   = element_blank(),
    plot.caption       = element_text(face = "italic", color = "gray50", hjust = 0)
  )

ggsave("slide7_regions.png", width = 8, height = 4.5, dpi = 300)