df <- read.csv("ican-icar-2025-v1.csv")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ readr     2.2.0
## ✔ ggplot2   4.0.2     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ✔ purrr     1.2.1     ✔ tidyr     1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(ggplotify)
library(ggrepel)
library(tidyverse)
# Graph of Mathematics Minimum Proficiency Levels (Age 10)
df |>
  filter(
    CountryName %in% c("Kenya", "Mali", "Mozambique", "Senegal", "Tanzania", "Uganda"),
    ch02 == 10
  ) |>
  drop_na(MPLMath, HHWeightProvided) |>
  group_by(CountryName) |>
  summarise(
    pct = sum(MPLMath * HHWeightProvided) / sum(HHWeightProvided)
  ) |>
  mutate(CountryName = fct_reorder(CountryName, pct)) |>
  ggplot(aes(x = pct, y = CountryName)) +
  geom_col() +
  scale_x_continuous(labels = scales::percent) +
  theme_minimal() +
  theme(
    axis.title = element_blank(),
    panel.grid = element_blank()
  )

african_countries <- c("Kenya", "Mali", "Mozambique", "Senegal", "Tanzania", "Uganda")

plot_data <- df |>
  filter(
    CountryName %in% african_countries,
    ch02 == 10
  ) |>
  drop_na(MPLMath, MPLReading, HHWeightProvided) |>
  group_by(CountryName) |>
  summarise(
    math = sum(MPLMath * HHWeightProvided) / sum(HHWeightProvided),
    reading = sum(MPLReading * HHWeightProvided) / sum(HHWeightProvided)
  ) |>
  pivot_longer(
    cols = c(math, reading),
    names_to = "subject",
    values_to = "pct"
  ) |>
  mutate(
    subject = recode(subject,
                     math = "Mathematics",
                     reading = "Reading")
  )

order_df <- plot_data |>
  filter(subject == "Mathematics") |>
  arrange(pct)

plot_data |>
  mutate(CountryName = factor(CountryName, levels = order_df$CountryName)) |>
  ggplot(aes(x = pct, y = CountryName, fill = subject)) +
  geom_col(position = position_dodge(width = 0.7), width = 0.6) +
  geom_text(
    aes(label = paste0(round(pct * 100), "%")),
    position = position_dodge(width = 0.7),
    hjust = -0.2,
    fontface = "bold",
    size = 3.5
  ) +
  scale_fill_manual(
    values = c("Mathematics" = "#2171B5", "Reading" = "#238B45"),
    name = NULL
  ) +
  scale_x_continuous(
    labels = scales::percent,
    expand = expansion(mult = c(0, 0.15))
  ) +
  labs(
    title = "Numeracy and reading proficiency across African countries",
    subtitle = "Weighted % of 10-year-old children meeting minimum proficiency"
  ) +
  theme_minimal() +
  theme(
    axis.title = element_blank(),
    panel.grid = element_blank(),
    plot.title = element_text(face = "bold"),
    legend.position = "top"
  )

math_lang_summary <- df %>%
  filter(
    CountryName %in% c("Kenya", "Mali", "Mozambique", "Senegal", "Tanzania", "Uganda"),
    ch02 == 10,
    !is.na(HHWeightProvided)
  ) %>%
  group_by(CountryName) %>%
  summarise(
    Mathematics = weighted.mean(MPLMath, w = HHWeightProvided, na.rm = TRUE),
    Reading = weighted.mean(MPLReading, w = HHWeightProvided, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  pivot_longer(
    cols = c(Mathematics, Reading),
    names_to = "subject",
    values_to = "proficiency_rate"
  )

country_order <- math_lang_summary %>%
  filter(subject == "Mathematics") %>%
  arrange(proficiency_rate) %>%
  pull(CountryName)

math_lang_summary <- math_lang_summary %>%
  mutate(CountryName = factor(CountryName, levels = country_order))

ggplot(math_lang_summary,
       aes(x = proficiency_rate, y = CountryName, fill = subject)) +
  geom_col(
    data = ~ dplyr::filter(.x, CountryName != "Kenya"),
    position = position_dodge(width = 0.7),
    width = 0.6,
    alpha = 0.35
  ) +
  geom_col(
    data = ~ dplyr::filter(.x, CountryName == "Kenya"),
    position = position_dodge(width = 0.7),
    width = 0.6,
    alpha = 1
  ) +
  geom_text(
    aes(label = paste0(round(proficiency_rate * 100), "%")),
    position = position_dodge(width = 0.7),
    hjust = -0.2,
    size = 3.5,
    fontface = "bold"
  ) +
  scale_fill_manual(
    values = c("Mathematics" = "#2171B5", "Reading" = "#238B45"),
    name = NULL
  ) +
  scale_x_continuous(
    labels = percent_format(accuracy = 1),
    limits = c(0, 1),
    expand = expansion(mult = c(0, 0.15))
  ) +
  labs(
    title = "Numeracy and reading proficiency among 10-year-olds",
    subtitle = "Weighted proportion meeting MPL",
    caption = "Source: ICAN-ICAR 2025"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    legend.position = "top",
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank()
  )

african_countries <- c("Kenya", "Mali", "Mozambique", "Senegal", "Tanzania", "Uganda")

plot_data <- df |>
  filter(
    CountryName %in% african_countries,
    ch02 == 10
  ) |>
  drop_na(MPLMath, MPLReading, HHWeightProvided) |>
  group_by(CountryName) |>
  summarise(
    math = sum(MPLMath * HHWeightProvided) / sum(HHWeightProvided),
    reading = sum(MPLReading * HHWeightProvided) / sum(HHWeightProvided)
  ) |>
  pivot_longer(
    cols = c(math, reading),
    names_to = "subject",
    values_to = "pct"
  ) |>
  mutate(
    subject = recode(subject,
                     math = "Mathematics",
                     reading = "Reading")
  )

# Order countries by Mathematics (lowest to highest)
order_df <- plot_data |>
  filter(subject == "Mathematics") |>
  arrange(pct)

plot_data |>
  mutate(CountryName = factor(CountryName, levels = order_df$CountryName)) |>
  ggplot(aes(x = pct, y = CountryName, fill = subject)) +
  geom_col(position = position_dodge(width = 0.7), width = 0.6) +
  geom_text(
    aes(label = paste0(round(pct * 100), "%")),
    position = position_dodge(width = 0.7),
    hjust = -0.2,
    fontface = "bold",
    size = 3.5
  ) +
  scale_fill_manual(
    values = c("Mathematics" = "#2171B5", "Reading" = "#238B45"),
    name = NULL
  ) +
  scale_x_continuous(
    labels = scales::percent,
    expand = expansion(mult = c(0, 0.15))
  ) +
  labs(
    title = "Numeracy and reading proficiency across African countries",
    subtitle = "Weighted % of 10-year-old children meeting minimum proficiency"
  ) +
  theme_minimal() +
  theme(
    axis.title = element_blank(),
    panel.grid = element_blank(),
    plot.title = element_text(face = "bold"),
    legend.position = "top"
  )

math_lang_summary <- df %>%
  filter(
    CountryName %in% c("Kenya", "Mali", "Mozambique", "Senegal", "Tanzania", "Uganda"),
    ch02 == 10,
    !is.na(HHWeightProvided)
  ) %>%
  group_by(CountryName) %>%
  summarise(
    Mathematics = weighted.mean(MPLMath,    w = HHWeightProvided, na.rm = TRUE),
    Reading     = weighted.mean(MPLReading, w = HHWeightProvided, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  pivot_longer(
    cols = c(Mathematics, Reading),
    names_to  = "subject",
    values_to = "proficiency_rate"
  )

# Order countries by Mathematics
country_order <- math_lang_summary %>%
  filter(subject == "Mathematics") %>%
  arrange(proficiency_rate) %>%
  pull(CountryName)

math_lang_summary <- math_lang_summary %>%
  mutate(CountryName = factor(CountryName, levels = country_order))

ggplot(math_lang_summary,
       aes(x = proficiency_rate, y = CountryName, fill = subject)) +
  
  # Faded bars (non-Kenya)
  geom_col(
    data = ~ dplyr::filter(.x, CountryName != "Kenya"),
    position = position_dodge(width = 0.7),
    width = 0.6,
    alpha = 0.35
  ) +
  
  # Highlighted Kenya bars
  geom_col(
    data = ~ dplyr::filter(.x, CountryName == "Kenya"),
    position = position_dodge(width = 0.7),
    width = 0.6,
    alpha = 1
  ) +
  
  geom_text(
    aes(label = paste0(round(proficiency_rate * 100), "%")),
    position  = position_dodge(width = 0.7),
    hjust = -0.2,
    size = 3.5,
    fontface = "bold"
  ) +
  
  scale_fill_manual(
    values = c("Mathematics" = "#2171B5", "Reading" = "#238B45"),
    name   = NULL
  ) +
  
  scale_x_continuous(
    labels = percent_format(accuracy = 1),
    limits = c(0, 1),
    expand = expansion(mult = c(0, 0.15))
  ) +
  
  labs(
    title    = "Numeracy and reading proficiency among 10-year-olds",
    subtitle = "Weighted proportion of children meeting Minimum Proficiency Level, age 10",
    x        = "Proportion meeting MPL",
    y        = NULL,
    caption  = "Source: ICAN-ICAR 2025. African countries only. Estimates are weighted proportions."
  ) +
  
  theme_minimal(base_size = 11) +
  theme(
    legend.position    = "top",
    panel.grid.major.y = element_blank(),
    panel.grid.minor   = element_blank()
  )

plot_data <- df %>%
  filter(
    CountryName %in% c("Kenya", "Tanzania", "Uganda"),
    ch06a >= 1 & ch06a <= 9,
    !is.na(MPLMath), !is.na(MPLReading), !is.na(HHWeightProvided)
  ) %>%
  group_by(CountryName, ch06a) %>%
  summarise(
    Mathematics = weighted.mean(MPLMath,    w = HHWeightProvided),
    Reading     = weighted.mean(MPLReading, w = HHWeightProvided),
    .groups = "drop"
  ) %>%
  pivot_longer(
    cols = c(Mathematics, Reading),
    names_to  = "subject",
    values_to = "proportion"
  )

ggplot(plot_data,
       aes(x = ch06a, y = proportion,
           colour = CountryName,
           linetype = subject)) +
  geom_line(linewidth = 0.8) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  scale_x_continuous(breaks = 1:9) +
  theme_minimal() +
  theme(
    axis.title = element_blank()
  )

# Data prep
plot_data <- df %>%
  filter(
    CountryName %in% c("Kenya", "Tanzania", "Uganda"),
    ch06a >= 1 & ch06a <= 9,
    !is.na(MPLMath), !is.na(MPLReading), !is.na(HHWeightProvided)
  ) %>%
  group_by(CountryName, ch06a) %>%
  summarise(
    Mathematics = weighted.mean(MPLMath,    w = HHWeightProvided),
    Reading     = weighted.mean(MPLReading, w = HHWeightProvided),
    .groups = "drop"
  ) %>%
  pivot_longer(
    cols = c(Mathematics, Reading),
    names_to  = "subject",
    values_to = "proportion"
  ) %>%
  mutate(
    line_colour = if_else(subject == "Reading", "#C0392B", "#BDC3C7"),
    line_type   = case_when(
      CountryName == "Uganda"   ~ "solid",
      CountryName == "Kenya"    ~ "dotted",
      CountryName == "Tanzania" ~ "dashed"
    ),
    line_width  = if_else(CountryName == "Uganda" & subject == "Reading", 1.4, 0.7),
    line_alpha  = if_else(CountryName == "Uganda" & subject == "Reading", 1, 0.55),
    line_label  = paste(CountryName, subject)
  )

# End-of-line labels
end_labels <- plot_data %>% filter(ch06a == 9)

# Grade 5 gap
grade5_reading <- plot_data %>%
  filter(
    ch06a == 5,
    subject == "Reading",
    CountryName %in% c("Uganda", "Kenya")
  ) %>%
  select(CountryName, proportion) %>%
  pivot_wider(names_from = CountryName, values_from = proportion)

gap_pct   <- round(abs(grade5_reading$Kenya - grade5_reading$Uganda) * 100)
uganda_g5 <- grade5_reading$Uganda
kenya_g5  <- grade5_reading$Kenya

uganda_reading <- ggplot(
  plot_data,
  aes(x = ch06a, y = proportion,
      colour    = line_colour,
      linetype  = line_type,
      linewidth = line_width,
      alpha     = line_alpha,
      group     = line_label)
) +
  geom_line() +
  
  # Direct labels (always visible, not faded)
  geom_text_repel(
    data = end_labels,
    inherit.aes = FALSE,
    aes(x = ch06a, y = proportion, label = line_label),
    colour = "black",
    hjust = 0,
    direction = "y",
    nudge_x = 0.6,
    xlim = c(9.5, 13),
    size = 3,
    fontface = "bold",
    segment.size = 0.3,
    force = 2
  ) +
  
  # Bracket
  annotate("segment",
           x = 5, xend = 5,
           y = uganda_g5, yend = kenya_g5,
           colour = "#C0392B", linewidth = 0.5) +
  annotate("segment",
           x = 4.85, xend = 5.15,
           y = uganda_g5, yend = uganda_g5,
           colour = "#C0392B", linewidth = 0.5) +
  annotate("segment",
           x = 4.85, xend = 5.15,
           y = kenya_g5, yend = kenya_g5,
           colour = "#C0392B", linewidth = 0.5) +
  
  annotate("text",
           x = 5.3,
           y = (uganda_g5 + kenya_g5) / 2,
           label = paste0(
             gap_pct,
             " percentage point gap between Uganda and Kenya Reading at Grade 5"
           ),
           colour = "#C0392B",
           fontface = "italic",
           size = 2.8,
           hjust = 0) +
  
  scale_colour_identity() +
  scale_linetype_identity() +
  scale_linewidth_identity() +
  scale_alpha_identity() +
  
  scale_x_continuous(
    breaks = 1:9,
    limits = c(1, 13),
    expand = expansion(mult = c(0.02, 0))
  ) +
  
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  
  labs(
    title = "Uganda’s reading gap: a pattern the data raises but cannot explain",
    subtitle = "Maths proficiency is broadly similar across Kenya, Tanzania and Uganda. Yet by Grade 9, Uganda’s reading rate is much lower than that of its neighbours — raising questions about what drives this gap.",
    x = "Grade",
    y = "Proportion meeting Minimum Proficiency Level",
    caption = "Source: ICAN-ICAR 2025. Estimates are weighted proportions."
  ) +
  
  coord_cartesian(clip = "off") +
  
  theme_minimal(base_size = 11) +
  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold", size = 13),
    plot.subtitle = element_text(size = 9, colour = "grey40"),
    plot.caption = element_text(size = 8, colour = "grey60"),
    panel.grid.minor = element_blank(),
    plot.margin = margin(10, 80, 10, 10)  # space for labels
  )

uganda_reading

# --- Overall Kenya average ---
kenya_overall_rate <- df %>%
  filter(CountryName == "Kenya",
         !is.na(MPLMath), !is.na(HHWeightProvided)) %>%
  summarise(rate = weighted.mean(MPLMath, w = HHWeightProvided)) %>%
  pull(rate)

# --- Gender ---
kenya_gender <- df %>%
  filter(CountryName == "Kenya",
         !is.na(MPLMath), !is.na(HHWeightProvided), !is.na(ch03)) %>%
  mutate(label = if_else(ch03 == 1, "Female", "Male")) %>%
  group_by(label) %>%
  summarise(
    prof_rate = weighted.mean(MPLMath, w = HHWeightProvided),
    headcount = sum(HHWeightProvided),
    .groups = "drop"
  ) %>%
  mutate(category = "Gender")

# --- Location ---
kenya_location <- df %>%
  filter(CountryName == "Kenya",
         !is.na(MPLMath), !is.na(HHWeightProvided),
         Location %in% c("Rural", "Urban")) %>%
  group_by(label = Location) %>%
  summarise(
    prof_rate = weighted.mean(MPLMath, w = HHWeightProvided),
    headcount = sum(HHWeightProvided),
    .groups = "drop"
  ) %>%
  mutate(category = "Location")

# --- Enrolment ---
kenya_enrol <- df %>%
  filter(CountryName == "Kenya",
         !is.na(MPLMath), !is.na(HHWeightProvided),
         !is.na(EnrolmentStatus)) %>%
  group_by(label = EnrolmentStatus) %>%
  summarise(
    prof_rate = weighted.mean(MPLMath, w = HHWeightProvided),
    headcount = sum(HHWeightProvided),
    .groups = "drop"
  ) %>%
  mutate(category = "Enrolment")

# --- Grade ---
kenya_grade <- df %>%
  filter(CountryName == "Kenya",
         !is.na(MPLMath), !is.na(HHWeightProvided),
         !is.na(ch06a), ch06a >= 0, ch06a <= 10) %>%
  mutate(label = if_else(ch06a == 0, "Pre-Primary", paste0("Grade ", ch06a))) %>%
  group_by(label, ch06a) %>%
  summarise(
    prof_rate = weighted.mean(MPLMath, w = HHWeightProvided),
    headcount = sum(HHWeightProvided),
    .groups = "drop"
  ) %>%
  arrange(ch06a) %>%
  mutate(category = "Grade") %>%
  select(-ch06a)

# --- Age group ---
kenya_age <- df %>%
  filter(CountryName == "Kenya",
         !is.na(MPLMath), !is.na(HHWeightProvided),
         !is.na(ch02)) %>%
  mutate(label = case_when(
    ch02 >= 5  & ch02 <= 7  ~ "5-7 yrs",
    ch02 >= 8  & ch02 <= 10 ~ "8-10 yrs",
    ch02 >= 11 & ch02 <= 13 ~ "11-13 yrs",
    ch02 >= 14 & ch02 <= 16 ~ "14-16 yrs"
  )) %>%
  filter(!is.na(label)) %>%
  group_by(label) %>%
  summarise(
    prof_rate = weighted.mean(MPLMath, w = HHWeightProvided),
    headcount = sum(HHWeightProvided),
    .groups = "drop"
  ) %>%
  mutate(
    label = factor(label, levels = c("5-7 yrs", "8-10 yrs", "11-13 yrs", "14-16 yrs"))
  ) %>%
  arrange(label) %>%
  mutate(
    label = as.character(label),
    category = "Age Group"
  )

# --- Combine ---
kenya_plot_data <- bind_rows(
  kenya_gender, kenya_location, kenya_enrol,
  kenya_grade, kenya_age
)

kenya_plot_data$label <- factor(
  kenya_plot_data$label,
  levels = unique(kenya_plot_data$label)
)

# --- Domain structure ---
kenya_domain_breaks <- kenya_plot_data %>%
  mutate(x_pos = as.numeric(label)) %>%
  group_by(category) %>%
  summarise(x_end = max(x_pos), .groups = "drop") %>%
  filter(x_end < max(as.numeric(kenya_plot_data$label)))

kenya_domain_labels <- kenya_plot_data %>%
  mutate(x_pos = as.numeric(label)) %>%
  group_by(category) %>%
  summarise(x_mid = mean(x_pos), .groups = "drop")

# --- Plot ---
kenya_bubble <- ggplot(
  kenya_plot_data,
  aes(x = as.numeric(label), y = prof_rate)
) +
  
  geom_vline(
    xintercept = kenya_domain_breaks$x_end + 0.5,
    colour = "grey80",
    linewidth = 0.4
  ) +
  
  geom_hline(
    yintercept = kenya_overall_rate,
    linetype = "dashed",
    colour = "grey40",
    linewidth = 0.6
  ) +
  
  annotate(
    "text",
    x = 1,
    y = kenya_overall_rate + 0.03,
    label = paste0("Kenya average: ", round(kenya_overall_rate * 100), "%"),
    hjust = 0,
    size = 3.2,
    colour = "grey30",
    fontface = "italic"
  ) +
  
  annotate(
    "text",
    x = kenya_domain_labels$x_mid,
    y = 1.08,
    label = kenya_domain_labels$category,
    fontface = "bold",
    colour = "grey40",
    size = 3
  ) +
  
  geom_point(
    aes(size = headcount, fill = category),
    shape = 21,
    colour = "white",
    stroke = 0.8,
    alpha = 0.85
  ) +
  
  geom_text(
    aes(label = paste0(round(prof_rate * 100), "%")),
    colour = "white",
    fontface = "bold",
    size = 2.8
  ) +
  
  scale_fill_manual(
    values = c(
      "Gender"    = "#2171B5",
      "Location"  = "#E6AB02",
      "Enrolment" = "#7570B3",
      "Grade"     = "#1B9E77",
      "Age Group" = "#D95F02"
    ),
    name = "Domain",
    guide = guide_legend(order = 1)
  ) +
  
  scale_size_continuous(
    range = c(4, 20),
    labels = comma,
    name = "Number of children (weighted)",
    guide = guide_legend(
      override.aes = list(fill = "grey70", colour = "white"),
      order = 2
    )
  ) +
  
  scale_x_continuous(
    breaks = seq_along(levels(kenya_plot_data$label)),
    labels = levels(kenya_plot_data$label),
    expand = expansion(add = 0.6)
  ) +
  
  scale_y_continuous(
    labels = percent_format(accuracy = 1),
    limits = c(0, 1.12),
    breaks = seq(0, 1, 0.2)
  ) +
  
  coord_cartesian(clip = "off") +
  
  labs(
    title = "Numeracy proficiency by child characteristics — Kenya",
    subtitle = "Bubble size = weighted total number of children. Dashed line = Kenya average.",
    x = NULL,
    y = NULL,
    caption = "Source: ICAN-ICAR 2025, Kenya. Estimates are weighted proportions."
  ) +
  
  theme_minimal(base_size = 11) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major.y = element_line(colour = "grey90", linewidth = 0.3),
    legend.position = "bottom",
    legend.box = "horizontal"
  )

kenya_bubble

# --- Overall Mali rate ---
mali_overall_rate <- df %>%
  filter(CountryName == "Mali",
         !is.na(MPLMath), !is.na(HHWeightProvided)) %>%
  summarise(rate = weighted.mean(MPLMath, w = HHWeightProvided)) %>%
  pull(rate)

# --- Build all groups (same as yours) ---
mali_gender <- df %>%
  filter(CountryName == "Mali",
         !is.na(MPLMath), !is.na(HHWeightProvided), !is.na(ch03)) %>%
  mutate(label = if_else(ch03 == 1, "Female", "Male")) %>%
  group_by(label) %>%
  summarise(
    prof_rate = weighted.mean(MPLMath, w = HHWeightProvided),
    headcount = sum(HHWeightProvided),
    .groups = "drop"
  ) %>%
  mutate(category = "Gender")

mali_location <- df %>%
  filter(CountryName == "Mali",
         !is.na(MPLMath), !is.na(HHWeightProvided),
         Location %in% c("Rural", "Urban")) %>%
  group_by(label = Location) %>%
  summarise(
    prof_rate = weighted.mean(MPLMath, w = HHWeightProvided),
    headcount = sum(HHWeightProvided),
    .groups = "drop"
  ) %>%
  mutate(category = "Location")

mali_enrol <- df %>%
  filter(CountryName == "Mali",
         !is.na(MPLMath), !is.na(HHWeightProvided),
         !is.na(EnrolmentStatus)) %>%
  group_by(label = EnrolmentStatus) %>%
  summarise(
    prof_rate = weighted.mean(MPLMath, w = HHWeightProvided),
    headcount = sum(HHWeightProvided),
    .groups = "drop"
  ) %>%
  mutate(category = "Enrolment")

mali_grade <- df %>%
  filter(CountryName == "Mali",
         !is.na(MPLMath), !is.na(HHWeightProvided),
         !is.na(ch06a), ch06a >= 0, ch06a <= 10) %>%
  mutate(label = if_else(ch06a == 0, "Pre-Primary", paste0("Grade ", ch06a))) %>%
  group_by(label, ch06a) %>%
  summarise(
    prof_rate = weighted.mean(MPLMath, w = HHWeightProvided),
    headcount = sum(HHWeightProvided),
    .groups = "drop"
  ) %>%
  arrange(ch06a) %>%
  mutate(category = "Grade") %>%
  select(-ch06a)

mali_age <- df %>%
  filter(CountryName == "Mali",
         !is.na(MPLMath), !is.na(HHWeightProvided),
         !is.na(ch02)) %>%
  mutate(label = case_when(
    ch02 >= 5  & ch02 <= 7  ~ "5-7 yrs",
    ch02 >= 8  & ch02 <= 10 ~ "8-10 yrs",
    ch02 >= 11 & ch02 <= 13 ~ "11-13 yrs",
    ch02 >= 14 & ch02 <= 16 ~ "14-16 yrs"
  )) %>%
  filter(!is.na(label)) %>%
  group_by(label) %>%
  summarise(
    prof_rate = weighted.mean(MPLMath, w = HHWeightProvided),
    headcount = sum(HHWeightProvided),
    .groups = "drop"
  ) %>%
  mutate(
    label = factor(label, levels = c("5-7 yrs", "8-10 yrs", "11-13 yrs", "14-16 yrs"))
  ) %>%
  arrange(label) %>%
  mutate(label = as.character(label),
         category = "Age Group")

# --- Combine (FIXED ordering) ---
mali_plot_data <- bind_rows(
  mali_gender, mali_location, mali_enrol,
  mali_grade, mali_age
)

mali_plot_data$label <- factor(
  mali_plot_data$label,
  levels = unique(mali_plot_data$label)
)

# --- Domain structure ---
mali_domain_breaks <- mali_plot_data %>%
  mutate(x_pos = as.numeric(label)) %>%
  group_by(category) %>%
  summarise(x_end = max(x_pos), .groups = "drop") %>%
  filter(x_end < max(as.numeric(mali_plot_data$label)))

mali_domain_labels <- mali_plot_data %>%
  mutate(x_pos = as.numeric(label)) %>%
  group_by(category) %>%
  summarise(x_mid = mean(x_pos), .groups = "drop")

# --- Plot ---
mali_bubble <- ggplot(
  mali_plot_data,
  aes(x = as.numeric(label), y = prof_rate)
) +
  
  geom_vline(
    xintercept = mali_domain_breaks$x_end + 0.5,
    colour = "grey80",
    linewidth = 0.4
  ) +
  
  geom_hline(
    yintercept = mali_overall_rate,
    linetype = "dashed",
    colour = "grey40",
    linewidth = 0.6
  ) +
  
  annotate(
    "text",
    x = 1,
    y = mali_overall_rate + 0.03,
    label = paste0("Mali average: ", round(mali_overall_rate * 100), "%"),
    hjust = 0,
    size = 3.2,
    colour = "grey30",
    fontface = "italic"
  ) +
  
  annotate(
    "text",
    x = mali_domain_labels$x_mid,
    y = 1.08,
    label = mali_domain_labels$category,
    fontface = "bold",
    colour = "grey40",
    size = 3
  ) +
  
  geom_point(
    aes(size = headcount, fill = category),
    shape = 21,
    colour = "white",
    stroke = 0.8,
    alpha = 0.85
  ) +
  
  geom_text(
    aes(label = paste0(round(prof_rate * 100), "%")),
    colour = "white",
    fontface = "bold",
    size = 2.8
  ) +
  
  scale_fill_manual(
    values = c(
      "Gender"    = "#2171B5",
      "Location"  = "#E6AB02",
      "Enrolment" = "#7570B3",
      "Grade"     = "#1B9E77",
      "Age Group" = "#D95F02"
    ),
    name = "Domain",
    guide = guide_legend(order = 1)
  ) +
  
  scale_size_continuous(
    range = c(4, 20),
    labels = comma,
    name = "Number of children (weighted)",
    guide = guide_legend(
      override.aes = list(fill = "grey70", colour = "white"),
      order = 2
    )
  ) +
  
  scale_x_continuous(
    breaks = seq_along(levels(mali_plot_data$label)),
    labels = levels(mali_plot_data$label),
    expand = expansion(add = 0.6)
  ) +
  
  scale_y_continuous(
    labels = percent_format(accuracy = 1),
    limits = c(0, 1.12),
    breaks = seq(0, 1, 0.2)
  ) +
  
  coord_cartesian(clip = "off") +
  
  labs(
    title = "Numeracy proficiency by child characteristics — Mali",
    subtitle = "Bubble size = weighted total number of children. Dashed line = Mali average.",
    x = NULL,
    y = NULL,
    caption = "Source: ICAN-ICAR 2025, Mali. Estimates are weighted proportions."
  ) +
  
  theme_minimal(base_size = 11) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major.y = element_line(colour = "grey90", linewidth = 0.3),
    legend.position = "bottom",
    legend.box = "horizontal"
  )

mali_bubble