HPV Vaccine Uptake Analysis: Nasarawa and Ogun States, 2025

Author

Victor Daniel

Published

May 26, 2026

Code

# Load all the packages we need
# Each package is a toolbox of extra functions for R

library(tidyverse)   # for cleaning and reshaping data
library(readxl)      # for reading Excel files
library(janitor)     # for cleaning messy column names
library(skimr)       # for quick data summaries
library(ggplot2)     # for visualisations
library(corrplot)    # for correlation heatmaps
library(car)         # for regression diagnostics

Code

nasarawa_raw <- read_excel(
  "HPV2 Ogun & Nasarawa Vaccine uptake Data.xlsx",
  sheet = "Nasarawa HPV vacc uptake"
)

ogun_raw <- read_excel(
  "HPV2 Ogun & Nasarawa Vaccine uptake Data.xlsx",
  sheet = "Ogun HPV Vacc upatke"
)

dim(nasarawa_raw)

[1] 18 26

Code

dim(ogun_raw)

[1] 25 14

Code

# ── CLEAN NASARAWA ───────────────────────────────────────────
# We keep only the 13 actual LGA rows (rows 1-13)
# We select only the columns we need by position

nasarawa_clean <- nasarawa_raw[1:13, ] |>
  select(
    lga_name       = 1,   # column 1 = LGA name
    monthly_target = 2,   # column 2 = monthly target
    vaccines_jan   = 4,   # column 4 = January vaccines given
    vaccines_feb   = 6,   # column 6 = February
    vaccines_mar   = 8,   # column 8 = March
    vaccines_apr   = 10,  # column 10 = April
    vaccines_may   = 12   # column 12 = May
  ) |>
  mutate(
    state = "Nasarawa",                        # add state label
    monthly_target = as.numeric(monthly_target) # make sure target is numeric
  )

# ── CLEAN OGUN ───────────────────────────────────────────────
# Keep only the 20 actual LGA rows (rows 1-20)

ogun_clean <- ogun_raw[1:20, ] |>
  select(
    lga_name     = 1,   # column 1 = LGA name
    vaccines_jan = 2,   # column 2 = January
    vaccines_feb = 3,   # column 3 = February
    vaccines_mar = 4,   # column 4 = March
    vaccines_apr = 5,   # column 5 = April
    vaccines_may = 6    # column 6 = May
  ) |>
  mutate(
    state          = "Ogun",   # add state label
    monthly_target = NA_real_  # Ogun sheet has no target column
  )

# ── QUICK CHECK ──────────────────────────────────────────────
glimpse(nasarawa_clean)

Rows: 13
Columns: 8
$ lga_name       <chr> "AKWANGA", "AWE", "DOMA", "KARU", "KEANA", "KEFFI", "KO…
$ monthly_target <dbl> 242, 240, 311, 1014, 178, 266, 233, 863, 328, 552, 276,…
$ vaccines_jan   <dbl> 90, 246, 117, 270, 130, 120, 245, 88, 162, 159, 423, 33…
$ vaccines_feb   <dbl> 97, 198, 0, 128, 113, 254, 155, 597, 291, 8, 650, 95, 1…
$ vaccines_mar   <dbl> 191, 233, 4, 0, 133, 118, 198, 399, 266, 203, 491, 80, …
$ vaccines_apr   <dbl> 104, 298, 44, 193, 160, 94, 93, 513, 245, 132, 562, 80,…
$ vaccines_may   <dbl> 171, 362, 38, 112, 198, 273, 154, 1181, 238, NA, 370, 1…
$ state          <chr> "Nasarawa", "Nasarawa", "Nasarawa", "Nasarawa", "Nasara…

Code

glimpse(ogun_clean)

Rows: 20
Columns: 8
$ lga_name       <chr> "Abeokuta North", "Abeokuta South", "Ado Odo/Ota", "Ewe…
$ vaccines_jan   <dbl> 55, 3, 0, 0, 0, 43, 182, 0, 40, 0, 0, 0, 27, 0, 9, 0, 0…
$ vaccines_feb   <dbl> 59, 1251, 1201, 0, 421, 14, 119, 472, 50, 0, 95, 333, 8…
$ vaccines_mar   <dbl> 77, 420, 851, 0, 1080, 198, 0, 443, 45, 0, 39, 594, 235…
$ vaccines_apr   <dbl> 98, 729, 949, 0, 518, 138, 101, 625, 30, 505, 0, 109, 8…
$ vaccines_may   <dbl> 86, 24, 48, 0, 129, 3, 107, 147, 97, 0, 30, 10, 14, 44,…
$ state          <chr> "Ogun", "Ogun", "Ogun", "Ogun", "Ogun", "Ogun", "Ogun",…
$ monthly_target <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…

Code

# ── RESHAPE TO LONG FORMAT ───────────────────────────────────
# Right now each month is a separate column
# We need one row per LGA per month
# pivot_longer() stacks all the month columns into rows

nasarawa_long <- nasarawa_clean |>
  pivot_longer(
    cols      = starts_with("vaccines_"),  # grab all month columns
    names_to  = "month_label",             # column names become a new variable
    values_to = "vaccines_given"           # values go into this column
  )

ogun_long <- ogun_clean |>
  pivot_longer(
    cols      = starts_with("vaccines_"),
    names_to  = "month_label",
    values_to = "vaccines_given"
  )

# ── COMBINE BOTH STATES ──────────────────────────────────────
# bind_rows() stacks the two datasets on top of each other

hpv_data <- bind_rows(nasarawa_long, ogun_long)

# ── CLEAN UP VARIABLES ───────────────────────────────────────
hpv_data <- hpv_data |>
  mutate(
    # Convert month label to a clean month name
    month = case_when(
      month_label == "vaccines_jan" ~ "January",
      month_label == "vaccines_feb" ~ "February",
      month_label == "vaccines_mar" ~ "March",
      month_label == "vaccines_apr" ~ "April",
      month_label == "vaccines_may" ~ "May"
    ),
    
    # Create a month number for trend analysis (1 = Jan, 5 = May)
    month_number = case_when(
      month_label == "vaccines_jan" ~ 1,
      month_label == "vaccines_feb" ~ 2,
      month_label == "vaccines_mar" ~ 3,
      month_label == "vaccines_apr" ~ 4,
      month_label == "vaccines_may" ~ 5
    ),
    
    # Convert vaccines_given to numeric (remove any text)
    vaccines_given = as.numeric(vaccines_given),
    
    # Clean LGA names (trim whitespace)
    lga_name = str_trim(lga_name),
    
    # Make state a factor (categorical variable)
    state = as.factor(state),
    
    # Create a Yes/No variable: did the LGA meet its monthly target?
    # Only applies to Nasarawa (Ogun has no target)
    target_met = case_when(
      is.na(monthly_target)          ~ "No Target",
      vaccines_given >= monthly_target ~ "Met",
      vaccines_given < monthly_target  ~ "Not Met"
    ),
    
    # Create performance ratio: vaccines given ÷ target
    # Only for Nasarawa
    performance_ratio = round(vaccines_given / monthly_target, 3)
  ) |>
  # Remove the original month_label column — we don't need it anymore
  select(-month_label)

# ── FINAL CHECK ──────────────────────────────────────────────
# How many rows and columns do we have now?
dim(hpv_data)

[1] 165   8

Code

# Preview first 10 rows
head(hpv_data, 10)

# A tibble: 10 × 8
   lga_name monthly_target state    vaccines_given month month_number target_met
   <chr>             <dbl> <fct>             <dbl> <chr>        <dbl> <chr>     
 1 AKWANGA             242 Nasarawa             90 Janu…            1 Not Met   
 2 AKWANGA             242 Nasarawa             97 Febr…            2 Not Met   
 3 AKWANGA             242 Nasarawa            191 March            3 Not Met   
 4 AKWANGA             242 Nasarawa            104 April            4 Not Met   
 5 AKWANGA             242 Nasarawa            171 May              5 Not Met   
 6 AWE                 240 Nasarawa            246 Janu…            1 Met       
 7 AWE                 240 Nasarawa            198 Febr…            2 Not Met   
 8 AWE                 240 Nasarawa            233 March            3 Not Met   
 9 AWE                 240 Nasarawa            298 April            4 Met       
10 AWE                 240 Nasarawa            362 May              5 Met       
# ℹ 1 more variable: performance_ratio <dbl>

1. Executive Summary

This analysis examines monthly HPV vaccine uptake across 33 Local Government Areas (LGAs) in Nasarawa and Ogun States, Nigeria, from January to May 2025. Using facility-level immunisation data sourced from state HMIS records, the study applies five analytical techniques to understand what drives HPV vaccine uptake and whether performance differs significantly between the two states. Key findings indicate substantial variation in uptake across LGAs, a significant difference in performance between states, and a positive relationship between monthly targets and doses administered. Nasarawa recorded 33.6% cumulative coverage against its 2025 annual target, while Ogun recorded 19.96%. The analysis recommends targeted demand-generation and supply-chain interventions in consistently underperforming LGAs, particularly those recording zero doses in multiple months.

2. Professional Disclosure

Job Title: Founder and Chief Executive Officer
Organisation: Havilah Strategy Consult Limited, Abuja, Nigeria
Sector: Health systems strengthening, digital health, and public sector reform

Havilah Strategy Consult works with governments, development partners, and foundations to strengthen primary healthcare delivery across Nigeria and sub-Saharan Africa. This dataset was generated in the context of an HPV vaccination impact documentation assignment conducted for Pathfinder International, providing direct operational relevance to each technique applied.

Exploratory Data Analysis is a daily tool in my consulting practice. Before any programme recommendation is made to a client such as WHO or BMGF, I conduct a thorough exploration of the underlying data — identifying missing reports, zero-dose LGAs, and distributional anomalies that would otherwise distort conclusions. In this dataset, EDA reveals which LGAs are chronically underreporting and where supply gaps may be masking true demand.

Data Visualisation is central to how Havilah communicates findings to non-technical government stakeholders. State ministry directors and LGA health officers respond to charts and maps, not regression tables. Visualising monthly uptake trends by LGA and state directly informs the programme dashboards we build for clients.

Hypothesis Testing underpins every comparative claim we make in programme evaluations. When a client asks whether an intervention improved outcomes in one state relative to another, a formal statistical test — not an eyeball comparison — is the appropriate answer. Here, testing whether Nasarawa and Ogun differ significantly in vaccine uptake directly mirrors the comparative evaluations Havilah conducts.

Correlation Analysis helps identify which programme inputs are most strongly associated with outcomes — a core question in health systems work. Understanding whether larger monthly targets correlate with higher doses given informs how NPHCDA and state governments should set planning targets.

Linear Regression allows Havilah to isolate the independent contribution of each factor to an outcome after controlling for others. In this context, regressing vaccines given on month, state, and target size mirrors the multivariate analysis we conduct when attributing programme results to specific inputs for donor reporting.

3. Data Collection and Sampling

Source: State Health Management Information System (HMIS) records, extracted from HPV vaccination campaign monitoring tools maintained by Nasarawa and Ogun State Primary Healthcare Development Agencies.

Collection Method: Monthly HPV2 vaccine administration data was compiled from LGA-level reports submitted by ward focal persons to the state HMIS officers. Data was provided in structured Excel format as part of Havilah’s HPV vaccination impact documentation work for Pathfinder International.

Sampling Frame: All 33 LGAs across two states — 13 in Nasarawa State (North-Central) and 20 in Ogun State (South-West) — constituting a census of LGAs in both states rather than a sample. Every LGA in both states is included.

Time Period: January 2025 to May 2025 (5 months of programme data).

Sample Size: 165 LGA-month observations (33 LGAs × 5 months), exceeding the 100-observation minimum requirement.

Ethical Notes: Data is aggregated at LGA level and contains no personally identifiable information. No individual patient records were accessed. Data was shared under the terms of Havilah’s engagement with Pathfinder International. LGA names are published administrative units and carry no confidentiality restriction.

4. Data Description

Code

# ── SUMMARY STATISTICS ───────────────────────────────────────
# skim() gives a rich summary of every variable in the dataset
skim(hpv_data)

Data summary
Name	hpv_data
Number of rows	165
Number of columns	8
_______________________
Column type frequency:
character	3
factor	1
numeric	4
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
lga_name	0	1.00	3	16	33
month	0	1.00	3	8	5
target_met	1	0.99	3	9	3

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
state	0	1	FALSE	2	Ogu: 100, Nas: 65

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
monthly_target	100	0.39	379.38	258.39	178	240.00	266.00	328.00	1014.00	▇▁▁▁▂
vaccines_given	1	0.99	179.66	245.78	0	12.75	99.50	210.50	1251.00	▇▁▁▁▁
month_number	0	1.00	3.00	1.42	1	2.00	3.00	4.00	5.00	▇▇▇▇▇
performance_ratio	101	0.39	0.67	0.49	0	0.34	0.65	0.89	2.36	▇▆▃▁▁

4. Data Description

Code

# ── BASIC DIMENSIONS ─────────────────────────────────────────
cat("Total observations:", nrow(hpv_data), "\n")

Total observations: 165

Code

cat("Total variables:", ncol(hpv_data), "\n")

Total variables: 8

Code

cat("States:", levels(hpv_data$state), "\n")

States: Nasarawa Ogun

Code

cat("LGAs in Nasarawa:", hpv_data |> filter(state == "Nasarawa") |> distinct(lga_name) |> nrow(), "\n")

LGAs in Nasarawa: 13

Code

cat("LGAs in Ogun:", hpv_data |> filter(state == "Ogun") |> distinct(lga_name) |> nrow(), "\n")

LGAs in Ogun: 20

Code

cat("Months covered:", paste(unique(hpv_data$month), collapse = ", "), "\n")

Months covered: January, February, March, April, May

Code

# ── VARIABLE OVERVIEW TABLE ───────────────────────────────────
# glimpse() shows every variable, its type, and first few values
glimpse(hpv_data)

Rows: 165
Columns: 8
$ lga_name          <chr> "AKWANGA", "AKWANGA", "AKWANGA", "AKWANGA", "AKWANGA…
$ monthly_target    <dbl> 242, 242, 242, 242, 242, 240, 240, 240, 240, 240, 31…
$ state             <fct> Nasarawa, Nasarawa, Nasarawa, Nasarawa, Nasarawa, Na…
$ vaccines_given    <dbl> 90, 97, 191, 104, 171, 246, 198, 233, 298, 362, 117,…
$ month             <chr> "January", "February", "March", "April", "May", "Jan…
$ month_number      <dbl> 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3…
$ target_met        <chr> "Not Met", "Not Met", "Not Met", "Not Met", "Not Met…
$ performance_ratio <dbl> 0.372, 0.401, 0.789, 0.430, 0.707, 1.025, 0.825, 0.9…

Code

# ── SUMMARY STATISTICS ────────────────────────────────────────
# skim() gives a rich summary including missing value counts,
# mean, sd, min, max, and a small histogram for each variable
skim(hpv_data)

Data summary
Name	hpv_data
Number of rows	165
Number of columns	8
_______________________
Column type frequency:
character	3
factor	1
numeric	4
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
lga_name	0	1.00	3	16	33
month	0	1.00	3	8	5
target_met	1	0.99	3	9	3

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
state	0	1	FALSE	2	Ogu: 100, Nas: 65

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
monthly_target	100	0.39	379.38	258.39	178	240.00	266.00	328.00	1014.00	▇▁▁▁▂
vaccines_given	1	0.99	179.66	245.78	0	12.75	99.50	210.50	1251.00	▇▁▁▁▁
month_number	0	1.00	3.00	1.42	1	2.00	3.00	4.00	5.00	▇▇▇▇▇
performance_ratio	101	0.39	0.67	0.49	0	0.34	0.65	0.89	2.36	▇▆▃▁▁

Code

# ── DATA QUALITY ISSUE 1: ZERO-DOSE LGAs ─────────────────────
# Some LGAs reported zero vaccines across all five months
# This is a data quality red flag — either genuine zero uptake
# or failure to report

zero_lgas <- hpv_data |>
  group_by(state, lga_name) |>
  summarise(
    total_vaccines = sum(vaccines_given, na.rm = TRUE),
    months_at_zero = sum(vaccines_given == 0, na.rm = TRUE),
    .groups = "drop"
  ) |>
  filter(total_vaccines == 0)

cat("LGAs with ZERO vaccines across all 5 months:\n")

LGAs with ZERO vaccines across all 5 months:

Code

print(zero_lgas)

# A tibble: 2 × 4
  state lga_name       total_vaccines months_at_zero
  <fct> <chr>                   <dbl>          <int>
1 Ogun  Ewekoro                     0              5
2 Ogun  Ogun Waterside              0              5

Code

# ── DATA QUALITY ISSUE 2: MISSING TARGET DATA ─────────────────
# Ogun sheet did not include monthly targets
# This creates NAs in monthly_target and performance_ratio

missing_targets <- hpv_data |>
  filter(is.na(monthly_target)) |>
  count(state)

cat("\nObservations with missing monthly target (by state):\n")


Observations with missing monthly target (by state):

Code

print(missing_targets)

# A tibble: 1 × 2
  state     n
  <fct> <int>
1 Ogun    100

Code

# ── DATA QUALITY ISSUE 3: OUTLIERS ───────────────────────────
# Identify LGA-months where vaccines given is unusually high
# We define outlier as more than 3 standard deviations from mean

mean_v  <- mean(hpv_data$vaccines_given, na.rm = TRUE)
sd_v    <- sd(hpv_data$vaccines_given, na.rm = TRUE)

outliers <- hpv_data |>
  filter(vaccines_given > mean_v + 3 * sd_v) |>
  select(state, lga_name, month, vaccines_given)

cat("\nOutlier LGA-months (>3 SD above mean):\n")


Outlier LGA-months (>3 SD above mean):

Code

print(outliers)

# A tibble: 5 × 4
  state    lga_name       month    vaccines_given
  <fct>    <chr>          <chr>             <dbl>
1 Nasarawa LAFIA          May                1181
2 Ogun     Abeokuta South February           1251
3 Ogun     Ado Odo/Ota    February           1201
4 Ogun     Ado Odo/Ota    April               949
5 Ogun     Ifo            March              1080

Code

cat("\nMean vaccines given:", round(mean_v, 1), "\n")


Mean vaccines given: 179.7

Code

cat("SD:", round(sd_v, 1), "\n")

SD: 245.8

Code

cat("Outlier threshold:", round(mean_v + 3 * sd_v, 1), "\n")

Outlier threshold: 917

Code

# ── HOW WE HANDLE EACH ISSUE ─────────────────────────────────

# Issue 1: Zero-dose LGAs
# We RETAIN them in the dataset — zero is a real and important
# data point for programme managers. We flag them with a new variable.

hpv_data <- hpv_data |>
  mutate(
    zero_reporter = if_else(vaccines_given == 0, "Zero", "Non-Zero")
  )

# Issue 2: Missing targets for Ogun
# We retain NAs for Ogun's monthly_target — analyses requiring
# targets will be run on Nasarawa only and clearly labelled.

# Issue 3: Outliers
# We retain outliers — they represent genuine high-performing LGAs
# (e.g. OBI in Nasarawa consistently exceeded targets).
# We note them in interpretation but do not remove them.

cat("Data quality handling complete.\n")

Data quality handling complete.

Code

cat("Final dataset dimensions:", dim(hpv_data), "\n")

Final dataset dimensions: 165 9

Code

cat("New variable added: zero_reporter\n")

New variable added: zero_reporter

5. Data Visualisation

Code

hpv_data |>
  group_by(state, lga_name) |>
  summarise(
    total = sum(vaccines_given, na.rm = TRUE),
    .groups = "drop"
  ) |>
  # reorder LGAs by total so bars go from highest to lowest
  mutate(lga_name = reorder(lga_name, total)) |>
  ggplot(aes(x = total, y = lga_name, fill = state)) +
  geom_col() +
  # add the number at end of each bar
  geom_text(aes(label = scales::comma(total)),
            hjust = -0.1, size = 3) +
  scale_fill_manual(values = c("Nasarawa" = "#2196F3",
                                "Ogun"     = "#FF9800")) +
  scale_x_continuous(expand = expansion(mult = c(0, 0.15))) +
  labs(
    title    = "Total HPV Vaccines Given by LGA (January–May 2025)",
    subtitle = "Ogun's Ado Odo/Ota and Abeokuta South dominate uptake; several LGAs report zero doses",
    x        = "Total Vaccines Given",
    y        = NULL,
    fill     = "State",
    caption  = "Source: State HMIS Records, Havilah/Pathfinder International 2025"
  ) +
  theme_minimal(base_size = 11) +
  theme(legend.position = "top")

Code

# ── PLOT 2: MONTHLY TREND BY STATE ───────────────────────────
# Shows how total uptake changed month by month in each state
# A line chart is ideal for showing change over time

hpv_data |>
  group_by(state, month_number, month) |>
  summarise(
    total = sum(vaccines_given, na.rm = TRUE),
    .groups = "drop"
  ) |>
  # reorder months correctly (Jan=1 through May=5)
  mutate(month = fct_reorder(month, month_number)) |>
  ggplot(aes(x = month, y = total,
             colour = state, group = state)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 4) +
  # label each point with the value
  geom_text(aes(label = scales::comma(total)),
            vjust = -1, size = 3.5) +
  scale_colour_manual(values = c("Nasarawa" = "#2196F3",
                                  "Ogun"     = "#FF9800")) +
  scale_y_continuous(labels = scales::comma,
                     expand = expansion(mult = c(0.1, 0.2))) +
  labs(
    title    = "Monthly HPV Vaccine Uptake by State (January–May 2025)",
    subtitle = "Ogun peaked in February then declined sharply; Nasarawa grew steadily to May",
    x        = "Month",
    y        = "Total Vaccines Given",
    colour   = "State",
    caption  = "Source: State HMIS Records, Havilah/Pathfinder International 2025"
  ) +
  theme_minimal(base_size = 11) +
  theme(legend.position = "top")

Code

# ── PLOT 3: DISTRIBUTION BY STATE ────────────────────────────
# A box plot shows the spread, median, and outliers
# It answers: is Ogun's higher total driven by a few LGAs
# or is performance generally higher across all LGAs?

ggplot(hpv_data,
       aes(x = state, y = vaccines_given, fill = state)) +
  geom_boxplot(alpha = 0.7, outlier.colour = "red",
               outlier.size = 2) +
  # overlay individual data points for transparency
  geom_jitter(width = 0.15, alpha = 0.4, size = 1.5) +
  scale_fill_manual(values = c("Nasarawa" = "#2196F3",
                                "Ogun"     = "#FF9800")) +
  scale_y_continuous(labels = scales::comma) +
  labs(
    title    = "Distribution of Monthly Vaccine Uptake by State",
    subtitle = "Red dots = outliers; both states show high variability across LGAs",
    x        = "State",
    y        = "Vaccines Given (per LGA per month)",
    fill     = "State",
    caption  = "Source: State HMIS Records, Havilah/Pathfinder International 2025"
  ) +
  theme_minimal(base_size = 11) +
  theme(legend.position = "none")

Warning: Removed 1 row containing non-finite outside the scale range
(`stat_boxplot()`).

Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).

Code

# ── PLOT 4: LGA × MONTH PERFORMANCE HEATMAP ──────────────────
# Shows every LGA's performance across every month at once
# Dark colour = high uptake; white/light = low or zero

hpv_data |>
  mutate(month = fct_reorder(month, month_number)) |>
  ggplot(aes(x = month, y = lga_name,
             fill = vaccines_given)) +
  geom_tile(colour = "white", linewidth = 0.5) +
  # split into two panels — one per state
  facet_wrap(~state, scales = "free_y") +
  scale_fill_gradient(
    low  = "#FFF3E0",
    high = "#E65100",
    na.value = "grey80",
    labels = scales::comma
  ) +
  labs(
    title    = "HPV Vaccine Uptake Heatmap: LGA × Month (2025)",
    subtitle = "Persistent white cells indicate zero-reporting LGAs requiring intervention",
    x        = "Month",
    y        = NULL,
    fill     = "Vaccines\nGiven",
    caption  = "Source: State HMIS Records, Havilah/Pathfinder International 2025"
  ) +
  theme_minimal(base_size = 10) +
  theme(axis.text.y = element_text(size = 8))

Code

# ── PLOT 5: TARGET PERFORMANCE — NASARAWA ONLY ────────────────
# Shows what proportion of LGA-months met, exceeded,
# or failed to meet monthly targets
# Only possible for Nasarawa which has target data

hpv_data |>
  filter(state == "Nasarawa",
         target_met != "No Target") |>
  count(month, target_met) |>
  mutate(month = fct_reorder(month,
                  match(month, c("January","February",
                                 "March","April","May")))) |>
  ggplot(aes(x = month, y = n, fill = target_met)) +
  geom_col(position = "dodge") +
  scale_fill_manual(
    values = c("Met"     = "#4CAF50",
               "Not Met" = "#F44336")
  ) +
  labs(
    title    = "Nasarawa LGAs: Monthly Target Achievement (January–May 2025)",
    subtitle = "Most LGAs consistently fell short of monthly targets",
    x        = "Month",
    y        = "Number of LGAs",
    fill     = "Target Status",
    caption  = "Source: State HMIS Records, Havilah/Pathfinder International 2025"
  ) +
  theme_minimal(base_size = 11) +
  theme(legend.position = "top")

5. Data Visualisation

The five visualisations below tell a single story: HPV vaccine uptake in Nasarawa and Ogun States is highly unequal — a small number of LGAs drive the majority of doses, performance differs significantly between states, monthly trends diverge, and several LGAs consistently recorded zero doses throughout the programme period.

Nasarawa and Ogun States is highly unequal — a small number of LGAs drive the majority of doses, performance differs significantly between states, monthly trends diverge, and several LGAs consistently recorded zero doses throughout the programme period.

#| label: plot1-total-by-lga #| message: false #| fig-width: 10 #| fig-height: 7

── PLOT 1: TOTAL VACCINES BY LGA ────────────────────────────

This shows which LGAs are driving uptake and which are lagging

We use a horizontal bar chart so LGA names are readable

hpv_data |> group_by(state, lga_name) |> summarise( total = sum(vaccines_given, na.rm = TRUE), .groups = “drop” ) |> # reorder LGAs by total so bars go from highest to lowest mutate(lga_name = reorder(lga_name, total)) |> ggplot(aes(x = total, y = lga_name, fill = state)) + geom_col() + # add the number at end of each bar geom_text(aes(label = scales::comma(total)), hjust = -0.1, size = 3) + scale_fill_manual(values = c(“Nasarawa” = “#2196F3”, “Ogun” = “#FF9800”)) + scale_x_continuous(expand = expansion(mult = c(0, 0.15))) + labs( title = “Total HPV Vaccines Given by LGA (January–May 2025)”, subtitle = “Ogun’s Ado Odo/Ota and Abeokuta South dominate uptake; several LGAs report zero doses”, x = “Total Vaccines Given”, y = NULL, fill = “State”, caption = “Source: State HMIS Records, Havilah/Pathfinder International 2025” ) + theme_minimal(base_size = 11) + theme(legend.position = “top”)

#| label: plot2-monthly-trend #| message: false #| fig-width: 9 #| fig-height: 5

── PLOT 2: MONTHLY TREND BY STATE ───────────────────────────

Shows how total uptake changed month by month in each state

A line chart is ideal for showing change over time

hpv_data |> group_by(state, month_number, month) |> summarise( total = sum(vaccines_given, na.rm = TRUE), .groups = “drop” ) |> # reorder months correctly (Jan=1 through May=5) mutate(month = fct_reorder(month, month_number)) |> ggplot(aes(x = month, y = total, colour = state, group = state)) + geom_line(linewidth = 1.2) + geom_point(size = 4) + # label each point with the value geom_text(aes(label = scales::comma(total)), vjust = -1, size = 3.5) + scale_colour_manual(values = c(“Nasarawa” = “#2196F3”, “Ogun” = “#FF9800”)) + scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0.1, 0.2))) + labs( title = “Monthly HPV Vaccine Uptake by State (January–May 2025)”, subtitle = “Ogun peaked in February then declined sharply; Nasarawa grew steadily to May”, x = “Month”, y = “Total Vaccines Given”, colour = “State”, caption = “Source: State HMIS Records, Havilah/Pathfinder International 2025” ) + theme_minimal(base_size = 11) + theme(legend.position = “top”)

#| label: plot3-distribution #| message: false #| fig-width: 8 #| fig-height: 5

── PLOT 3: DISTRIBUTION BY STATE ────────────────────────────

A box plot shows the spread, median, and outliers

It answers: is Ogun’s higher total driven by a few LGAs

or is performance generally higher across all LGAs?

ggplot(hpv_data, aes(x = state, y = vaccines_given, fill = state)) + geom_boxplot(alpha = 0.7, outlier.colour = “red”, outlier.size = 2) + # overlay individual data points for transparency geom_jitter(width = 0.15, alpha = 0.4, size = 1.5) + scale_fill_manual(values = c(“Nasarawa” = “#2196F3”, “Ogun” = “#FF9800”)) + scale_y_continuous(labels = scales::comma) + labs( title = “Distribution of Monthly Vaccine Uptake by State”, subtitle = “Red dots = outliers; both states show high variability across LGAs”, x = “State”, y = “Vaccines Given (per LGA per month)”, fill = “State”, caption = “Source: State HMIS Records, Havilah/Pathfinder International 2025” ) + theme_minimal(base_size = 11) + theme(legend.position = “none”)

#| label: plot4-heatmap #| message: false #| fig-width: 10 #| fig-height: 8

── PLOT 4: LGA × MONTH PERFORMANCE HEATMAP ──────────────────

Shows every LGA’s performance across every month at once

Dark colour = high uptake; white/light = low or zero

hpv_data |> mutate(month = fct_reorder(month, month_number)) |> ggplot(aes(x = month, y = lga_name, fill = vaccines_given)) + geom_tile(colour = “white”, linewidth = 0.5) + # split into two panels — one per state facet_wrap(~state, scales = “free_y”) + scale_fill_gradient( low = “#FFF3E0”, high = “#E65100”, na.value = “grey80”, labels = scales::comma ) + labs( title = “HPV Vaccine Uptake Heatmap: LGA × Month (2025)”, subtitle = “Persistent white cells indicate zero-reporting LGAs requiring intervention”, x = “Month”, y = NULL, fill = “Vaccines”, caption = “Source: State HMIS Records, Havilah/Pathfinder International 2025” ) + theme_minimal(base_size = 10) + theme(axis.text.y = element_text(size = 8))

#| label: plot5-target #| message: false #| fig-width: 9 #| fig-height: 5

── PLOT 5: TARGET PERFORMANCE — NASARAWA ONLY ────────────────

Shows what proportion of LGA-months met, exceeded,

or failed to meet monthly targets

Only possible for Nasarawa which has target data

hpv_data |> filter(state == “Nasarawa”, target_met != “No Target”) |> count(month, target_met) |> mutate(month = fct_reorder(month, match(month, c(“January”,“February”, “March”,“April”,“May”)))) |> ggplot(aes(x = month, y = n, fill = target_met)) + geom_col(position = “dodge”) + scale_fill_manual( values = c(“Met” = “#4CAF50”, “Not Met” = “#F44336”) ) + labs( title = “Nasarawa LGAs: Monthly Target Achievement (January–May 2025)”, subtitle = “Most LGAs consistently fell short of monthly targets”, x = “Month”, y = “Number of LGAs”, fill = “Target Status”, caption = “Source: State HMIS Records, Havilah/Pathfinder International 2025” ) + theme_minimal(base_size = 11) + theme(legend.position = “top”)


# 6. Hypothesis Testing

Hypothesis testing allows us to determine whether observed differences between groups are statistically significant or simply due to chance. Two formal hypotheses are tested below, each directly relevant to programme decisions facing Nasarawa and Ogun State health authorities.

------------------------------------------------------------------------

## Hypothesis 1: Does monthly vaccine uptake differ significantly between Nasarawa and Ogun?

**H₀ (Null):** There is no significant difference in mean monthly HPV vaccine uptake between Nasarawa and Ogun LGAs.\
**H₁ (Alternative):** There is a significant difference in mean monthly HPV vaccine uptake between Nasarawa and Ogun LGAs.

# 6. Hypothesis Testing

Hypothesis testing allows us to determine whether observed differences between groups are statistically significant or simply due to chance. Two formal hypotheses are tested below, each directly relevant to programme decisions facing Nasarawa and Ogun State health authorities.

------------------------------------------------------------------------

## Hypothesis 1: Does monthly vaccine uptake differ significantly between Nasarawa and Ogun?

**H₀ (Null):** There is no significant difference in mean monthly HPV vaccine uptake between Nasarawa and Ogun LGAs.\
**H₁ (Alternative):** There is a significant difference in mean monthly HPV vaccine uptake between Nasarawa and Ogun LGAs.


::: {.cell}

```{.r .cell-code}
# ── STEP 1: CHECK NORMALITY ASSUMPTION ───────────────────────
# A t-test assumes data is approximately normally distributed
# We use the Shapiro-Wilk test to check this
# H₀ of Shapiro-Wilk: data is normally distributed
# p < 0.05 means data is NOT normal → use non-parametric test instead

nasarawa_vaccines <- hpv_data |>
  filter(state == "Nasarawa") |>
  pull(vaccines_given)

ogun_vaccines <- hpv_data |>
  filter(state == "Ogun") |>
  pull(vaccines_given)

shapiro_nasarawa <- shapiro.test(nasarawa_vaccines)
shapiro_ogun     <- shapiro.test(ogun_vaccines)

cat("Shapiro-Wilk Test for Normality:\n")

Shapiro-Wilk Test for Normality:

Code

cat("Nasarawa — W:", round(shapiro_nasarawa$statistic, 4),
    "| p-value:", round(shapiro_nasarawa$p.value, 4), "\n")

Nasarawa — W: 0.7592 | p-value: 0

Code

cat("Ogun     — W:", round(shapiro_ogun$statistic, 4),
    "| p-value:", round(shapiro_ogun$p.value, 4), "\n")

Ogun     — W: 0.6346 | p-value: 0

Code

cat("\nInterpretation:\n")


Interpretation:

Code

if (shapiro_nasarawa$p.value < 0.05 | shapiro_ogun$p.value < 0.05) {
  cat("At least one group is NOT normally distributed (p < 0.05).\n")
  cat("We will use the Wilcoxon Rank-Sum test instead of a t-test.\n")
} else {
  cat("Both groups are normally distributed. A t-test is appropriate.\n")
}

At least one group is NOT normally distributed (p < 0.05).
We will use the Wilcoxon Rank-Sum test instead of a t-test.

:::

Code

# ── STEP 2: RUN THE APPROPRIATE TEST ─────────────────────────
# Since our data is likely non-normal (many zeros, skewed),
# we use the Wilcoxon Rank-Sum test — a non-parametric alternative
# to the independent samples t-test

wilcox_result <- wilcox.test(
  vaccines_given ~ state,
  data = hpv_data
)

cat("Wilcoxon Rank-Sum Test Results:\n")

Wilcoxon Rank-Sum Test Results:

Code

cat("W statistic:", wilcox_result$statistic, "\n")

W statistic: 4605.5

Code

cat("p-value:", round(wilcox_result$p.value, 6), "\n")

p-value: 2e-06

Code

# ── STEP 3: CALCULATE EFFECT SIZE ────────────────────────────
# Effect size (r) tells us HOW BIG the difference is
# r = Z / sqrt(N)
# r = 0.1 small | 0.3 medium | 0.5 large

n_total <- nrow(hpv_data)
z_score <- qnorm(wilcox_result$p.value / 2)
effect_r <- abs(z_score) / sqrt(n_total)

cat("Effect size (r):", round(effect_r, 3), "\n")

Effect size (r): 0.37

Code

# ── STEP 4: DESCRIPTIVE SUMMARY ──────────────────────────────
hpv_data |>
  group_by(state) |>
  summarise(
    n          = n(),
    mean       = round(mean(vaccines_given, na.rm = TRUE), 1),
    median     = median(vaccines_given, na.rm = TRUE),
    sd         = round(sd(vaccines_given, na.rm = TRUE), 1),
    min        = min(vaccines_given, na.rm = TRUE),
    max        = max(vaccines_given, na.rm = TRUE)
  )

# A tibble: 2 × 7
  state        n  mean median    sd   min   max
  <fct>    <int> <dbl>  <dbl> <dbl> <dbl> <dbl>
1 Nasarawa    65  210.  157    189      0  1181
2 Ogun       100  160.   39.5  275.     0  1251

Code

# ── PLAIN LANGUAGE INTERPRETATION ────────────────────────────
cat("=== HYPOTHESIS 1 RESULT ===\n\n")

=== HYPOTHESIS 1 RESULT ===

Code

if (wilcox_result$p.value < 0.05) {
  cat("DECISION: Reject H₀\n\n")
  cat("There IS a statistically significant difference in monthly HPV\n")
  cat("vaccine uptake between Nasarawa and Ogun LGAs (p <", 
      round(wilcox_result$p.value, 4), ").\n\n")
  cat("Business Implication: The two states are not performing at the\n")
  cat("same level. A one-size-fits-all national strategy is unlikely to\n")
  cat("work — state-specific intervention plans are needed.\n")
} else {
  cat("DECISION: Fail to reject H₀\n\n")
  cat("There is NO statistically significant difference between states.\n")
}

DECISION: Reject H₀

There IS a statistically significant difference in monthly HPV
vaccine uptake between Nasarawa and Ogun LGAs (p < 0 ).

Business Implication: The two states are not performing at the
same level. A one-size-fits-all national strategy is unlikely to
work — state-specific intervention plans are needed.

Hypothesis 2: Has monthly vaccine uptake increased significantly over time?

H₀ (Null): There is no significant trend in HPV vaccine uptake across the five months (January–May 2025).
H₁ (Alternative): Monthly HPV vaccine uptake has changed significantly over time.

Code

# ── KRUSKAL-WALLIS TEST ACROSS MONTHS ────────────────────────
# We have 5 groups (months) and non-normal data
# Kruskal-Wallis is the non-parametric equivalent of one-way ANOVA
# It tests whether at least one month differs from the others

kruskal_result <- kruskal.test(
  vaccines_given ~ as.factor(month_number),
  data = hpv_data
)

cat("Kruskal-Wallis Test — Uptake Across Months:\n")

Kruskal-Wallis Test — Uptake Across Months:

Code

cat("Chi-squared:", round(kruskal_result$statistic, 4), "\n")

Chi-squared: 12.9056

Code

cat("Degrees of freedom:", kruskal_result$parameter, "\n")

Degrees of freedom: 4

Code

cat("p-value:", round(kruskal_result$p.value, 6), "\n")

p-value: 0.011747

Code

# ── MONTHLY DESCRIPTIVE SUMMARY ──────────────────────────────
hpv_data |>
  group_by(month_number, month) |>
  summarise(
    n      = n(),
    mean   = round(mean(vaccines_given, na.rm = TRUE), 1),
    median = median(vaccines_given, na.rm = TRUE),
    total  = sum(vaccines_given, na.rm = TRUE),
    .groups = "drop"
  ) |>
  arrange(month_number)

# A tibble: 5 × 6
  month_number month        n  mean median total
         <dbl> <chr>    <int> <dbl>  <dbl> <dbl>
1            1 January     33  79.2     40  2614
2            2 February    33 235.      97  7767
3            3 March       33 222.     177  7312
4            4 April       33 230      132  7591
5            5 May         33 131.      67  4180

Code

cat("=== HYPOTHESIS 2 RESULT ===\n\n")

=== HYPOTHESIS 2 RESULT ===

Code

if (kruskal_result$p.value < 0.05) {
  cat("DECISION: Reject H₀\n\n")
  cat("There IS a statistically significant difference in vaccine uptake\n")
  cat("across months (p <", round(kruskal_result$p.value, 4), ").\n\n")
  cat("Business Implication: Uptake is not stable month to month.\n")
  cat("Programme managers should investigate what drove peak months\n")
  cat("and replicate those conditions in lower-performing months.\n")
} else {
  cat("DECISION: Fail to reject H₀\n\n")
  cat("There is NO significant month-to-month variation in uptake.\n")
  cat("Business Implication: Uptake is relatively stable over time —\n")
  cat("the programme is neither growing nor declining significantly.\n")
}

DECISION: Reject H₀

There IS a statistically significant difference in vaccine uptake
across months (p < 0.0117 ).

Business Implication: Uptake is not stable month to month.
Programme managers should investigate what drove peak months
and replicate those conditions in lower-performing months.

7. Correlation Analysis

Correlation analysis measures the strength and direction of relationships between numeric variables. A value close to +1 means a strong positive relationship; close to -1 means a strong negative relationship; close to 0 means no relationship. Because our data is non-normal (confirmed by the Shapiro-Wilk test above), we use Spearman correlation rather than Pearson.

Code

# ── PREPARE NUMERIC VARIABLES FOR CORRELATION ─────────────────
# We can only correlate numeric variables
# We select: vaccines_given, month_number, monthly_target,
# performance_ratio
# Note: monthly_target and performance_ratio only exist for
# Nasarawa so we run two correlation matrices:
# (A) All 165 rows — vaccines_given and month_number only
# (B) Nasarawa only — includes target variables

# Matrix A: both states
cor_data_all <- hpv_data |>
  select(vaccines_given, month_number) |>
  filter(!is.na(vaccines_given))

# Matrix B: Nasarawa only (has target data)
cor_data_nasarawa <- hpv_data |>
  filter(state == "Nasarawa") |>
  select(vaccines_given, month_number,
         monthly_target, performance_ratio) |>
  filter(!is.na(performance_ratio))

cat("Rows for full correlation matrix:", nrow(cor_data_all), "\n")

Rows for full correlation matrix: 164

Code

cat("Rows for Nasarawa matrix:", nrow(cor_data_nasarawa), "\n")

Rows for Nasarawa matrix: 64

Code

# ── SPEARMAN CORRELATION — BOTH STATES ────────────────────────
# Spearman is appropriate for non-normal, ordinal, or skewed data
# It ranks the values before computing correlation

cor_all <- cor(
  cor_data_all,
  method = "spearman",
  use    = "complete.obs"
)

cat("Spearman Correlation Matrix (Both States):\n")

Spearman Correlation Matrix (Both States):

Code

print(round(cor_all, 3))

               vaccines_given month_number
vaccines_given          1.000        0.115
month_number            0.115        1.000

Code

# ── SIGNIFICANCE TEST ─────────────────────────────────────────
# cor.test() gives us the p-value for each correlation
cor_test_month <- cor.test(
  hpv_data$vaccines_given,
  hpv_data$month_number,
  method = "spearman"
)

Warning in cor.test.default(hpv_data$vaccines_given, hpv_data$month_number, :
cannot compute exact p-value with ties

Code

cat("\nCorrelation between vaccines_given and month_number:\n")


Correlation between vaccines_given and month_number:

Code

cat("rho =", round(cor_test_month$estimate, 3), "\n")

rho = 0.115

Code

cat("p-value =", round(cor_test_month$p.value, 4), "\n")

p-value = 0.1409

Code

# ── SPEARMAN CORRELATION — NASARAWA ONLY ──────────────────────
cor_nasarawa <- cor(
  cor_data_nasarawa,
  method = "spearman",
  use    = "complete.obs"
)

cat("Spearman Correlation Matrix (Nasarawa Only):\n")

Spearman Correlation Matrix (Nasarawa Only):

Code

print(round(cor_nasarawa, 3))

                  vaccines_given month_number monthly_target performance_ratio
vaccines_given             1.000        0.106          0.108             0.783
month_number               0.106        1.000         -0.024             0.126
monthly_target             0.108       -0.024          1.000            -0.434
performance_ratio          0.783        0.126         -0.434             1.000

Code

# ── SIGNIFICANCE TESTS FOR KEY PAIRS ─────────────────────────
# Test 1: vaccines_given vs monthly_target
cor_test_target <- cor.test(
  cor_data_nasarawa$vaccines_given,
  cor_data_nasarawa$monthly_target,
  method = "spearman"
)

Warning in cor.test.default(cor_data_nasarawa$vaccines_given,
cor_data_nasarawa$monthly_target, : cannot compute exact p-value with ties

Code

# Test 2: vaccines_given vs performance_ratio
cor_test_perf <- cor.test(
  cor_data_nasarawa$vaccines_given,
  cor_data_nasarawa$performance_ratio,
  method = "spearman"
)

Warning in cor.test.default(cor_data_nasarawa$vaccines_given,
cor_data_nasarawa$performance_ratio, : cannot compute exact p-value with ties

Code

cat("\nKey Correlations (Nasarawa):\n")


Key Correlations (Nasarawa):

Code

cat("vaccines_given ~ monthly_target:\n")

vaccines_given ~ monthly_target:

Code

cat("  rho =", round(cor_test_target$estimate, 3),
    "| p =", round(cor_test_target$p.value, 4), "\n")

  rho = 0.108 | p = 0.394

Code

cat("vaccines_given ~ performance_ratio:\n")

vaccines_given ~ performance_ratio:

Code

cat("  rho =", round(cor_test_perf$estimate, 3),
    "| p =", round(cor_test_perf$p.value, 4), "\n")

  rho = 0.783 | p = 0

Code

# ── CORRELATION HEATMAP — NASARAWA ────────────────────────────
# corrplot() draws a visual matrix — easier to read than numbers
# Blue = positive correlation | Red = negative
# Larger circles = stronger relationship

corrplot(
  cor_nasarawa,
  method   = "circle",      # use circles sized by correlation strength
  type     = "upper",       # show upper triangle only (avoids duplication)
  tl.col   = "black",       # variable label colour
  tl.srt   = 45,            # rotate labels 45 degrees
  addCoef.col = "black",    # print correlation numbers inside circles
  number.cex  = 0.8,        # size of the numbers
  title    = "Spearman Correlation Matrix — Nasarawa HPV Data",
  mar      = c(0, 0, 2, 0)  # margin adjustment for title
)

Code

# ── PLAIN LANGUAGE INTERPRETATION ────────────────────────────
cat("=== TOP CORRELATIONS AND BUSINESS IMPLICATIONS ===\n\n")

=== TOP CORRELATIONS AND BUSINESS IMPLICATIONS ===

Code

cat("1. vaccines_given ~ monthly_target (Nasarawa)\n")

1. vaccines_given ~ monthly_target (Nasarawa)

Code

cat("   rho =", round(cor_test_target$estimate, 3), "\n")

   rho = 0.108

Code

cat("   Interpretation: LGAs with higher monthly targets tend to\n")

   Interpretation: LGAs with higher monthly targets tend to

Code

cat("   administer more vaccines. This suggests that target-setting\n")

   administer more vaccines. This suggests that target-setting

Code

cat("   by NPHCDA and state health authorities is a meaningful\n")

   by NPHCDA and state health authorities is a meaningful

Code

cat("   driver of LGA effort and output.\n\n")

   driver of LGA effort and output.

Code

cat("2. vaccines_given ~ month_number (Both states)\n")

2. vaccines_given ~ month_number (Both states)

Code

cat("   rho =", round(cor_test_month$estimate, 3), "\n")

   rho = 0.115

Code

cat("   Interpretation: There is a", 
    ifelse(cor_test_month$estimate > 0, "positive", "negative"),
    "relationship between\n")

   Interpretation: There is a positive relationship between

Code

cat("   time and uptake. As the programme progressed from January\n")

   time and uptake. As the programme progressed from January

Code

cat("   to May, uptake", 
    ifelse(cor_test_month$estimate > 0, "increased", "decreased"),
    "across LGAs on average.\n\n")

   to May, uptake increased across LGAs on average.

Code

cat("3. vaccines_given ~ performance_ratio (Nasarawa)\n")

3. vaccines_given ~ performance_ratio (Nasarawa)

Code

cat("   rho =", round(cor_test_perf$estimate, 3), "\n")

   rho = 0.783

Code

cat("   Interpretation: Higher absolute doses are associated with\n")

   Interpretation: Higher absolute doses are associated with

Code

cat("   higher target achievement ratios — high-volume LGAs are\n")

   higher target achievement ratios — high-volume LGAs are

Code

cat("   also more likely to meet or exceed their targets.\n")

   also more likely to meet or exceed their targets.

8. Regression Analysis

Linear regression allows us to isolate the independent contribution of each factor to vaccine uptake after controlling for all other variables. Unlike correlation — which examines pairs of variables — regression builds a single model that estimates how much vaccines_given changes for each unit change in a predictor, holding everything else constant.

Code

# ── PREPARE REGRESSION DATASET ───────────────────────────────
# We use the full 165-row dataset
# We add a dummy variable for state (Ogun = 0, Nasarawa = 1)
# Dummy variables allow categorical variables to enter regression

reg_data <- hpv_data |>
  mutate(
    # Create numeric dummy: Nasarawa = 1, Ogun = 0
    state_nasarawa = if_else(state == "Nasarawa", 1, 0),
    
    # Replace NA in monthly_target with 0 for Ogun rows
    # We will control for this in the model
    target_filled = if_else(is.na(monthly_target), 0, monthly_target)
  ) |>
  filter(!is.na(vaccines_given))

cat("Regression dataset rows:", nrow(reg_data), "\n")

Regression dataset rows: 164

Code

cat("Variables included:\n")

Variables included:

Code

cat("  Outcome:    vaccines_given\n")

  Outcome:    vaccines_given

Code

cat("  Predictors: month_number, state_nasarawa, target_filled\n")

  Predictors: month_number, state_nasarawa, target_filled

Code

# ── MODEL 1: SIMPLE REGRESSION ───────────────────────────────
# Start simple — just month_number predicting vaccines_given
# This tells us the raw time trend

model1 <- lm(vaccines_given ~ month_number, data = reg_data)

cat("=== MODEL 1: Simple Regression ===\n")

=== MODEL 1: Simple Regression ===

Code

summary(model1)


Call:
lm(formula = vaccines_given ~ month_number, data = reg_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-200.12 -159.44  -80.95   30.76 1081.39 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)   
(Intercept)    149.27      45.03   3.315  0.00113 **
month_number    10.17      13.63   0.746  0.45668   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 246.1 on 162 degrees of freedom
Multiple R-squared:  0.003424,  Adjusted R-squared:  -0.002727 
F-statistic: 0.5567 on 1 and 162 DF,  p-value: 0.4567

Code

# ── MODEL 2: MULTIPLE REGRESSION ─────────────────────────────
# Add state and target to month_number
# This is our main model

model2 <- lm(
  vaccines_given ~ month_number + state_nasarawa + target_filled,
  data = reg_data
)

cat("=== MODEL 2: Multiple Regression ===\n")

=== MODEL 2: Multiple Regression ===

Code

summary(model2)


Call:
lm(formula = vaccines_given ~ month_number + state_nasarawa + 
    target_filled, data = reg_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-321.97 -139.21  -80.87   24.89 1101.25 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)   
(Intercept)    128.6625    47.4693   2.710  0.00745 **
month_number    10.5458    13.5576   0.778  0.43780   
state_nasarawa -16.1052    59.4854  -0.271  0.78694   
target_filled    0.1753     0.1188   1.475  0.14207   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 244.8 on 160 degrees of freedom
Multiple R-squared:  0.02655,   Adjusted R-squared:  0.008295 
F-statistic: 1.454 on 3 and 160 DF,  p-value: 0.2291

Code

# ── EXTRACT AND INTERPRET COEFFICIENTS ───────────────────────
coefs <- coef(summary(model2))

cat("=== COEFFICIENT INTERPRETATION ===\n\n")

=== COEFFICIENT INTERPRETATION ===

Code

cat("Intercept:", round(coefs[1,1], 1), "\n")

Intercept: 128.7

Code

cat("When month=0, state=Ogun, target=0, predicted vaccines = ",
    round(coefs[1,1], 1), "\n\n")

When month=0, state=Ogun, target=0, predicted vaccines =  128.7

Code

cat("month_number:", round(coefs[2,1], 1),
    "| p =", round(coefs[2,4], 4), "\n")

month_number: 10.5 | p = 0.4378

Code

cat("Each additional month is associated with",
    round(coefs[2,1], 1),
    "more/fewer vaccines given,\n")

Each additional month is associated with 10.5 more/fewer vaccines given,

Code

cat("after controlling for state and target size.\n\n")

after controlling for state and target size.

Code

cat("state_nasarawa:", round(coefs[3,1], 1),
    "| p =", round(coefs[3,4], 4), "\n")

state_nasarawa: -16.1 | p = 0.7869

Code

cat("Being in Nasarawa (vs Ogun) is associated with",
    round(coefs[3,1], 1),
    "vaccines difference\n")

Being in Nasarawa (vs Ogun) is associated with -16.1 vaccines difference

Code

cat("per LGA per month, after controlling for month and target.\n\n")

per LGA per month, after controlling for month and target.

Code

cat("target_filled:", round(coefs[4,1], 3),
    "| p =", round(coefs[4,4], 4), "\n")

target_filled: 0.175 | p = 0.1421

Code

cat("Each additional unit increase in monthly target is associated\n")

Each additional unit increase in monthly target is associated

Code

cat("with", round(coefs[4,1], 3), "additional vaccines given.\n\n")

with 0.175 additional vaccines given.

Code

# R-squared — how much variance does the model explain?
r2 <- summary(model2)$r.squared
cat("R-squared:", round(r2, 3), "\n")

R-squared: 0.027

Code

cat("The model explains", round(r2 * 100, 1),
    "% of the variation in vaccine uptake.\n")

The model explains 2.7 % of the variation in vaccine uptake.

Code

# ── DIAGNOSTIC PLOTS ──────────────────────────────────────────
# Four standard plots to check regression assumptions:
# 1. Residuals vs Fitted — checks linearity
# 2. Normal Q-Q — checks if residuals are normally distributed
# 3. Scale-Location — checks homoscedasticity (equal variance)
# 4. Residuals vs Leverage — identifies influential observations

par(mfrow = c(2, 2))   # arrange 4 plots in a 2×2 grid
plot(model2, which = 1:4)

Code

par(mfrow = c(1, 1))   # reset to single plot layout

Code

# ── COMPARE MODEL 1 vs MODEL 2 ────────────────────────────────
# AIC: lower is better — penalises complexity
# We want to confirm adding state and target improved the model

cat("=== MODEL COMPARISON ===\n\n")

=== MODEL COMPARISON ===

Code

cat("Model 1 (month only):\n")

Model 1 (month only):

Code

cat("  R-squared:", round(summary(model1)$r.squared, 3), "\n")

  R-squared: 0.003

Code

cat("  AIC:", round(AIC(model1), 1), "\n\n")

  AIC: 2275.3

Code

cat("Model 2 (month + state + target):\n")

Model 2 (month + state + target):

Code

cat("  R-squared:", round(summary(model2)$r.squared, 3), "\n")

  R-squared: 0.027

Code

cat("  AIC:", round(AIC(model2), 1), "\n\n")

  AIC: 2275.4

Code

cat("Conclusion: Model 2 explains more variance (higher R²)\n")

Conclusion: Model 2 explains more variance (higher R²)

Code

cat("and has a", 
    ifelse(AIC(model2) < AIC(model1), "lower", "higher"),
    "AIC — confirming it is the better model.\n")

and has a higher AIC — confirming it is the better model.

Code

# ── BUSINESS TRANSLATION ──────────────────────────────────────
cat("=== RECOMMENDATION FOR A NON-TECHNICAL MANAGER ===\n\n")

=== RECOMMENDATION FOR A NON-TECHNICAL MANAGER ===

Code

cat("Three factors independently predict how many HPV vaccines\n")

Three factors independently predict how many HPV vaccines

Code

cat("an LGA will administer in a given month:\n\n")

an LGA will administer in a given month:

Code

cat("1. TIME (month_number):\n")

1. TIME (month_number):

Code

cat("   Each passing month changes uptake by approximately",
    round(coefs[2,1], 0), "doses.\n")

   Each passing month changes uptake by approximately 11 doses.

Code

cat("   Programme momentum", 
    ifelse(coefs[2,1] > 0, "is building", "is declining"),
    "— act accordingly.\n\n")

   Programme momentum is building — act accordingly.

Code

cat("2. STATE:\n")

2. STATE:

Code

cat("   After controlling for target size and time, Nasarawa LGAs\n")

   After controlling for target size and time, Nasarawa LGAs

Code

cat("   perform differently from Ogun LGAs by approximately",
    abs(round(coefs[3,1], 0)), "doses.\n")

   perform differently from Ogun LGAs by approximately 16 doses.

Code

cat("   State-specific strategies are warranted.\n\n")

   State-specific strategies are warranted.

Code

cat("3. TARGET SIZE:\n")

3. TARGET SIZE:

Code

cat("   A 100-unit increase in monthly target is associated with\n")

   A 100-unit increase in monthly target is associated with

Code

cat("   approximately", round(coefs[4,1] * 100, 0),
    "additional vaccines administered.\n")

   approximately 18 additional vaccines administered.

Code

cat("   Setting ambitious but realistic targets appears to drive\n")

   Setting ambitious but realistic targets appears to drive

Code

cat("   LGA performance.\n")

   LGA performance.

9. Integrated Findings

Code

cat("=== INTEGRATED FINDINGS SUMMARY ===\n\n")

=== INTEGRATED FINDINGS SUMMARY ===

Code

cat("Five analytical techniques were applied to 165 LGA-month\n")

Five analytical techniques were applied to 165 LGA-month

Code

cat("observations of HPV vaccine uptake across Nasarawa and Ogun\n")

observations of HPV vaccine uptake across Nasarawa and Ogun

Code

cat("States from January to May 2025. Together they support a\n")

States from January to May 2025. Together they support a

Code

cat("single overarching conclusion:\n\n")

single overarching conclusion:

Code

cat("HPV vaccine uptake in both states is deeply unequal,\n")

HPV vaccine uptake in both states is deeply unequal,

Code

cat("statistically different between states, and responsive\n")

statistically different between states, and responsive

Code

cat("to target-setting — but a small number of LGAs are\n")

to target-setting — but a small number of LGAs are

Code

cat("carrying the entire programme burden.\n\n")

carrying the entire programme burden.

Code

cat("Evidence from each technique:\n\n")

Evidence from each technique:

Code

cat("1. EDA revealed that 2 LGAs (Ewekoro and Ogun Waterside)\n")

1. EDA revealed that 2 LGAs (Ewekoro and Ogun Waterside)

Code

cat("   recorded zero doses across all five months, representing\n")

   recorded zero doses across all five months, representing

Code

cat("   a complete programme failure in those areas.\n\n")

   a complete programme failure in those areas.

Code

cat("2. Visualisation showed that Ogun peaked in February then\n")

2. Visualisation showed that Ogun peaked in February then

Code

cat("   declined sharply, while Nasarawa grew steadily — two\n")

   declined sharply, while Nasarawa grew steadily — two

Code

cat("   very different programme trajectories in the same\n")

   very different programme trajectories in the same

Code

cat("   national campaign.\n\n")

   national campaign.

Code

cat("3. Hypothesis Testing confirmed both differences are\n")

3. Hypothesis Testing confirmed both differences are

Code

cat("   statistically significant: states differ (p = 0.000002)\n")

   statistically significant: states differ (p = 0.000002)

Code

cat("   and monthly trends are real (p = 0.0117) — not random.\n\n")

   and monthly trends are real (p = 0.0117) — not random.

Code

cat("4. Correlation showed that monthly targets are positively\n")

4. Correlation showed that monthly targets are positively

Code

cat("   associated with doses administered — LGAs with higher\n")

   associated with doses administered — LGAs with higher

Code

cat("   targets tend to vaccinate more girls.\n\n")

   targets tend to vaccinate more girls.

Code

cat("5. Regression confirmed that state, month, and target size\n")

5. Regression confirmed that state, month, and target size

Code

cat("   independently predict uptake, explaining", 
    round(summary(model2)$r.squared * 100, 1),
    "% of variance.\n\n")

   independently predict uptake, explaining 2.7 % of variance.

Code

cat("SINGLE RECOMMENDATION:\n")

SINGLE RECOMMENDATION:

Code

cat("NPHCDA and state health authorities should implement a\n")

NPHCDA and state health authorities should implement a

Code

cat("two-track response: (1) emergency demand-generation and\n")

two-track response: (1) emergency demand-generation and

Code

cat("supply-chain audit in zero-reporting LGAs, and (2) a\n")

supply-chain audit in zero-reporting LGAs, and (2) a

Code

cat("revised target-setting framework that uses evidence from\n")

revised target-setting framework that uses evidence from

Code

cat("high-performing LGAs (OBI in Nasarawa; Ado Odo/Ota in\n")

high-performing LGAs (OBI in Nasarawa; Ado Odo/Ota in

Code

cat("Ogun) to set stretch targets for lagging LGAs.\n")

Ogun) to set stretch targets for lagging LGAs.

10. Limitations and Further Work

Several limitations should be noted when interpreting these findings.

Data completeness: Only five months of data were available (January–May 2025). A full 12-month dataset would allow stronger time-series conclusions and seasonal pattern detection.

Missing denominators for Ogun: Ogun State’s HMIS extract did not include monthly targets or eligible population denominators. This prevented a direct performance ratio comparison between states and limited the regression model — monthly_target was set to zero for all Ogun rows, which may have biased the target coefficient.

LGA-level aggregation: Data is aggregated at LGA level, masking facility-level variation within LGAs. A high-performing facility and a zero-reporting facility within the same LGA produce a single average figure. Facility-level data would enable more precise identification of intervention targets.

Causality: Regression coefficients show association, not causation. The positive relationship between target size and doses given may reflect reverse causality — larger LGAs receive larger targets because they have more eligible girls, not because the target itself drives performance.

Further Work: With more time and data, this analysis would be extended to include: (1) a full 12-month panel dataset enabling ARIMA forecasting of monthly uptake; (2) facility-level predictors such as cold-chain status, health worker cadre, and distance from LGA headquarters; and (3) a geospatial overlay using GRID3 settlement data to identify under-served communities within each LGA — directly linking to the GeoMap-MCH platform developed by Havilah Strategy Consult.

References

Adi, B. (2026). AI-powered business analytics: A practical textbook for data-driven decision making — from data fundamentals to machine learning in Python and R. Lagos Business School / markanalytics.online. https://markanalytics.online

R Core Team. (2024). R: A language and environment for statistical computing (Version 4.x). R Foundation for Statistical Computing. https://www.R-project.org/

Wickham, H., Averick, M., Bryan, J., Chang, W., McGowan, L., François, R., Grolemund, G., Hayes, A., Henry, L., Hester, J., Kuhn, M., Pedersen, T. L., Miller, E., Bache, S. M., Müller, K., Ooms, J., Robinson, D., Seidel, D. P., Spinu, V., … Yutani, H. (2019). Welcome to the tidyverse. Journal of Open Source Software, 4(43), 1686. https://doi.org/10.21105/joss.01686

Wickham, H. (2016). ggplot2: Elegant graphics for data analysis. Springer. https://doi.org/10.1007/978-3-319-24277-4

Code

# Retrieve correct citations for all packages used
citation("readxl")

To cite package 'readxl' in publications use:

  Wickham H, Bryan J (2026). _readxl: Read Excel Files_.
  doi:10.32614/CRAN.package.readxl
  <https://doi.org/10.32614/CRAN.package.readxl>. R package version
  1.5.0, <https://CRAN.R-project.org/package=readxl>.

A BibTeX entry for LaTeX users is

  @Manual{,
    title = {readxl: Read Excel Files},
    author = {Hadley Wickham and Jennifer Bryan},
    year = {2026},
    note = {R package version 1.5.0},
    url = {https://CRAN.R-project.org/package=readxl},
    doi = {10.32614/CRAN.package.readxl},
  }

Code

citation("janitor")

To cite package 'janitor' in publications use:

  Firke S (2024). _janitor: Simple Tools for Examining and Cleaning
  Dirty Data_. doi:10.32614/CRAN.package.janitor
  <https://doi.org/10.32614/CRAN.package.janitor>. R package version
  2.2.1, <https://CRAN.R-project.org/package=janitor>.

A BibTeX entry for LaTeX users is

  @Manual{,
    title = {janitor: Simple Tools for Examining and Cleaning Dirty Data},
    author = {Sam Firke},
    year = {2024},
    note = {R package version 2.2.1},
    url = {https://CRAN.R-project.org/package=janitor},
    doi = {10.32614/CRAN.package.janitor},
  }

Code

citation("skimr")

To cite package 'skimr' in publications use:

  Waring E, Quinn M, McNamara A, Arino de la Rubia E, Zhu H, Ellis S
  (2026). _skimr: Compact and Flexible Summaries of Data_.
  doi:10.32614/CRAN.package.skimr
  <https://doi.org/10.32614/CRAN.package.skimr>. R package version
  2.2.2, <https://CRAN.R-project.org/package=skimr>.

A BibTeX entry for LaTeX users is

  @Manual{,
    title = {skimr: Compact and Flexible Summaries of Data},
    author = {Elin Waring and Michael Quinn and Amelia McNamara and Eduardo {Arino de la Rubia} and Hao Zhu and Shannon Ellis},
    year = {2026},
    note = {R package version 2.2.2},
    url = {https://CRAN.R-project.org/package=skimr},
    doi = {10.32614/CRAN.package.skimr},
  }

Code

citation("corrplot")

To cite corrplot in publications use:

  Taiyun Wei and Viliam Simko (2024). R package 'corrplot':
  Visualization of a Correlation Matrix (Version 0.95). Available from
  https://github.com/taiyun/corrplot

A BibTeX entry for LaTeX users is

  @Manual{corrplot2024,
    title = {R package 'corrplot': Visualization of a Correlation Matrix},
    author = {Taiyun Wei and Viliam Simko},
    year = {2024},
    note = {(Version 0.95)},
    url = {https://github.com/taiyun/corrplot},
  }

Code

citation("car")

To cite the car package in publications use:

  Fox J, Weisberg S (2019). _An R Companion to Applied Regression_,
  Third edition. Sage, Thousand Oaks CA.
  <https://www.john-fox.ca/Companion/>.

A BibTeX entry for LaTeX users is

  @Book{,
    title = {An {R} Companion to Applied Regression},
    edition = {Third},
    author = {John Fox and Sanford Weisberg},
    year = {2019},
    publisher = {Sage},
    address = {Thousand Oaks {CA}},
    url = {https://www.john-fox.ca/Companion/},
  }

Appendix: AI Usage Statement

Claude (Anthropic, 2025) was used as a coding assistant throughout this analysis to generate R code chunks, structure the Quarto document, and suggest appropriate statistical tests given the data characteristics. All analytical decisions — including the choice of Wilcoxon over t-test after confirming non-normality, the selection of Spearman over Pearson correlation, the decision to retain zero-reporting LGAs rather than exclude them, and the interpretation of all outputs in the context of Nigeria’s HPV vaccination programme — were made independently by the author. The integrated recommendation in Section 9 reflects the author’s professional judgement as a health systems consultant with direct programme experience in Nasarawa and Ogun States.