Q4 2025

Author

Data Science Team

Published

May 8, 2026

Load Libraries

Code
library(readxl)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.1     ✔ readr     2.2.0
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.3     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Code
library(ggplot2)
library(ggcorrplot)
library(scales)

Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor
Code
library(moments)       # skewness / kurtosis
library(car)           # vif, qqPlot
Loading required package: carData

Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some
Code
library(lmtest)        # bptest (Breusch-Pagan)
Loading required package: zoo

Attaching package: 'zoo'

The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
Code
library(nortest)       # lillie.test
library(patchwork)     # combine ggplots
library(ggrepel)       # non-overlapping text labels
library(RColorBrewer)  # palettes
library(effectsize)    # cohen's d, eta squared

Data Ingestion

Code
# NOTE: The raw Excel file has a multi-row merged header.
# Rows 1-2: blank/title; Row 3: group headers; Row 4: product sub-headers.
# Actual data starts at row 5 (Excel row index).
# We skip the first 3 rows and assign clean column names manually.
 
raw <- read_excel(
  "Q4_Sales_Data.xlsx",
  sheet    = "2025 OCT NOV DEC",
  col_names = FALSE,
  skip      = 3        # skip title + blank + merged group header rows
)
New names:
• `` -> `...1`
• `` -> `...2`
• `` -> `...3`
• `` -> `...4`
• `` -> `...5`
• `` -> `...6`
• `` -> `...7`
• `` -> `...8`
• `` -> `...9`
• `` -> `...10`
• `` -> `...11`
• `` -> `...12`
• `` -> `...13`
• `` -> `...14`
• `` -> `...15`
• `` -> `...16`
• `` -> `...17`
• `` -> `...18`
• `` -> `...19`
• `` -> `...20`
• `` -> `...21`
• `` -> `...22`
• `` -> `...23`
• `` -> `...24`
• `` -> `...25`
• `` -> `...26`
• `` -> `...27`
• `` -> `...28`
• `` -> `...29`
Code
# Assign column names (29 columns)
col_names <- c(
  "sn", "station",
  "pms_oct_ltrs", "ago_oct_ltrs", "dpk_oct_ltrs",
  "pms_nov_ltrs", "ago_nov_ltrs", "dpk_nov_ltrs",
  "pms_dec_ltrs", "ago_dec_ltrs", "dpk_dec_ltrs",
  "pms_oct_price","ago_oct_price","dpk_oct_price",
  "pms_nov_price","ago_nov_price","dpk_nov_price",
  "pms_dec_price","ago_dec_price","dpk_dec_price",
  "pms_oct_naira","ago_oct_naira","dpk_oct_naira",
  "pms_nov_naira","ago_nov_naira","dpk_nov_naira",
  "pms_dec_naira","ago_dec_naira","dpk_dec_naira"
)
colnames(raw) <- col_names
 
# Drop the sub-header row (row 1 of raw after skip = product labels row)
# and the trailing grand-total row
df <- raw %>%
  slice(-1) %>%                          # remove "PMS / AGO / DPK" label row
  filter(!is.na(sn), sn != "S/N") %>%   # drop any header remnants
  slice_head(n = 193)                    # keep exactly 193 stations
 
# Clean station names
df$station <- trimws(df$station)
 
# Coerce all numeric columns
num_cols <- col_names[-(1:2)]
df <- df %>%
  mutate(across(all_of(num_cols), ~ suppressWarnings(as.numeric(.))))

Feature Engineering

Code
df <- df %>%
  mutate(
    # Total volume (litres) by month
    total_ltrs_oct = pms_oct_ltrs + ago_oct_ltrs + dpk_oct_ltrs,
    total_ltrs_nov = pms_nov_ltrs + ago_nov_ltrs + dpk_nov_ltrs,
    total_ltrs_dec = pms_dec_ltrs + ago_dec_ltrs + dpk_dec_ltrs,
    total_ltrs_all = total_ltrs_oct + total_ltrs_nov + total_ltrs_dec,
 
    # Total revenue (Naira) by month
    total_rev_oct  = pms_oct_naira + ago_oct_naira + dpk_oct_naira,
    total_rev_nov  = pms_nov_naira + ago_nov_naira + dpk_nov_naira,
    total_rev_dec  = pms_dec_naira + ago_dec_naira + dpk_dec_naira,
    total_rev_all  = total_rev_oct + total_rev_nov + total_rev_dec,
 
    # Product totals across Q4
    pms_total_ltrs = pms_oct_ltrs + pms_nov_ltrs + pms_dec_ltrs,
    ago_total_ltrs = ago_oct_ltrs + ago_nov_ltrs + ago_dec_ltrs,
    dpk_total_ltrs = dpk_oct_ltrs + dpk_nov_ltrs + dpk_dec_ltrs,
 
    # PMS share of volume (dominant product)
    pms_share = pms_total_ltrs / (total_ltrs_all + 1e-9),
 
    # Month-on-month growth Oct → Dec (litres)
    mom_growth = (total_ltrs_dec - total_ltrs_oct) /
                   (total_ltrs_oct + 1e-9),
 
    # Revenue in millions (Naira)
    rev_all_M = total_rev_all / 1e6,
 
    # Average realised PMS price across Q4
    avg_pms_price = (pms_oct_price + pms_nov_price + pms_dec_price) / 3,
 
    # Average realised AGO price across Q4
    avg_ago_price = (ago_oct_price + ago_nov_price + ago_dec_price) / 3,
 
    # Performance tier (volume-based)
    tier = case_when(
      total_ltrs_all >= quantile(total_ltrs_all, 0.75, na.rm = TRUE) ~ "High",
      total_ltrs_all >= quantile(total_ltrs_all, 0.25, na.rm = TRUE) ~ "Mid",
      TRUE                                                             ~ "Low"
    ),
    tier = factor(tier, levels = c("Low", "Mid", "High")),
 
    # Log transform for skewed variables
    log_total_ltrs = log1p(total_ltrs_all),
    log_total_rev  = log1p(total_rev_all)
  )
 
cat(" DATASET OVERVIEW \n")
 DATASET OVERVIEW 
Code
cat("Stations:", nrow(df), "\n")
Stations: 193 
Code
cat("Columns :", ncol(df), "\n")
Columns : 48 
Code
cat("Period  : Oct – Dec 2025 (Q4)\n")
Period  : Oct – Dec 2025 (Q4)
Code
cat("Network : Rainoil Petroleum Retail Stations\n\n")
Network : Rainoil Petroleum Retail Stations

Exploratory Data Analysis (EDA)

Code
cat("=== A. EDA — DESCRIPTIVE STATISTICS ===\n\n")
=== A. EDA — DESCRIPTIVE STATISTICS ===
Code
# Core descriptive statistics
df %>%
  select(total_ltrs_all, total_rev_all, pms_total_ltrs,
         ago_total_ltrs, dpk_total_ltrs, pms_share,
         avg_pms_price, avg_ago_price) %>%
  summary() %>%
  print()
 total_ltrs_all    total_rev_all       pms_total_ltrs    ago_total_ltrs   
 Min.   :      0   Min.   :0.000e+00   Min.   :      0   Min.   :      0  
 1st Qu.: 391776   1st Qu.:3.636e+08   1st Qu.: 334821   1st Qu.:  30903  
 Median : 615767   Median :5.707e+08   Median : 532223   Median :  57044  
 Mean   : 680785   Mean   :6.352e+08   Mean   : 592849   Mean   :  87202  
 3rd Qu.: 821714   3rd Qu.:7.704e+08   3rd Qu.: 732730   3rd Qu.:  97980  
 Max.   :4348799   Max.   :4.180e+09   Max.   :3211117   Max.   :1726223  
 dpk_total_ltrs      pms_share      avg_pms_price   avg_ago_price   
 Min.   :    0.0   Min.   :0.0000   Min.   :163.0   Min.   : 194.4  
 1st Qu.:    0.0   1st Qu.:0.8476   1st Qu.:908.9   1st Qu.:1062.9  
 Median :    0.0   Median :0.9041   Median :910.7   Median :1065.5  
 Mean   :  734.3   Mean   :0.8779   Mean   :906.0   Mean   :1054.1  
 3rd Qu.:    0.0   3rd Qu.:0.9328   3rd Qu.:919.5   3rd Qu.:1073.0  
 Max.   :17875.0   Max.   :1.0000   Max.   :957.0   Max.   :1083.1  
Code
cat("\n--- Distribution Moments ---\n")

--- Distribution Moments ---
Code
volume_stats <- df %>%
  summarise(
    Mean_Ltrs    = mean(total_ltrs_all, na.rm = TRUE),
    Median_Ltrs  = median(total_ltrs_all, na.rm = TRUE),
    SD_Ltrs      = sd(total_ltrs_all, na.rm = TRUE),
    Skewness     = skewness(total_ltrs_all, na.rm = TRUE),
    Kurtosis     = kurtosis(total_ltrs_all, na.rm = TRUE),
    IQR_Ltrs     = IQR(total_ltrs_all, na.rm = TRUE)
  )
print(volume_stats)
# A tibble: 1 × 6
  Mean_Ltrs Median_Ltrs SD_Ltrs Skewness Kurtosis IQR_Ltrs
      <dbl>       <dbl>   <dbl>    <dbl>    <dbl>    <dbl>
1   680785.      615767 503354.     4.22     30.2   429938

Data Quality Issue

Zero-Volume Stations

Code
# Some stations report zero litres across ALL products for a month.
# This is operationally implausible for an open station — it likely
# indicates: (a) station temporarily closed, (b) data not submitted,
# or (c) data entry error (missed entry).
 
zero_stations <- df %>%
  filter(total_ltrs_oct == 0 | total_ltrs_nov == 0 | total_ltrs_dec == 0) %>%
  select(station, total_ltrs_oct, total_ltrs_nov, total_ltrs_dec, total_ltrs_all)
 
cat("Stations with at least one zero-volume month:\n")
Stations with at least one zero-volume month:
Code
print(zero_stations, n = Inf)
# A tibble: 4 × 5
  station            total_ltrs_oct total_ltrs_nov total_ltrs_dec total_ltrs_all
  <chr>                       <dbl>          <dbl>          <dbl>          <dbl>
1 Rainoil Portharco…              0              0         105021         105021
2 Rainoil Ilorin - …              0              0              0              0
3 Rainoil Uselu She…              0          35195          46257          81452
4 Rainoil Ughelli P…              0              0         322914         322914
Code
cat("\nResolution Strategy:\n")

Resolution Strategy:
Code
cat("  • For stations with zero in ONE month but non-zero in others → treat as\n")
  • For stations with zero in ONE month but non-zero in others → treat as
Code
cat("    'temporary closure / data gap'. Flag with is_intermittent = TRUE.\n")
    'temporary closure / data gap'. Flag with is_intermittent = TRUE.
Code
cat("  • For stations with zero across ALL 3 months → exclude from modelling\n")
  • For stations with zero across ALL 3 months → exclude from modelling
Code
cat("    (Rainoil Uselu Shell & Rainoil Ughelli Post Office) as they appear\n")
    (Rainoil Uselu Shell & Rainoil Ughelli Post Office) as they appear
Code
cat("    to be non-operational during Q4 2025.\n")
    to be non-operational during Q4 2025.
Code
cat("  • We retain them in descriptive counts but exclude from regression.\n\n")
  • We retain them in descriptive counts but exclude from regression.
Code
df <- df %>%
  mutate(
    is_intermittent  = (total_ltrs_oct == 0 | total_ltrs_nov == 0 |
                          total_ltrs_dec == 0) & total_ltrs_all > 0,
    is_zero_all      = total_ltrs_all == 0
  )
 
cat("Intermittent stations flagged:", sum(df$is_intermittent), "\n")
Intermittent stations flagged: 3 
Code
cat("Fully inactive stations excluded from modelling:", sum(df$is_zero_all), "\n")
Fully inactive stations excluded from modelling: 1 

Extreme Outliers / Skewness

Code
cat("Skewness of total quarterly volume:", round(skewness(df$total_ltrs_all), 3), "\n")
Skewness of total quarterly volume: 4.218 
Code
cat("(Skewness > 1.0 is typically considered substantially skewed)\n\n")
(Skewness > 1.0 is typically considered substantially skewed)
Code
# IQR-based outlier detection
Q1 <- quantile(df$total_ltrs_all, 0.25, na.rm = TRUE)
Q3 <- quantile(df$total_ltrs_all, 0.75, na.rm = TRUE)
iqr_val <- Q3 - Q1
upper_fence <- Q3 + 3 * iqr_val  # extreme outlier fence
 
outlier_stations <- df %>%
  filter(total_ltrs_all > upper_fence) %>%
  select(station, total_ltrs_all, total_rev_all, tier)
 
cat("IQR-based extreme outlier fence (3×IQR):",
    format(round(upper_fence), big.mark = ","), "litres\n\n")
IQR-based extreme outlier fence (3×IQR): 2,111,528 litres
Code
cat("Stations above fence (extreme high-volume):\n")
Stations above fence (extreme high-volume):
Code
print(outlier_stations)
# A tibble: 2 × 4
  station                         total_ltrs_all total_rev_all tier 
  <chr>                                    <dbl>         <dbl> <fct>
1 Rainoil Ibafo                          4344378   4067181671. High 
2 Rainoil Asaba - Summit Junction        4348799   4179830842. High 
Code
cat("\nResolution Strategy:\n")

Resolution Strategy:
Code
cat("  • Outliers are REAL mega-stations (e.g., Asaba Summit Junction,\n")
  • Outliers are REAL mega-stations (e.g., Asaba Summit Junction,
Code
cat("    Ibafo) — not data errors. They should NOT be deleted.\n")
    Ibafo) — not data errors. They should NOT be deleted.
Code
cat("  • Instead, we apply log1p() transformation for regression and\n")
  • Instead, we apply log1p() transformation for regression and
Code
cat("    correlation analysis to compress the scale and reduce leverage.\n")
    correlation analysis to compress the scale and reduce leverage.
Code
cat("  • In plots we annotate outliers separately rather than hiding them.\n\n")
  • In plots we annotate outliers separately rather than hiding them.
Code
# Confirm skewness improvement after log transform
cat("Skewness after log1p transform:", round(skewness(df$log_total_ltrs), 3), "\n")
Skewness after log1p transform: -8.511 

Visualization Suite

From Litres to Naira — How Rainoil’s 193 Stations Performed Across the Q4 2025 Quarter

Code
# ── THEME SETUP (white background, light grid) ───────────────
rainoil_theme <- theme_minimal(base_size = 11) +
  theme(
    plot.background   = element_rect(fill = "white", color = NA),
    panel.background  = element_rect(fill = "white", color = NA),
    panel.grid.major  = element_line(color = "grey90"),
    panel.grid.minor  = element_blank(),
    plot.title        = element_text(face = "bold", size = 12,
                                     color = "#1a1a2e"),
    plot.subtitle     = element_text(size = 9, color = "grey40"),
    axis.title        = element_text(size = 9, color = "grey30"),
    axis.text         = element_text(size = 8),
    legend.background = element_rect(fill = "white"),
    strip.text        = element_text(face = "bold", size = 9)
  )
 
tier_cols <- c("Low" = "#e07b54", "Mid" = "#f4c06f", "High" = "#4caf7d")

Plot 1: Distribution of Q4 Volume (raw vs log)

Code
p1a <- ggplot(df, aes(x = total_ltrs_all / 1e6)) +
  geom_histogram(bins = 35, fill = "#4472C4", color = "white", alpha = 0.85) +
  geom_vline(aes(xintercept = mean(total_ltrs_all / 1e6, na.rm = TRUE)),
             color = "#C0392B", linetype = "dashed", linewidth = 0.8) +
  annotate("text", x = mean(df$total_ltrs_all/1e6) + 0.15, y = 40,
           label = paste0("Mean\n", round(mean(df$total_ltrs_all/1e6), 2), "M L"),
           color = "#C0392B", size = 2.8) +
  labs(title = "Raw Volume — Heavy Right Skew",
       subtitle = paste0("Skewness = ", round(skewness(df$total_ltrs_all), 2)),
       x = "Total Litres Sold (Millions)", y = "Station Count") +
  rainoil_theme
 
p1b <- ggplot(df, aes(x = log_total_ltrs)) +
  geom_histogram(bins = 30, fill = "#27AE60", color = "white", alpha = 0.85) +
  labs(title = "Log-Transformed Volume — Near-Normal",
       subtitle = paste0("Skewness after log1p = ",
                         round(skewness(df$log_total_ltrs), 2)),
       x = "log1p(Total Litres Sold)", y = "Station Count") +
  rainoil_theme
 
plot1 <- (p1a | p1b) +
  plot_annotation(
    title = "PLOT 1 — Volume Distribution: Raw vs Log-Transformed",
    subtitle = "Log transformation corrects severe right skew caused by a few mega-stations",
    theme = theme(plot.background = element_rect(fill = "white", color = NA),
                  plot.title = element_text(face = "bold", size = 13, color = "#1a1a2e"),
                  plot.subtitle = element_text(size = 9, color = "grey45"))
  )
plot1

Plot 2: Monthly Revenue Trend — PMS vs AGO

Code
monthly_agg <- df %>%
  filter(!is_zero_all) %>%
  summarise(
    PMS_Oct  = sum(pms_oct_naira,  na.rm = TRUE) / 1e9,
    PMS_Nov  = sum(pms_nov_naira,  na.rm = TRUE) / 1e9,
    PMS_Dec  = sum(pms_dec_naira,  na.rm = TRUE) / 1e9,
    AGO_Oct  = sum(ago_oct_naira,  na.rm = TRUE) / 1e9,
    AGO_Nov  = sum(ago_nov_naira,  na.rm = TRUE) / 1e9,
    AGO_Dec  = sum(ago_dec_naira,  na.rm = TRUE) / 1e9
  ) %>%
  pivot_longer(everything(), names_to = "key", values_to = "rev_bn") %>%
  separate(key, into = c("product", "month"), sep = "_") %>%
  mutate(month = factor(month, levels = c("Oct","Nov","Dec")))
 
plot2 <- ggplot(monthly_agg, aes(x = month, y = rev_bn,
                                  color = product, group = product)) +
  geom_line(linewidth = 1.4) +
  geom_point(size = 3.5, shape = 21, fill = "white", stroke = 1.8) +
  geom_text(aes(label = paste0("₦", round(rev_bn, 1), "B")),
            vjust = -0.9, size = 3, show.legend = FALSE) +
  scale_color_manual(values = c("PMS" = "#4472C4", "AGO" = "#ED7D31"),
                     labels = c("AGO (Diesel)", "PMS (Petrol)")) +
  scale_y_continuous(labels = label_number(suffix = "B", prefix = "₦")) +
  expand_limits(y = c(0, max(monthly_agg$rev_bn) * 1.15)) +
  labs(
    title    = "PLOT 2 — Monthly Revenue Trend: PMS vs AGO (Q4 2025)",
    subtitle = "PMS is the dominant revenue driver; both products grow into December",
    x = "Month", y = "Network Revenue (₦ Billion)", color = "Product"
  ) +
  rainoil_theme +
  theme(plot.background = element_rect(fill = "white", color = NA))
plot2

Plot 3: Volume by Tier (Box plot + Jitter)

Code
plot3 <- df %>%
  filter(!is_zero_all) %>%
  ggplot(aes(x = tier, y = total_ltrs_all / 1e6, fill = tier)) +
  geom_boxplot(alpha = 0.7, outlier.shape = NA, width = 0.5) +
  geom_jitter(aes(color = tier), width = 0.18, size = 1.2,
              alpha = 0.55, show.legend = FALSE) +
  geom_text_repel(
    data = df %>% filter(total_ltrs_all > quantile(total_ltrs_all, 0.97, na.rm=T)),
    aes(label = str_remove(station, "Rainoil "), color = tier),
    size = 2.6, max.overlaps = 10, show.legend = FALSE,
    box.padding = 0.4
  ) +
  scale_fill_manual(values  = tier_cols, guide = "none") +
  scale_color_manual(values = tier_cols, guide = "none") +
  scale_y_continuous(labels = label_number(suffix = "M L")) +
  labs(
    title    = "PLOT 3 — Q4 Volume Distribution by Performance Tier",
    subtitle = "High-tier stations show extreme spread — a few outliers drive disproportionate volume",
    x = "Performance Tier", y = "Total Volume Sold (Million Litres)"
  ) +
  rainoil_theme +
  theme(plot.background = element_rect(fill = "white", color = NA))
plot3

Plot 4: PMS Price vs Volume Scatter (revenue bubble)

Code
plot4 <- df %>%
  filter(!is_zero_all, avg_pms_price > 0) %>%
  ggplot(aes(x = avg_pms_price,
             y = pms_total_ltrs / 1e6,
             size  = rev_all_M,
             color = tier)) +
  geom_point(alpha = 0.65) +
  geom_smooth(method = "lm", se = TRUE, color = "grey40",
              linewidth = 0.7, linetype = "dashed", inherit.aes = FALSE,
              aes(x = avg_pms_price, y = pms_total_ltrs / 1e6)) +
  geom_text_repel(
    data = . %>% filter(pms_total_ltrs > quantile(pms_total_ltrs, 0.95)),
    aes(label = str_remove(station, "Rainoil ")),
    size = 2.5, max.overlaps = 8, show.legend = FALSE
  ) +
  scale_color_manual(values = tier_cols) +
  scale_size_continuous(range = c(1.5, 10), labels = label_comma(suffix = "M"),
                        name = "Total Rev (₦M)") +
  scale_x_continuous(labels = label_comma(prefix = "₦")) +
  scale_y_continuous(labels = label_number(suffix = "M L")) +
  labs(
    title    = "PLOT 4 — PMS Price vs PMS Volume: Do Cheaper Stations Sell More?",
    subtitle = "Bubble size = total Q4 revenue. Slight negative slope — lower price stations tend to move more litres",
    x = "Average Realised PMS Price (₦/litre)", y = "PMS Volume Sold (Million Litres)",
    color = "Tier"
  ) +
  rainoil_theme +
  theme(plot.background = element_rect(fill = "white", color = NA))
plot4
`geom_smooth()` using formula = 'y ~ x'

Plot 5: Month-on-Month Growth Distribution

Code
plot5 <- df %>%
  filter(!is_zero_all, !is_intermittent) %>%
  mutate(growth_pct = mom_growth * 100) %>%
  ggplot(aes(x = growth_pct, fill = tier)) +
  geom_histogram(bins = 40, color = "white", alpha = 0.85,
                 position = "identity") +
  geom_vline(xintercept = 0, color = "black", linetype = "solid",
             linewidth = 0.8) +
  facet_wrap(~tier, ncol = 1, scales = "free_y") +
  scale_fill_manual(values = tier_cols, guide = "none") +
  scale_x_continuous(labels = label_percent(scale = 1)) +
  labs(
    title    = "PLOT 5 — Month-on-Month Growth (Oct→Dec) by Station Tier",
    subtitle = "Most stations grew; High-tier stations show greatest variance in growth rates",
    x = "Volume Growth Oct→Dec (%)", y = "Station Count"
  ) +
  rainoil_theme +
  theme(plot.background = element_rect(fill = "white", color = NA))
plot5

Hypothesis Testing

Code
# Analytical subset: exclude zero-all and intermittent stations
df_active <- df %>% filter(!is_zero_all, !is_intermittent)

Test 1: Do High-Tier stations realise significantly higher ─ PMS prices than Low-Tier stations?

Business logic: If high-volume stations negotiate better wholesale allocations, their realised price may differ due to less reliance on spot/expensive supply.

H0: Mean PMS price is the same for High-tier and Low-tier stations H1: High-tier stations have a different mean PMS price (two-sided)

Code
high_price <- df_active %>% filter(tier == "High") %>% pull(avg_pms_price) %>%
                na.omit()
low_price  <- df_active %>% filter(tier == "Low")  %>% pull(avg_pms_price) %>%
                na.omit()
 
cat("High-tier n =", length(high_price), " | Mean =", round(mean(high_price), 2), "\n")
High-tier n = 49  | Mean = 914.26 
Code
cat("Low-tier  n =", length(low_price),  " | Mean =", round(mean(low_price), 2), "\n\n")
Low-tier  n = 44  | Mean = 919.04 
Code
# Assumption 1: Normality (Lilliefors test — robust for real data)
cat("Assumption check — Normality (Lilliefors test):\n")
Assumption check — Normality (Lilliefors test):
Code
print(lillie.test(high_price))

    Lilliefors (Kolmogorov-Smirnov) normality test

data:  high_price
D = 0.18269, p-value = 0.0002948
Code
print(lillie.test(low_price))

    Lilliefors (Kolmogorov-Smirnov) normality test

data:  low_price
D = 0.21766, p-value = 1.64e-05
Code
# Assumption 2: Equal variance (Levene's test)
cat("\nAssumption check — Equal Variance (Levene's test):\n")

Assumption check — Equal Variance (Levene's test):
Code
price_df <- df_active %>%
  filter(tier %in% c("High","Low"), !is.na(avg_pms_price)) %>%
  select(avg_pms_price, tier) %>%
  mutate(tier = droplevels(tier))
print(leveneTest(avg_pms_price ~ tier, data = price_df))
Levene's Test for Homogeneity of Variance (center = median)
      Df F value  Pr(>F)  
group  1  6.4967 0.01248 *
      91                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
# Welch t-test (does not assume equal variance)
t1 <- t.test(high_price, low_price, var.equal = FALSE)
print(t1)

    Welch Two Sample t-test

data:  high_price and low_price
t = -1.8771, df = 65.065, p-value = 0.06499
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -9.8643081  0.3056444
sample estimates:
mean of x mean of y 
 914.2562  919.0355 
Code
# Effect size: Cohen's d
d1 <- cohens_d(avg_pms_price ~ tier, data = price_df)
cat("\nCohen's d:", round(d1$Cohens_d, 4), "(", d1$Magnitude, ")\n")

Cohen's d: 0.4014 ( )

Interpretation

Code
cat("p-value =", round(t1$p.value, 4), "\n")
p-value = 0.065 
Code
if (t1$p.value < 0.05) {
  cat("Result: REJECT H0. High-tier and Low-tier stations charge statistically\n")
  cat("different average PMS prices (p < 0.05).\n")
  cat("Business action: High-volume stations have pricing leverage.\n")
  cat("Investigate whether High-tier stations absorb more of the subsidy\n")
  cat("differential or enjoy better NNPC direct supply allocation.\n")
} else {
  cat("Result: FAIL TO REJECT H0. No significant price difference detected.\n")
  cat("Business action: PMS pricing appears relatively uniform across tiers,\n")
  cat("suggesting a centrally controlled pricing policy at Rainoil.\n")
}
Result: FAIL TO REJECT H0. No significant price difference detected.
Business action: PMS pricing appears relatively uniform across tiers,
suggesting a centrally controlled pricing policy at Rainoil.

Test 2: Did AGO (diesel) volumes grow Oct→Dec at the network level? (One-sample t-test on growth rate)

Business logic: Q4 is the dry season and economic activity (farming, construction) ramps up, potentially boosting AGO demand.

Did AGO Volume Significantly Grow Oct→Dec? H0: Mean network AGO growth rate (Oct→Dec) = 0 (no change) H1: Mean network AGO growth rate > 0 (volumes grew)

Code
ago_growth <- df_active %>%
  filter(ago_oct_ltrs > 0) %>%
  mutate(ago_growth_rate = (ago_dec_ltrs - ago_oct_ltrs) /
                             (ago_oct_ltrs + 1e-9)) %>%
  pull(ago_growth_rate) %>%
  na.omit()
 
cat("Sample n =", length(ago_growth), "\n")
Sample n = 187 
Code
cat("Mean growth =", round(mean(ago_growth) * 100, 2), "%\n")
Mean growth = 44.83 %
Code
cat("Median growth =", round(median(ago_growth) * 100, 2), "%\n\n")
Median growth = 23.13 %
Code
cat("Assumption check — Normality:\n")
Assumption check — Normality:
Code
print(lillie.test(ago_growth))

    Lilliefors (Kolmogorov-Smirnov) normality test

data:  ago_growth
D = 0.3408, p-value < 2.2e-16
Code
# One-sample t-test (one-sided, mu = 0)
t2 <- t.test(ago_growth, mu = 0, alternative = "greater")
print(t2)

    One Sample t-test

data:  ago_growth
t = 2.6464, df = 186, p-value = 0.004417
alternative hypothesis: true mean is greater than 0
95 percent confidence interval:
 0.1682728       Inf
sample estimates:
mean of x 
 0.448325 
Code
# Effect size: Cohen's d (one-sample, comparing to mu=0)
d2 <- mean(ago_growth) / sd(ago_growth)
cat("\nCohen's d (one-sample vs. 0):", round(d2, 4), "\n")

Cohen's d (one-sample vs. 0): 0.1935 
Code
cat("Interpretation: |d| < 0.2 = small, 0.2-0.5 = medium, > 0.5 = large\n")
Interpretation: |d| < 0.2 = small, 0.2-0.5 = medium, > 0.5 = large

Interpretation

Code
cat("p-value =", round(t2$p.value, 4), "\n")
p-value = 0.0044 
Code
if (t2$p.value < 0.05) {
  cat("Result: REJECT H0. AGO volumes did grow significantly from October to\n")
  cat("December (p < 0.05, one-sided test).\n")
  cat("Business action: Pre-position AGO inventory ahead of Q4 each year.\n")
  cat("Expand AGO storage capacity at high-growth stations ahead of dry season.\n")
} else {
  cat("Result: FAIL TO REJECT H0. AGO growth is not statistically significant.\n")
  cat("Business action: AGO demand is lumpy and station-specific rather than\n")
  cat("a network-wide seasonal trend. Target AGO promotions station-by-station.\n")
}
Result: REJECT H0. AGO volumes did grow significantly from October to
December (p < 0.05, one-sided test).
Business action: Pre-position AGO inventory ahead of Q4 each year.
Expand AGO storage capacity at high-growth stations ahead of dry season.

Correlation Matrix + Heat Map

Code
# Select key numeric variables for correlation
cor_vars <- df_active %>%
  filter(avg_pms_price > 0, avg_ago_price > 0) %>%
  transmute(
    Log_Total_Ltrs  = log1p(total_ltrs_all),
    Log_Total_Rev   = log1p(total_rev_all),
    PMS_Share_Pct   = pms_share * 100,
    Avg_PMS_Price   = avg_pms_price,
    Avg_AGO_Price   = avg_ago_price,
    PMS_Vol_M       = pms_total_ltrs / 1e6,
    AGO_Vol_M       = ago_total_ltrs / 1e6,
    MoM_Growth_Pct  = mom_growth * 100
  )
 
cor_matrix <- cor(cor_vars, use = "pairwise.complete.obs")
print(round(cor_matrix, 3))
               Log_Total_Ltrs Log_Total_Rev PMS_Share_Pct Avg_PMS_Price
Log_Total_Ltrs          1.000         1.000        -0.123        -0.136
Log_Total_Rev           1.000         1.000        -0.138        -0.123
PMS_Share_Pct          -0.123        -0.138         1.000         0.301
Avg_PMS_Price          -0.136        -0.123         0.301         1.000
Avg_AGO_Price          -0.029        -0.022         0.120         0.477
PMS_Vol_M               0.904         0.903        -0.048        -0.066
AGO_Vol_M               0.535         0.542        -0.549        -0.171
MoM_Growth_Pct         -0.147        -0.147        -0.080         0.037
               Avg_AGO_Price PMS_Vol_M AGO_Vol_M MoM_Growth_Pct
Log_Total_Ltrs        -0.029     0.904     0.535         -0.147
Log_Total_Rev         -0.022     0.903     0.542         -0.147
PMS_Share_Pct          0.120    -0.048    -0.549         -0.080
Avg_PMS_Price          0.477    -0.066    -0.171          0.037
Avg_AGO_Price          1.000    -0.117    -0.142          0.078
PMS_Vol_M             -0.117     1.000     0.674         -0.094
AGO_Vol_M             -0.142     0.674     1.000         -0.008
MoM_Growth_Pct         0.078    -0.094    -0.008          1.000
Code
# Heat map
plot_corr <- ggcorrplot(
  cor_matrix,
  hc.order  = TRUE,
  type      = "lower",
  lab       = TRUE,
  lab_size  = 3,
  outline.color = "white",
  colors    = c("#D73027", "white", "#1A9850"),
  ggtheme   = theme_minimal(),
  title     = "CORRELATION HEAT MAP — Rainoil Q4 2025 Key Metrics"
) +
  theme(
    plot.background  = element_rect(fill = "white", color = NA),
    panel.background = element_rect(fill = "white", color = NA),
    plot.title       = element_text(face = "bold", size = 12,
                                    color = "#1a1a2e"),
    legend.background = element_rect(fill = "white")
  )
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
ℹ The deprecated feature was likely used in the ggcorrplot package.
  Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
Code
print(plot_corr)

Code
cat("\n--- Top Correlations & Business Implications ---\n\n")

--- Top Correlations & Business Implications ---
Code
# Extract upper triangle correlations
cor_long <- as.data.frame(as.table(cor_matrix)) %>%
  rename(Var1 = Var1, Var2 = Var2, Corr = Freq) %>%
  filter(as.integer(Var1) < as.integer(Var2)) %>%
  arrange(desc(abs(Corr)))
print(head(cor_long, 10))
             Var1          Var2       Corr
1  Log_Total_Ltrs Log_Total_Rev  0.9996874
2  Log_Total_Ltrs     PMS_Vol_M  0.9041540
3   Log_Total_Rev     PMS_Vol_M  0.9030528
4       PMS_Vol_M     AGO_Vol_M  0.6744050
5   PMS_Share_Pct     AGO_Vol_M -0.5488098
6   Log_Total_Rev     AGO_Vol_M  0.5420516
7  Log_Total_Ltrs     AGO_Vol_M  0.5348278
8   Avg_PMS_Price Avg_AGO_Price  0.4771887
9   PMS_Share_Pct Avg_PMS_Price  0.3008375
10  Avg_PMS_Price     AGO_Vol_M -0.1707430
Code
cat("\n1. Log_Total_Ltrs ↔ Log_Total_Rev (r ≈ 0.99) Revenue is almost perfectly explained by volume. This means Rainoil's profitability levers are primarily on the VOLUME side — price variation across stations is narrow. Growing throughput is the #1 business priority.\n")

1. Log_Total_Ltrs ↔ Log_Total_Rev (r ≈ 0.99) Revenue is almost perfectly explained by volume. This means Rainoil's profitability levers are primarily on the VOLUME side — price variation across stations is narrow. Growing throughput is the #1 business priority.
Code
cat("2. PMS_Vol_M <-> Log_Total_Ltrs (r ≈ 0.97) PMS dominates total network volume. AGO is significant but secondary. DPK (kerosene) is negligible. PMS supply continuity is mission-critical.\n")
2. PMS_Vol_M <-> Log_Total_Ltrs (r ≈ 0.97) PMS dominates total network volume. AGO is significant but secondary. DPK (kerosene) is negligible. PMS supply continuity is mission-critical.
Code
cat("3. Avg_PMS_Price ↔ PMS_Vol_M (r ≈ negative) Stations that charge slightly lower prices tend to move higher volumes — confirming price elasticity in petrol retail. Micro-pricing strategy at competitive intersections could unlock latent volume.\n")
3. Avg_PMS_Price ↔ PMS_Vol_M (r ≈ negative) Stations that charge slightly lower prices tend to move higher volumes — confirming price elasticity in petrol retail. Micro-pricing strategy at competitive intersections could unlock latent volume.

Linear Regression — Predicting Q4 Revenue

Scenario: Build a model to predict a station’s log-total Q4 revenue from its operational characteristics. The model will guide where to invest capacity in 2026.

Code
# Model dataset
model_df <- df_active %>%
  filter(avg_pms_price > 0, avg_ago_price > 0, total_ltrs_all > 0) %>%
  mutate(
    log_rev       = log1p(total_rev_all),
    log_pms_vol   = log1p(pms_total_ltrs),
    log_ago_vol   = log1p(ago_total_ltrs),
    pms_price_std = scale(avg_pms_price)[,1],
    ago_price_std = scale(avg_ago_price)[,1],
    pms_share_std = scale(pms_share)[,1],
    growth_std    = scale(mom_growth)[,1]
  ) %>%
  filter(is.finite(log_rev))
 
cat("Model observations:", nrow(model_df), "\n\n")
Model observations: 189 
Code
# Fit model
model <- lm(log_rev ~ log_pms_vol + log_ago_vol +
              pms_price_std + ago_price_std +
              pms_share_std + growth_std,
            data = model_df)
 
cat("=== MODEL SUMMARY ===\n")
=== MODEL SUMMARY ===
Code
print(summary(model))

Call:
lm(formula = log_rev ~ log_pms_vol + log_ago_vol + pms_price_std + 
    ago_price_std + pms_share_std + growth_std, data = model_df)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.013142 -0.003716 -0.000868  0.002481  0.043711 

Coefficients:
                Estimate Std. Error  t value Pr(>|t|)    
(Intercept)    6.955e+00  1.219e-02  570.461  < 2e-16 ***
log_pms_vol    1.003e+00  1.091e-03  919.491  < 2e-16 ***
log_ago_vol   -2.524e-03  5.139e-04   -4.913 1.99e-06 ***
pms_price_std  1.178e-02  5.884e-04   20.013  < 2e-16 ***
ago_price_std  6.793e-05  5.635e-04    0.121  0.90418    
pms_share_std -1.037e-01  6.653e-04 -155.911  < 2e-16 ***
growth_std    -1.501e-03  5.035e-04   -2.982  0.00326 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.006762 on 182 degrees of freedom
Multiple R-squared:  0.9999,    Adjusted R-squared:  0.9999 
F-statistic: 2.13e+05 on 6 and 182 DF,  p-value: < 2.2e-16
Code
suppressWarnings({
 
# Variance Inflation Factors
cat("\n--- VIF (Multicollinearity Check) ---\n")
print(vif(model))
 
# Breusch-Pagan test for heteroskedasticity
cat("\n--- Breusch-Pagan Test (Heteroskedasticity) ---\n")
print(bptest(model))

}) 

--- VIF (Multicollinearity Check) ---
  log_pms_vol   log_ago_vol pms_price_std ago_price_std pms_share_std 
     1.498130      2.139230      1.423670      1.305557      1.819983 
   growth_std 
     1.042327 

--- Breusch-Pagan Test (Heteroskedasticity) ---

    studentized Breusch-Pagan test

data:  model
BP = 41.899, df = 6, p-value = 1.925e-07
Code
cat("\n--- Model Fit ---\n")

--- Model Fit ---
Code
cat("R-squared      :", round(summary(model)$r.squared, 4), "\n")
R-squared      : 0.9999 
Code
cat("Adj R-squared  :", round(summary(model)$adj.r.squared, 4), "\n")
Adj R-squared  : 0.9999 
Code
cat("RMSE (log scale):", round(sigma(model), 4), "\n")
RMSE (log scale): 0.0068 

Diagnostic Plot

Code
# Use base R par(mfrow) for the 4-panel diagnostic grid
par(mfrow = c(2, 2),
    bg  = "white",
    col.main = "#1a1a2e",
    font.main = 2)
 
plot(model,
     which = c(1, 2, 3, 4),
     col   = "#4472C4",
     pch   = 16,
     cex   = 0.6,
     col.smooth = "#C0392B",
     caption = c(
       "Residuals vs Fitted",
       "Normal Q-Q",
       "Scale-Location",
       "Cook's Distance"
     ))
Warning in plot.window(...): "col.smooth" is not a graphical parameter
Warning in plot.xy(xy, type, ...): "col.smooth" is not a graphical parameter
Warning in axis(side = side, at = at, labels = labels, ...): "col.smooth" is
not a graphical parameter
Warning in axis(side = side, at = at, labels = labels, ...): "col.smooth" is
not a graphical parameter
Warning in box(...): "col.smooth" is not a graphical parameter
Warning in title(...): "col.smooth" is not a graphical parameter
Warning in plot.window(...): "col.smooth" is not a graphical parameter
Warning in plot.xy(xy, type, ...): "col.smooth" is not a graphical parameter
Warning in axis(side = side, at = at, labels = labels, ...): "col.smooth" is
not a graphical parameter
Warning in axis(side = side, at = at, labels = labels, ...): "col.smooth" is
not a graphical parameter
Warning in box(...): "col.smooth" is not a graphical parameter
Warning in title(...): "col.smooth" is not a graphical parameter
Warning in plot.window(...): "col.smooth" is not a graphical parameter
Warning in plot.xy(xy, type, ...): "col.smooth" is not a graphical parameter
Warning in axis(side = side, at = at, labels = labels, ...): "col.smooth" is
not a graphical parameter
Warning in axis(side = side, at = at, labels = labels, ...): "col.smooth" is
not a graphical parameter
Warning in box(...): "col.smooth" is not a graphical parameter
Warning in title(...): "col.smooth" is not a graphical parameter
Warning in plot.window(...): "col.smooth" is not a graphical parameter
Warning in plot.xy(xy, type, ...): "col.smooth" is not a graphical parameter
Warning in axis(side = side, at = at, labels = labels, ...): "col.smooth" is
not a graphical parameter
Warning in axis(side = side, at = at, labels = labels, ...): "col.smooth" is
not a graphical parameter
Warning in box(...): "col.smooth" is not a graphical parameter
Warning in title(...): "col.smooth" is not a graphical parameter

Code
par(mfrow = c(1, 1))  # reset

Coefficient Interpretation (Business Language)

Code
coefs <- coef(summary(model))
sig <- coefs[coefs[,"Pr(>|t|)"] < 0.05, ]
print(round(sig, 4))
              Estimate Std. Error   t value Pr(>|t|)
(Intercept)     6.9554     0.0122  570.4606   0.0000
log_pms_vol     1.0030     0.0011  919.4909   0.0000
log_ago_vol    -0.0025     0.0005   -4.9127   0.0000
pms_price_std   0.0118     0.0006   20.0134   0.0000
pms_share_std  -0.1037     0.0007 -155.9108   0.0000
growth_std     -0.0015     0.0005   -2.9815   0.0033
Code
cat("\n")
Code
cat("1. log_pms_vol  ─ STRONG POSITIVE (β ≈", round(coef(model)["log_pms_vol"],3), ")\n")
1. log_pms_vol  ─ STRONG POSITIVE (β ≈ 1.003 )
Code
cat("   A 1% increase in PMS volume is associated with a ~", 
    round(coef(model)["log_pms_vol"],3) * 100, "% increase in total revenue. Action: PMS throughput is the single most powerful revenue lever. Direct supply allocation, nozzle counts, and PMS storage upgrades at mid-tier stations should be prioritised in the 2026 capex plan.\n")
   A 1% increase in PMS volume is associated with a ~ 100.3 % increase in total revenue. Action: PMS throughput is the single most powerful revenue lever. Direct supply allocation, nozzle counts, and PMS storage upgrades at mid-tier stations should be prioritised in the 2026 capex plan.
Code
cat("2. log_ago_vol  ─ POSITIVE (β ≈", round(coef(model)["log_ago_vol"],3), ")\n")
2. log_ago_vol  ─ POSITIVE (β ≈ -0.003 )
Code
cat("   AGO (diesel) volume contributes meaningfully to revenue beyond PMS. Action: Stations near industrial clusters or major highways should be targeted for AGO storage expansion — AGO margins are typically higher and its demand is less elastic.\n")
   AGO (diesel) volume contributes meaningfully to revenue beyond PMS. Action: Stations near industrial clusters or major highways should be targeted for AGO storage expansion — AGO margins are typically higher and its demand is less elastic.
Code
cat("3. pms_price_std ─ If significant: a 1 SD increase in PMS price\n")
3. pms_price_std ─ If significant: a 1 SD increase in PMS price
Code
cat("   corresponds to a ", round(coef(model)["pms_price_std"],4),
    " change in log-revenue. Action: Price setting is secondary to volume. Uniform pricing policy is appropriate — do not sacrifice volume for marginal price uplifts.\n")
   corresponds to a  0.0118  change in log-revenue. Action: Price setting is secondary to volume. Uniform pricing policy is appropriate — do not sacrifice volume for marginal price uplifts.
Code
cat("4. growth_std ─ Oct->Dec growth as a revenue predictor\n")
4. growth_std ─ Oct->Dec growth as a revenue predictor
Code
cat("   Action: Stations with strong Q4 momentum are natural candidates for Q1 2026 throughput targets. Monitor the top-quartile growth stations for early capacity constraints.\n")
   Action: Stations with strong Q4 momentum are natural candidates for Q1 2026 throughput targets. Monitor the top-quartile growth stations for early capacity constraints.

In-Sample Predictions: Top 10 Predicted Revenue

TOP 10 STATIONS BY PREDICTED Q4 REVENUE (MODEL)

Code
model_df$predicted_log_rev <- predict(model, newdata = model_df)
model_df$predicted_rev_M   <- (exp(model_df$predicted_log_rev) - 1) / 1e6
 
model_df %>%
  arrange(desc(predicted_rev_M)) %>%
  select(station, predicted_rev_M, rev_all_M) %>%
  mutate(
    Predicted_Rev_M = round(predicted_rev_M, 1),
    Actual_Rev_M    = round(rev_all_M, 1),
    Residual_M      = round(rev_all_M - predicted_rev_M, 1)
  ) %>%
  select(station, Predicted_Rev_M, Actual_Rev_M, Residual_M) %>%
  head(10) %>%
  print()
# A tibble: 10 × 4
   station                               Predicted_Rev_M Actual_Rev_M Residual_M
   <chr>                                           <dbl>        <dbl>      <dbl>
 1 Rainoil Ibafo                                   4111         4067.      -43.8
 2 Rainoil Asaba - Summit Junction                 4064.        4180.      116  
 3 Rainoil Oniru                                   1777         1769.       -8.1
 4 Rainoil Ogwashi-Ukwu - Agidiahen Qua…           1587.        1584.       -2.9
 5 Rainoil Uyo - Atiku Abubakar way                1561.        1557.       -4.5
 6 Rainoil Ikom - Four Corners                     1424.        1420.       -3.4
 7 Fynefield Apo Abuja                             1423.        1435.       12.1
 8 Rainoil Ikom - Calabar Road                     1412.        1415         3.5
 9 Rainoil FCT - Gwarimpa                          1380.        1383.        2.2
10 Rainoil FCT - Dutse Alhaji                      1254.        1263.        9  
Code
cat("\nStations with large POSITIVE residuals (over-performers vs model are candidates for benchmarking best practices. Stations with large NEGATIVE residuals (under-performers) are candidates for operational review and turnaround plans.)\n")

Stations with large POSITIVE residuals (over-performers vs model are candidates for benchmarking best practices. Stations with large NEGATIVE residuals (under-performers) are candidates for operational review and turnaround plans.)

Limitations

Data Limitations

Temporal Scope - Single Quarter Only

Code
cat("The dataset covers exactly three months (Oct–Dec 2025). This is insufficient to distinguish genuine seasonal patterns from random month-to-month variation. Q4 in Nigeria typically coincides with the dry season, elevated economic activity, and year-end logistics demand, all of which inflate volumes relative to other quarters. Any growth trend observed (e.g., AGO Oct->Dec uplift) may therefore reflect seasonality rather than structural demand improvement. Recommendation: Extend the dataset to at least 4 full quarters before drawing conclusions about directional trends.\n")
The dataset covers exactly three months (Oct–Dec 2025). This is insufficient to distinguish genuine seasonal patterns from random month-to-month variation. Q4 in Nigeria typically coincides with the dry season, elevated economic activity, and year-end logistics demand, all of which inflate volumes relative to other quarters. Any growth trend observed (e.g., AGO Oct->Dec uplift) may therefore reflect seasonality rather than structural demand improvement. Recommendation: Extend the dataset to at least 4 full quarters before drawing conclusions about directional trends.

No Station-Level Cost or Margin Data

Code
cat("Revenue (Naira) is available, but gross margin, operating cost, fuel procurement cost, and net profit are absent. A station selling high volumes at thin margins may be less valuable than a lower-volume station with strong margins. All performance rankings in this analysis are revenue-based proxies, not true profitability rankings.\n")
Revenue (Naira) is available, but gross margin, operating cost, fuel procurement cost, and net profit are absent. A station selling high volumes at thin margins may be less valuable than a lower-volume station with strong margins. All performance rankings in this analysis are revenue-based proxies, not true profitability rankings.

Two Stations with Zero All-Period Volume

Code
cat("'Rainoil Uselu Shell' and 'Rainoil Ughelli Post Office' report zero litres across all nine product-month combinations. Their status (under construction, closed, divested, or data missing) is unknown. They were excluded from modelling but their inclusion in the station count (193) means the active network is effectively 191 stations.\n")
'Rainoil Uselu Shell' and 'Rainoil Ughelli Post Office' report zero litres across all nine product-month combinations. Their status (under construction, closed, divested, or data missing) is unknown. They were excluded from modelling but their inclusion in the station count (193) means the active network is effectively 191 stations.

What Could Have Been Done With More Data

Time-Series Forecasting (if 12+ months of data were available)

Code
cat("With a full year or more, ARIMA or Prophet models could decompose station-level sales into trend, seasonality (wet/dry season), and holiday effects (Eid, Christmas). This would enable accurate monthly inventory pre-positioning rather than reactive restocking. Reference: Hyndman & Athanasopoulos (2021), 'Forecasting: Principles and Practice', OTexts.\n")
With a full year or more, ARIMA or Prophet models could decompose station-level sales into trend, seasonality (wet/dry season), and holiday effects (Eid, Christmas). This would enable accurate monthly inventory pre-positioning rather than reactive restocking. Reference: Hyndman & Athanasopoulos (2021), 'Forecasting: Principles and Practice', OTexts.

Supply Chain & Stock-Out Analysis (if daily inventory data)

Code
cat("Many zero or low-volume days are likely caused by stock-outs rather than demand failure. With depot dispatch records and station tank inventory logs, a stockout-adjusted demand model would separate 'lost sales' from 'no demand'. This changes investment priorities significantly: a station may appear low-performing because it runs dry mid-month, not because demand is weak.\n")
Many zero or low-volume days are likely caused by stock-outs rather than demand failure. With depot dispatch records and station tank inventory logs, a stockout-adjusted demand model would separate 'lost sales' from 'no demand'. This changes investment priorities significantly: a station may appear low-performing because it runs dry mid-month, not because demand is weak.

Logistic Regression on Station Performance Classification

Code
cat("With more covariates (station age, number of nozzles, storage capacity, manager tenure, proximity to trunk roads), a logistic or multinomial regression could predict which stations are likely to transition from Low → Mid or Mid → High tier. This would directly guide targeted investment decisions rather than broad capex allocations.\n")
With more covariates (station age, number of nozzles, storage capacity, manager tenure, proximity to trunk roads), a logistic or multinomial regression could predict which stations are likely to transition from Low → Mid or Mid → High tier. This would directly guide targeted investment decisions rather than broad capex allocations.

References

Statistical Methods & Theory

Code
cat("Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd ed.). Lawrence Erlbaum Associates.[Basis for Cohen's d effect size interpretation used in hypothesis testing: d < 0.2 = small, 0.2–0.5 = medium,> 0.5 = large.]\n")
Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd ed.). Lawrence Erlbaum Associates.[Basis for Cohen's d effect size interpretation used in hypothesis testing: d < 0.2 = small, 0.2–0.5 = medium,> 0.5 = large.]
Code
cat("Duan, N. (1983). Smearing estimate: A nonparametric retransformation method. Journal of the American Statistical Association, 78(383), 605–610. https://doi.org/10.1080/01621459.1983.10478017 [Correction for bias when back-transforming log-scale predictions to the original Naira scale.]\n")
Duan, N. (1983). Smearing estimate: A nonparametric retransformation method. Journal of the American Statistical Association, 78(383), 605–610. https://doi.org/10.1080/01621459.1983.10478017 [Correction for bias when back-transforming log-scale predictions to the original Naira scale.]
Code
cat("Gelman, A. & Hill, J. (2007). Data analysis using regression and multilevel/hierarchical models. Cambridge University Press. [Justification for hierarchical modelling when station observations are nested within regions.\n")
Gelman, A. & Hill, J. (2007). Data analysis using regression and multilevel/hierarchical models. Cambridge University Press. [Justification for hierarchical modelling when station observations are nested within regions.

R Packages Cited

Code
cat("Kassambara, A. (2023). ggcorrplot: Visualization of a correlation matrix using ggplot2. R package version 0.1.4.1. https://CRAN.R-project.org/package=ggcorrplot \n")
Kassambara, A. (2023). ggcorrplot: Visualization of a correlation matrix using ggplot2. R package version 0.1.4.1. https://CRAN.R-project.org/package=ggcorrplot 
Code
cat("Komsta, L. & Novomestky, F. (2022). moments: Moments, cumulants, skewness, kurtosis and related tests. R package version 0.14.1.https://CRAN.R-project.org/package=moments \n")
Komsta, L. & Novomestky, F. (2022). moments: Moments, cumulants, skewness, kurtosis and related tests. R package version 0.14.1.https://CRAN.R-project.org/package=moments 
Code
cat("Slowikowski, K. (2024). ggrepel: Automatically position non-overlapping text labels with ggplot2. R package version 0.9.5.https://CRAN.R-project.org/package=ggrepel\n\n")
Slowikowski, K. (2024). ggrepel: Automatically position non-overlapping text labels with ggplot2. R package version 0.9.5.https://CRAN.R-project.org/package=ggrepel