#' Title : Q3 - Flexible Profile Class Analysis (Q1 + Q2 for ANY PC)
#' Author: Vibhash
#' Date : 2026-04
# ==== SECTION 1 — Working Directory ================================
# setwd("C:/Users/.../SSE_Exercise")
# getwd()
# ==== SECTION 2 — Libraries ========================================
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readxl)
## Warning: package 'readxl' was built under R version 4.2.3
library(lubridate)
library(ggplot2)
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.2.3
library(fpp2) # ts(), autoplot(), stl forecast
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## ── Attaching packages ──────────────────────────────────────────── fpp2 2.5.1 ──
## ✔ forecast 8.21 ✔ expsmooth 2.3
## ✔ fma 2.5
## Warning: package 'fma' was built under R version 4.2.3
## Warning: package 'expsmooth' was built under R version 4.2.3
##
library(tseries) # adf.test()
library(zoo) # rollmean()
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Excel serial <-> Date helper (handles POSIXct / Date / numeric / string)
to_date <- function(x) {
if (inherits(x, "Date")) return(as.Date(x))
if (inherits(x, "POSIXct") || inherits(x, "POSIXt")) return(as.Date(x))
n <- suppressWarnings(as.numeric(x))
if (all(is.na(n))) return(as.Date(x))
as.Date(n, origin = "1899-12-30")
}
# ==== SECTION 3 — Input Configuration ==============================
file_name <- "SSEEX1.xlsx"
sheet_name <- "Sheet1"
DAYS_YR <- 365
# ==== SECTION 4 — Custom Theme =====================================
my_theme <- theme(
plot.background = element_rect(fill = "white", color = "white"),
panel.background = element_rect(fill = "white", color = "gray60"),
panel.grid.major = element_line(color = "gray90"),
panel.grid.minor = element_blank(),
legend.background = element_rect(fill = "white", color = "white"),
legend.key = element_rect(fill = "white", colour = "white"),
plot.title = element_text(face = "bold")
)
# ==== SECTION 5 — Data Loading & Cleaning ==========================
raw <- read_excel(file_name, sheet = sheet_name)
names(raw) <- trimws(names(raw))
rdb <- raw %>%
slice(-1) %>%
mutate(
Profile = as.numeric(Profile),
`Annual Consumption` = as.numeric(`Annual Consumption`),
Margin = as.numeric(Margin),
Term = as.numeric(Term),
Start_Date = to_date(Start_Date),
End_Date = to_date(End_Date),
DateCosted = to_date(DateCosted),
Market = toupper(trimws(as.character(Market))),
Flexi = trimws(as.character(Flexi)),
Route = trimws(as.character(Route))
)
# ==== SECTION 6 — Core Helper: FY overlap days ====================
# Inclusive day-count of overlap between contract window and FY window.
fy_overlap_days <- function(start, end, fy_start, fy_end) {
if (is.na(start) || is.na(end)) return(0L)
win_start <- pmax(start, fy_start)
win_end <- pmin(end, fy_end)
if (win_end < win_start) return(0L)
as.integer(as.numeric(win_end - win_start) + 1)
}
# ==== SECTION 7 — Flexible Function ===============================
# One call runs the Q1 (FY consumption) + Q2 (margin trend EDA) pipeline
# for any (market, profile_class, FY window). Plots render to the Plots pane.
analyse_profile_class <- function(df,
profile_class,
market = "NHH",
fy_start = "2017-04-01",
fy_end = "2018-03-31") {
fy_s <- as.Date(fy_start)
fy_e <- as.Date(fy_end)
market <- toupper(market)
label_txt <- paste0(market, " - Profile Class ", profile_class)
# -------- filter slice ------------------------------------------
sub <- df %>%
filter(Market == market, Profile == profile_class)
if (nrow(sub) == 0) {
cat(sprintf("[%s] no rows match - nothing to analyse.\n", label_txt))
return(invisible(NULL))
}
# -------- Q1: FY CONSUMPTION ------------------------------------
cons <- sub %>%
rowwise() %>%
mutate(
Daily_Consumption = `Annual Consumption` / DAYS_YR,
FY_Days = fy_overlap_days(Start_Date, End_Date, fy_s, fy_e),
FY_Consumption = Daily_Consumption * FY_Days
) %>%
ungroup()
cat("=================================================================\n")
cat(sprintf("%s FY %s -> %s\n", label_txt, fy_s, fy_e))
cat("=================================================================\n")
cat(sprintf("Contracts (matching) : %d\n", nrow(cons)))
cat(sprintf("Contracts overlapping FY : %d\n", sum(cons$FY_Days > 0)))
cat(sprintf("Total FY contract-days : %s\n",
format(sum(cons$FY_Days), big.mark = ",")))
cat(sprintf("TOTAL FY consumption (kWh) : %s\n",
format(round(sum(cons$FY_Consumption, na.rm = TRUE), 2), big.mark = ",")))
cat(sprintf(" (GWh) : %.4f\n",
sum(cons$FY_Consumption, na.rm = TRUE) / 1e6))
# -------- Q2: MARGIN TREND EDA ----------------------------------
pc <- sub %>%
filter(!is.na(Margin), !is.na(DateCosted)) %>%
arrange(DateCosted)
if (nrow(pc) < 3) {
cat("\n[Margin trend skipped - fewer than 3 contracts with margin data.]\n")
return(invisible(list(consumption = cons)))
}
# Section 6 — Descriptive stats
cat("\nDescriptive stats -", label_txt, "\n")
cat(sprintf("Rows (with margin) : %d\n", nrow(pc)))
cat(sprintf("DateCosted range : %s -> %s\n",
min(pc$DateCosted), max(pc$DateCosted)))
cat(sprintf("Margin mean / median : %.4f / %.4f\n",
mean(pc$Margin), median(pc$Margin)))
cat(sprintf("Margin std / min / max : %.4f / %.4f / %.4f\n",
sd(pc$Margin), min(pc$Margin), max(pc$Margin)))
# Section 7 — Raw scatter
p1 <- ggplot(pc, aes(DateCosted, Margin)) +
geom_point(alpha = 0.35, size = 1.6, color = "steelblue") +
labs(title = paste("Raw margin vs DateCosted -", label_txt),
x = "DateCosted", y = "Margin (p/kWh)") +
my_theme
print(p1)
# Section 8 — Monthly aggregation
monthly <- pc %>%
mutate(month = floor_date(DateCosted, "month")) %>%
group_by(month) %>%
summarise(
mean_margin = mean(Margin),
median_margin = median(Margin),
count = n(),
vol_weighted = sum(Margin * `Annual Consumption`) /
sum(`Annual Consumption`),
.groups = "drop"
)
# Fill missing months so rolling / ts() behave well
full_months <- tibble(month = seq(min(monthly$month),
max(monthly$month),
by = "month"))
monthly <- full_months %>% left_join(monthly, by = "month")
p2 <- ggplot(monthly, aes(month)) +
geom_line (aes(y = mean_margin, color = "Mean")) +
geom_point(aes(y = mean_margin, color = "Mean")) +
geom_line (aes(y = median_margin, color = "Median"), alpha = 0.7) +
geom_line (aes(y = vol_weighted, color = "Vol-weighted"), alpha = 0.7) +
scale_color_manual(values = c(Mean = "steelblue",
Median = "darkorange",
`Vol-weighted` = "forestgreen")) +
labs(title = paste("Monthly margin -", label_txt),
x = "Month", y = "Margin (p/kWh)", color = "") +
my_theme
print(p2)
p2b <- ggplot(monthly, aes(month, count)) +
geom_col(fill = "steelblue", alpha = 0.7) +
labs(title = paste("Contracts priced per month -", label_txt),
x = "Month", y = "Count") + my_theme
print(p2b)
# Section 9 — Rolling mean
monthly <- monthly %>%
mutate(
roll3 = rollmean(mean_margin, k = 3, fill = NA, align = "center"),
roll6 = rollmean(mean_margin, k = 6, fill = NA, align = "center")
)
p3 <- ggplot(monthly, aes(month)) +
geom_line(aes(y = mean_margin, color = "Monthly mean"), alpha = 0.4) +
geom_line(aes(y = roll3, color = "3-mo rolling")) +
geom_line(aes(y = roll6, color = "6-mo rolling")) +
scale_color_manual(values = c(`Monthly mean` = "gray50",
`3-mo rolling` = "steelblue",
`6-mo rolling` = "firebrick")) +
labs(title = paste("Rolling mean -", label_txt),
x = "Month", y = "Margin (p/kWh)", color = "") +
my_theme
print(p3)
# Section 10 — Seasonality (by Month + YoY)
pc <- pc %>% mutate(Year = year(DateCosted), Month = month(DateCosted))
p4 <- ggplot(pc, aes(factor(Month), Margin)) +
geom_boxplot(fill = "lightblue", outlier.size = 0.8) +
labs(title = paste("Margin by month of year -", label_txt),
x = "Month", y = "Margin (p/kWh)") + my_theme
print(p4)
yoy <- pc %>%
group_by(Year, Month) %>%
summarise(mean_margin = mean(Margin), .groups = "drop") %>%
mutate(Year = factor(Year))
p4b <- ggplot(yoy, aes(Month, mean_margin, color = Year, group = Year)) +
geom_line() + geom_point() +
scale_x_continuous(breaks = 1:12) +
labs(title = paste("YoY overlay of monthly mean margin -", label_txt),
x = "Month", y = "Margin (p/kWh)") + my_theme
print(p4b)
# Section 11 — STL decomposition
ts_vals <- monthly$mean_margin
if (any(is.na(ts_vals))) ts_vals <- zoo::na.approx(ts_vals, na.rm = FALSE)
ts_vals[is.na(ts_vals)] <- mean(ts_vals, na.rm = TRUE) # edge fill
start_yr <- year(min(monthly$month))
start_mo <- month(min(monthly$month))
margin_ts <- ts(ts_vals, frequency = 12, start = c(start_yr, start_mo))
if (length(margin_ts) >= 24) {
stl_fit <- stl(margin_ts, t.window = 13, s.window = "periodic", robust = TRUE)
plot(stl_fit, main = paste("STL decomposition -", label_txt))
} else {
cat("STL decomposition skipped - need >=24 months.\n")
}
# Section 12 — ADF stationarity test
cat("\n=================================================================\n")
cat("Stationarity & autocorrelation -", label_txt, "\n")
cat("=================================================================\n")
if (length(margin_ts) >= 12) {
adf_res <- suppressWarnings(
tseries::adf.test(margin_ts, alternative = "stationary"))
cat(sprintf("ADF statistic : %.4f\n", unname(adf_res$statistic)))
cat(sprintf("ADF p-value : %.4f --> %s\n",
adf_res$p.value,
ifelse(adf_res$p.value < 0.05, "stationary",
"NON-stationary (trend / unit root likely)")))
} else {
cat("ADF test skipped - series too short.\n")
}
# Section 13 — ACF / PACF
par(mfrow = c(1, 2))
acf (margin_ts, lag.max = min(24, length(margin_ts) - 1),
main = paste("ACF -", label_txt))
pacf(margin_ts, lag.max = min(24, length(margin_ts) - 1),
main = paste("PACF -", label_txt))
par(mfrow = c(1, 1))
# Section 14 — OLS linear trend
monthly_lm <- monthly %>% filter(!is.na(mean_margin)) %>%
mutate(t = row_number())
fit <- lm(mean_margin ~ t, data = monthly_lm)
coef_fit <- coef(fit)
p_val <- summary(fit)$coefficients["t", "Pr(>|t|)"]
r2 <- summary(fit)$r.squared
cat("\nOLS on monthly mean margin:\n")
cat(sprintf(" slope : %+.5f p/kWh per month (~%+.4f p/kWh per year)\n",
coef_fit["t"], coef_fit["t"] * 12))
cat(sprintf(" R-squared : %.3f\n", r2))
cat(sprintf(" p-value : %.4f --> %s\n", p_val,
ifelse(p_val < 0.05, "significant trend",
"no significant linear trend")))
p5 <- ggplot(monthly_lm, aes(month, mean_margin)) +
geom_line(alpha = 0.6) + geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, color = "firebrick", linetype = "dashed") +
labs(title = sprintf("Linear trend fit - %s (slope=%+.4f, p=%.3f)",
label_txt, coef_fit["t"], p_val),
x = "Month", y = "Margin (p/kWh)") + my_theme
print(p5)
# Section 15 — Segmentation
pc_seg <- pc %>%
mutate(Term_bucket = cut(Term, breaks = c(0, 12, 24, 36, 120),
labels = c("<=12m", "13-24m", "25-36m", ">36m")))
p6 <- ggplot(pc_seg, aes(Flexi, Margin)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Margin by Flexi/Fixed", x = "", y = "Margin (p/kWh)") + my_theme
p7 <- ggplot(pc_seg, aes(Route, Margin)) +
geom_boxplot(fill = "lightgreen") +
labs(title = "Margin by Route", x = "", y = "Margin (p/kWh)") + my_theme
p8 <- ggplot(pc_seg %>% filter(!is.na(Term_bucket)), aes(Term_bucket, Margin)) +
geom_boxplot(fill = "wheat") +
labs(title = "Margin by Contract Term", x = "", y = "Margin (p/kWh)") + my_theme
if (requireNamespace("patchwork", quietly = TRUE)) {
library(patchwork)
print(p6 + p7 + p8)
} else {
print(p6)
print(p7)
print(p8)
}
cat("\nMean margin by segment:\n\nFlexi:\n")
print(pc_seg %>% group_by(Flexi) %>%
summarise(mean = round(mean(Margin), 4), n = n()))
cat("\nRoute:\n")
print(pc_seg %>% group_by(Route) %>%
summarise(mean = round(mean(Margin), 4), n = n()))
cat("\nTerm bucket:\n")
print(pc_seg %>% filter(!is.na(Term_bucket)) %>%
group_by(Term_bucket) %>%
summarise(mean = round(mean(Margin), 4), n = n()))
# Return the computed objects invisibly so downstream code can use them
invisible(list(
consumption = cons,
monthly = monthly,
slope_per_mo = unname(coef_fit["t"]),
ols_p = unname(p_val),
ols_r2 = r2,
total_fy_kwh = sum(cons$FY_Consumption, na.rm = TRUE),
n_contracts = nrow(cons),
n_in_fy = sum(cons$FY_Days > 0)
))
}
# ==== SECTION 8 — Demo: run PC1 and PC2 ============================
res_pc1 <- analyse_profile_class(rdb, profile_class = 1)
## =================================================================
## NHH - Profile Class 1 FY 2017-04-01 -> 2018-03-31
## =================================================================
## Contracts (matching) : 306
## Contracts overlapping FY : 285
## Total FY contract-days : 70,239
## TOTAL FY consumption (kWh) : 1,120,143
## (GWh) : 1.1201
##
## Descriptive stats - NHH - Profile Class 1
## Rows (with margin) : 306
## DateCosted range : 2013-07-29 -> 2018-03-27
## Margin mean / median : 0.2331 / 0.1770
## Margin std / min / max : 0.3108 / -1.2745 / 1.7342

## Warning: Removed 10 rows containing missing values (`geom_point()`).

## Warning: Removed 10 rows containing missing values (`position_stack()`).

## Warning: Removed 16 rows containing missing values (`geom_line()`).
## Warning: Removed 19 rows containing missing values (`geom_line()`).



##
## =================================================================
## Stationarity & autocorrelation - NHH - Profile Class 1
## =================================================================
## ADF statistic : -3.7823
## ADF p-value : 0.0258 --> stationary

##
## OLS on monthly mean margin:
## slope : -0.00038 p/kWh per month (~-0.0046 p/kWh per year)
## R-squared : 0.002
## p-value : 0.7717 --> no significant linear trend
## `geom_smooth()` using formula = 'y ~ x'




##
## Mean margin by segment:
##
## Flexi:
## # A tibble: 1 × 3
## Flexi mean n
## <chr> <dbl> <int>
## 1 Fixed 0.233 306
##
## Route:
## # A tibble: 2 × 3
## Route mean n
## <chr> <dbl> <int>
## 1 DIRECT 0.346 22
## 2 TPI 0.224 284
##
## Term bucket:
## # A tibble: 4 × 3
## Term_bucket mean n
## <fct> <dbl> <int>
## 1 <=12m 0.221 107
## 2 13-24m 0.266 102
## 3 25-36m 0.196 82
## 4 >36m 0.295 15
res_pc2 <- analyse_profile_class(rdb, profile_class = 2)
## =================================================================
## NHH - Profile Class 2 FY 2017-04-01 -> 2018-03-31
## =================================================================
## Contracts (matching) : 68
## Contracts overlapping FY : 66
## Total FY contract-days : 17,372
## TOTAL FY consumption (kWh) : 420,651.6
## (GWh) : 0.4207
##
## Descriptive stats - NHH - Profile Class 2
## Rows (with margin) : 68
## DateCosted range : 2014-04-09 -> 2018-03-19
## Margin mean / median : 0.2896 / 0.1792
## Margin std / min / max : 0.4784 / -1.0057 / 3.0141

## Warning: Removed 11 rows containing missing values (`geom_point()`).

## Warning: Removed 11 rows containing missing values (`position_stack()`).

## Warning: Removed 14 rows containing missing values (`geom_line()`).
## Warning: Removed 31 rows containing missing values (`geom_line()`).



##
## =================================================================
## Stationarity & autocorrelation - NHH - Profile Class 2
## =================================================================
## ADF statistic : -4.5758
## ADF p-value : 0.0100 --> stationary

##
## OLS on monthly mean margin:
## slope : +0.00447 p/kWh per month (~+0.0536 p/kWh per year)
## R-squared : 0.019
## p-value : 0.4106 --> no significant linear trend
## `geom_smooth()` using formula = 'y ~ x'




##
## Mean margin by segment:
##
## Flexi:
## # A tibble: 1 × 3
## Flexi mean n
## <chr> <dbl> <int>
## 1 Fixed 0.290 68
##
## Route:
## # A tibble: 2 × 3
## Route mean n
## <chr> <dbl> <int>
## 1 DIRECT 0.634 4
## 2 TPI 0.268 64
##
## Term bucket:
## # A tibble: 4 × 3
## Term_bucket mean n
## <fct> <dbl> <int>
## 1 <=12m 0.284 27
## 2 13-24m 0.436 23
## 3 25-36m 0.100 17
## 4 >36m 0.265 1
# Side-by-side comparison
comp <- tibble(
`Profile Class` = c(1, 2),
`Contracts` = c(res_pc1$n_contracts, res_pc2$n_contracts),
`In FY17/18` = c(res_pc1$n_in_fy, res_pc2$n_in_fy),
`FY Consumption (kWh)` = round(c(res_pc1$total_fy_kwh, res_pc2$total_fy_kwh), 2),
`FY Consumption (GWh)` = round(c(res_pc1$total_fy_kwh, res_pc2$total_fy_kwh) / 1e6, 4),
`OLS slope (p/mo)` = round(c(res_pc1$slope_per_mo, res_pc2$slope_per_mo), 5),
`OLS p-value` = round(c(res_pc1$ols_p, res_pc2$ols_p), 4)
)
cat("\n=================================================================\n")
##
## =================================================================
cat("COMPARISON - PC1 vs PC2 (NHH, FY 2017/18)\n")
## COMPARISON - PC1 vs PC2 (NHH, FY 2017/18)
cat("=================================================================\n")
## =================================================================
print(comp)
## # A tibble: 2 × 7
## `Profile Class` Contracts `In FY17/18` `FY Consumption (kWh)`
## <dbl> <int> <int> <dbl>
## 1 1 306 285 1120143.
## 2 2 68 66 420652.
## # ℹ 3 more variables: `FY Consumption (GWh)` <dbl>, `OLS slope (p/mo)` <dbl>,
## # `OLS p-value` <dbl>
cat("\nDone. All plots rendered to the RStudio Plots pane — use the\n")
##
## Done. All plots rendered to the RStudio Plots pane — use the
cat("arrows at the top of the Plots pane to flick through them.\n")
## arrows at the top of the Plots pane to flick through them.