setwd("C:/Users/Vibha/OneDrive/Desktop")
getwd()
## [1] "C:/Users/Vibha/OneDrive/Desktop"
#' Title : Q2 - NHH Profile Class 1 Margin Trend Analysis
#' Author: Vibhash
#' Date  : 2026-04
#'
#' Margin (p/kWh) is set at DateCosted. We treat DateCosted as the time axis
#' and run a full time-series EDA — plots are rendered interactively to the
#' RStudio 'Plots' pane (nothing is saved to disk):
#'   1. Descriptive stats
#'   2. Raw scatter
#'   3. Monthly aggregation (mean, median, count, volume-weighted mean)
#'   4. Rolling 3/6-month mean
#'   5. Month-of-year seasonality + YoY overlay
#'   6. STL decomposition
#'   7. Stationarity (ADF) + ACF/PACF
#'   8. Segmentation (Flexi, Route, Term bucket)
#'   9. OLS linear trend fit"
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readxl)
## Warning: package 'readxl' was built under R version 4.2.3
library(lubridate)
library(ggplot2)
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.2.3
library(fpp2)          # ts(), autoplot(), stl forecast
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## ── Attaching packages ──────────────────────────────────────────── fpp2 2.5.1 ──
## ✔ forecast  8.21     ✔ expsmooth 2.3 
## ✔ fma       2.5
## Warning: package 'fma' was built under R version 4.2.3
## Warning: package 'expsmooth' was built under R version 4.2.3
## 
library(tseries)       # adf.test()
library(zoo)           # rollmean()
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
to_date <- function(x) {
  if (inherits(x, "Date"))                         return(as.Date(x))
  if (inherits(x, "POSIXct") || inherits(x, "POSIXt")) return(as.Date(x))
  n <- suppressWarnings(as.numeric(x))
  if (all(is.na(n))) return(as.Date(x))
  as.Date(n, origin = "1899-12-30")
}
file_date  <- format(Sys.Date(), "%Y%m%d")
file_name  <- "SSEEX1.xlsx"
sheet_name <- "Sheet1"

MARKET_F   <- "NHH"
PROFILE_F  <- 1

out_dir    <- file.path(getwd(), "q2_plots")
if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)
my_theme <- theme(
  plot.background   = element_rect(fill = "white", color = "white"),
  panel.background  = element_rect(fill = "white", color = "gray60"),
  panel.grid.major  = element_line(color = "gray90"),
  panel.grid.minor  = element_blank(),
  legend.background = element_rect(fill = "white", color = "white"),
  legend.key        = element_rect(fill = "white", colour = "white"),
  plot.title        = element_text(face = "bold")
)
raw <- read_excel(file_name, sheet = sheet_name)

names(raw) <- trimws(names(raw))

rdb <- raw %>%
  slice(-1) %>%                                      # drop description row
  mutate(
    Profile            = as.numeric(Profile),
    `Annual Consumption` = as.numeric(`Annual Consumption`),
    Margin             = as.numeric(Margin),
    Term               = as.numeric(Term),
    Start_Date = as.Date(as.numeric(Start_Date), origin = "1899-12-30"),
    End_Date   = as.Date(as.numeric(End_Date),   origin = "1899-12-30"),
    DateCosted = as.Date(as.numeric(DateCosted), origin = "1899-12-30"),
    Market             = toupper(trimws(as.character(Market)))
  )

pc1 <- rdb %>%
  filter(Market == MARKET_F, Profile == PROFILE_F) %>%
  filter(!is.na(Margin), !is.na(DateCosted)) %>%
  arrange(DateCosted)

label_txt <- paste0(MARKET_F, " - Profile Class ", PROFILE_F)
cat("=================================================================\n")
## =================================================================
cat("Descriptive stats -", label_txt, "\n")
## Descriptive stats - NHH - Profile Class 1
cat("=================================================================\n")
## =================================================================
cat(sprintf("Rows                   : %d\n", nrow(pc1)))
## Rows                   : 306
cat(sprintf("DateCosted range       : %s -> %s\n",
            min(pc1$DateCosted), max(pc1$DateCosted)))
## DateCosted range       : 2013-07-29 -> 2018-03-27
cat(sprintf("Margin mean / median   : %.4f / %.4f\n",
            mean(pc1$Margin), median(pc1$Margin)))
## Margin mean / median   : 0.2331 / 0.1770
cat(sprintf("Margin std / min / max : %.4f / %.4f / %.4f\n",
            sd(pc1$Margin), min(pc1$Margin), max(pc1$Margin)))
## Margin std / min / max : 0.3108 / -1.2745 / 1.7342
# ==== SECTION 7 — Raw Scatter ======================================
p1 <- ggplot(pc1, aes(DateCosted, Margin)) +
  geom_point(alpha = 0.35, size = 1.6, color = "steelblue") +
  labs(title = paste("Raw margin vs DateCosted -", label_txt),
       x = "DateCosted", y = "Margin (p/kWh)") +
  my_theme
p1 <- ggplot(pc1, aes(DateCosted, Margin)) +
  geom_point(alpha = 0.35, size = 1.6, color = "steelblue") +
  labs(title = paste("Raw margin vs DateCosted -", label_txt),
       x = "DateCosted", y = "Margin (p/kWh)") +
  my_theme
print(p1)

monthly <- pc1 %>%
  mutate(month = floor_date(DateCosted, "month")) %>%
  group_by(month) %>%
  summarise(
    mean_margin   = mean(Margin),
    median_margin = median(Margin),
    count         = n(),
    vol_weighted  = sum(Margin * `Annual Consumption`) /
                    sum(`Annual Consumption`),
    .groups = "drop"
  )
full_months <- tibble(month = seq(min(monthly$month),
                                  max(monthly$month),
                                  by = "month"))
monthly <- full_months %>% left_join(monthly, by = "month")

p2 <- ggplot(monthly, aes(month)) +
  geom_line (aes(y = mean_margin,   color = "Mean"))           +
  geom_point(aes(y = mean_margin,   color = "Mean"))           +
  geom_line (aes(y = median_margin, color = "Median"), alpha = 0.7) +
  geom_line (aes(y = vol_weighted,  color = "Vol-weighted"), alpha = 0.7) +
  scale_color_manual(values = c(Mean = "steelblue",
                                Median = "darkorange",
                                `Vol-weighted` = "forestgreen")) +
  labs(title = paste("Monthly margin -", label_txt),
       x = "Month", y = "Margin (p/kWh)", color = "") +
  my_theme
print(p2)
## Warning: Removed 10 rows containing missing values (`geom_point()`).

p2b <- ggplot(monthly, aes(month, count)) +
  geom_col(fill = "steelblue", alpha = 0.7) +
  labs(title = paste("Contracts priced per month -", label_txt),
       x = "Month", y = "Count") + my_theme
print(p2b)
## Warning: Removed 10 rows containing missing values (`position_stack()`).

monthly <- monthly %>%
  mutate(
    roll3 = rollmean(mean_margin, k = 3, fill = NA, align = "center"),
    roll6 = rollmean(mean_margin, k = 6, fill = NA, align = "center")
  )

p3 <- ggplot(monthly, aes(month)) +
  geom_line(aes(y = mean_margin, color = "Monthly mean"), alpha = 0.4) +
  geom_line(aes(y = roll3,       color = "3-mo rolling"))              +
  geom_line(aes(y = roll6,       color = "6-mo rolling"))              +
  scale_color_manual(values = c(`Monthly mean` = "gray50",
                                `3-mo rolling` = "steelblue",
                                `6-mo rolling` = "firebrick")) +
  labs(title = paste("Rolling mean -", label_txt),
       x = "Month", y = "Margin (p/kWh)", color = "") +
  my_theme
print(p3)
## Warning: Removed 16 rows containing missing values (`geom_line()`).
## Warning: Removed 19 rows containing missing values (`geom_line()`).

pc1 <- pc1 %>% mutate(Year = year(DateCosted), Month = month(DateCosted))

p4 <- ggplot(pc1, aes(factor(Month), Margin)) +
  geom_boxplot(fill = "lightblue", outlier.size = 0.8) +
  labs(title = paste("Margin by month of year -", label_txt),
       x = "Month", y = "Margin (p/kWh)") + my_theme
print(p4)

yoy <- pc1 %>%
  group_by(Year, Month) %>%
  summarise(mean_margin = mean(Margin), .groups = "drop") %>%
  mutate(Year = factor(Year))

p4b <- ggplot(yoy, aes(Month, mean_margin, color = Year, group = Year)) +
  geom_line() + geom_point() +
  scale_x_continuous(breaks = 1:12) +
  labs(title = paste("YoY overlay of monthly mean margin -", label_txt),
       x = "Month", y = "Margin (p/kWh)") + my_theme
print(p4b)

ts_vals <- monthly$mean_margin
if (any(is.na(ts_vals))) ts_vals <- zoo::na.approx(ts_vals, na.rm = FALSE)
ts_vals[is.na(ts_vals)] <- mean(ts_vals, na.rm = TRUE)   # edge fill

start_yr <- year(min(monthly$month))
start_mo <- month(min(monthly$month))
margin_ts <- ts(ts_vals, frequency = 12, start = c(start_yr, start_mo))

if (length(margin_ts) >= 24) {
  stl_fit <- stl(margin_ts, t.window = 13, s.window = "periodic", robust = TRUE)

  plot(stl_fit, main = paste("STL decomposition -", label_txt))
} else {
  cat("STL decomposition skipped - need >=24 months.\n")
}

# ==== SECTION 12 — ADF Stationarity Test ==========================
cat("\n=================================================================\n")
## 
## =================================================================
cat("Stationarity & autocorrelation -", label_txt, "\n")
## Stationarity & autocorrelation - NHH - Profile Class 1
cat("=================================================================\n")
## =================================================================
adf_res <- tseries::adf.test(margin_ts, alternative = "stationary")
cat(sprintf("ADF statistic : %.4f\n", unname(adf_res$statistic)))
## ADF statistic : -3.7823
cat(sprintf("ADF p-value   : %.4f  -->  %s\n",
            adf_res$p.value,
            ifelse(adf_res$p.value < 0.05, "stationary",
                   "NON-stationary (trend / unit root likely)")))
## ADF p-value   : 0.0258  -->  stationary
par(mfrow = c(1, 2))
acf (margin_ts, lag.max = 24, main = paste("ACF -",  label_txt))
pacf(margin_ts, lag.max = 24, main = paste("PACF -", label_txt))

par(mfrow = c(1, 1))
monthly_lm <- monthly %>% filter(!is.na(mean_margin)) %>%
  mutate(t = row_number())
fit <- lm(mean_margin ~ t, data = monthly_lm)
coef_fit <- coef(fit)
p_val    <- summary(fit)$coefficients["t", "Pr(>|t|)"]
r2       <- summary(fit)$r.squared

cat("\nOLS on monthly mean margin:\n")
## 
## OLS on monthly mean margin:
cat(sprintf("  slope     : %+.5f p/kWh per month  (~%+.4f p/kWh per year)\n",
            coef_fit["t"], coef_fit["t"] * 12))
##   slope     : -0.00038 p/kWh per month  (~-0.0046 p/kWh per year)
cat(sprintf("  R-squared : %.3f\n", r2))
##   R-squared : 0.002
cat(sprintf("  p-value   : %.4f  -->  %s\n", p_val,
            ifelse(p_val < 0.05, "significant trend",
                   "no significant linear trend")))
##   p-value   : 0.7717  -->  no significant linear trend
p5 <- ggplot(monthly_lm, aes(month, mean_margin)) +
  geom_line(alpha = 0.6) + geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "firebrick", linetype = "dashed") +
  labs(title = sprintf("Linear trend fit - %s  (slope=%+.4f, p=%.3f)",
                       label_txt, coef_fit["t"], p_val),
       x = "Month", y = "Margin (p/kWh)") + my_theme
print(p5)
## `geom_smooth()` using formula = 'y ~ x'

pc1_seg <- pc1 %>%
  mutate(Term_bucket = cut(Term, breaks = c(0, 12, 24, 36, 120),
                           labels = c("<=12m", "13-24m", "25-36m", ">36m")))

p6 <- ggplot(pc1_seg, aes(Flexi, Margin)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Margin by Flexi/Fixed", x = "", y = "Margin (p/kWh)") + my_theme

p7 <- ggplot(pc1_seg, aes(Route, Margin)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "Margin by Route", x = "", y = "Margin (p/kWh)") + my_theme

p8 <- ggplot(pc1_seg %>% filter(!is.na(Term_bucket)), aes(Term_bucket, Margin)) +
  geom_boxplot(fill = "wheat") +
  labs(title = "Margin by Contract Term", x = "", y = "Margin (p/kWh)") + my_theme
if (requireNamespace("patchwork", quietly = TRUE)) {
  library(patchwork)
  print(p6 + p7 + p8)
} else {
  print(p6)
  print(p7)
  print(p8)
}

cat("\nMean margin by segment:\n\nFlexi:\n")
## 
## Mean margin by segment:
## 
## Flexi:
print(pc1_seg %>% group_by(Flexi) %>%
        summarise(mean = round(mean(Margin), 4), n = n()))
## # A tibble: 1 × 3
##   Flexi  mean     n
##   <chr> <dbl> <int>
## 1 Fixed 0.233   306
cat("\nRoute:\n")
## 
## Route:
print(pc1_seg %>% group_by(Route) %>%
        summarise(mean = round(mean(Margin), 4), n = n()))
## # A tibble: 2 × 3
##   Route   mean     n
##   <chr>  <dbl> <int>
## 1 DIRECT 0.346    22
## 2 TPI    0.224   284
cat("\nTerm bucket:\n")
## 
## Term bucket:
print(pc1_seg %>% filter(!is.na(Term_bucket)) %>%
        group_by(Term_bucket) %>%
        summarise(mean = round(mean(Margin), 4), n = n()))
## # A tibble: 4 × 3
##   Term_bucket  mean     n
##   <fct>       <dbl> <int>
## 1 <=12m       0.221   107
## 2 13-24m      0.266   102
## 3 25-36m      0.196    82
## 4 >36m        0.295    15
cat("\nDone. All plots rendered to the RStudio Plots pane — use the\n")
## 
## Done. All plots rendered to the RStudio Plots pane — use the
cat("arrows at the top of the Plots pane to flick through them.\n")
## arrows at the top of the Plots pane to flick through them.