#' Title : Q3 - Flexible Profile Class Analysis (Q1 + Q2 for ANY PC)
#' Author: Vibhash
#' Date  : 2026-04

# ==== SECTION 1 — Working Directory ================================
# setwd("C:/Users/.../SSE_Exercise")
# getwd()

# ==== SECTION 2 — Libraries ========================================
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readxl)
## Warning: package 'readxl' was built under R version 4.2.3
library(lubridate)
library(ggplot2)
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.2.3
library(fpp2)          # ts(), autoplot(), stl forecast
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## ── Attaching packages ──────────────────────────────────────────── fpp2 2.5.1 ──
## ✔ forecast  8.21     ✔ expsmooth 2.3 
## ✔ fma       2.5
## Warning: package 'fma' was built under R version 4.2.3
## Warning: package 'expsmooth' was built under R version 4.2.3
## 
library(tseries)       # adf.test()
library(zoo)           # rollmean()
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# Excel serial <-> Date helper (handles POSIXct / Date / numeric / string)
to_date <- function(x) {
  if (inherits(x, "Date"))                         return(as.Date(x))
  if (inherits(x, "POSIXct") || inherits(x, "POSIXt")) return(as.Date(x))
  n <- suppressWarnings(as.numeric(x))
  if (all(is.na(n))) return(as.Date(x))
  as.Date(n, origin = "1899-12-30")
}

# ==== SECTION 3 — Input Configuration ==============================
file_name  <- "SSEEX1.xlsx"
sheet_name <- "Sheet1"

DAYS_YR    <- 365

# ==== SECTION 4 — Custom Theme =====================================
my_theme <- theme(
  plot.background   = element_rect(fill = "white", color = "white"),
  panel.background  = element_rect(fill = "white", color = "gray60"),
  panel.grid.major  = element_line(color = "gray90"),
  panel.grid.minor  = element_blank(),
  legend.background = element_rect(fill = "white", color = "white"),
  legend.key        = element_rect(fill = "white", colour = "white"),
  plot.title        = element_text(face = "bold")
)

# ==== SECTION 5 — Data Loading & Cleaning ==========================
raw <- read_excel(file_name, sheet = sheet_name)
names(raw) <- trimws(names(raw))

rdb <- raw %>%
  slice(-1) %>%
  mutate(
    Profile              = as.numeric(Profile),
    `Annual Consumption` = as.numeric(`Annual Consumption`),
    Margin               = as.numeric(Margin),
    Term                 = as.numeric(Term),
    Start_Date           = to_date(Start_Date),
    End_Date             = to_date(End_Date),
    DateCosted           = to_date(DateCosted),
    Market               = toupper(trimws(as.character(Market))),
    Flexi                = trimws(as.character(Flexi)),
    Route                = trimws(as.character(Route))
  )

# ==== SECTION 6 — Core Helper: FY overlap days ====================
# Inclusive day-count of overlap between contract window and FY window.
fy_overlap_days <- function(start, end, fy_start, fy_end) {
  if (is.na(start) || is.na(end)) return(0L)
  win_start <- pmax(start, fy_start)
  win_end   <- pmin(end,   fy_end)
  if (win_end < win_start) return(0L)
  as.integer(as.numeric(win_end - win_start) + 1)
}

# ==== SECTION 7 — Flexible Function ===============================
# One call runs the Q1 (FY consumption) + Q2 (margin trend EDA) pipeline
# for any (market, profile_class, FY window). Plots render to the Plots pane.
analyse_profile_class <- function(df,
                                  profile_class,
                                  market   = "NHH",
                                  fy_start = "2017-04-01",
                                  fy_end   = "2018-03-31") {

  fy_s      <- as.Date(fy_start)
  fy_e      <- as.Date(fy_end)
  market    <- toupper(market)
  label_txt <- paste0(market, " - Profile Class ", profile_class)

  # -------- filter slice ------------------------------------------
  sub <- df %>%
    filter(Market == market, Profile == profile_class)

  if (nrow(sub) == 0) {
    cat(sprintf("[%s] no rows match - nothing to analyse.\n", label_txt))
    return(invisible(NULL))
  }

  # -------- Q1: FY CONSUMPTION ------------------------------------
  cons <- sub %>%
    rowwise() %>%
    mutate(
      Daily_Consumption = `Annual Consumption` / DAYS_YR,
      FY_Days           = fy_overlap_days(Start_Date, End_Date, fy_s, fy_e),
      FY_Consumption    = Daily_Consumption * FY_Days
    ) %>%
    ungroup()

  cat("=================================================================\n")
  cat(sprintf("%s   FY %s -> %s\n", label_txt, fy_s, fy_e))
  cat("=================================================================\n")
  cat(sprintf("Contracts (matching)       : %d\n", nrow(cons)))
  cat(sprintf("Contracts overlapping FY   : %d\n", sum(cons$FY_Days > 0)))
  cat(sprintf("Total FY contract-days     : %s\n",
              format(sum(cons$FY_Days), big.mark = ",")))
  cat(sprintf("TOTAL FY consumption (kWh) : %s\n",
              format(round(sum(cons$FY_Consumption, na.rm = TRUE), 2), big.mark = ",")))
  cat(sprintf("                     (GWh) : %.4f\n",
              sum(cons$FY_Consumption, na.rm = TRUE) / 1e6))

  # -------- Q2: MARGIN TREND EDA ----------------------------------
  pc <- sub %>%
    filter(!is.na(Margin), !is.na(DateCosted)) %>%
    arrange(DateCosted)

  if (nrow(pc) < 3) {
    cat("\n[Margin trend skipped - fewer than 3 contracts with margin data.]\n")
    return(invisible(list(consumption = cons)))
  }

  # Section 6 — Descriptive stats
  cat("\nDescriptive stats -", label_txt, "\n")
  cat(sprintf("Rows (with margin)     : %d\n", nrow(pc)))
  cat(sprintf("DateCosted range       : %s -> %s\n",
              min(pc$DateCosted), max(pc$DateCosted)))
  cat(sprintf("Margin mean / median   : %.4f / %.4f\n",
              mean(pc$Margin), median(pc$Margin)))
  cat(sprintf("Margin std / min / max : %.4f / %.4f / %.4f\n",
              sd(pc$Margin), min(pc$Margin), max(pc$Margin)))

  # Section 7 — Raw scatter
  p1 <- ggplot(pc, aes(DateCosted, Margin)) +
    geom_point(alpha = 0.35, size = 1.6, color = "steelblue") +
    labs(title = paste("Raw margin vs DateCosted -", label_txt),
         x = "DateCosted", y = "Margin (p/kWh)") +
    my_theme
  print(p1)

  # Section 8 — Monthly aggregation
  monthly <- pc %>%
    mutate(month = floor_date(DateCosted, "month")) %>%
    group_by(month) %>%
    summarise(
      mean_margin   = mean(Margin),
      median_margin = median(Margin),
      count         = n(),
      vol_weighted  = sum(Margin * `Annual Consumption`) /
                      sum(`Annual Consumption`),
      .groups = "drop"
    )

  # Fill missing months so rolling / ts() behave well
  full_months <- tibble(month = seq(min(monthly$month),
                                    max(monthly$month),
                                    by = "month"))
  monthly <- full_months %>% left_join(monthly, by = "month")

  p2 <- ggplot(monthly, aes(month)) +
    geom_line (aes(y = mean_margin,   color = "Mean"))           +
    geom_point(aes(y = mean_margin,   color = "Mean"))           +
    geom_line (aes(y = median_margin, color = "Median"), alpha = 0.7) +
    geom_line (aes(y = vol_weighted,  color = "Vol-weighted"), alpha = 0.7) +
    scale_color_manual(values = c(Mean = "steelblue",
                                  Median = "darkorange",
                                  `Vol-weighted` = "forestgreen")) +
    labs(title = paste("Monthly margin -", label_txt),
         x = "Month", y = "Margin (p/kWh)", color = "") +
    my_theme
  print(p2)

  p2b <- ggplot(monthly, aes(month, count)) +
    geom_col(fill = "steelblue", alpha = 0.7) +
    labs(title = paste("Contracts priced per month -", label_txt),
         x = "Month", y = "Count") + my_theme
  print(p2b)

  # Section 9 — Rolling mean
  monthly <- monthly %>%
    mutate(
      roll3 = rollmean(mean_margin, k = 3, fill = NA, align = "center"),
      roll6 = rollmean(mean_margin, k = 6, fill = NA, align = "center")
    )

  p3 <- ggplot(monthly, aes(month)) +
    geom_line(aes(y = mean_margin, color = "Monthly mean"), alpha = 0.4) +
    geom_line(aes(y = roll3,       color = "3-mo rolling"))              +
    geom_line(aes(y = roll6,       color = "6-mo rolling"))              +
    scale_color_manual(values = c(`Monthly mean` = "gray50",
                                  `3-mo rolling` = "steelblue",
                                  `6-mo rolling` = "firebrick")) +
    labs(title = paste("Rolling mean -", label_txt),
         x = "Month", y = "Margin (p/kWh)", color = "") +
    my_theme
  print(p3)

  # Section 10 — Seasonality (by Month + YoY)
  pc <- pc %>% mutate(Year = year(DateCosted), Month = month(DateCosted))

  p4 <- ggplot(pc, aes(factor(Month), Margin)) +
    geom_boxplot(fill = "lightblue", outlier.size = 0.8) +
    labs(title = paste("Margin by month of year -", label_txt),
         x = "Month", y = "Margin (p/kWh)") + my_theme
  print(p4)

  yoy <- pc %>%
    group_by(Year, Month) %>%
    summarise(mean_margin = mean(Margin), .groups = "drop") %>%
    mutate(Year = factor(Year))

  p4b <- ggplot(yoy, aes(Month, mean_margin, color = Year, group = Year)) +
    geom_line() + geom_point() +
    scale_x_continuous(breaks = 1:12) +
    labs(title = paste("YoY overlay of monthly mean margin -", label_txt),
         x = "Month", y = "Margin (p/kWh)") + my_theme
  print(p4b)

  # Section 11 — STL decomposition
  ts_vals <- monthly$mean_margin
  if (any(is.na(ts_vals))) ts_vals <- zoo::na.approx(ts_vals, na.rm = FALSE)
  ts_vals[is.na(ts_vals)] <- mean(ts_vals, na.rm = TRUE)   # edge fill

  start_yr <- year(min(monthly$month))
  start_mo <- month(min(monthly$month))
  margin_ts <- ts(ts_vals, frequency = 12, start = c(start_yr, start_mo))

  if (length(margin_ts) >= 24) {
    stl_fit <- stl(margin_ts, t.window = 13, s.window = "periodic", robust = TRUE)
    plot(stl_fit, main = paste("STL decomposition -", label_txt))
  } else {
    cat("STL decomposition skipped - need >=24 months.\n")
  }

  # Section 12 — ADF stationarity test
  cat("\n=================================================================\n")
  cat("Stationarity & autocorrelation -", label_txt, "\n")
  cat("=================================================================\n")
  if (length(margin_ts) >= 12) {
    adf_res <- suppressWarnings(
      tseries::adf.test(margin_ts, alternative = "stationary"))
    cat(sprintf("ADF statistic : %.4f\n", unname(adf_res$statistic)))
    cat(sprintf("ADF p-value   : %.4f  -->  %s\n",
                adf_res$p.value,
                ifelse(adf_res$p.value < 0.05, "stationary",
                       "NON-stationary (trend / unit root likely)")))
  } else {
    cat("ADF test skipped - series too short.\n")
  }

  # Section 13 — ACF / PACF
  par(mfrow = c(1, 2))
  acf (margin_ts, lag.max = min(24, length(margin_ts) - 1),
       main = paste("ACF -",  label_txt))
  pacf(margin_ts, lag.max = min(24, length(margin_ts) - 1),
       main = paste("PACF -", label_txt))
  par(mfrow = c(1, 1))

  # Section 14 — OLS linear trend
  monthly_lm <- monthly %>% filter(!is.na(mean_margin)) %>%
    mutate(t = row_number())
  fit      <- lm(mean_margin ~ t, data = monthly_lm)
  coef_fit <- coef(fit)
  p_val    <- summary(fit)$coefficients["t", "Pr(>|t|)"]
  r2       <- summary(fit)$r.squared

  cat("\nOLS on monthly mean margin:\n")
  cat(sprintf("  slope     : %+.5f p/kWh per month  (~%+.4f p/kWh per year)\n",
              coef_fit["t"], coef_fit["t"] * 12))
  cat(sprintf("  R-squared : %.3f\n", r2))
  cat(sprintf("  p-value   : %.4f  -->  %s\n", p_val,
              ifelse(p_val < 0.05, "significant trend",
                     "no significant linear trend")))

  p5 <- ggplot(monthly_lm, aes(month, mean_margin)) +
    geom_line(alpha = 0.6) + geom_point(alpha = 0.6) +
    geom_smooth(method = "lm", se = FALSE, color = "firebrick", linetype = "dashed") +
    labs(title = sprintf("Linear trend fit - %s  (slope=%+.4f, p=%.3f)",
                         label_txt, coef_fit["t"], p_val),
         x = "Month", y = "Margin (p/kWh)") + my_theme
  print(p5)

  # Section 15 — Segmentation
  pc_seg <- pc %>%
    mutate(Term_bucket = cut(Term, breaks = c(0, 12, 24, 36, 120),
                             labels = c("<=12m", "13-24m", "25-36m", ">36m")))

  p6 <- ggplot(pc_seg, aes(Flexi, Margin)) +
    geom_boxplot(fill = "lightblue") +
    labs(title = "Margin by Flexi/Fixed", x = "", y = "Margin (p/kWh)") + my_theme

  p7 <- ggplot(pc_seg, aes(Route, Margin)) +
    geom_boxplot(fill = "lightgreen") +
    labs(title = "Margin by Route", x = "", y = "Margin (p/kWh)") + my_theme

  p8 <- ggplot(pc_seg %>% filter(!is.na(Term_bucket)), aes(Term_bucket, Margin)) +
    geom_boxplot(fill = "wheat") +
    labs(title = "Margin by Contract Term", x = "", y = "Margin (p/kWh)") + my_theme

  if (requireNamespace("patchwork", quietly = TRUE)) {
    library(patchwork)
    print(p6 + p7 + p8)
  } else {
    print(p6)
    print(p7)
    print(p8)
  }

  cat("\nMean margin by segment:\n\nFlexi:\n")
  print(pc_seg %>% group_by(Flexi) %>%
          summarise(mean = round(mean(Margin), 4), n = n()))
  cat("\nRoute:\n")
  print(pc_seg %>% group_by(Route) %>%
          summarise(mean = round(mean(Margin), 4), n = n()))
  cat("\nTerm bucket:\n")
  print(pc_seg %>% filter(!is.na(Term_bucket)) %>%
          group_by(Term_bucket) %>%
          summarise(mean = round(mean(Margin), 4), n = n()))

  # Return the computed objects invisibly so downstream code can use them
  invisible(list(
    consumption  = cons,
    monthly      = monthly,
    slope_per_mo = unname(coef_fit["t"]),
    ols_p        = unname(p_val),
    ols_r2       = r2,
    total_fy_kwh = sum(cons$FY_Consumption, na.rm = TRUE),
    n_contracts  = nrow(cons),
    n_in_fy      = sum(cons$FY_Days > 0)
  ))
}

# ==== SECTION 8 — Demo: run PC1 and PC2 ============================
res_pc1 <- analyse_profile_class(rdb, profile_class = 1)
## =================================================================
## NHH - Profile Class 1   FY 2017-04-01 -> 2018-03-31
## =================================================================
## Contracts (matching)       : 306
## Contracts overlapping FY   : 285
## Total FY contract-days     : 70,239
## TOTAL FY consumption (kWh) : 1,120,143
##                      (GWh) : 1.1201
## 
## Descriptive stats - NHH - Profile Class 1 
## Rows (with margin)     : 306
## DateCosted range       : 2013-07-29 -> 2018-03-27
## Margin mean / median   : 0.2331 / 0.1770
## Margin std / min / max : 0.3108 / -1.2745 / 1.7342

## Warning: Removed 10 rows containing missing values (`geom_point()`).

## Warning: Removed 10 rows containing missing values (`position_stack()`).

## Warning: Removed 16 rows containing missing values (`geom_line()`).
## Warning: Removed 19 rows containing missing values (`geom_line()`).

## 
## =================================================================
## Stationarity & autocorrelation - NHH - Profile Class 1 
## =================================================================
## ADF statistic : -3.7823
## ADF p-value   : 0.0258  -->  stationary

## 
## OLS on monthly mean margin:
##   slope     : -0.00038 p/kWh per month  (~-0.0046 p/kWh per year)
##   R-squared : 0.002
##   p-value   : 0.7717  -->  no significant linear trend
## `geom_smooth()` using formula = 'y ~ x'

## 
## Mean margin by segment:
## 
## Flexi:
## # A tibble: 1 × 3
##   Flexi  mean     n
##   <chr> <dbl> <int>
## 1 Fixed 0.233   306
## 
## Route:
## # A tibble: 2 × 3
##   Route   mean     n
##   <chr>  <dbl> <int>
## 1 DIRECT 0.346    22
## 2 TPI    0.224   284
## 
## Term bucket:
## # A tibble: 4 × 3
##   Term_bucket  mean     n
##   <fct>       <dbl> <int>
## 1 <=12m       0.221   107
## 2 13-24m      0.266   102
## 3 25-36m      0.196    82
## 4 >36m        0.295    15
res_pc2 <- analyse_profile_class(rdb, profile_class = 2)
## =================================================================
## NHH - Profile Class 2   FY 2017-04-01 -> 2018-03-31
## =================================================================
## Contracts (matching)       : 68
## Contracts overlapping FY   : 66
## Total FY contract-days     : 17,372
## TOTAL FY consumption (kWh) : 420,651.6
##                      (GWh) : 0.4207
## 
## Descriptive stats - NHH - Profile Class 2 
## Rows (with margin)     : 68
## DateCosted range       : 2014-04-09 -> 2018-03-19
## Margin mean / median   : 0.2896 / 0.1792
## Margin std / min / max : 0.4784 / -1.0057 / 3.0141

## Warning: Removed 11 rows containing missing values (`geom_point()`).

## Warning: Removed 11 rows containing missing values (`position_stack()`).

## Warning: Removed 14 rows containing missing values (`geom_line()`).
## Warning: Removed 31 rows containing missing values (`geom_line()`).

## 
## =================================================================
## Stationarity & autocorrelation - NHH - Profile Class 2 
## =================================================================
## ADF statistic : -4.5758
## ADF p-value   : 0.0100  -->  stationary

## 
## OLS on monthly mean margin:
##   slope     : +0.00447 p/kWh per month  (~+0.0536 p/kWh per year)
##   R-squared : 0.019
##   p-value   : 0.4106  -->  no significant linear trend
## `geom_smooth()` using formula = 'y ~ x'

## 
## Mean margin by segment:
## 
## Flexi:
## # A tibble: 1 × 3
##   Flexi  mean     n
##   <chr> <dbl> <int>
## 1 Fixed 0.290    68
## 
## Route:
## # A tibble: 2 × 3
##   Route   mean     n
##   <chr>  <dbl> <int>
## 1 DIRECT 0.634     4
## 2 TPI    0.268    64
## 
## Term bucket:
## # A tibble: 4 × 3
##   Term_bucket  mean     n
##   <fct>       <dbl> <int>
## 1 <=12m       0.284    27
## 2 13-24m      0.436    23
## 3 25-36m      0.100    17
## 4 >36m        0.265     1
# Side-by-side comparison
comp <- tibble(
  `Profile Class`        = c(1, 2),
  `Contracts`            = c(res_pc1$n_contracts,       res_pc2$n_contracts),
  `In FY17/18`           = c(res_pc1$n_in_fy,           res_pc2$n_in_fy),
  `FY Consumption (kWh)` = round(c(res_pc1$total_fy_kwh, res_pc2$total_fy_kwh), 2),
  `FY Consumption (GWh)` = round(c(res_pc1$total_fy_kwh, res_pc2$total_fy_kwh) / 1e6, 4),
  `OLS slope (p/mo)`     = round(c(res_pc1$slope_per_mo, res_pc2$slope_per_mo), 5),
  `OLS p-value`          = round(c(res_pc1$ols_p,        res_pc2$ols_p), 4)
)

cat("\n=================================================================\n")
## 
## =================================================================
cat("COMPARISON - PC1 vs PC2  (NHH, FY 2017/18)\n")
## COMPARISON - PC1 vs PC2  (NHH, FY 2017/18)
cat("=================================================================\n")
## =================================================================
print(comp)
## # A tibble: 2 × 7
##   `Profile Class` Contracts `In FY17/18` `FY Consumption (kWh)`
##             <dbl>     <int>        <int>                  <dbl>
## 1               1       306          285               1120143.
## 2               2        68           66                420652.
## # ℹ 3 more variables: `FY Consumption (GWh)` <dbl>, `OLS slope (p/mo)` <dbl>,
## #   `OLS p-value` <dbl>
cat("\nDone. All plots rendered to the RStudio Plots pane — use the\n")
## 
## Done. All plots rendered to the RStudio Plots pane — use the
cat("arrows at the top of the Plots pane to flick through them.\n")
## arrows at the top of the Plots pane to flick through them.