R Statistical Programming Code

Author

Troy Hanlon and Garrett Barnachez

#clear the workspace
rm(list = ls()) #clear environment
gc()            #clear unused memory

          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells  608533 32.5    1380852 73.8         NA   715625 38.3
Vcells 1125889  8.6    8388608 64.0      24576  2010544 15.4

dev.off         #clear charts

function (which = dev.cur()) 
{
    if (which == 1) 
        stop("cannot shut down device 1 (the null device)")
    .External(C_devoff, as.integer(which))
    dev.cur()
}
<bytecode: 0x12643a4e8>
<environment: namespace:grDevices>

cat("\f")     #clear console

# Set working directory
setwd("~/Desktop/BC/Summer 2025/ADEC7320/Final Project")

#Data provided in Google Drive link above
# 1. Load and process Natural Gas Data
gas_data_raw <- read_csv("Natural_Gas_Consumption_by_End_Use.csv", skip = 6)

gas_data <- gas_data_raw %>%
  rename(
    Commercial_Mcf = `Natural Gas Deliveries to Commercial Consumers (Including Vehicle Fuel through 1996) in Massachusetts MMcf`,
    Residential_Mcf = `Massachusetts Natural Gas Residential Consumption MMcf`,
    Industrial_Mcf = `Massachusetts Natural Gas Industrial Consumption MMcf`
  ) %>%
  filter(!is.na(Month)) %>%
  mutate(
    Month = my(Month),
    Year = year(Month),
    Month_Num = month(Month),
    Date = make_date(Year, Month_Num, 1)  # Explicit Date creation
  )

gas_long <- gas_data %>%
  pivot_longer(cols = c(Residential_Mcf, Commercial_Mcf, Industrial_Mcf),
               names_to = "Sector",
               values_to = "Consumption_Mcf") %>%
  mutate(
    Sector = str_remove(Sector, "_Mcf"),
    Fuel_Type = "Natural_Gas"
  )

# 2. Load and process Electricity Data
raw_data <- read_excel("sales_revenue.xlsx", col_names = FALSE)
cat("First 3 rows of raw data:\n")

First 3 rows of raw data:

print(head(raw_data, 3))

# A tibble: 3 × 24
  ...1  ...2  ...3  ...4   ...5  ...6  ...7  ...8  ...9  ...10 ...11 ...12 ...13
  <chr> <chr> <chr> <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 <NA>  <NA>  <NA>  <NA>   RESI… <NA>  <NA>  <NA>  COMM… <NA>  <NA>  <NA>  INDU…
2 <NA>  <NA>  <NA>  <NA>   Reve… Sales Cust… Price Reve… Sales Cust… Price Reve…
3 Year  Month State Data … Thou… Mega… Count Cent… Thou… Mega… Count Cent… Thou…
# ℹ 11 more variables: ...14 <chr>, ...15 <chr>, ...16 <chr>, ...17 <chr>,
#   ...18 <chr>, ...19 <chr>, ...20 <chr>, ...21 <chr>, ...22 <chr>,
#   ...23 <chr>, ...24 <chr>

elec_ma <- read_excel("sales_revenue.xlsx", col_names = FALSE) %>%
  select(
    Year = ...1,
    Month = ...2,
    Resi_Sales_MWH = ...6,
    Com_Sales_MWH = ...10,
    Ind_Sales_MWH = ...14
  ) %>%
  filter(!Year %in% c("Year", "YEAR", "year"),
         !is.na(Year),
         !is.na(Month)) %>%
  mutate(
    Year = as.numeric(Year),
    Month = as.numeric(Month),
    Resi_Sales_MWH = as.numeric(gsub(",", "", Resi_Sales_MWH)),
    Com_Sales_MWH = as.numeric(gsub(",", "", Com_Sales_MWH)),
    Ind_Sales_MWH = as.numeric(gsub(",", "", Ind_Sales_MWH)),
    Date = make_date(Year, Month, 1)  # Explicit Date creation
  )

# Aggregate electricity data to monthly
elec_ma <- elec_ma %>%
  group_by(Year, Month) %>%
  summarise(
    Resi_Sales_MWH = sum(Resi_Sales_MWH, na.rm = TRUE),
    Com_Sales_MWH = sum(Com_Sales_MWH, na.rm = TRUE),
    Ind_Sales_MWH = sum(Ind_Sales_MWH, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(Date = make_date(Year, Month, 1))

elec_long <- elec_ma %>%
  pivot_longer(
    cols = c(Resi_Sales_MWH, Com_Sales_MWH, Ind_Sales_MWH),
    names_to = "Sector",
    values_to = "Consumption_MWh"
  ) %>%
  mutate(
    Sector = case_when(
      Sector == "Resi_Sales_MWH" ~ "Residential",
      Sector == "Com_Sales_MWH" ~ "Commercial",
      Sector == "Ind_Sales_MWH" ~ "Industrial"
    ),
    Fuel_Type = "Electricity",
    Month_Num = Month
  )

# 3. Load and process Weather Data
weather <- read_excel("weather_vars.xlsx") %>%
  filter(!is.na(Month), !is.na(Year)) %>%
  mutate(
    Days_in_Month = days_in_month(ymd(paste(Year, Month, "01", sep="-"))),
    HDD = pmax(65 - `Avg Temp (°F)`, 0) * Days_in_Month,
    CDD = pmax(`Avg Temp (°F)` - 65, 0) * Days_in_Month,
    Date = make_date(Year, Month, 1),
    Avg_Temp_F = `Avg Temp (°F)`,
    Humidity_Pct = `Humidity (%)`
  )

# Filter data to 2010-2024 range
start_year <- 2010
end_year <- 2024

weather_filt <- weather %>%
  filter(Year >= start_year & Year <= end_year)

gas_filt <- gas_long %>%
  filter(Year >= start_year & Year <= end_year)

elec_filt <- elec_long %>%
  filter(Year >= start_year & Year <= end_year)

# Combine energy datasets with consistent Date column
energy_data <- bind_rows(
  gas_filt %>% select(Year, Month_Num, Sector, Fuel_Type, Consumption = Consumption_Mcf, Date),
  elec_filt %>% select(Year, Month_Num, Sector, Fuel_Type, Consumption = Consumption_MWh, Date)
)

# Create comprehensive merged dataset
merged_data <- energy_data %>%
  left_join(weather_filt %>% select(Date, Avg_Temp_F, Humidity_Pct, HDD, CDD), 
            by = "Date") %>%
  mutate(
    Season = case_when(
      Month_Num %in% c(12, 1, 2) ~ "Winter",
      Month_Num %in% c(3, 4, 5) ~ "Spring",
      Month_Num %in% c(6, 7, 8) ~ "Summer",
      Month_Num %in% c(9, 10, 11) ~ "Fall"
    ),
    Log_Consumption = log(Consumption + 1)  # Add log-transformed consumption
  )

# Final validation
cat("\nMerged Data Sample:\n")


Merged Data Sample:

print(head(merged_data))

# A tibble: 6 × 12
   Year Month_Num Sector      Fuel_Type   Consumption Date       Avg_Temp_F
  <dbl>     <dbl> <chr>       <chr>             <dbl> <date>          <dbl>
1  2024        12 Residential Natural_Gas       19957 2024-12-01       35.1
2  2024        12 Commercial  Natural_Gas       13458 2024-12-01       35.1
3  2024        12 Industrial  Natural_Gas        4399 2024-12-01       35.1
4  2024        11 Residential Natural_Gas       10070 2024-11-01       48.9
5  2024        11 Commercial  Natural_Gas        8536 2024-11-01       48.9
6  2024        11 Industrial  Natural_Gas        3691 2024-11-01       48.9
# ℹ 5 more variables: Humidity_Pct <dbl>, HDD <dbl>, CDD <dbl>, Season <chr>,
#   Log_Consumption <dbl>

cat("\nMissing values by column:\n")


Missing values by column:

print(colSums(is.na(merged_data)))

           Year       Month_Num          Sector       Fuel_Type     Consumption 
              0               0               0               0               0 
           Date      Avg_Temp_F    Humidity_Pct             HDD             CDD 
              0               0               0               0               0 
         Season Log_Consumption 
              0               0

cat("\nNumber of observations by sector and fuel type:\n")


Number of observations by sector and fuel type:

print(table(merged_data$Fuel_Type, merged_data$Sector))

             
              Commercial Industrial Residential
  Electricity        180        180         180
  Natural_Gas        180        180         180

# --- build data (same as before) ---
summary_stats <- merged_data %>%
  group_by(Fuel_Type, Sector) %>%
  summarise(
    N = n(),
    Mean_Consumption = mean(Consumption, na.rm = TRUE),
    SD_Consumption   = sd(Consumption,   na.rm = TRUE),
    Min_Consumption  = min(Consumption,  na.rm = TRUE),
    Max_Consumption  = max(Consumption,  na.rm = TRUE),
    Mean_Log_Consumption = mean(Log_Consumption, na.rm = TRUE),
    SD_Log_Consumption   = sd(Log_Consumption,   na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  mutate(
    across(c(Mean_Consumption, SD_Consumption, Min_Consumption, Max_Consumption),
           ~ number(.x, big.mark = ",", accuracy = 0.1)),
    Mean_Log_Consumption = round(Mean_Log_Consumption, 2),
    SD_Log_Consumption   = round(SD_Log_Consumption, 2)
  )

# --- print table: fit-to-width + multipage + smaller font ---
kable(
  summary_stats,
  caption   = "Summary Statistics by Fuel Type and Sector",
  col.names = c("Fuel Type","Sector","N","Mean","SD","Min","Max","Mean (log)","SD (log)"),
  booktabs  = TRUE,
  longtable = TRUE,                 # allow page breaks
  align     = "llrrrrrrr"
) %>%
  kable_styling(
    full_width   = FALSE,
    font_size    = 9,               # slightly smaller
    latex_options = c("hold_position","repeat_header","scale_down")
    # scale_down shrinks to fit \textwidth if still too wide
  )

Summary Statistics by Fuel Type and Sector
Fuel Type	Sector	N	Mean	SD	Min	Max	Mean (log)	SD (log)
Electricity	Commercial	180	113,038,501.7	10,277,404.2	91,405,772.8	138,935,749.8	18.54	0.09
Electricity	Industrial	180	82,796,205.6	4,248,999.9	72,791,102.4	93,672,698.2	18.23	0.05
Electricity	Residential	180	119,623,275.4	20,561,601.8	88,046,437.0	167,108,141.9	18.59	0.17
Natural_Gas	Commercial	180	8,487.1	4,932.2	2,007.0	20,316.0	8.87	0.61
Natural_Gas	Industrial	180	3,832.0	1,224.6	2,054.0	6,391.0	8.20	0.32
Natural_Gas	Residential	180	10,177.7	7,342.5	2,158.0	26,339.0	8.93	0.81

invisible(NULL)  # prevent accidental printing of last value in chunk

# ---- Weather summary (same idea) ----
weather_summary <- weather_filt %>%
  summarise(
    `Mean Temp (°F)` = round(mean(`Avg Temp (°F)`, na.rm = TRUE), 1),
    `SD Temp (°F)`   = round(sd(`Avg Temp (°F)`,   na.rm = TRUE), 1),
    `Mean Hum (%)`   = round(mean(`Humidity (%)`,  na.rm = TRUE), 1),
    `SD Hum (%)`     = round(sd(`Humidity (%)`,    na.rm = TRUE), 1),
    `Mean HDD`       = round(mean(HDD,             na.rm = TRUE), 1),
    `Mean CDD`       = round(mean(CDD,             na.rm = TRUE), 1)
  )

kable(
  weather_summary,
  caption   = "Weather Variables Summary Statistics (2010–2024)",
  booktabs  = TRUE,
  longtable = FALSE,
  align     = "rrrrrr"
) %>%
  kable_styling(full_width = FALSE, font_size = 9)

Weather Variables Summary Statistics (2010–2024)
Mean Temp (°F)	SD Temp (°F)	Mean Hum (%)	SD Hum (%)	Mean HDD	Mean CDD
52.8	15.4	71.4	5.1	426.7	59.4

invisible(NULL)

# Time series plots for Consumption and Log_Consumption
p1 <- merged_data %>%
  filter(Fuel_Type == "Natural_Gas") %>%
  ggplot(aes(x = Date, y = Consumption, color = Sector)) +
  geom_line(size = 1) +
  labs(title = "Natural Gas Consumption by Sector (2010-2024)",
       x = "Date", y = "Consumption (MMcf)") +
  theme_minimal() +
  theme(legend.position = "bottom")

p1_log <- merged_data %>%
  filter(Fuel_Type == "Natural_Gas") %>%
  ggplot(aes(x = Date, y = Log_Consumption, color = Sector)) +
  geom_line(size = 1) +
  labs(title = "Log Natural Gas Consumption by Sector (2010-2024)",
       x = "Date", y = "Log Consumption") +
  theme_minimal() +
  theme(legend.position = "bottom")

p2 <- merged_data %>%
  filter(Fuel_Type == "Electricity") %>%
  ggplot(aes(x = Date, y = Consumption, color = Sector)) +
  geom_line(size = 1) +
  labs(title = "Electricity Consumption by Sector (2010-2024)",
       x = "Date", y = "Consumption (MWh)") +
  theme_minimal() +
  theme(legend.position = "bottom")

p2_log <- merged_data %>%
  filter(Fuel_Type == "Electricity") %>%
  ggplot(aes(x = Date, y = Log_Consumption, color = Sector)) +
  geom_line(size = 1) +
  labs(title = "Log Electricity Consumption by Sector (2010-2024)",
       x = "Date", y = "Log Consumption") +
  theme_minimal() +
  theme(legend.position = "bottom")

grid.arrange(p1, p1_log, p2, p2_log, ncol = 2, nrow = 2)

# Weather time series
p3 <- weather_filt %>%
  ggplot(aes(x = Date, y = `Avg Temp (°F)`)) +
  geom_line(color = "red", size = 1) +
  labs(title = "Average Monthly Temperature (2010-2024)",
       x = "Date", y = "Temperature (°F)") +
  theme_minimal()

p4 <- weather_filt %>%
  select(Date, HDD, CDD) %>%
  pivot_longer(cols = c(HDD, CDD), names_to = "Variable", values_to = "Value") %>%
  ggplot(aes(x = Date, y = Value, color = Variable)) +
  geom_line(size = 1) +
  labs(title = "Heating and Cooling Degree Days (2010-2024)",
       x = "Date", y = "Value") +
  theme_minimal() +
  theme(legend.position = "bottom")

grid.arrange(p3, p4, ncol = 1)

# Correlation analysis for natural gas
gas_corr_data <- merged_data %>%
  filter(Fuel_Type == "Natural_Gas") %>%
  select(Log_Consumption, Avg_Temp_F, Humidity_Pct, HDD, CDD) %>%
  na.omit()

cor_matrix_gas <- cor(gas_corr_data)

# Correlation analysis for electricity  
elec_corr_data <- merged_data %>%
  filter(Fuel_Type == "Electricity") %>%
  select(Log_Consumption, Avg_Temp_F, Humidity_Pct, HDD, CDD) %>%
  na.omit()

cor_matrix_elec <- cor(elec_corr_data)

par(mfrow = c(1, 2))
corrplot(cor_matrix_gas, method = "color", type = "upper", 
         title = "Natural Gas Correlations (Log-Consumption)", mar = c(0,0,2,0))
corrplot(cor_matrix_elec, method = "color", type = "upper",
         title = "Electricity Correlations (Log-Consumption)", mar = c(0,0,2,0))

library(forecast)
library(ggplot2)
library(knitr)

# Initialize list to store models and diagnostics
gas_ts_models <- list()
arima_diagnostics <- list()

cat("Checking Natural Gas Data Quality:\n")

Checking Natural Gas Data Quality:

for(sector in c("Residential", "Commercial", "Industrial")) {
  sector_data <- merged_data %>%
    filter(Fuel_Type == "Natural_Gas", Sector == sector) %>%
    mutate(
      Trend = 1:n(),
      Log_Consumption = log(Consumption + 1)
    ) %>%
    select(Date, Consumption, Log_Consumption, HDD, CDD, Humidity_Pct, Trend) %>%
    na.omit()
  
  cat("\nSector:", sector, "\n")
  cat("Rows:", nrow(sector_data), "\n")
  cat("Missing values:", sum(is.na(sector_data)), "\n")
  cat("Log_Consumption range:", range(sector_data$Log_Consumption, na.rm = TRUE), "\n")
  adf_result <- adf.test(sector_data$Log_Consumption)
  cat("ADF Test (Log_Consumption, p-value):", adf_result$p.value, "\n")
  
  # Linear model
  linear_model <- lm(Log_Consumption ~ HDD + CDD + Humidity_Pct + Trend,
                     data = sector_data)
  
  # Time series with exogenous regressors
  ts_data <- ts(sector_data$Log_Consumption, frequency = 12, start = c(2010, 1))
  xreg_matrix <- as.matrix(sector_data[, c("HDD", "CDD", "Humidity_Pct", "Trend")])
  
  arima_model <- tryCatch(
    {
      auto.arima(ts_data, xreg = xreg_matrix, seasonal = TRUE, 
                 max.p = 2, max.q = 2, max.d = 1, trace = FALSE, 
                 d = ifelse(adf_result$p.value > 0.05, 1, 0))
    },
    error = function(e) {
      message("Error fitting ARIMA for ", sector, ": ", e$message)
      return(NULL)
    }
  )
  
  gas_ts_models[[sector]] <- list(
    Linear = linear_model,
    ARIMA = arima_model,
    TS_Data = ts_data,
    Xreg = xreg_matrix
  )
  
  # ---- Plot Actual vs Fitted ----
  if (!is.null(arima_model)) {
    fitted_vals <- fitted(arima_model)
    plot(ts_data, type="l", col="black", lwd=2,
         main=paste("Actual vs Fitted -", sector),
         ylab="Log Consumption")
    lines(fitted_vals, col="blue", lwd=2)
    legend("topleft", legend=c("Actual","Fitted"), col=c("black","blue"), lty=1, bty="n")
  }
  
  # ---- Plot Actual vs Predicted (12-step forecast) ----
  if (!is.null(arima_model)) {
    h <- 12
    last_xreg <- xreg_matrix[nrow(xreg_matrix), , drop=FALSE]
    future_xreg <- last_xreg[rep(1, h), ]
    
    fc <- forecast(arima_model, h=h, xreg=future_xreg)
    autoplot(fc) +
      autolayer(ts_data, series="Actual") +
      labs(title=paste("Actual vs Predicted (12-month forecast) -", sector),
           y="Log Consumption") -> p
    print(p)
  }
}


Sector: Residential 
Rows: 180 
Missing values: 0 
Log_Consumption range: 7.6774 10.17884

ADF Test (Log_Consumption, p-value): 0.01


Sector: Commercial 
Rows: 180 
Missing values: 0 
Log_Consumption range: 7.604894 9.919213

ADF Test (Log_Consumption, p-value): 0.01


Sector: Industrial 
Rows: 180 
Missing values: 0 
Log_Consumption range: 7.628031 8.762802

ADF Test (Log_Consumption, p-value): 0.01

# ---- Coefficient Table ----
library(modelsummary)

msummary(
  list(
    "Residential" = gas_ts_models$Residential$Linear,
    "Commercial"  = gas_ts_models$Commercial$Linear,
    "Industrial"  = gas_ts_models$Industrial$Linear
  ),
  fmt = 3,
  stars = TRUE,
   title = "Natural Gas Consumption Models (Linear, Log-Transformed)",
  gof_omit = "IC|Log|Adj"  # cleaner output
)

Natural Gas Consumption Models (Linear, Log-Transformed)
	Residential	Commercial	Industrial
+ p < 0.1, * p < 0.05, p < 0.01, * p < 0.001
(Intercept)	8.442***	8.503***	8.073***
	(0.182)	(0.164)	(0.108)
HDD	0.002***	0.001***	0.001***
	(0.000)	(0.000)	(0.000)
CDD	-0.001***	-0.000***	-0.000**
	(0.000)	(0.000)	(0.000)
Humidity_Pct	-0.004	0.000	-0.002
	(0.002)	(0.002)	(0.001)
Trend	0.000+	-0.003***	-0.000
	(0.000)	(0.000)	(0.000)
Num.Obs.	180	180	180
R2	0.965	0.950	0.920
F	1198.346	824.085	506.411
RMSE	0.15	0.14	0.09

library(modelsummary)
library(knitr)

elec_ts_models <- list()
elec_arima_diagnostics <- list()
elec_linear_models <- list()   # store linear models for modelsummary

cat("Checking Electricity Data Quality:\n")

Checking Electricity Data Quality:

for(sector in c("Residential", "Commercial", "Industrial")) {
  sector_data <- merged_data %>%
    filter(Fuel_Type == "Electricity", Sector == sector) %>%
    mutate(
      Trend = 1:n(),
      Log_Consumption = log(Consumption + 1)
    ) %>%
    select(Date, Consumption, Log_Consumption, HDD, CDD, Humidity_Pct, Trend) %>%
    na.omit()
  
  cat("\nSector:", sector, "\n")
  cat("Rows:", nrow(sector_data), "\n")
  cat("Missing values:", sum(is.na(sector_data)), "\n")
  adf_result <- adf.test(sector_data$Log_Consumption)
  cat("ADF Test p-value:", adf_result$p.value, "\n")
  
  # Linear regression
  linear_model <- lm(Log_Consumption ~ HDD + CDD + Humidity_Pct + Trend,
                    data = sector_data)
  
  ts_data <- ts(sector_data$Log_Consumption, frequency = 12, start = c(2010, 1))
  xreg_matrix <- as.matrix(sector_data[, c("HDD", "CDD", "Humidity_Pct", "Trend")])
  
  arima_model <- tryCatch(
    {
      auto.arima(ts_data, xreg = xreg_matrix, seasonal = TRUE,
                 max.p = 2, max.q = 2, max.d = 1,
                 d = ifelse(adf_result$p.value > 0.05, 1, 0))
    },
    error = function(e) {
      message("Error fitting ARIMA for ", sector, ": ", e$message)
      return(NULL)
    }
  )
  
  # Store models
  elec_ts_models[[sector]] <- list(Linear = linear_model, ARIMA = arima_model)
  elec_linear_models[[sector]] <- linear_model
  
  if (!is.null(arima_model)) {
    elec_arima_diagnostics[[sector]] <- data.frame(
      Sector = sector,
      Order = paste(arima_model$arma[1:3], collapse = ","),
      AIC = arima_model$aic,
      BIC = BIC(arima_model),
      row.names = NULL
    )
  }
}


Sector: Residential 
Rows: 180 
Missing values: 0

ADF Test p-value: 0.01 

Sector: Commercial 
Rows: 180 
Missing values: 0

ADF Test p-value: 0.01 

Sector: Industrial 
Rows: 180 
Missing values: 0

ADF Test p-value: 0.01

# === Linear Regression Table 
modelsummary(
  elec_linear_models,
  gof_omit = "AIC|BIC|Log.Lik|F",
  stars = TRUE,
  title = "Electricity Consumption Models (Linear, Log-Transformed)"
)

Electricity Consumption Models (Linear, Log-Transformed)
	Residential	Commercial	Industrial
+ p < 0.1, * p < 0.05, p < 0.01, * p < 0.001
(Intercept)	18.161***	18.344***	18.151***
	(0.125)	(0.055)	(0.033)
HDD	0.000***	-0.000***	-0.000***
	(0.000)	(0.000)	(0.000)
CDD	0.001***	0.001***	0.000***
	(0.000)	(0.000)	(0.000)
Humidity_Pct	0.003*	0.002**	0.001*
	(0.002)	(0.001)	(0.000)
Trend	0.000*	0.000***	0.000***
	(0.000)	(0.000)	(0.000)
Num.Obs.	180	180	180
R2	0.619	0.739	0.711
R2 Adj.	0.611	0.733	0.705
RMSE	0.10	0.05	0.03

# === ARIMA Summary Table ===
if (length(elec_arima_diagnostics) > 0) {
  arima_table <- do.call(rbind, elec_arima_diagnostics)
  kable(arima_table, digits = 3, caption = "Electricity Consumption ARIMA Diagnostics")
}

Electricity Consumption ARIMA Diagnostics
	Sector	Order	AIC	BIC
Residential	Residential	1,1,0	-454.790	-422.860
Commercial	Commercial	0,0,0	-660.408	-638.057
Industrial	Industrial	1,0,1	-861.864	-836.321

# Function for linear model diagnostics (unchanged)
diagnostic_plots_linear <- function(model, title_text) {
  old_par <- par(mfrow = c(2, 2),  
                 mar = c(4, 4, 3, 2),  
                 oma = c(0, 0, 2, 0),  
                 cex.main = 1.1,       
                 font.main = 1,        
                 family = "sans")      
  
  res <- resid(model)
  fit <- fitted(model)
  
  plot(fit, res,
       main = "Residuals vs Fitted",
       xlab = "Fitted Values (Log-Consumption)",
       ylab = "Residuals",
       pch = 19,          
       col = "#1E88E5",   
       cex = 0.7)         
  abline(h = 0, col = "#FF5252", lwd = 2) 
  grid(col = "#E0E0E0", lty = "dotted")
  
  qqnorm(res,
         main = "Normal Q-Q Plot",
         pch = 19,
         col = "#1E88E5")
  qqline(res, col = "#FF5252", lwd = 2)
  grid(col = "#E0E0E0", lty = "dotted")
  
  hist(res, 
       main = "Residual Distribution",
       xlab = "Residuals (Log-Consumption)",
       col = "#1E88E5",
       border = "white",
       freq = FALSE)
  curve(dnorm(x, mean(res), sd(res)), 
        add = TRUE, 
        col = "#FF5252", 
        lwd = 2)
  grid(col = "#E0E0E0", lty = "dotted")
  
  acf(res,
      main = "ACF of Residuals",
      col = "#1E88E5",
      lwd = 2)
  grid(col = "#E0E0E0", lty = "dotted")
  
  mtext(title_text,
        outer = TRUE,
        cex = 1.3,
        font = 2,
        col = "#333333")
  
  par(old_par)
}

# Function for ARIMA diagnostics
diagnostic_plots_arima <- function(model, title_text) {
  old_par <- par(mfrow = c(2, 2),  
                 mar = c(4, 4, 3, 2),  
                 oma = c(0, 0, 2, 0),  
                 cex.main = 1.1,       
                 font.main = 1,        
                 family = "sans")      
  
  res <- resid(model)
  fit <- fitted(model)
  
  plot(fit, res,
       main = "Residuals vs Fitted",
       xlab = "Fitted Values (Consumption)",
       ylab = "Residuals",
       pch = 19,          
       col = "#1E88E5",   
       cex = 0.7)         
  abline(h = 0, col = "#FF5252", lwd = 2) 
  grid(col = "#E0E0E0", lty = "dotted")
  
  qqnorm(res,
         main = "Normal Q-Q Plot",
         pch = 19,
         col = "#1E88E5")
  qqline(res, col = "#FF5252", lwd = 2)
  grid(col = "#E0E0E0", lty = "dotted")
  
  hist(res, 
       main = "Residual Distribution",
       xlab = "Residuals (Consumption)",
       col = "#1E88E5",
       border = "white",
       freq = FALSE)
  curve(dnorm(x, mean(res), sd(res)), 
        add = TRUE, 
        col = "#FF5252", 
        lwd = 2)
  grid(col = "#E0E0E0", lty = "dotted")
  
  acf(res,
      main = "ACF of Residuals",
      col = "#1E88E5",
      lwd = 2)
  grid(col = "#E0E0E0", lty = "dotted")
  
  mtext(title_text,
        outer = TRUE,
        cex = 1.3,
        font = 2,
        col = "#333333")
  
  par(old_par)
}

run_diagnostics <- function(linear_model, arima_model, sector_name, fuel_type) {
  cat("\n", rep("=", 50), "\n", sep="")
  cat("DIAGNOSTICS FOR:", sector_name, "\n")
  cat(rep("=", 50), "\n\n")
  
  # Linear Model Diagnostics
  cat("LINEAR MODEL (LOG-TRANSFORMED):\n")
  print(summary(linear_model))
  
  res_linear <- resid(linear_model)
  
  cat("\nNormality Tests (Linear):\n")
  if(length(res_linear) <= 5000) {
    sw_test <- shapiro.test(res_linear)
    cat("Shapiro-Wilk: p =", format.pval(sw_test$p.value, digits=3), "\n")
  }
  ad_test <- ad.test(res_linear)
  cat("Anderson-Darling: p =", format.pval(ad_test$p.value, digits=3), "\n")
  
  bp_test <- bptest(linear_model)
  cat("\nHeteroskedasticity Test (Linear):\n")
  cat("Breusch-Pagan: p =", format.pval(bp_test$p.value, digits=3), "\n")
  
  dw_test <- dwtest(linear_model)
  cat("\nAutocorrelation Test (Linear):\n")
  cat("Durbin-Watson: p =", format.pval(dw_test$p.value, digits=3), "\n")
  
  cat("\nMulticollinearity Check (VIF):\n")
  vif_values <- vif(linear_model)
  print(vif_values)
  
  diagnostic_plots_linear(linear_model, paste(fuel_type, "-", toupper(sector_name), "(Linear, Log-Transformed)"))
  
  # ARIMA Model Diagnostics
  cat("\nARIMA MODEL:\n")
  if (!is.null(arima_model)) {
    print(arima_model)
    cat("AIC:", arima_model$aic, "\n")
    cat("BIC:", BIC(arima_model), "\n")
    
    res_arima <- resid(arima_model)
    
    cat("\nNormality Tests (ARIMA):\n")
    if(length(res_arima) <= 5000) {
      sw_test_arima <- shapiro.test(res_arima)
      cat("Shapiro-Wilk: p =", format.pval(sw_test_arima$p.value, digits=3), "\n")
    }
    ad_test_arima <- ad.test(res_arima)
    cat("Anderson-Darling: p =", format.pval(ad_test_arima$p.value, digits=3), "\n")
    
    cat("\nAutocorrelation Test (ARIMA):\n")
    lb_test <- Box.test(res_arima, lag = 12, type = "Ljung-Box")
    cat("Ljung-Box: p =", format.pval(lb_test$p.value, digits=3), "\n")
    
    diagnostic_plots_arima(arima_model, paste(fuel_type, "-", toupper(sector_name), "(ARIMA)"))
  } else {
    cat("ARIMA fitting failed for ", sector_name, "\n")
  }
}

cat("\n\nELECTRICITY MODEL DIAGNOSTICS\n")



ELECTRICITY MODEL DIAGNOSTICS

for(sector in c("Residential", "Commercial", "Industrial")) {
  run_diagnostics(elec_ts_models[[sector]]$Linear, 
                  elec_ts_models[[sector]]$ARIMA, 
                  paste("ELECTRICITY -", toupper(sector)), 
                  "ELECTRICITY")
}


==================================================
DIAGNOSTICS FOR: ELECTRICITY - RESIDENTIAL 
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 

LINEAR MODEL (LOG-TRANSFORMED):

Call:
lm(formula = Log_Consumption ~ HDD + CDD + Humidity_Pct + Trend, 
    data = sector_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.19860 -0.08742 -0.00779  0.07776  0.23363 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.816e+01  1.255e-01 144.743  < 2e-16 ***
HDD          1.635e-04  2.658e-05   6.150 5.11e-09 ***
CDD          1.442e-03  8.960e-05  16.092  < 2e-16 ***
Humidity_Pct 3.312e-03  1.662e-03   1.993   0.0478 *  
Trend        3.622e-04  1.515e-04   2.390   0.0179 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1055 on 175 degrees of freedom
Multiple R-squared:  0.6193,    Adjusted R-squared:  0.6106 
F-statistic: 71.17 on 4 and 175 DF,  p-value: < 2.2e-16


Normality Tests (Linear):
Shapiro-Wilk: p = 0.0079 
Anderson-Darling: p = 0.0293 

Heteroskedasticity Test (Linear):
Breusch-Pagan: p = 0.167 

Autocorrelation Test (Linear):
Durbin-Watson: p = 0.0139 

Multicollinearity Check (VIF):
         HDD          CDD Humidity_Pct        Trend 
    1.746997     1.567780     1.143861     1.002831


ARIMA MODEL:
Series: ts_data 
Regression with ARIMA(1,0,1)(0,0,2)[12] errors 

Coefficients:
         ar1     ma1    sma1    sma2  intercept    HDD    CDD  Humidity_Pct
      0.1644  0.5653  0.8048  0.5811    18.4556  1e-04  5e-04         4e-04
s.e.  0.1306  0.0754  0.0896  0.0700     0.0749  1e-04  1e-04         8e-04
      Trend
      3e-04
s.e.  4e-04

sigma^2 = 0.004077:  log likelihood = 237.4
AIC=-454.79   AICc=-453.49   BIC=-422.86
AIC: -454.7901 
BIC: -422.8605 

Normality Tests (ARIMA):
Shapiro-Wilk: p = 0.435 
Anderson-Darling: p = 0.279 

Autocorrelation Test (ARIMA):
Ljung-Box: p = 9.19e-10


==================================================
DIAGNOSTICS FOR: ELECTRICITY - COMMERCIAL 
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 

LINEAR MODEL (LOG-TRANSFORMED):

Call:
lm(formula = Log_Consumption ~ HDD + CDD + Humidity_Pct + Trend, 
    data = sector_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.16926 -0.03067  0.00426  0.03052  0.10575 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)   1.834e+01  5.470e-02 335.356  < 2e-16 ***
HDD          -5.635e-05  1.159e-05  -4.862 2.58e-06 ***
CDD           5.158e-04  3.906e-05  13.204  < 2e-16 ***
Humidity_Pct  2.298e-03  7.245e-04   3.172  0.00179 ** 
Trend         2.670e-04  6.607e-05   4.042 7.93e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.04599 on 175 degrees of freedom
Multiple R-squared:  0.7393,    Adjusted R-squared:  0.7333 
F-statistic: 124.1 on 4 and 175 DF,  p-value: < 2.2e-16


Normality Tests (Linear):
Shapiro-Wilk: p = 0.034 
Anderson-Darling: p = 0.399 

Heteroskedasticity Test (Linear):
Breusch-Pagan: p = 0.0403 

Autocorrelation Test (Linear):
Durbin-Watson: p = 0.000307 

Multicollinearity Check (VIF):
         HDD          CDD Humidity_Pct        Trend 
    1.746997     1.567780     1.143861     1.002831


ARIMA MODEL:
Series: ts_data 
Regression with ARIMA(0,0,0)(0,0,1)[12] errors 

Coefficients:
        sma1  intercept     HDD    CDD  Humidity_Pct  Trend
      0.6076    18.4628  -1e-04  4e-04         7e-04  3e-04
s.e.  0.0657     0.0386   1e-04  1e-04         5e-04  1e-04

sigma^2 = 0.001386:  log likelihood = 337.2
AIC=-660.41   AICc=-659.76   BIC=-638.06
AIC: -660.4079 
BIC: -638.0572 

Normality Tests (ARIMA):
Shapiro-Wilk: p = 0.000106 
Anderson-Darling: p = 0.02 

Autocorrelation Test (ARIMA):
Ljung-Box: p = 4.58e-07


==================================================
DIAGNOSTICS FOR: ELECTRICITY - INDUSTRIAL 
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 

LINEAR MODEL (LOG-TRANSFORMED):

Call:
lm(formula = Log_Consumption ~ HDD + CDD + Humidity_Pct + Trend, 
    data = sector_data)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.108120 -0.013179  0.002707  0.017720  0.069334 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)   1.815e+01  3.302e-02 549.761  < 2e-16 ***
HDD          -5.922e-05  6.995e-06  -8.466 9.96e-15 ***
CDD           1.954e-04  2.358e-05   8.288 2.92e-14 ***
Humidity_Pct  1.060e-03  4.373e-04   2.425   0.0163 *  
Trend         1.967e-04  3.988e-05   4.932 1.88e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.02776 on 175 degrees of freedom
Multiple R-squared:  0.7115,    Adjusted R-squared:  0.7049 
F-statistic: 107.9 on 4 and 175 DF,  p-value: < 2.2e-16


Normality Tests (Linear):
Shapiro-Wilk: p = 1.85e-05 
Anderson-Darling: p = 0.00114 

Heteroskedasticity Test (Linear):
Breusch-Pagan: p = 0.0232 

Autocorrelation Test (Linear):
Durbin-Watson: p = 0.00974 

Multicollinearity Check (VIF):
         HDD          CDD Humidity_Pct        Trend 
    1.746997     1.567780     1.143861     1.002831


ARIMA MODEL:
Series: ts_data 
Regression with ARIMA(1,0,0)(1,0,0)[12] errors 

Coefficients:
         ar1    sar1  intercept    HDD    CDD  Humidity_Pct  Trend
      0.7013  0.7988    18.1868  0e+00  0e+00         1e-04  4e-04
s.e.  0.1760  0.2535     0.0682  2e-04  4e-04         2e-04  7e-04

sigma^2 = 0.0004321:  log likelihood = 438.93
AIC=-861.86   AICc=-861.02   BIC=-836.32
AIC: -861.8643 
BIC: -836.3206 

Normality Tests (ARIMA):
Shapiro-Wilk: p = 5.54e-07 
Anderson-Darling: p = 0.00181 

Autocorrelation Test (ARIMA):
Ljung-Box: p = 0.0759

cat("\n\nNATURAL GAS MODEL DIAGNOSTICS\n")



NATURAL GAS MODEL DIAGNOSTICS

for(sector in c("Residential", "Commercial", "Industrial")) {
  run_diagnostics(gas_ts_models[[sector]]$Linear, 
                  gas_ts_models[[sector]]$ARIMA, 
                  paste("NATURAL GAS -", toupper(sector)), 
                  "NATURAL GAS")
}


==================================================
DIAGNOSTICS FOR: NATURAL GAS - RESIDENTIAL 
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 

LINEAR MODEL (LOG-TRANSFORMED):

Call:
lm(formula = Log_Consumption ~ HDD + CDD + Humidity_Pct + Trend, 
    data = sector_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.34684 -0.11369 -0.00225  0.11342  0.39431 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)   8.442e+00  1.822e-01  46.324  < 2e-16 ***
HDD           1.828e-03  3.883e-05  47.070  < 2e-16 ***
CDD          -1.046e-03  1.309e-04  -7.993 1.71e-13 ***
Humidity_Pct -3.785e-03  2.427e-03  -1.559    0.121    
Trend         3.965e-04  2.214e-04   1.791    0.075 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1541 on 175 degrees of freedom
Multiple R-squared:  0.9648,    Adjusted R-squared:  0.964 
F-statistic:  1198 on 4 and 175 DF,  p-value: < 2.2e-16


Normality Tests (Linear):
Shapiro-Wilk: p = 0.438 
Anderson-Darling: p = 0.502 

Heteroskedasticity Test (Linear):
Breusch-Pagan: p = 0.532 

Autocorrelation Test (Linear):
Durbin-Watson: p = 4.44e-11 

Multicollinearity Check (VIF):
         HDD          CDD Humidity_Pct        Trend 
    1.746997     1.567780     1.143861     1.002831


ARIMA MODEL:
Series: ts_data 
Regression with ARIMA(1,0,0)(0,0,2)[12] errors 

Coefficients:
         ar1    sma1    sma2  intercept     HDD     CDD  Humidity_Pct  Trend
      0.4943  0.4858  0.3568     7.9503  0.0017  -6e-04        0.0029  6e-04
s.e.  0.0746  0.0856  0.0977     0.1520  0.0001   2e-04        0.0018  5e-04

sigma^2 = 0.01363:  log likelihood = 132.7
AIC=-247.41   AICc=-246.35   BIC=-218.67
AIC: -247.4087 
BIC: -218.6721 

Normality Tests (ARIMA):
Shapiro-Wilk: p = 0.671 
Anderson-Darling: p = 0.923 

Autocorrelation Test (ARIMA):
Ljung-Box: p = 0.682


==================================================
DIAGNOSTICS FOR: NATURAL GAS - COMMERCIAL 
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 

LINEAR MODEL (LOG-TRANSFORMED):

Call:
lm(formula = Log_Consumption ~ HDD + CDD + Humidity_Pct + Trend, 
    data = sector_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.38438 -0.08730  0.00006  0.10139  0.37204 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)   8.503e+00  1.644e-01  51.726  < 2e-16 ***
HDD           1.417e-03  3.503e-05  40.441  < 2e-16 ***
CDD          -4.680e-04  1.181e-04  -3.964 0.000107 ***
Humidity_Pct  3.952e-04  2.190e-03   0.180 0.856972    
Trend        -2.657e-03  1.997e-04 -13.306  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.139 on 175 degrees of freedom
Multiple R-squared:  0.9496,    Adjusted R-squared:  0.9484 
F-statistic: 824.1 on 4 and 175 DF,  p-value: < 2.2e-16


Normality Tests (Linear):
Shapiro-Wilk: p = 0.24 
Anderson-Darling: p = 0.082 

Heteroskedasticity Test (Linear):
Breusch-Pagan: p = 0.619 

Autocorrelation Test (Linear):
Durbin-Watson: p = 3.47e-15 

Multicollinearity Check (VIF):
         HDD          CDD Humidity_Pct        Trend 
    1.746997     1.567780     1.143861     1.002831


ARIMA MODEL:
Series: ts_data 
Regression with ARIMA(1,0,1)(1,0,0)[12] errors 

Coefficients:
         ar1      ma1    sar1  intercept     HDD     CDD  Humidity_Pct    Trend
      0.8237  -0.4364  0.3627     8.5391  0.0014  -5e-04       -0.0002  -0.0024
s.e.  0.0798   0.1432  0.3379     0.5227  0.0010   3e-04        0.0018   0.0007

sigma^2 = 0.01135:  log likelihood = 150.61
AIC=-283.23   AICc=-282.17   BIC=-254.49
AIC: -283.227 
BIC: -254.4904 

Normality Tests (ARIMA):
Shapiro-Wilk: p = 0.0776 
Anderson-Darling: p = 0.00666 

Autocorrelation Test (ARIMA):
Ljung-Box: p = 0.959


==================================================
DIAGNOSTICS FOR: NATURAL GAS - INDUSTRIAL 
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 

LINEAR MODEL (LOG-TRANSFORMED):

Call:
lm(formula = Log_Consumption ~ HDD + CDD + Humidity_Pct + Trend, 
    data = sector_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.23264 -0.05550  0.00032  0.05636  0.29753 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)   8.073e+00  1.076e-01  75.026  < 2e-16 ***
HDD           7.325e-04  2.293e-05  31.948  < 2e-16 ***
CDD          -2.296e-04  7.727e-05  -2.971  0.00339 ** 
Humidity_Pct -2.126e-03  1.433e-03  -1.483  0.13977    
Trend        -2.098e-04  1.307e-04  -1.606  0.11018    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.09098 on 175 degrees of freedom
Multiple R-squared:  0.9205,    Adjusted R-squared:  0.9187 
F-statistic: 506.4 on 4 and 175 DF,  p-value: < 2.2e-16


Normality Tests (Linear):
Shapiro-Wilk: p = 0.923 
Anderson-Darling: p = 0.967 

Heteroskedasticity Test (Linear):
Breusch-Pagan: p = 0.868 

Autocorrelation Test (Linear):
Durbin-Watson: p = 1.4e-07 

Multicollinearity Check (VIF):
         HDD          CDD Humidity_Pct        Trend 
    1.746997     1.567780     1.143861     1.002831


ARIMA MODEL:
Series: ts_data 
Regression with ARIMA(2,0,0)(0,0,2)[12] errors 

Coefficients:
         ar1     ar2    sma1    sma2  intercept    HDD     CDD  Humidity_Pct
      0.2786  0.2029  0.1710  0.2517     8.0072  7e-04  -2e-04       -0.0012
s.e.  0.0781  0.0744  0.0766  0.0789     0.1010  1e-04   1e-04        0.0013
       Trend
      -1e-04
s.e.   3e-04

sigma^2 = 0.006431:  log likelihood = 202.39
AIC=-384.78   AICc=-383.48   BIC=-352.85
AIC: -384.7827 
BIC: -352.8532 

Normality Tests (ARIMA):
Shapiro-Wilk: p = 0.618 
Anderson-Darling: p = 0.528 

Autocorrelation Test (ARIMA):
Ljung-Box: p = 0.559