#clear the workspace
rm(list = ls()) #clear environment
gc() #clear unused memory
dev.off #clear charts
cat("\f") #clear consoleR Statistical Programming Code
# Set working directory
setwd("~/Desktop/BC/Summer 2025/ADEC7320/Final Project")
#Data provided in Google Drive link above
# 1. Load and process Natural Gas Data
gas_data_raw <- read_csv("Natural_Gas_Consumption_by_End_Use.csv", skip = 6)
gas_data <- gas_data_raw %>%
rename(
Commercial_Mcf = `Natural Gas Deliveries to Commercial Consumers (Including Vehicle Fuel through 1996) in Massachusetts MMcf`,
Residential_Mcf = `Massachusetts Natural Gas Residential Consumption MMcf`,
Industrial_Mcf = `Massachusetts Natural Gas Industrial Consumption MMcf`
) %>%
filter(!is.na(Month)) %>%
mutate(
Month = my(Month),
Year = year(Month),
Month_Num = month(Month),
Date = make_date(Year, Month_Num, 1) # Explicit Date creation
)
gas_long <- gas_data %>%
pivot_longer(cols = c(Residential_Mcf, Commercial_Mcf, Industrial_Mcf),
names_to = "Sector",
values_to = "Consumption_Mcf") %>%
mutate(
Sector = str_remove(Sector, "_Mcf"),
Fuel_Type = "Natural_Gas"
)
# 2. Load and process Electricity Data
raw_data <- read_excel("sales_revenue.xlsx", col_names = FALSE)
cat("First 3 rows of raw data:\n")
print(head(raw_data, 3))
elec_ma <- read_excel("sales_revenue.xlsx", col_names = FALSE) %>%
select(
Year = ...1,
Month = ...2,
Resi_Sales_MWH = ...6,
Com_Sales_MWH = ...10,
Ind_Sales_MWH = ...14
) %>%
filter(!Year %in% c("Year", "YEAR", "year"),
!is.na(Year),
!is.na(Month)) %>%
mutate(
Year = as.numeric(Year),
Month = as.numeric(Month),
Resi_Sales_MWH = as.numeric(gsub(",", "", Resi_Sales_MWH)),
Com_Sales_MWH = as.numeric(gsub(",", "", Com_Sales_MWH)),
Ind_Sales_MWH = as.numeric(gsub(",", "", Ind_Sales_MWH)),
Date = make_date(Year, Month, 1) # Explicit Date creation
)
# Aggregate electricity data to monthly
elec_ma <- elec_ma %>%
group_by(Year, Month) %>%
summarise(
Resi_Sales_MWH = sum(Resi_Sales_MWH, na.rm = TRUE),
Com_Sales_MWH = sum(Com_Sales_MWH, na.rm = TRUE),
Ind_Sales_MWH = sum(Ind_Sales_MWH, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(Date = make_date(Year, Month, 1))
elec_long <- elec_ma %>%
pivot_longer(
cols = c(Resi_Sales_MWH, Com_Sales_MWH, Ind_Sales_MWH),
names_to = "Sector",
values_to = "Consumption_MWh"
) %>%
mutate(
Sector = case_when(
Sector == "Resi_Sales_MWH" ~ "Residential",
Sector == "Com_Sales_MWH" ~ "Commercial",
Sector == "Ind_Sales_MWH" ~ "Industrial"
),
Fuel_Type = "Electricity",
Month_Num = Month
)
# 3. Load and process Weather Data
weather <- read_excel("weather_vars.xlsx") %>%
filter(!is.na(Month), !is.na(Year)) %>%
mutate(
Days_in_Month = days_in_month(ymd(paste(Year, Month, "01", sep="-"))),
HDD = pmax(65 - `Avg Temp (°F)`, 0) * Days_in_Month,
CDD = pmax(`Avg Temp (°F)` - 65, 0) * Days_in_Month,
Date = make_date(Year, Month, 1),
Avg_Temp_F = `Avg Temp (°F)`,
Humidity_Pct = `Humidity (%)`
)
# Filter data to 2010-2024 range
start_year <- 2010
end_year <- 2024
weather_filt <- weather %>%
filter(Year >= start_year & Year <= end_year)
gas_filt <- gas_long %>%
filter(Year >= start_year & Year <= end_year)
elec_filt <- elec_long %>%
filter(Year >= start_year & Year <= end_year)
# Combine energy datasets with consistent Date column
energy_data <- bind_rows(
gas_filt %>% select(Year, Month_Num, Sector, Fuel_Type, Consumption = Consumption_Mcf, Date),
elec_filt %>% select(Year, Month_Num, Sector, Fuel_Type, Consumption = Consumption_MWh, Date)
)
# Create comprehensive merged dataset
merged_data <- energy_data %>%
left_join(weather_filt %>% select(Date, Avg_Temp_F, Humidity_Pct, HDD, CDD),
by = "Date") %>%
mutate(
Season = case_when(
Month_Num %in% c(12, 1, 2) ~ "Winter",
Month_Num %in% c(3, 4, 5) ~ "Spring",
Month_Num %in% c(6, 7, 8) ~ "Summer",
Month_Num %in% c(9, 10, 11) ~ "Fall"
),
Log_Consumption = log(Consumption + 1) # Add log-transformed consumption
)
# Final validation
cat("\nMerged Data Sample:\n")
print(head(merged_data))
cat("\nMissing values by column:\n")
print(colSums(is.na(merged_data)))
cat("\nNumber of observations by sector and fuel type:\n")
print(table(merged_data$Fuel_Type, merged_data$Sector))# --- build data (same as before) ---
summary_stats <- merged_data %>%
group_by(Fuel_Type, Sector) %>%
summarise(
N = n(),
Mean_Consumption = mean(Consumption, na.rm = TRUE),
SD_Consumption = sd(Consumption, na.rm = TRUE),
Min_Consumption = min(Consumption, na.rm = TRUE),
Max_Consumption = max(Consumption, na.rm = TRUE),
Mean_Log_Consumption = mean(Log_Consumption, na.rm = TRUE),
SD_Log_Consumption = sd(Log_Consumption, na.rm = TRUE),
.groups = 'drop'
) %>%
mutate(
across(c(Mean_Consumption, SD_Consumption, Min_Consumption, Max_Consumption),
~ number(.x, big.mark = ",", accuracy = 0.1)),
Mean_Log_Consumption = round(Mean_Log_Consumption, 2),
SD_Log_Consumption = round(SD_Log_Consumption, 2)
)
# --- print table: fit-to-width + multipage + smaller font ---
kable(
summary_stats,
caption = "Summary Statistics by Fuel Type and Sector",
col.names = c("Fuel Type","Sector","N","Mean","SD","Min","Max","Mean (log)","SD (log)"),
booktabs = TRUE,
longtable = TRUE, # allow page breaks
align = "llrrrrrrr"
) %>%
kable_styling(
full_width = FALSE,
font_size = 9, # slightly smaller
latex_options = c("hold_position","repeat_header","scale_down")
# scale_down shrinks to fit \textwidth if still too wide
)
invisible(NULL) # prevent accidental printing of last value in chunk
# ---- Weather summary (same idea) ----
weather_summary <- weather_filt %>%
summarise(
`Mean Temp (°F)` = round(mean(`Avg Temp (°F)`, na.rm = TRUE), 1),
`SD Temp (°F)` = round(sd(`Avg Temp (°F)`, na.rm = TRUE), 1),
`Mean Hum (%)` = round(mean(`Humidity (%)`, na.rm = TRUE), 1),
`SD Hum (%)` = round(sd(`Humidity (%)`, na.rm = TRUE), 1),
`Mean HDD` = round(mean(HDD, na.rm = TRUE), 1),
`Mean CDD` = round(mean(CDD, na.rm = TRUE), 1)
)
kable(
weather_summary,
caption = "Weather Variables Summary Statistics (2010–2024)",
booktabs = TRUE,
longtable = FALSE,
align = "rrrrrr"
) %>%
kable_styling(full_width = FALSE, font_size = 9)
invisible(NULL)# Time series plots for Consumption and Log_Consumption
p1 <- merged_data %>%
filter(Fuel_Type == "Natural_Gas") %>%
ggplot(aes(x = Date, y = Consumption, color = Sector)) +
geom_line(size = 1) +
labs(title = "Natural Gas Consumption by Sector (2010-2024)",
x = "Date", y = "Consumption (MMcf)") +
theme_minimal() +
theme(legend.position = "bottom")
p1_log <- merged_data %>%
filter(Fuel_Type == "Natural_Gas") %>%
ggplot(aes(x = Date, y = Log_Consumption, color = Sector)) +
geom_line(size = 1) +
labs(title = "Log Natural Gas Consumption by Sector (2010-2024)",
x = "Date", y = "Log Consumption") +
theme_minimal() +
theme(legend.position = "bottom")
p2 <- merged_data %>%
filter(Fuel_Type == "Electricity") %>%
ggplot(aes(x = Date, y = Consumption, color = Sector)) +
geom_line(size = 1) +
labs(title = "Electricity Consumption by Sector (2010-2024)",
x = "Date", y = "Consumption (MWh)") +
theme_minimal() +
theme(legend.position = "bottom")
p2_log <- merged_data %>%
filter(Fuel_Type == "Electricity") %>%
ggplot(aes(x = Date, y = Log_Consumption, color = Sector)) +
geom_line(size = 1) +
labs(title = "Log Electricity Consumption by Sector (2010-2024)",
x = "Date", y = "Log Consumption") +
theme_minimal() +
theme(legend.position = "bottom")
grid.arrange(p1, p1_log, p2, p2_log, ncol = 2, nrow = 2)# Weather time series
p3 <- weather_filt %>%
ggplot(aes(x = Date, y = `Avg Temp (°F)`)) +
geom_line(color = "red", size = 1) +
labs(title = "Average Monthly Temperature (2010-2024)",
x = "Date", y = "Temperature (°F)") +
theme_minimal()
p4 <- weather_filt %>%
select(Date, HDD, CDD) %>%
pivot_longer(cols = c(HDD, CDD), names_to = "Variable", values_to = "Value") %>%
ggplot(aes(x = Date, y = Value, color = Variable)) +
geom_line(size = 1) +
labs(title = "Heating and Cooling Degree Days (2010-2024)",
x = "Date", y = "Value") +
theme_minimal() +
theme(legend.position = "bottom")
grid.arrange(p3, p4, ncol = 1)# Correlation analysis for natural gas
gas_corr_data <- merged_data %>%
filter(Fuel_Type == "Natural_Gas") %>%
select(Log_Consumption, Avg_Temp_F, Humidity_Pct, HDD, CDD) %>%
na.omit()
cor_matrix_gas <- cor(gas_corr_data)
# Correlation analysis for electricity
elec_corr_data <- merged_data %>%
filter(Fuel_Type == "Electricity") %>%
select(Log_Consumption, Avg_Temp_F, Humidity_Pct, HDD, CDD) %>%
na.omit()
cor_matrix_elec <- cor(elec_corr_data)
par(mfrow = c(1, 2))
corrplot(cor_matrix_gas, method = "color", type = "upper",
title = "Natural Gas Correlations (Log-Consumption)", mar = c(0,0,2,0))
corrplot(cor_matrix_elec, method = "color", type = "upper",
title = "Electricity Correlations (Log-Consumption)", mar = c(0,0,2,0))# Initialize list to store models and diagnostics
gas_ts_models <- list()
arima_diagnostics <- list()
cat("Checking Natural Gas Data Quality:\n")
for(sector in c("Residential", "Commercial", "Industrial")) {
sector_data <- merged_data %>%
filter(Fuel_Type == "Natural_Gas", Sector == sector) %>%
mutate(
Trend = 1:n(),
Log_Consumption = log(Consumption + 1)
) %>%
select(Date, Consumption, Log_Consumption, HDD, CDD, Humidity_Pct, Trend) %>%
na.omit()
cat("\nSector:", sector, "\n")
cat("Rows:", nrow(sector_data), "\n")
cat("Missing values:", sum(is.na(sector_data)), "\n")
cat("Log_Consumption range:", range(sector_data$Log_Consumption, na.rm = TRUE), "\n")
adf_result <- adf.test(sector_data$Log_Consumption)
cat("ADF Test (Log_Consumption, p-value):", adf_result$p.value, "\n")
linear_model <- lm(Log_Consumption ~ HDD + CDD + Humidity_Pct + Trend,
data = sector_data)
ts_data <- ts(sector_data$Log_Consumption, frequency = 12, start = c(2010, 1))
xreg_matrix <- as.matrix(sector_data[, c("HDD", "CDD", "Humidity_Pct", "Trend")])
if (any(is.na(xreg_matrix))) {
stop("Missing values in xreg_matrix for sector ", sector)
}
arima_model <- tryCatch(
{
auto.arima(ts_data, xreg = xreg_matrix, seasonal = TRUE,
max.p = 2, max.q = 2, max.d = 1, trace = TRUE,
d = ifelse(adf_result$p.value > 0.05, 1, 0))
},
error = function(e) {
message("Error fitting ARIMA for ", sector, ": ", e$message)
return(NULL)
}
)
gas_ts_models[[sector]] <- list(
Linear = linear_model,
ARIMA = arima_model
)
arima_diagnostics[[sector]] <- if (!is.null(arima_model)) {
list(
Order = arima_model$arma,
Coefficients = coef(arima_model),
AIC = arima_model$aic,
BIC = BIC(arima_model),
Residual_ACF = acf(resid(arima_model), plot = FALSE)$acf
)
} else {
list(Error = "ARIMA fitting failed")
}
}
cat("\nNatural Gas Consumption Models\n")
for(sector in names(gas_ts_models)) {
cat("\n=== ", sector, " Sector ===\n")
lin_summary <- summary(gas_ts_models[[sector]]$Linear)
cat("\nLinear Model Coefficients (Log-Transformed):\n")
print(coef(lin_summary))
cat("\nR-squared:", lin_summary$r.squared, "\n")
cat("\nARIMA Model:\n")
if (!is.null(gas_ts_models[[sector]]$ARIMA)) {
print(gas_ts_models[[sector]]$ARIMA)
cat("AIC:", gas_ts_models[[sector]]$ARIMA$aic, "\n")
cat("BIC:", BIC(gas_ts_models[[sector]]$ARIMA), "\n")
cat("Residual ACF (first 5 lags):\n")
print(head(arima_diagnostics[[sector]]$Residual_ACF, 5))
} else {
cat("ARIMA fitting failed for ", sector, "\n")
}
cat("\n--------------------------------\n")
}
coef_table <- do.call(rbind, lapply(names(gas_ts_models), function(sector) {
coefs <- coef(summary(gas_ts_models[[sector]]$Linear))
data.frame(
Sector = sector,
Term = rownames(coefs),
Estimate = coefs[,1],
Std.Error = coefs[,2],
p.value = coefs[,4],
row.names = NULL
)
}))
kable(coef_table, digits = 4,
caption = "Natural Gas Consumption Coefficients (Log-Transformed)")elec_ts_models <- list()
elec_arima_diagnostics <- list()
cat("Checking Electricity Data Quality:\n")
for(sector in c("Residential", "Commercial", "Industrial")) {
sector_data <- merged_data %>%
filter(Fuel_Type == "Electricity", Sector == sector) %>%
mutate(
Trend = 1:n(),
Log_Consumption = log(Consumption + 1)
) %>%
select(Date, Consumption, Log_Consumption, HDD, CDD, Humidity_Pct, Trend) %>%
na.omit()
cat("\nSector:", sector, "\n")
cat("Rows:", nrow(sector_data), "\n")
cat("Missing values:", sum(is.na(sector_data)), "\n")
cat("Log_Consumption range:", range(sector_data$Log_Consumption, na.rm = TRUE), "\n")
adf_result <- adf.test(sector_data$Log_Consumption)
cat("ADF Test (Log_Consumption, p-value):", adf_result$p.value, "\n")
linear_model <- lm(Log_Consumption ~ HDD + CDD + Humidity_Pct + Trend,
data = sector_data)
ts_data <- ts(sector_data$Log_Consumption, frequency = 12, start = c(2010, 1))
xreg_matrix <- as.matrix(sector_data[, c("HDD", "CDD", "Humidity_Pct", "Trend")])
if (any(is.na(xreg_matrix))) {
stop("Missing values in xreg_matrix for sector ", sector)
}
arima_model <- tryCatch(
{
auto.arima(ts_data, xreg = xreg_matrix, seasonal = TRUE,
max.p = 2, max.q = 2, max.d = 1, trace = TRUE,
d = ifelse(adf_result$p.value > 0.05, 1, 0))
},
error = function(e) {
message("Error fitting ARIMA for ", sector, ": ", e$message)
return(NULL)
}
)
elec_ts_models[[sector]] <- list(
Linear = linear_model,
ARIMA = arima_model
)
elec_arima_diagnostics[[sector]] <- if (!is.null(arima_model)) {
list(
Order = arima_model$arma,
Coefficients = coef(arima_model),
AIC = arima_model$aic,
BIC = BIC(arima_model),
Residual_ACF = acf(resid(arima_model), plot = FALSE)$acf
)
} else {
list(Error = "ARIMA fitting failed")
}
}
cat("\nElectricity Consumption Models\n")
for(sector in names(elec_ts_models)) {
cat("\n=== ", sector, " Sector ===\n")
lin_summary <- summary(elec_ts_models[[sector]]$Linear)
cat("\nLinear Model Coefficients (Log-Transformed):\n")
print(coef(lin_summary))
cat("\nR-squared:", lin_summary$r.squared, "\n")
cat("\nARIMA Model:\n")
if (!is.null(elec_ts_models[[sector]]$ARIMA)) {
print(elec_ts_models[[sector]]$ARIMA)
cat("AIC:", elec_ts_models[[sector]]$ARIMA$aic, "\n")
cat("BIC:", BIC(elec_ts_models[[sector]]$ARIMA), "\n")
cat("Residual ACF (first 5 lags):\n")
print(head(elec_arima_diagnostics[[sector]]$Residual_ACF, 5))
} else {
cat("ARIMA fitting failed for ", sector, "\n")
}
cat("\n--------------------------------\n")
}
elec_coef_table <- do.call(rbind, lapply(names(elec_ts_models), function(sector) {
coefs <- coef(summary(elec_ts_models[[sector]]$Linear))
data.frame(
Sector = sector,
Term = rownames(coefs),
Estimate = coefs[,1],
Std.Error = coefs[,2],
p.value = coefs[,4],
row.names = NULL
)
}))
kable(elec_coef_table, digits = 4,
caption = "Electricity Consumption Coefficients (Log-Transformed)")# Function for linear model diagnostics (unchanged)
diagnostic_plots_linear <- function(model, title_text) {
old_par <- par(mfrow = c(2, 2),
mar = c(4, 4, 3, 2),
oma = c(0, 0, 2, 0),
cex.main = 1.1,
font.main = 1,
family = "sans")
res <- resid(model)
fit <- fitted(model)
plot(fit, res,
main = "Residuals vs Fitted",
xlab = "Fitted Values (Log-Consumption)",
ylab = "Residuals",
pch = 19,
col = "#1E88E5",
cex = 0.7)
abline(h = 0, col = "#FF5252", lwd = 2)
grid(col = "#E0E0E0", lty = "dotted")
qqnorm(res,
main = "Normal Q-Q Plot",
pch = 19,
col = "#1E88E5")
qqline(res, col = "#FF5252", lwd = 2)
grid(col = "#E0E0E0", lty = "dotted")
hist(res,
main = "Residual Distribution",
xlab = "Residuals (Log-Consumption)",
col = "#1E88E5",
border = "white",
freq = FALSE)
curve(dnorm(x, mean(res), sd(res)),
add = TRUE,
col = "#FF5252",
lwd = 2)
grid(col = "#E0E0E0", lty = "dotted")
acf(res,
main = "ACF of Residuals",
col = "#1E88E5",
lwd = 2)
grid(col = "#E0E0E0", lty = "dotted")
mtext(title_text,
outer = TRUE,
cex = 1.3,
font = 2,
col = "#333333")
par(old_par)
}
# Function for ARIMA diagnostics
diagnostic_plots_arima <- function(model, title_text) {
old_par <- par(mfrow = c(2, 2),
mar = c(4, 4, 3, 2),
oma = c(0, 0, 2, 0),
cex.main = 1.1,
font.main = 1,
family = "sans")
res <- resid(model)
fit <- fitted(model)
plot(fit, res,
main = "Residuals vs Fitted",
xlab = "Fitted Values (Consumption)",
ylab = "Residuals",
pch = 19,
col = "#1E88E5",
cex = 0.7)
abline(h = 0, col = "#FF5252", lwd = 2)
grid(col = "#E0E0E0", lty = "dotted")
qqnorm(res,
main = "Normal Q-Q Plot",
pch = 19,
col = "#1E88E5")
qqline(res, col = "#FF5252", lwd = 2)
grid(col = "#E0E0E0", lty = "dotted")
hist(res,
main = "Residual Distribution",
xlab = "Residuals (Consumption)",
col = "#1E88E5",
border = "white",
freq = FALSE)
curve(dnorm(x, mean(res), sd(res)),
add = TRUE,
col = "#FF5252",
lwd = 2)
grid(col = "#E0E0E0", lty = "dotted")
acf(res,
main = "ACF of Residuals",
col = "#1E88E5",
lwd = 2)
grid(col = "#E0E0E0", lty = "dotted")
mtext(title_text,
outer = TRUE,
cex = 1.3,
font = 2,
col = "#333333")
par(old_par)
}
run_diagnostics <- function(linear_model, arima_model, sector_name, fuel_type) {
cat("\n", rep("=", 50), "\n", sep="")
cat("DIAGNOSTICS FOR:", sector_name, "\n")
cat(rep("=", 50), "\n\n")
# Linear Model Diagnostics
cat("LINEAR MODEL (LOG-TRANSFORMED):\n")
print(summary(linear_model))
res_linear <- resid(linear_model)
cat("\nNormality Tests (Linear):\n")
if(length(res_linear) <= 5000) {
sw_test <- shapiro.test(res_linear)
cat("Shapiro-Wilk: p =", format.pval(sw_test$p.value, digits=3), "\n")
}
ad_test <- ad.test(res_linear)
cat("Anderson-Darling: p =", format.pval(ad_test$p.value, digits=3), "\n")
bp_test <- bptest(linear_model)
cat("\nHeteroskedasticity Test (Linear):\n")
cat("Breusch-Pagan: p =", format.pval(bp_test$p.value, digits=3), "\n")
dw_test <- dwtest(linear_model)
cat("\nAutocorrelation Test (Linear):\n")
cat("Durbin-Watson: p =", format.pval(dw_test$p.value, digits=3), "\n")
cat("\nMulticollinearity Check (VIF):\n")
vif_values <- vif(linear_model)
print(vif_values)
diagnostic_plots_linear(linear_model, paste(fuel_type, "-", toupper(sector_name), "(Linear, Log-Transformed)"))
# ARIMA Model Diagnostics
cat("\nARIMA MODEL:\n")
if (!is.null(arima_model)) {
print(arima_model)
cat("AIC:", arima_model$aic, "\n")
cat("BIC:", BIC(arima_model), "\n")
res_arima <- resid(arima_model)
cat("\nNormality Tests (ARIMA):\n")
if(length(res_arima) <= 5000) {
sw_test_arima <- shapiro.test(res_arima)
cat("Shapiro-Wilk: p =", format.pval(sw_test_arima$p.value, digits=3), "\n")
}
ad_test_arima <- ad.test(res_arima)
cat("Anderson-Darling: p =", format.pval(ad_test_arima$p.value, digits=3), "\n")
cat("\nAutocorrelation Test (ARIMA):\n")
lb_test <- Box.test(res_arima, lag = 12, type = "Ljung-Box")
cat("Ljung-Box: p =", format.pval(lb_test$p.value, digits=3), "\n")
diagnostic_plots_arima(arima_model, paste(fuel_type, "-", toupper(sector_name), "(ARIMA)"))
} else {
cat("ARIMA fitting failed for ", sector_name, "\n")
}
}
cat("\n\nELECTRICITY MODEL DIAGNOSTICS\n")
for(sector in c("Residential", "Commercial", "Industrial")) {
run_diagnostics(elec_ts_models[[sector]]$Linear,
elec_ts_models[[sector]]$ARIMA,
paste("ELECTRICITY -", toupper(sector)),
"ELECTRICITY")
}
cat("\n\nNATURAL GAS MODEL DIAGNOSTICS\n")
for(sector in c("Residential", "Commercial", "Industrial")) {
run_diagnostics(gas_ts_models[[sector]]$Linear,
gas_ts_models[[sector]]$ARIMA,
paste("NATURAL GAS -", toupper(sector)),
"NATURAL GAS")
}