Predictive Analytics Discussion: Natural Gas Consumption

Author

Troy Hanlon

Load Libraries and Data

# Load required packages
library(tidyverse)
library(lubridate)

# Read data
data <- read.csv("C:\\Users\\Troy\\OneDrive\\Desktop\\pred_anal\\EIA_ng_ts.csv")

# Convert Date to Date format and extract Month and Year
data <- data %>%
  mutate(Date = as.Date(Date, format = "%m/%d/%Y"),
         Month = as.factor(month(Date)),
         Year = year(Date))

# Rename consumption column
colnames(data)[2] <- "Consumption"
head (data)
        Date Consumption Month Year
1 1973-01-15      843900     1 1973
2 1973-02-15      747331     2 1973
3 1973-03-15      648504     3 1973
4 1973-04-15      465867     4 1973
5 1973-05-15      326313     5 1973
6 1973-06-15      207172     6 1973

Train / Test Split

# 80% train, 20% test split
n <- nrow(data)
train_size <- floor(0.8 * n)
train_data <- data[1:train_size, ]
test_data <- data[(train_size + 1):n, ]

Fit Linear Regression Model

# Fit lm with monthly dummies and year
model <- lm(Consumption ~ Month + Year, data = train_data)

# Model summary
summary(model)

Call:
lm(formula = Consumption ~ Month + Year, data = train_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-216554  -20518   -3620   22092  231997 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  656392.83  365975.61   1.794   0.0735 .  
Month2       -93894.00   10898.85  -8.615   <2e-16 ***
Month3      -236788.71   10898.85 -21.726   <2e-16 ***
Month4      -443996.86   10898.85 -40.738   <2e-16 ***
Month5      -608058.31   10898.85 -55.791   <2e-16 ***
Month6      -694312.79   10898.85 -63.705   <2e-16 ***
Month7      -724616.55   10898.85 -66.486   <2e-16 ***
Month8      -734760.98   10898.85 -67.416   <2e-16 ***
Month9      -723329.67   10898.85 -66.368   <2e-16 ***
Month10     -634053.38   10898.85 -58.176   <2e-16 ***
Month11     -444886.21   10898.85 -40.820   <2e-16 ***
Month12     -172957.38   10898.85 -15.869   <2e-16 ***
Year             99.45     183.54   0.542   0.5882    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 49940 on 491 degrees of freedom
Multiple R-squared:  0.9648,    Adjusted R-squared:  0.964 
F-statistic:  1123 on 12 and 491 DF,  p-value: < 2.2e-16

Predict on Test Data

# Predict on test data
test_data$Predicted <- predict(model, newdata = test_data)

Plot Actual vs Predicted Values

# ggplot
ggplot(test_data, aes(x = Date)) +
  geom_line(aes(y = Consumption, color = "Actual"), size = 1) +
  geom_line(aes(y = Predicted, color = "Predicted"), size = 1, linetype = "dashed") +
  labs(title = "Actual vs. Predicted Natural Gas Consumption (Test Data)",
       x = "Date", y = "Consumption (MMcf)",
       color = "Series") +
  scale_color_manual(values = c("Actual" = "blue", "Predicted" = "red")) +
  theme_minimal()

# Save plot
ggsave("actual_vs_predicted.png", width = 8, height = 5)