1. Load Packages

2. Import Data

Household_Electric_Power_Consumption <- read.table("C:/Users/schic/OneDrive/Documents/Predictive Analytics and Forecasting/household_power_consumption.txt", header = TRUE, sep = ";")

The data set that I imported contains data on Individual Household Electric Power Consumption. This data set contains 2075259 measurements gathered in a house located in Sceaux (7km of Paris, France) between December 2006 and November 2010 (47 months). The variable that I will focus on for forecasting is “Global_active_power”, which is defined as ‘household global minute-averaged active power (in kilowatt)’. For Dynamic Regression modeling, I will create a variable that uses historical monthly average temperature as a key indicator of power consumption.

3. Data Exploration/Manipulation

a.Creating Monthly Average Consumption

#create data set with key variables
df2 <- Household_Electric_Power_Consumption %>%
  select(Date, Time, Global_active_power)

#turn Date variable into a date type
df2$Date <- as.Date(df2$Date, format = "%d/%m/%Y")

df2$Global_active_power <- as.numeric(df2$Global_active_power)

## Warning: NAs introduced by coercion

df2 <- df2 %>%
  mutate(YearMonth = floor_date(Date, "month"))

monthly_avg_power <- df2 %>%
  group_by(YearMonth) %>%
  summarise(TotalPower = sum(Global_active_power, na.rm = TRUE), Count = n(), MonthlyAvgPower = TotalPower / Count )

b. Graphing Historical Data

ggplot(monthly_avg_power, aes(x = YearMonth, y = MonthlyAvgPower)) +
  geom_line() +
  labs( x = 'Year-Month', y = "Average Active Power Consumption (Kilowatt) ", title = "Monthly-Averaged Active Power Consumption (Kilowatt) from January 2007 to October 2010") +
  theme_minimal() + 
  scale_x_date(date_breaks = "4 months", date_labels = "%b-%y") + theme(plot.title = element_text(size = 9),  # Change font size of plot title
    axis.title = element_text(size = 7),  # Change font size of axis titles
    axis.text = element_text(size = 7))

From these historical graphs, we can see that the Monthly-Average Active Power Consumption data contains seasonality, as there are Troughs in Active Power Consumption in the summer months and Peaks in Active Power Consumption in the Winter Months. Next, I will move into analyzing the decomposition of the data to identify the use of multiplicative or additive methods. I am assuming, due to the visual nature of historical data, that the additive methods make the most sense.

c. Creating Monthly Average Temp Variable

monthly_avg_power <- monthly_avg_power %>%
  filter(YearMonth >= '2007-01-01' & YearMonth <= '2010-10-01')

monthly_avg_power$Avg_Temp <- case_when(
  year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 1 ~ 45.26,
  year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 2 ~ 46.24,
   year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 3 ~ 45.83,
  year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 4 ~ 59.17,
   year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 5 ~ 59.51,
  year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 6 ~ 64.69,
   year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 7 ~ 64.76,
  year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 8 ~ 63.67,
   year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 9 ~ 58.69,
  year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 10 ~ 51.85,
   year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 11 ~ 43.96,
  year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 12 ~ 38.89,
  
   year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 1 ~ 43.13,
  year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 2 ~ 43.34,
   year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 3 ~ 45.1,
  year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 4 ~ 49.53,
   year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 5 ~ 62.1,
  year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 6 ~ 64.39,
  year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 7 ~ 67.5,
  year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 8 ~ 66.1,
  year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 9 ~ 58.18,
  year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 10 ~ 49.41,
  year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 11 ~ 44.99,
  year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 12 ~ 37.01,
  
  year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 1 ~ 34.26,
  year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 2 ~ 39.06,
  year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 3 ~ 45.21,
  year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 4 ~ 54.5,
  year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 5 ~ 59.16,
  year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 6 ~ 63.65,
  year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 7 ~ 67.55,
  year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 8 ~ 70.07,
  year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 9 ~ 61.73,
  year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 10 ~ 53.56,
  year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 11 ~ 49.87,
  year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 12 ~ 38.72,
  
  year(monthly_avg_power$YearMonth) == 2010 & month(monthly_avg_power$YearMonth) == 1 ~ 33,
  year(monthly_avg_power$YearMonth) == 2010 & month(monthly_avg_power$YearMonth) == 2 ~ 38.8,
  year(monthly_avg_power$YearMonth) == 2010 & month(monthly_avg_power$YearMonth) == 3 ~ 45.15,
  year(monthly_avg_power$YearMonth) == 2010 & month(monthly_avg_power$YearMonth) == 4 ~ 52.53,
  year(monthly_avg_power$YearMonth) == 2010 & month(monthly_avg_power$YearMonth) == 5 ~ 55.59,
  year(monthly_avg_power$YearMonth) == 2010 & month(monthly_avg_power$YearMonth) == 6 ~ 65.43,
  year(monthly_avg_power$YearMonth) == 2010 & month(monthly_avg_power$YearMonth) == 7 ~ 70.6,
  year(monthly_avg_power$YearMonth) == 2010 & month(monthly_avg_power$YearMonth) == 8 ~ 65.07,
  year(monthly_avg_power$YearMonth) == 2010 & month(monthly_avg_power$YearMonth) == 9 ~ 59.38,
  year(monthly_avg_power$YearMonth) == 2010 & month(monthly_avg_power$YearMonth) == 10 ~ 52.3)

Avg_Temp <- c(43.87, 31.17, 40.2, 42.39, 48.1,57.98)
YearMonth <- as.Date(c("2010-11-01", "2010-12-01", "2011-01-01", "2011-02-01","2011-03-01","2011-04-01"))

Future_Weather <- data.frame(YearMonth, Avg_Temp)
combined_df <- bind_rows(Future_Weather, monthly_avg_power)

d.Arranging Data Into Training and Testing Sets

#create training set that uses all but last 6 months
Training_Data <- monthly_avg_power %>%
  filter(YearMonth >= '2007-01-01' & YearMonth <= '2010-4-01' )

Testing_Data <- monthly_avg_power %>%
  filter(YearMonth >= '2010-05-01' & YearMonth <= '2010-10-01')

4. Modeling/Forecasting

a. ETS (Holt-Winters’ Methods)

#Create Time Series Object
Avg_AP_TS <- ts(monthly_avg_power$MonthlyAvgPower, start = c(2007, 01), frequency = 12)

Avg_AP_TS2 <- ts(monthly_avg_power$MonthlyAvgPower, start = c(2007, 01), end = c(2010, 04), frequency = 12)

Weather_ts <- ts(combined_df$Avg_Temp, start = c(2007, 01), frequency = 12)

Weather_ts2 <- ts(combined_df$Avg_Temp, start = c(2007, 01), end = c(2010, 04), frequency = 12)

#Use best ETS models using Additive methods from week 3
A_HW <- ets(Avg_AP_TS2, model = "AAA", damped = FALSE)
print(summary(A_HW))

## ETS(A,A,A) 
## 
## Call:
##  ets(y = Avg_AP_TS2, model = "AAA", damped = FALSE) 
## 
##   Smoothing parameters:
##     alpha = 8e-04 
##     beta  = 1e-04 
##     gamma = 1e-04 
## 
##   Initial states:
##     l = 1.1146 
##     b = -0.0018 
##     s = 0.3493 0.2497 0.0474 -0.0977 -0.5157 -0.3905
##            -0.2217 -0.0754 -0.0456 0.1282 0.209 0.3631
## 
##   sigma:  0.1271
## 
##       AIC      AICc       BIC 
## -3.882942 23.935240 24.828009 
## 
## Training set error measures:
##                       ME       RMSE        MAE       MPE     MAPE      MASE
## Training set 0.002465139 0.09847264 0.07579298 -1.914256 9.280805 0.5859143
##                    ACF1
## Training set -0.1208645

FC2 <- forecast(A_HW, h = 6)
autoplot(Avg_AP_TS) + autolayer(FC2, color = "red") + geom_line(alpha = 0.9)  + ylab("Average Active Power Consumption (Kilowatt) ") + labs(title = "Additive Holt-Winters' Forecast")

Based on observations of the summary of the model, we can see that there is an RMSE = 0.098, MAE = 0.076, AICc = 23.94, and BIC = 24.83.

b. Auto-Arima

A.ARIMA <- auto.arima(Avg_AP_TS2)

FC3 <- forecast(A.ARIMA, h=6)

print(summary(A.ARIMA))

## Series: Avg_AP_TS2 
## ARIMA(0,0,0)(1,1,0)[12] 
## 
## Coefficients:
##          sar1
##       -0.6714
## s.e.   0.1530
## 
## sigma^2 = 0.01898:  log likelihood = 12.68
## AIC=-21.37   AICc=-20.89   BIC=-18.7
## 
## Training set error measures:
##                       ME      RMSE       MAE       MPE    MAPE      MASE
## Training set -0.02070775 0.1131912 0.0754779 -4.174647 9.27599 0.5834786
##                     ACF1
## Training set -0.03387432

print(summary(A_HW))

## ETS(A,A,A) 
## 
## Call:
##  ets(y = Avg_AP_TS2, model = "AAA", damped = FALSE) 
## 
##   Smoothing parameters:
##     alpha = 8e-04 
##     beta  = 1e-04 
##     gamma = 1e-04 
## 
##   Initial states:
##     l = 1.1146 
##     b = -0.0018 
##     s = 0.3493 0.2497 0.0474 -0.0977 -0.5157 -0.3905
##            -0.2217 -0.0754 -0.0456 0.1282 0.209 0.3631
## 
##   sigma:  0.1271
## 
##       AIC      AICc       BIC 
## -3.882942 23.935240 24.828009 
## 
## Training set error measures:
##                       ME       RMSE        MAE       MPE     MAPE      MASE
## Training set 0.002465139 0.09847264 0.07579298 -1.914256 9.280805 0.5859143
##                    ACF1
## Training set -0.1208645

autoplot(Avg_AP_TS) + autolayer(FC3, color = "purple") + geom_line(alpha = 0.9) + ylab("Average Active Power Consumption (Kilowatt) ") + labs(title = "Auto ARIMA Forecast")

In comparing the Additive Holt-Winters model to the Auto-ARIMA model, we can see that the forecast graphs seem to exhibit very similar results. In comparing both models visually, the ARIMA model is closer in estimating the start and end point estimates for the forecast, whereas the ETS model is better at estimating the trough value in the 6-month forecast. In addition, we can also see that the ME, RMSE, and MPE values for the training data are lower for the ETS model rather than the ARIMA model, however, each of the other error measures is lower for the ARIMA model (MAE, MAPE, MASE, and ACF1).

c. Dynamic Regression (ARIMA Model with External Regressor)

fit <- auto.arima(Avg_AP_TS2, xreg = Weather_ts2)

FC5 <- forecast(fit, xreg = Future_Weather$Avg_Temp, h=6)

print(summary(fit))

## Series: Avg_AP_TS2 
## Regression with ARIMA(1,0,0)(1,0,0)[12] errors 
## 
## Coefficients:
##          ar1   sar1  intercept    xreg
##       0.5029  0.410     0.7120  0.0075
## s.e.  0.1696  0.232     0.3075  0.0058
## 
## sigma^2 = 0.03546:  log likelihood = 10.89
## AIC=-11.78   AICc=-10.01   BIC=-3.33
## 
## Training set error measures:
##                       ME      RMSE       MAE       MPE     MAPE     MASE
## Training set -0.01089693 0.1786363 0.1328515 -6.985167 16.82508 1.027003
##                     ACF1
## Training set 0.009273251

autoplot(Avg_AP_TS) + autolayer(FC5, color = "orange") + geom_line(alpha = 0.9) + ylab("Average Active Power Consumption (Kilowatt) ") + labs(title = "Auto ARIMA w/ Regressor Forecast")

checkresiduals(fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(1,0,0)(1,0,0)[12] errors
## Q* = 9.3301, df = 6, p-value = 0.1558
## 
## Model df: 2.   Total lags used: 8

5. Accuracy

ETS_data <- FC2$mean
ETS_data <- data.frame(Testing_Data, ETS_data)

ETS_data$residuals <- ETS_data$MonthlyAvgPower - ETS_data$ETS_data

MSE_ets <- mean(ETS_data$residuals^2)

print(paste("MSE for ETS Model (Test Data):", MSE_ets))

## [1] "MSE for ETS Model (Test Data): 0.0104178809161307"

ARIMA_data <- FC3$mean
ARIMA_data <- data.frame(Testing_Data, ARIMA_data)

ARIMA_data$residuals <- ARIMA_data$MonthlyAvgPower - ARIMA_data$ARIMA_data

MSE_ARIMA <- mean(ARIMA_data$residuals^2)

print(paste("MSE for ARIMA Model (Test Data):", MSE_ARIMA))

## [1] "MSE for ARIMA Model (Test Data): 0.00655994575963929"

ARIMA_reg_data <- FC5$mean
ARIMA_reg_data <- data.frame(Testing_Data, ARIMA_reg_data)

ARIMA_reg_data$residuals <- ARIMA_reg_data$MonthlyAvgPower - ARIMA_reg_data$ARIMA_reg_data

MSE_ARIMA_reg <- mean(ARIMA_reg_data$residuals^2)

print(paste("MSE for ARIMA w/ Reg Model (Test Data):", MSE_ARIMA_reg))

## [1] "MSE for ARIMA w/ Reg Model (Test Data): 0.0401755558755118"

6. Comparison

Based on the associated models (ETS, ARIMA, ARIMA w/ Reg) test data MSE comparisons, we can see that the traditional ARIMA model is performing the best in this instance, as it has the lowest MSE value for comparison of forecasts and observed data, and it also has the lowest AICc and BIC comparatively, so it is the best-fit model with best predictive ability. I assume in this case scenario, that ARIMA w/ Reg did not perform as well because the Average Monthly Temperature was not the best-suited exogenous variable for predictions, or there needed to be more variables used in combination with average monthly temperature.

Please let me know if anyone has any good ideas on how to make the ARIMA w/ Reg model better in this instance.

Discussion Week4 Dynamic Regression

Daniel Schick

2024-04-03