Household_Electric_Power_Consumption <- read.table("C:/Users/schic/OneDrive/Documents/Predictive Analytics and Forecasting/household_power_consumption.txt", header = TRUE, sep = ";")
The data set that I imported contains data on Individual Household Electric Power Consumption. This data set contains 2075259 measurements gathered in a house located in Sceaux (7km of Paris, France) between December 2006 and November 2010 (47 months). The variable that I will focus on for forecasting is “Global_active_power”, which is defined as ‘household global minute-averaged active power (in kilowatt)’. For Dynamic Regression modeling, I will create a variable that uses historical monthly average temperature as a key indicator of power consumption.
#create data set with key variables
df2 <- Household_Electric_Power_Consumption %>%
select(Date, Time, Global_active_power)
#turn Date variable into a date type
df2$Date <- as.Date(df2$Date, format = "%d/%m/%Y")
df2$Global_active_power <- as.numeric(df2$Global_active_power)
## Warning: NAs introduced by coercion
df2 <- df2 %>%
mutate(YearMonth = floor_date(Date, "month"))
monthly_avg_power <- df2 %>%
group_by(YearMonth) %>%
summarise(TotalPower = sum(Global_active_power, na.rm = TRUE), Count = n(), MonthlyAvgPower = TotalPower / Count )
ggplot(monthly_avg_power, aes(x = YearMonth, y = MonthlyAvgPower)) +
geom_line() +
labs( x = 'Year-Month', y = "Average Active Power Consumption (Kilowatt) ", title = "Monthly-Averaged Active Power Consumption (Kilowatt) from January 2007 to October 2010") +
theme_minimal() +
scale_x_date(date_breaks = "4 months", date_labels = "%b-%y") + theme(plot.title = element_text(size = 9), # Change font size of plot title
axis.title = element_text(size = 7), # Change font size of axis titles
axis.text = element_text(size = 7))
From these historical graphs, we can see that the Monthly-Average Active Power Consumption data contains seasonality, as there are Troughs in Active Power Consumption in the summer months and Peaks in Active Power Consumption in the Winter Months. Next, I will move into analyzing the decomposition of the data to identify the use of multiplicative or additive methods. I am assuming, due to the visual nature of historical data, that the additive methods make the most sense.
monthly_avg_power <- monthly_avg_power %>%
filter(YearMonth >= '2007-01-01' & YearMonth <= '2010-10-01')
monthly_avg_power$Avg_Temp <- case_when(
year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 1 ~ 45.26,
year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 2 ~ 46.24,
year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 3 ~ 45.83,
year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 4 ~ 59.17,
year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 5 ~ 59.51,
year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 6 ~ 64.69,
year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 7 ~ 64.76,
year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 8 ~ 63.67,
year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 9 ~ 58.69,
year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 10 ~ 51.85,
year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 11 ~ 43.96,
year(monthly_avg_power$YearMonth) == 2007 & month(monthly_avg_power$YearMonth) == 12 ~ 38.89,
year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 1 ~ 43.13,
year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 2 ~ 43.34,
year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 3 ~ 45.1,
year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 4 ~ 49.53,
year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 5 ~ 62.1,
year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 6 ~ 64.39,
year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 7 ~ 67.5,
year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 8 ~ 66.1,
year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 9 ~ 58.18,
year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 10 ~ 49.41,
year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 11 ~ 44.99,
year(monthly_avg_power$YearMonth) == 2008 & month(monthly_avg_power$YearMonth) == 12 ~ 37.01,
year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 1 ~ 34.26,
year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 2 ~ 39.06,
year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 3 ~ 45.21,
year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 4 ~ 54.5,
year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 5 ~ 59.16,
year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 6 ~ 63.65,
year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 7 ~ 67.55,
year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 8 ~ 70.07,
year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 9 ~ 61.73,
year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 10 ~ 53.56,
year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 11 ~ 49.87,
year(monthly_avg_power$YearMonth) == 2009 & month(monthly_avg_power$YearMonth) == 12 ~ 38.72,
year(monthly_avg_power$YearMonth) == 2010 & month(monthly_avg_power$YearMonth) == 1 ~ 33,
year(monthly_avg_power$YearMonth) == 2010 & month(monthly_avg_power$YearMonth) == 2 ~ 38.8,
year(monthly_avg_power$YearMonth) == 2010 & month(monthly_avg_power$YearMonth) == 3 ~ 45.15,
year(monthly_avg_power$YearMonth) == 2010 & month(monthly_avg_power$YearMonth) == 4 ~ 52.53,
year(monthly_avg_power$YearMonth) == 2010 & month(monthly_avg_power$YearMonth) == 5 ~ 55.59,
year(monthly_avg_power$YearMonth) == 2010 & month(monthly_avg_power$YearMonth) == 6 ~ 65.43,
year(monthly_avg_power$YearMonth) == 2010 & month(monthly_avg_power$YearMonth) == 7 ~ 70.6,
year(monthly_avg_power$YearMonth) == 2010 & month(monthly_avg_power$YearMonth) == 8 ~ 65.07,
year(monthly_avg_power$YearMonth) == 2010 & month(monthly_avg_power$YearMonth) == 9 ~ 59.38,
year(monthly_avg_power$YearMonth) == 2010 & month(monthly_avg_power$YearMonth) == 10 ~ 52.3)
Avg_Temp <- c(43.87, 31.17, 40.2, 42.39, 48.1,57.98)
YearMonth <- as.Date(c("2010-11-01", "2010-12-01", "2011-01-01", "2011-02-01","2011-03-01","2011-04-01"))
Future_Weather <- data.frame(YearMonth, Avg_Temp)
combined_df <- bind_rows(Future_Weather, monthly_avg_power)
#create training set that uses all but last 6 months
Training_Data <- monthly_avg_power %>%
filter(YearMonth >= '2007-01-01' & YearMonth <= '2010-4-01' )
Testing_Data <- monthly_avg_power %>%
filter(YearMonth >= '2010-05-01' & YearMonth <= '2010-10-01')
#Create Time Series Object
Avg_AP_TS <- ts(monthly_avg_power$MonthlyAvgPower, start = c(2007, 01), frequency = 12)
Avg_AP_TS2 <- ts(monthly_avg_power$MonthlyAvgPower, start = c(2007, 01), end = c(2010, 04), frequency = 12)
Weather_ts <- ts(combined_df$Avg_Temp, start = c(2007, 01), frequency = 12)
Weather_ts2 <- ts(combined_df$Avg_Temp, start = c(2007, 01), end = c(2010, 04), frequency = 12)
#Use best ETS models using Additive methods from week 3
A_HW <- ets(Avg_AP_TS2, model = "AAA", damped = FALSE)
print(summary(A_HW))
## ETS(A,A,A)
##
## Call:
## ets(y = Avg_AP_TS2, model = "AAA", damped = FALSE)
##
## Smoothing parameters:
## alpha = 8e-04
## beta = 1e-04
## gamma = 1e-04
##
## Initial states:
## l = 1.1146
## b = -0.0018
## s = 0.3493 0.2497 0.0474 -0.0977 -0.5157 -0.3905
## -0.2217 -0.0754 -0.0456 0.1282 0.209 0.3631
##
## sigma: 0.1271
##
## AIC AICc BIC
## -3.882942 23.935240 24.828009
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.002465139 0.09847264 0.07579298 -1.914256 9.280805 0.5859143
## ACF1
## Training set -0.1208645
FC2 <- forecast(A_HW, h = 6)
autoplot(Avg_AP_TS) + autolayer(FC2, color = "red") + geom_line(alpha = 0.9) + ylab("Average Active Power Consumption (Kilowatt) ") + labs(title = "Additive Holt-Winters' Forecast")
Based on observations of the summary of the model, we can see that there is an RMSE = 0.098, MAE = 0.076, AICc = 23.94, and BIC = 24.83.
A.ARIMA <- auto.arima(Avg_AP_TS2)
FC3 <- forecast(A.ARIMA, h=6)
print(summary(A.ARIMA))
## Series: Avg_AP_TS2
## ARIMA(0,0,0)(1,1,0)[12]
##
## Coefficients:
## sar1
## -0.6714
## s.e. 0.1530
##
## sigma^2 = 0.01898: log likelihood = 12.68
## AIC=-21.37 AICc=-20.89 BIC=-18.7
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.02070775 0.1131912 0.0754779 -4.174647 9.27599 0.5834786
## ACF1
## Training set -0.03387432
print(summary(A_HW))
## ETS(A,A,A)
##
## Call:
## ets(y = Avg_AP_TS2, model = "AAA", damped = FALSE)
##
## Smoothing parameters:
## alpha = 8e-04
## beta = 1e-04
## gamma = 1e-04
##
## Initial states:
## l = 1.1146
## b = -0.0018
## s = 0.3493 0.2497 0.0474 -0.0977 -0.5157 -0.3905
## -0.2217 -0.0754 -0.0456 0.1282 0.209 0.3631
##
## sigma: 0.1271
##
## AIC AICc BIC
## -3.882942 23.935240 24.828009
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.002465139 0.09847264 0.07579298 -1.914256 9.280805 0.5859143
## ACF1
## Training set -0.1208645
autoplot(Avg_AP_TS) + autolayer(FC3, color = "purple") + geom_line(alpha = 0.9) + ylab("Average Active Power Consumption (Kilowatt) ") + labs(title = "Auto ARIMA Forecast")
In comparing the Additive Holt-Winters model to the Auto-ARIMA model, we can see that the forecast graphs seem to exhibit very similar results. In comparing both models visually, the ARIMA model is closer in estimating the start and end point estimates for the forecast, whereas the ETS model is better at estimating the trough value in the 6-month forecast. In addition, we can also see that the ME, RMSE, and MPE values for the training data are lower for the ETS model rather than the ARIMA model, however, each of the other error measures is lower for the ARIMA model (MAE, MAPE, MASE, and ACF1).
fit <- auto.arima(Avg_AP_TS2, xreg = Weather_ts2)
FC5 <- forecast(fit, xreg = Future_Weather$Avg_Temp, h=6)
print(summary(fit))
## Series: Avg_AP_TS2
## Regression with ARIMA(1,0,0)(1,0,0)[12] errors
##
## Coefficients:
## ar1 sar1 intercept xreg
## 0.5029 0.410 0.7120 0.0075
## s.e. 0.1696 0.232 0.3075 0.0058
##
## sigma^2 = 0.03546: log likelihood = 10.89
## AIC=-11.78 AICc=-10.01 BIC=-3.33
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.01089693 0.1786363 0.1328515 -6.985167 16.82508 1.027003
## ACF1
## Training set 0.009273251
autoplot(Avg_AP_TS) + autolayer(FC5, color = "orange") + geom_line(alpha = 0.9) + ylab("Average Active Power Consumption (Kilowatt) ") + labs(title = "Auto ARIMA w/ Regressor Forecast")
checkresiduals(fit)
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(1,0,0)(1,0,0)[12] errors
## Q* = 9.3301, df = 6, p-value = 0.1558
##
## Model df: 2. Total lags used: 8
ETS_data <- FC2$mean
ETS_data <- data.frame(Testing_Data, ETS_data)
ETS_data$residuals <- ETS_data$MonthlyAvgPower - ETS_data$ETS_data
MSE_ets <- mean(ETS_data$residuals^2)
print(paste("MSE for ETS Model (Test Data):", MSE_ets))
## [1] "MSE for ETS Model (Test Data): 0.0104178809161307"
ARIMA_data <- FC3$mean
ARIMA_data <- data.frame(Testing_Data, ARIMA_data)
ARIMA_data$residuals <- ARIMA_data$MonthlyAvgPower - ARIMA_data$ARIMA_data
MSE_ARIMA <- mean(ARIMA_data$residuals^2)
print(paste("MSE for ARIMA Model (Test Data):", MSE_ARIMA))
## [1] "MSE for ARIMA Model (Test Data): 0.00655994575963929"
ARIMA_reg_data <- FC5$mean
ARIMA_reg_data <- data.frame(Testing_Data, ARIMA_reg_data)
ARIMA_reg_data$residuals <- ARIMA_reg_data$MonthlyAvgPower - ARIMA_reg_data$ARIMA_reg_data
MSE_ARIMA_reg <- mean(ARIMA_reg_data$residuals^2)
print(paste("MSE for ARIMA w/ Reg Model (Test Data):", MSE_ARIMA_reg))
## [1] "MSE for ARIMA w/ Reg Model (Test Data): 0.0401755558755118"
Based on the associated models (ETS, ARIMA, ARIMA w/ Reg) test data MSE comparisons, we can see that the traditional ARIMA model is performing the best in this instance, as it has the lowest MSE value for comparison of forecasts and observed data, and it also has the lowest AICc and BIC comparatively, so it is the best-fit model with best predictive ability. I assume in this case scenario, that ARIMA w/ Reg did not perform as well because the Average Monthly Temperature was not the best-suited exogenous variable for predictions, or there needed to be more variables used in combination with average monthly temperature.
Please let me know if anyone has any good ideas on how to make the ARIMA w/ Reg model better in this instance.