#import .txt file
Household_Electric_Power_Consumption <- read.table("C:/Users/schic/OneDrive/Documents/Predictive Analytics and Forecasting/household_power_consumption.txt", header = TRUE, sep = ";")
The data set that I imported contains data on Individual Household Electric Power Consumption. This data set contains 2075259 measurements gathered in a house located in Sceaux (7km of Paris, France) between December 2006 and November 2010 (47 months). The variable that I will focus on for forecasting is “Global_active_power”, which is defined as ‘household global minute-averaged active power (in kilowatt)’.
#Checking for Missing Observations
missmap(Household_Electric_Power_Consumption)
In this case, there are a few missing observations, however, we do not need to worry about these missing observations, as they do not occur in the variables we are concerned with, which are Date and Global_active_power.
Since the data is organized by date, and recorded for each minute of each date, I will manipulate the data to use monthly average for analysis.
#create data set with key variables
df2 <- Household_Electric_Power_Consumption %>%
select(Date, Time, Global_active_power)
#turn Date variable into a date type
df2$Date <- as.Date(df2$Date, format = "%d/%m/%Y")
df2$Global_active_power <- as.numeric(df2$Global_active_power)
## Warning: NAs introduced by coercion
df2 <- df2 %>%
mutate(YearMonth = floor_date(Date, "month"))
monthly_avg_power <- df2 %>%
group_by(YearMonth) %>%
summarise(TotalPower = sum(Global_active_power, na.rm = TRUE), Count = n(), MonthlyAvgPower = TotalPower / Count )
After creating Monthly Averages for ‘Global_active_power’ I have decided that it would be best to drop the first and last Month, as these months did not have equal amounts of observations for the averages.
monthly_avg_power <- monthly_avg_power %>%
filter(YearMonth >= '2007-01-01' & YearMonth <= '2010-10-01')
#create training set that uses all but last 6 months
Training_Data <- monthly_avg_power %>%
filter(YearMonth >= '2007-01-01' & YearMonth <= '2010-4-01' )
Testing_Data <- monthly_avg_power %>%
filter(YearMonth >= '2010-05-01' & YearMonth <= '2010-10-01')
After Arranging the Data into Training and Testing Sets, I have decided to graph the historical data to analyze the trends and seasonality visually.
ggplot(monthly_avg_power, aes(x = YearMonth, y = MonthlyAvgPower)) +
geom_line() +
labs( x = 'Year-Month', y = "Average Active Power Consumption (Kilowatt) ", title = "Monthly-Averaged Active Power Consumption (Kilowatt) from January 2007 to October 2010") +
theme_minimal() +
scale_x_date(date_breaks = "4 months", date_labels = "%b-%y") + theme(plot.title = element_text(size = 9), # Change font size of plot title
axis.title = element_text(size = 7), # Change font size of axis titles
axis.text = element_text(size = 7))
ggplot(Training_Data, aes(x = YearMonth, y = MonthlyAvgPower)) +
geom_line() +
labs( x = 'Year-Month', y = "Average Active Power Consumption (Kilowatt) ", title = "Monthly-Averaged Active Power Consumption (Kilowatt) from January 2007 to April 2010") +
theme_minimal() +
scale_x_date(date_breaks = "4 months", date_labels = "%b-%y") + theme(plot.title = element_text(size = 9), # Change font size of plot title
axis.title = element_text(size = 7), # Change font size of axis titles
axis.text = element_text(size = 7))
ggplot(Testing_Data, aes(x = YearMonth, y = MonthlyAvgPower)) +
geom_line() +
labs( x = 'Year-Month', y = "Average Active Power Consumption (Kilowatt) ", title = "Monthly-Averaged Active Power Consumption (Kilowatt) from May 2010 to October 2010") +
theme_minimal() +
scale_x_date(date_breaks = "1 months", date_labels = "%b-%y") + theme(plot.title = element_text(size = 9), # Change font size of plot title
axis.title = element_text(size = 7), # Change font size of axis titles
axis.text = element_text(size = 7))
From these historical graphs, we can see that the Monthly-Average Active Power Consumption data contains seasonality, as there are Troughs in Active Power Consumption in the summer months and Peaks in Active Power Consumption in the Winter Months. Next, I will move into analyzing the decomposition of the data to identify the use of multiplicative or additive methods. I am assuming, due to the visual nature of historical data, that the additive methods make the most sense.
#Create Time Series Object
Avg_AP_TS <- ts(monthly_avg_power$MonthlyAvgPower, start = c(2007, 01), frequency = 12)
Avg_AP_TS2 <- ts(monthly_avg_power$MonthlyAvgPower, start = c(2007, 01), end = c(2010, 04), frequency = 12)
Additive <- decompose(Avg_AP_TS, type = "additive")
plot(Additive)
Multiplicative <- decompose(Avg_AP_TS, type = 'multiplicative')
plot(Multiplicative)
#Create ETS models using ADditive and Multiplicative Methods as well as Dampened vs Regular
DM_HW <- ets(Avg_AP_TS2, model = "MAM", damped = TRUE)
M_HW <- ets(Avg_AP_TS2, model = "MAM", damped = FALSE)
print(summary(DM_HW))
## ETS(M,Ad,M)
##
## Call:
## ets(y = Avg_AP_TS2, model = "MAM", damped = TRUE)
##
## Smoothing parameters:
## alpha = 0.0036
## beta = 1e-04
## gamma = 1e-04
## phi = 0.9716
##
## Initial states:
## l = 1.1374
## b = -0.0042
## s = 1.3064 1.2119 1.0426 0.903 0.5871 0.6394
## 0.8022 0.933 0.9655 1.1279 1.1913 1.2896
##
## sigma: 0.1587
##
## AIC AICc BIC
## 18.97451 51.54594 49.37434
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.003789111 0.1011423 0.07315255 -2.614924 9.384364 0.5655026
## ACF1
## Training set -0.102809
print(summary(M_HW))
## ETS(M,A,M)
##
## Call:
## ets(y = Avg_AP_TS2, model = "MAM", damped = FALSE)
##
## Smoothing parameters:
## alpha = 4e-04
## beta = 2e-04
## gamma = 2e-04
##
## Initial states:
## l = 1.1283
## b = -0.0025
## s = 1.3045 1.2023 1.0536 0.9082 0.585 0.6381
## 0.7917 0.913 0.9666 1.1233 1.2273 1.2864
##
## sigma: 0.1556
##
## AIC AICc BIC
## 17.03794 44.85612 45.74889
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.004094527 0.1018132 0.07409908 -2.517714 9.44016 0.5728197
## ACF1
## Training set -0.1201447
DA_HW <- ets(Avg_AP_TS2, model = "AAA", damped = TRUE)
A_HW <- ets(Avg_AP_TS2, model = "AAA", damped = FALSE)
print(summary(DA_HW))
## ETS(A,Ad,A)
##
## Call:
## ets(y = Avg_AP_TS2, model = "AAA", damped = TRUE)
##
## Smoothing parameters:
## alpha = 1e-04
## beta = 1e-04
## gamma = 1e-04
## phi = 0.9754
##
## Initial states:
## l = 1.1242
## b = -0.0029
## s = 0.3456 0.2476 0.0416 -0.0901 -0.5181 -0.3846
## -0.2184 -0.0675 -0.0592 0.1351 0.2189 0.349
##
## sigma: 0.1297
##
## AIC AICc BIC
## -1.96359 30.60784 28.43624
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.001303291 0.09837342 0.07581442 -2.303495 9.313437 0.5860801
## ACF1
## Training set -0.1195183
print(summary(A_HW))
## ETS(A,A,A)
##
## Call:
## ets(y = Avg_AP_TS2, model = "AAA", damped = FALSE)
##
## Smoothing parameters:
## alpha = 8e-04
## beta = 1e-04
## gamma = 1e-04
##
## Initial states:
## l = 1.1146
## b = -0.0018
## s = 0.3493 0.2497 0.0474 -0.0977 -0.5157 -0.3905
## -0.2217 -0.0754 -0.0456 0.1282 0.209 0.3631
##
## sigma: 0.1271
##
## AIC AICc BIC
## -3.882942 23.935240 24.828009
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.002465139 0.09847264 0.07579298 -1.914256 9.280805 0.5859143
## ACF1
## Training set -0.1208645
Based on observations of the summary of multiplicative and additive versions of damped and regular Holt-Winters’ Models, we can see that the Additive models produce lower AICc and BIC Values, which indicates a better fit for the additive models. In considering whether I should choose the damped or regular method for Holt-Winters’ models, I would argue that the regular method works fine in this case, since the AICc and BIC values are lower, and the differences in RMSE and MAE are very small, where the damped method boasts a lower RMSE (0.09847 vs 0.09837) and the regular method boasts a lower MAE (0.07579 vs 0.07581).
FC1 <- forecast(DA_HW, h = 6)
FC2 <- forecast(A_HW, h = 6)
autoplot(Avg_AP_TS) + autolayer(FC1, color = 'blue') + geom_line(alpha = 0.9) + ylab("Average Active Power Consumption (Kilowatt) ") + labs(title = "Damped Additive Holt-Winters' Forecast")
autoplot(Avg_AP_TS) + autolayer(FC2, color = "red") + geom_line(alpha = 0.9) + ylab("Average Active Power Consumption (Kilowatt) ") + labs(title = "Additive Holt-Winters' Forecast")
Visually, we can see that the two models are very close in their forecasts, so I will continue with the use of the regular model rather than the damped model.
A.ARIMA <- auto.arima(Avg_AP_TS2)
FC3 <- forecast(A.ARIMA, h=6)
print(summary(A.ARIMA))
## Series: Avg_AP_TS2
## ARIMA(0,0,0)(1,1,0)[12]
##
## Coefficients:
## sar1
## -0.6714
## s.e. 0.1530
##
## sigma^2 = 0.01898: log likelihood = 12.68
## AIC=-21.37 AICc=-20.89 BIC=-18.7
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.02070775 0.1131912 0.0754779 -4.174647 9.27599 0.5834786
## ACF1
## Training set -0.03387432
print(summary(A_HW))
## ETS(A,A,A)
##
## Call:
## ets(y = Avg_AP_TS2, model = "AAA", damped = FALSE)
##
## Smoothing parameters:
## alpha = 8e-04
## beta = 1e-04
## gamma = 1e-04
##
## Initial states:
## l = 1.1146
## b = -0.0018
## s = 0.3493 0.2497 0.0474 -0.0977 -0.5157 -0.3905
## -0.2217 -0.0754 -0.0456 0.1282 0.209 0.3631
##
## sigma: 0.1271
##
## AIC AICc BIC
## -3.882942 23.935240 24.828009
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.002465139 0.09847264 0.07579298 -1.914256 9.280805 0.5859143
## ACF1
## Training set -0.1208645
autoplot(Avg_AP_TS) + autolayer(FC3, color = "purple") + geom_line(alpha = 0.9) + ylab("Average Active Power Consumption (Kilowatt) ") + labs(title = "Auto ARIMA Forecast")
In comparing the Additive Holt-Winters model to the Auto-ARIMA model, we can see that the forecast graphs seem to exhibit very similar results. In comparing both models visually, the ARIMA model is closer in estimating the start and end point estimates for the forecast, whereas the ETS model is better at estimating the trough value in the 6-month forecast. In addition, we can also see that the ME, RMSE, and MPE values for the training data are lower for the ETS model rather than the ARIMA model, however, each of the other error measures is lower for the ARIMA model (MAE, MAPE, MASE, and ACF1). Next, I will compare the accuracy of the forecasting models using the Test data.
ETS_data <- FC2$mean
ETS_data <- data.frame(Testing_Data, ETS_data)
ETS_data$residuals <- ETS_data$MonthlyAvgPower - ETS_data$ETS_data
MSE_ets <- mean(ETS_data$residuals^2)
print(paste("MSE for ETS Model (Test Data):", MSE_ets))
## [1] "MSE for ETS Model (Test Data): 0.0104178809161307"
ARIMA_data <- FC3$mean
ARIMA_data <- data.frame(Testing_Data, ARIMA_data)
ARIMA_data$residuals <- ARIMA_data$MonthlyAvgPower - ARIMA_data$ARIMA_data
MSE_ARIMA <- mean(ARIMA_data$residuals^2)
print(paste("MSE for ARIMA Model (Test Data):", MSE_ARIMA))
## [1] "MSE for ARIMA Model (Test Data): 0.00655994575963929"
After calculating the MSE for the model’s forecasts using the 6 months that were left out of fitting the models, I have found that the MSE for the ARIMA model is lower than that of the ETS model (0.0104 vs 0.0066). Therefore, I would conclude that the ARIMA model is performing more efficiently in this case scenario. This is interesting to me, as I assumed that the ETS model would have performed better given the seasonality of the data.