Dataset

Individual Household Electric Power Consumption

The dataset used consists of 2075259 measurements gathered in a house located in Sceaux (7km of Paris, France) between December 2006 and November 2010 (47 months). It measures the electric power consumption in one household with a one-minute sampling rate over a period of almost 4 years.

Import Packages & Dataset

# Load required libraries
library(dplyr)
library(naniar)
library(forecast)
library(fpp2)
library(ggplot2)
library(stargazer)
df <- read.table("/Users/laykhoonyu/Documents/MSAA_AE/predictive_analytics/week_3/household_power_consumption.txt", header = TRUE, sep = ";")

Data Cleaning

# Convert Date variable to Date format
df$Date <- as.Date(df$Date, format = "%d/%m/%Y")
# Replace '?' with 0 in the Global_active_power column
df$Global_active_power <- gsub("\\?", "0", df$Global_active_power)

# Convert Global_active_power column to numeric
df$Global_active_power <- as.numeric(df$Global_active_power)

# Check the structure again to ensure it's numeric
str(df$Global_active_power)
##  num [1:2075259] 4.22 5.36 5.37 5.39 3.67 ...
# Check for missing values in  time series data
missing_values <- is.na(df$Global_active_power)

# Count the number of missing values
count_missing <- sum(missing_values)

# Print the count of missing values
print(count_missing)
## [1] 0
# Extract year and month from the Date column
df$Year <- format(df$Date, "%Y")
df$Month <- format(df$Date, "%m")

# Aggregate the data by year and month, calculating the mean for each month
monthly_avg <- aggregate(Global_active_power ~ Year + Month, data = df, FUN = mean)

# Convert Year and Month back to factors
monthly_avg$Year <- factor(monthly_avg$Year)
monthly_avg$Month <- factor(monthly_avg$Month)

# View the first few rows of the monthly averages
head(monthly_avg)
##   Year Month Global_active_power
## 1 2007    01            1.545965
## 2 2008    01            1.459888
## 3 2009    01            1.410170
## 4 2010    01            1.330189
## 5 2007    02            1.401014
## 6 2008    02            1.181300
# Combine Year and Month columns into a single Date column
monthly_avg$Date <- as.Date(paste(monthly_avg$Year, monthly_avg$Month, "01", sep = "-"))

# Print the first few rows of the modified data frame
head(monthly_avg)
##   Year Month Global_active_power       Date
## 1 2007    01            1.545965 2007-01-01
## 2 2008    01            1.459888 2008-01-01
## 3 2009    01            1.410170 2009-01-01
## 4 2010    01            1.330189 2010-01-01
## 5 2007    02            1.401014 2007-02-01
## 6 2008    02            1.181300 2008-02-01
# Convert Date variable to Date format
monthly_avg$Date <- as.Date(monthly_avg$Date, format = "%Y/%m/%d")
# Convert the data frame to a time series object
ts_data <- ts(monthly_avg$Global_active_power, start = c(2007, 01), end = c(2010, 11), frequency = 12)

Modeling (Full Set)

# Build auto.arima model using the full time frame
auto_arima_model_full <- auto.arima(ts_data)
summary(auto_arima_model_full)
## Series: ts_data 
## ARIMA(0,1,1) 
## 
## Coefficients:
##           ma1
##       -0.4772
## s.e.   0.1136
## 
## sigma^2 = 0.03735:  log likelihood = 10.72
## AIC=-17.44   AICc=-17.16   BIC=-13.78
## 
## Training set error measures:
##                        ME      RMSE       MAE       MPE     MAPE      MASE
## Training set -0.003956713 0.1890935 0.1359252 -4.397992 15.88718 0.3320549
##                     ACF1
## Training set -0.03992884
checkresiduals(auto_arima_model_full)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)
## Q* = 3.5366, df = 8, p-value = 0.8963
## 
## Model df: 1.   Total lags used: 9
ets_model_full <- ets(ts_data)
# Build ETS model using full data set
summary(ets_model_full)
## ETS(A,N,N) 
## 
## Call:
##  ets(y = ts_data) 
## 
##   Smoothing parameters:
##     alpha = 0.5149 
## 
##   Initial states:
##     l = 1.4827 
## 
##   sigma:  0.1932
## 
##      AIC     AICc      BIC 
## 30.39348 30.95162 35.94392 
## 
## Training set error measures:
##                        ME      RMSE       MAE       MPE     MAPE      MASE
## Training set -0.002016091 0.1890831 0.1366135 -4.283916 15.91921 0.3337364
##                    ACF1
## Training set -0.0355229

Modeling (Hold-out Set)

# Split data into train and test set 
train_data <- window(ts_data, end = c(2010, 06))
test_data <- window(ts_data, start = c(2010, 06), end = c(2010, 11))
# Build arima model using train data
auto_arima_model_train <- auto.arima(train_data)
summary(auto_arima_model_train)
## Series: train_data 
## ARIMA(1,1,0) 
## 
## Coefficients:
##           ar1
##       -0.4799
## s.e.   0.1345
## 
## sigma^2 = 0.02645:  log likelihood = 16.67
## AIC=-29.33   AICc=-29.02   BIC=-25.91
## 
## Training set error measures:
##                        ME      RMSE       MAE       MPE   MAPE      MASE
## Training set -0.006376161 0.1587096 0.1204931 -4.222958 15.321 0.3243359
##                    ACF1
## Training set 0.01035117
checkresiduals(auto_arima_model_train)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,0)
## Q* = 3.1504, df = 7, p-value = 0.8708
## 
## Model df: 1.   Total lags used: 8
# Build ETS model using train data
ets_model_train <- ets(train_data)
summary(ets_model_train)
## ETS(A,N,N) 
## 
## Call:
##  ets(y = train_data) 
## 
##   Smoothing parameters:
##     alpha = 0.5793 
## 
##   Initial states:
##     l = 1.4949 
## 
##   sigma:  0.1671
## 
##      AIC     AICc      BIC 
## 10.61858 11.25016 15.83159 
## 
## Training set error measures:
##                        ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -0.007130097 0.163025 0.125479 -4.636938 16.15954 0.3377567
##                     ACF1
## Training set -0.09166243
# Forecast using auto.arima model trained on the training set
auto_arima_forecast_train <- forecast(auto_arima_model_train, h = 6)
autoplot(auto_arima_forecast_train)

# Forecast using ETS model trained on the training set
ets_forecast_train <- forecast(ets_model_train, h = 6)
autoplot(ets_forecast_train)

# Evaluate performance on test set for arima model
auto_arima_test_error <- sqrt(mean((test_data - auto_arima_forecast_train$mean)^2))

# Print test error
print(auto_arima_test_error)
## [1] 0.286645
# Evaluate performance on test set for ets model
ets_test_error <- sqrt(mean((test_data - ets_forecast_train$mean)^2))

# Print test error
print(ets_test_error)
## [1] 0.299635

Visual Comparison

# Plot forecast from both models in one plot
autoplot(train_data) +
  autolayer(forecast(ets_model_train, h=6),
    series="ETS", PI=FALSE) +
  autolayer(forecast(auto_arima_model_train, h=6),
    series="Auto Arima", PI=FALSE) +
  autolayer(test_data,
    series = "Actual") +
  ggtitle("Comparison between Forecasts for Global Active Power Consumption") +
  xlab("Year") + ylab("kilowatt") +
  guides(colour=guide_legend(title="Forecast"))

Analysis:

In terms of MAE and RMSE, the auto.arima model and ETS model are rather similar. In the full and hold-out set, auto.arima has the lower MAE and RMSE which indicates better performance. The reason for similar performance between the two models might be due to the characteristics of the data. Since the time series data does not exhibit strong seasonality or trend, the auto.arima and ETS model capture the underlying patterns similarly. By nature, auto.arima and ETS are relatively flexible in capturing various time series patterns. Hence, they are capable of adapting well and produce similar forecasts. The model specifications for auto.arima and ETS are (1,1,0) and (A, N, N) respectively. ARIMA(1,1,0) focuses on capturing linear dependencies and removing trend through differencing. It assumes that the time series data can be made stationary by differencing once. On the other hand, ETS models capture the underlying patterns in the time series data, including trend and seasonality, using exponential smoothing techniques. In an ETS(A,N,N) model, there is an additive error term (A) and no trend (N) or seasonality (N). Despite being based on different principles, both models make similar assumptions about the underlying structure of the data, such as the presence or absence of trend and seasonality.

Furthermore, auto.arima also has lower test error from the hold-out set when compared to the ETS model. Visually, the forecast from the ETS model seems to be almost horizontal while auto.arima still retains some pattern.

The ACF plots from the auto.arima models also seem to indicate that the time series is relatively stationary as it shows exponential decay which means that ARIMA might be more appropriate that ETS. This potentially explains the slightly better performance from ARIMA. ETS is also more complex than ARIMA. Therefore, auto.arima might be better at effectively capturing the underlying patterns of this time series as it is relatively simple.