Objective
Forecast electricity consumption (kW) for 17/2/2010 with and without using outdoor temperature
Information about Given Data File
- 3 columns: Timestamp, Power(kW), Temps(C)
- Total 4603 observations for Timestamp and Temps(C): Information from 1/1/2010 1:15 to 2/17/2010 23:45
- Total 4507 observations for Power(kW): Information from 1/1/2010 1:15 to 2/16/2010 23:45
Part 1: Forecast electricity consumption (kW) for 2/17/2010 without using outdoor temperature Here, we will forecast electricity consumption on 2/17/2010 by using only previous information of electricity consumption during 1/1/2010 to 2/16/2010.
#Download data file
library("readxl")
data <- read_excel("D:/Transition/DSTI/Main Program/Time Series Analysis/Project/Elec-train.xlsx")
head(data, 5) #Check data file
## # A tibble: 5 x 3
## Timestamp `Power (kW)` `Temp (C°)`
## <chr> <dbl> <dbl>
## 1 1/1/2010 1:15 165. 10.6
## 2 1/1/2010 1:30 152. 10.6
## 3 1/1/2010 1:45 147. 10.6
## 4 1/1/2010 2:00 154. 10.6
## 5 1/1/2010 2:15 154. 10.6
Create time series objects with hourly seasonal pattern.
Since our data are observed every 15 minutes, to make an unit as hour, our frequency = 4 (60 minutes/15 minutes)
(Note: Information from 1/1/2010 to 2/16/2010 (Row 1-4508) for electricity consumption (Column 2))
consumption <- ts(data[1:4507,2], frequency = 4, start=c(1,2))
head(consumption) #Check time series object
## Power (kW)
## [1,] 165.1
## [2,] 151.6
## [3,] 146.9
## [4,] 153.7
## [5,] 153.8
## [6,] 159.0
Plot data
library(forecast)
## Warning: package 'forecast' was built under R version 4.0.5
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(ggplot2)
autoplot(consumption) +
ggtitle ('Electricity Consumption (kW) per hour') +
xlab('Time (hr)') +
ylab('Consumption (kW)')
Separate data into train (80%) and test set (20%) for further model evaluation
(Time series object for electicity consumption has 4507 observations
Therefore, 80% for train set = 3607 observations and 20% for test set = 900 observations)
consum_train= window(consumption, start=c(1,2), end=c(902,4))
consum_test= window(consumption, start=c(903,1), end=c(1127,4))
autoplot(consum_train,series="Train set") +
autolayer(consum_test,series='Test set')+
ggtitle ('Electricity Consumption (kW) per hour') +
xlab('Time (hr)') +
ylab('Consumption (kW)')
Forecasting with Exponential Smoothing
First, we consider forecasting model that not concern seasonal pattern.
#Simple Exponential Smoothing (SES) : Forecasting with a constant
#auto alpha selection, alpha = NULL
consum_SE = ses(consum_train,h=900, alpha=NULL)
#Non seasonal HW : Forecasting with a linear trend (auto alpha and beta selection)
consum_NHW = holt(consum_train,h=900,alpha=NULL,beta=NULL)
#2 methods in the same graph
autoplot(consum_train,series="Train set") +
autolayer(consum_test,series='Test set')+
autolayer(consum_SE$mean,series='SES')+
autolayer(consum_NHW$mean,series='Non seasonal HW')+
ggtitle ('Electricity Consumption (kW) per hour') +
xlab('Time (hr)') +
ylab('Consumption (kW)')
The models are not so good.
Let forecast by using both seasonal and linear trend.
#Additive seasonal Holt-Winters model
consum_HW_add = hw(consum_train, seasonal='additive',h=900)
#Multiplicative seasonal Holt-Winters model
consum_HW_mul = hw(consum_train, seasonal='multiplicative',h=900)
#Damped additive seasonal Holt-Winters model
consum_DHW_add = hw(consum_train, seasonal='additive',h=900,damped=TRUE)
#Damped multiplicative seasonal Holt-Winters model
consum_DHW_mul = hw(consum_train, seasonal='multiplicative',h=900,damped=TRUE)
#Plot 4 models on the same graph
autoplot(consum_train,series="Train set") +
autolayer(consum_test,series='Test set')+
autolayer(consum_HW_add$mean,series='Additive seasonal HW')+
autolayer(consum_HW_mul$mean,series='Multiplicative seasonal HW')+
autolayer(consum_DHW_add$mean,series='Damped additive seasonal HW')+
autolayer(consum_DHW_mul$mean,series='Damped multiplicative seasonal HW')+
ggtitle ('Electricity Consumption (kW) per hour') +
xlab('Time (hr)') +
ylab('Consumption (kW)')
Still not so efficient.
Next, try to stabilize the variance by Box-Cox transformation in Additive HW model.
#Additive seasonal Holt-Winters model
consum_HW_addBC = hw(consum_train, seasonal='additive',h=900, lambda = 'auto' )
#Damped additive seasonal Holt-Winters model
consum_DHW_addBC = hw(consum_train, seasonal='additive',h=900,damped=TRUE, lambda = 'auto')
#Plot 2 models on the same graph
autoplot(consum_train,series="Train set") +
autolayer(consum_test,series='Test set')+
autolayer(consum_HW_addBC$mean,series='Box Cox + Additive seasonal HW')+
autolayer(consum_DHW_addBC$mean,series='Box Cox + Damped additive seasonal HW')+
ggtitle ('Electricity Consumption (kW) per hour') +
xlab('Time (hr)') +
ylab('Consumption (kW)')
The results show no significant improvement. However, we can compute root mean square error (RMSE) of each model.
print(sqrt(mean((consum_SE$mean-consum_test)^2)))
## [1] 60.76483
print(sqrt(mean((consum_NHW$mean-consum_test)^2)))
## [1] 64.57402
print(sqrt(mean((consum_HW_add$mean-consum_test)^2)))
## [1] 73.46383
print(sqrt(mean((consum_HW_mul$mean-consum_test)^2)))
## [1] 73.40462
print(sqrt(mean((consum_DHW_add$mean-consum_test)^2)))
## [1] 60.91925
print(sqrt(mean((consum_DHW_mul$mean-consum_test)^2)))
## [1] 60.48178
print(sqrt(mean((consum_HW_addBC$mean-consum_test)^2)))
## [1] 70.2003
print(sqrt(mean((consum_DHW_addBC$mean-consum_test)^2)))
## [1] 60.84712
The model with the lowest error so far is Damped multiplicative Holt-Winters, however; this is not because it is the best model (See that the prediction pattern is not correlate with pattern of test set).
This low error is just because our calculation base on mean different and Damped multiplicative Holt-Winters gives us the linear model that situates around the middle of the graph.
We should also consider that all alpha, beta, gamma and phi parameters used above are automatically chosen just to screen and see the pattern of each forecasting model.
Therefore, if we really want to compare between each model, the parameters should be fixed.
(For example, if you want to compare the effect of damped version to multiplicative Holt-Winters model, you should fix alpha, beta and gamma for both model (the only difference will be with or without phi parameter))
But we will not consider them now since clearly we cannot use any exponential smoothing for our forecast.
Forecasting with ARIMA
Let’s begin with automaticaly SARIMA model to see the possibility.
consum_SARIMA = auto.arima(consum_train)
pred_consum_SARIMA = forecast(consum_SARIMA,h=900)
autoplot(consum_train,series="Train set") +
autolayer(consum_test,series='Test set')+
autolayer(pred_consum_SARIMA,series='SARIMA',PI=FALSE)+
ggtitle ('Electricity Consumption (kW) per hour') +
xlab('Time (hr)') +
ylab('Consumption (kW)')
Not perfect but seems better than before.
Let’s check model accuracy.
print(sqrt(mean((pred_consum_SARIMA$mean-consum_test)^2)))
## [1] 56.392
This model shows less error but the prediction pattern are still not so good.
We should concern model that more flexible like Neural Network Auto-Regression.
Neural Network Auto-Regression
#First, using automatic choice for parameters p and k
consum_train_NN = nnetar(consum_train)
pred_consum_train_NN = forecast(consum_train_NN, h = 900)
autoplot(consum_train,series="Train set") +
autolayer(consum_test,series='Test set')+
autolayer(pred_consum_train_NN$mean,series='Neural Network')+
ggtitle ('Electricity Consumption (kW) per hour') +
xlab('Time (hr)') +
ylab('Consumption (kW)')
#Check model accuracy
print(sqrt(mean((pred_consum_train_NN$mean-consum_test)^2)))
## [1] 77.39456
Even the error is higher than SARIMA model but the prediction pattern seems correct. This correlates with the information from auto-correlation function (acf) below.
See that auto-correlation declines slowly as the number of lags increases. This is a property of non-stationarity that will effect the efficiency of several forecasting models.
It also possible that our data might has no seasonal but cyclic pattern. In that case, they cannot be modelized by usual linear model.
#Auto-correlation pattern
ggAcf(consum_train)
Now, let’s verify information from Neural network model for further adjustment
#Check model information for further adjustment to improve the model
print(consum_train_NN)
## Series: consum_train
## Model: NNAR(35,1,18)[4]
## Call: nnetar(y = consum_train)
##
## Average of 20 networks, each of which is
## a 35-18-1 network with 667 weights
## options were - linear output units
##
## sigma^2 estimated as 48.94
Edit the model by adding more neuron and stabilize variance by Box-Cox transformation.
consum_train_NN2 = nnetar(consum_train,35,1,25,lambda='auto')
pred_consum_train_NN2 = forecast(consum_train_NN2, h = 900)
autoplot(consum_train,series="Train set") +
autolayer(consum_test,series='Test set')+
autolayer(pred_consum_train_NN2$mean,series='Neural Network')+
ggtitle ('Electricity Consumption (kW) per hour') +
xlab('Time (hr)') +
ylab('Consumption (kW)')
#Check model accuracy
print(sqrt(mean((pred_consum_train_NN2$mean-consum_test)^2)))
## [1] 81.31413
The error is so much lower. Seems like we finally found the best model! We can zoom in the prediction part to make it more clear.
autoplot(consum_test,series='Test set') +
autolayer(pred_consum_train_NN2$mean,series='Neural Network')+
ggtitle ('Electricity Consumption (kW) per hour') +
xlab('Time (hr)') +
ylab('Consumption (kW)')
Now we will predict electricity consumption of 17/2/2010 bases on the whole previous consumption information.
The prediction interval = 24 hr. of 17/2/2010, h =(24*60)/15 = 96 observations
consum_NN = nnetar(consumption,35,1,25,lambda='auto')
pred_consum_NN = forecast(consum_NN, h = 96)
autoplot(consumption,series="Power Consumption 1/1/2010 - 16/1/2010") +
autolayer(pred_consum_NN$mean,series='Neural Network Prediction for 17/2/2010')+
ggtitle ('Electricity Consumption (kW) per hour') +
xlab('Time (hr)') +
ylab('Consumption (kW)')
#Prediction results
Prediction = print(pred_consum_NN)
## Qtr1 Qtr2 Qtr3 Qtr4
## 1128 160.3902 154.2735 161.2098 162.2114
## 1129 162.3355 161.1900 154.5925 157.0433
## 1130 155.6688 152.5407 158.1951 154.5137
## 1131 160.2907 160.8751 156.3013 166.3297
## 1132 162.7838 158.6948 155.8140 157.2775
## 1133 159.1186 155.9056 153.9054 158.7392
## 1134 159.2970 163.1473 169.6984 168.2918
## 1135 166.8973 166.0943 170.0809 174.6674
## 1136 191.0437 244.0679 240.1304 238.4002
## 1137 238.2083 244.4214 241.2692 242.4561
## 1138 245.9943 247.2206 249.2448 250.7079
## 1139 251.7902 252.2848 250.4387 252.3203
## 1140 253.5246 255.8408 258.4789 258.9882
## 1141 257.3126 258.3749 259.0013 259.3948
## 1142 258.4818 259.4822 261.0378 262.5532
## 1143 265.7573 267.0920 268.2749 266.2939
## 1144 268.9176 271.5304 276.1104 277.5529
## 1145 278.8578 281.0999 283.4875 285.5121
## 1146 287.3779 289.1794 291.9660 293.3308
## 1147 293.7680 295.2610 296.1586 296.8783
## 1148 298.4786 300.5746 301.7162 302.8413
## 1149 305.0192 306.8057 308.1541 309.4402
## 1150 310.7424 311.9118 313.4643 315.8942
## 1151 317.6970 318.4280 319.1631 319.9638
#Save prediction results to csv file
library("readr")
write_csv(Prediction,path="Prediction.csv")
## Warning: The `path` argument of `write_csv()` is deprecated as of readr 1.4.0.
## Please use the `file` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
Now, we will move to second objective of this forecasting.
Part 2: Forecast electricity consumption (kW) for 2/17/2010 by using outdoor temperature Start by download data file and make time series object
temp <- ts(data[1:4507,3], frequency = 4, start=c(1,2))
head(temp) #Check time series object
## Qtr1 Qtr2 Qtr3 Qtr4
## 1 10.55556 10.55556 10.55556
## 2 10.55556 10.55556 10.55556
Extract data to make the regression and prediction
temp_forTRAIN=window(temp, start=c(1,2), end=c(902,4))
temp_forTEST=window(temp, start=c(903,1), end=c(1127,4))
Time series linear regression model First of all, we check the effect of temperature to electricity consumption
fit_train=tslm(consum_train~temp_forTRAIN)
summary(fit_train)
##
## Call:
## tslm(formula = consum_train ~ temp_forTRAIN)
##
## Residuals:
## Min 1Q Median 3Q Max
## -120.990 -42.802 3.352 42.980 111.862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 126.9146 3.5111 36.15 <2e-16 ***
## temp_forTRAIN 9.8478 0.3201 30.76 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51.23 on 3605 degrees of freedom
## Multiple R-squared: 0.2079, Adjusted R-squared: 0.2077
## F-statistic: 946.4 on 1 and 3605 DF, p-value: < 2.2e-16
The effect of temperature to electricity consumption are statistically significant. We can add trend and seasonal pattern to this regression.
fit_train_TS=tslm(consum_train~temp_forTRAIN+trend+season)
summary(fit_train_TS)
##
## Call:
## tslm(formula = consum_train ~ temp_forTRAIN + trend + season)
##
## Residuals:
## Min 1Q Median 3Q Max
## -126.036 -42.603 2.417 43.367 122.385
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.364e+02 3.840e+00 35.518 <2e-16 ***
## temp_forTRAIN 1.052e+01 3.200e-01 32.860 <2e-16 ***
## trend -9.358e-03 8.189e-04 -11.427 <2e-16 ***
## season2 8.305e-01 2.372e+00 0.350 0.726
## season3 6.520e-01 2.372e+00 0.275 0.783
## season4 -2.761e-01 2.372e+00 -0.116 0.907
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.35 on 3601 degrees of freedom
## Multiple R-squared: 0.2357, Adjusted R-squared: 0.2347
## F-statistic: 222.1 on 5 and 3601 DF, p-value: < 2.2e-16
Seems like seasonal pattern play no role Let’s try to consider only trend.
fit_train_T=tslm(consum_train~temp_forTRAIN+trend)
summary(fit_train_T)
##
## Call:
## tslm(formula = consum_train ~ temp_forTRAIN + trend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -125.686 -42.643 2.713 43.220 121.808
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.367e+02 3.554e+00 38.46 <2e-16 ***
## temp_forTRAIN 1.052e+01 3.199e-01 32.87 <2e-16 ***
## trend -9.358e-03 8.186e-04 -11.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.33 on 3604 degrees of freedom
## Multiple R-squared: 0.2357, Adjusted R-squared: 0.2352
## F-statistic: 555.6 on 2 and 3604 DF, p-value: < 2.2e-16
Compare all models
CV(fit_train)
## CV AIC AICc BIC AdjR2
## 2.625580e+03 2.840073e+04 2.840073e+04 2.841930e+04 2.077181e-01
CV(fit_train_TS)
## CV AIC AICc BIC AdjR2
## 2.539092e+03 2.827996e+04 2.827999e+04 2.832330e+04 2.346533e-01
CV(fit_train_T)
## CV AIC AICc BIC AdjR2
## 2.535074e+03 2.827426e+04 2.827427e+04 2.829902e+04 2.352279e-01
Model with temperature and trend has the lowest AIC and the highest Adjusted R squared. Therefore, I will choose this model for further step.
Time series linear regression model assume that the residuals are independent and identically distributed. So, we should check the residuals of our model first.
checkresiduals(fit_train_T,test="LB",plot=TRUE)
##
## Ljung-Box test
##
## data: Residuals from Linear regression model
## Q* = 17482, df = 5, p-value < 2.2e-16
##
## Model df: 3. Total lags used: 8
Results show that the residual are dependent to each other.
Therefore, we cannot use this linear regression model.
The appropriate model in case the residual are not independent to each other are Dynamic regression model.
Dynamic regression model Let’s start with function with automatic selected parameters
fit_train_T_ar = auto.arima(consum_train,xreg=temp_forTRAIN)
#Check autocorrelation of residuals
checkresiduals(fit_train_T_ar,test="LB",plot=TRUE)
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(2,1,2) errors
## Q* = 2.5285, df = 3, p-value = 0.4702
##
## Model df: 5. Total lags used: 8
See that all the auto-correlations of the residuals have been modelled with this model.
The model being validated, now we can forecast the test set.
predict_test_T = forecast(fit_train_T_ar,xreg=temp_forTEST, h=900)
#Compare the prediction wit Neuron network model(without temperature)
autoplot(consum_train,series="Train set") +
autolayer(consum_test,series='Test set')+
autolayer(pred_consum_train_NN2$mean,series='Neural Network')+
autolayer(predict_test_T$mean,series='Dynamic Regression with Temperature')+
ggtitle ('Electricity Consumption (kW) per hour') +
xlab('Time (hr)') +
ylab('Consumption (kW)')
Not so good.
However, the best model we have so far is Neural Network maybe we should try them with temperature variable.
consum_train_NN_T = nnetar(consum_train,35,1,25,lambda='auto',xreg=temp_forTRAIN)
pred_consum_train_NN_T = forecast(consum_train_NN_T, h = 900,xreg=temp_forTEST)
autoplot(consum_train,series="Train set") +
autolayer(consum_test,series='Test set')+
autolayer(pred_consum_train_NN_T$mean,series='Neural Network + Temperature')+
ggtitle ('Electricity Consumption (kW) per hour') +
xlab('Time (hr)') +
ylab('Consumption (kW)')
#Check model accuracy
print(sqrt(mean((pred_consum_train_NN_T$mean-consum_test)^2)))
## [1] 75.17165
We can zoom in the prediction.
autoplot(consum_test,series='Test set') +
autolayer(pred_consum_train_NN_T$mean,series='Neural Network + Temperature')+
ggtitle ('Electricity Consumption (kW) per hour') +
xlab('Time (hr)') +
ylab('Consumption (kW)')
However, let’s try forecast power consumption on 17th Feb based on temperature of that day.
#Extract temperature of 17th Feb to make new time series object for forecast
temp_17 <- ts(data[4509:4603,3], frequency = 4, start=c(1,2))
head(temp_17)
## Qtr1 Qtr2 Qtr3 Qtr4
## 1 11.11111 11.11111 11.11111
## 2 11.11111 10.55556 10.55556
#Check time series object
consum_NN_T = nnetar(consumption,35,1,25,lambda='auto',xreg=temp)
pred_consum_NN_T = forecast(consum_NN_T, h = 96,xreg=temp_17)
autoplot(consumption,series="Power Consumption 1/1/2010 - 16/1/2010") +
autolayer(pred_consum_NN_T$mean,series='Neural Network Prediction using Temperature for 17/2/2010')+
ggtitle ('Electricity Consumption (kW) per hour') +
xlab('Time (hr)') +
ylab('Consumption (kW)')
#Prediction results
Prediction_T = print(pred_consum_NN_T)
## Qtr1 Qtr2 Qtr3 Qtr4
## 1128 161.5249 156.2582 161.1289 163.9018
## 1129 160.7935 159.5037 156.1569 159.1297
## 1130 156.6895 152.5025 156.7392 153.3954
## 1131 159.6009 160.7005 156.9769 164.3447
## 1132 162.1651 159.9572 161.2955 160.5852
## 1133 164.4584 156.8487 157.9112 163.4410
## 1134 161.4248 165.1150 174.6794 169.9885
## 1135 167.4362 166.0159 171.2971 177.6974
## 1136 193.9775 241.1559 238.2722 237.4495
## 1137 237.9434 242.5547 241.4337 242.7241
## 1138 247.2093 248.5491 250.3006 251.0837
## 1139 252.1104 252.7194 251.8022 254.1513
## 1140 255.5868 257.9694 260.4585 261.1169
## 1141 260.8893 262.3923 263.1312 263.5821
## 1142 261.3231 262.3366 266.0310 268.7922
## 1143 273.1082 273.9150 273.0055 271.6021
## 1144 275.3006 277.3659 283.0822 286.8318
## 1145 287.1274 289.9362 293.8535 296.4600
## 1146 299.9591 302.8361 305.0638 307.2993
## 1147 309.0560 310.9452 312.3079 313.5479
## 1148 314.5149 315.3531 315.5509 316.4828
## 1149 317.7608 318.0825 317.7087 316.6208
## 1150 314.6174 314.6247 315.0709 315.5089
## 1151 314.8383 313.5775 313.7556
#Save prediction results to csv file
library("readr")
write_csv(Prediction_T,path="Prediction with Temperature.csv")
We can compare both predictions in the same graph. They look not so diffirent.
autoplot(consumption,series="Power Consumption 1/1/2010 - 16/1/2010") +
autolayer(pred_consum_NN$mean,series='Neural Network')+
autolayer(pred_consum_NN_T$mean,series='Neural Network Prediction using Temperature')+
ggtitle ('Electricity Consumption (kW)') +
xlab('Time (hr)') +
ylab('Consumption (kW)')
Conclusion: The best forecast model we have is Neural Network auto-regression (NNAR) with outside temperature consideration. The main advantage of NNAR are more flexible and can modelized non-linear relation like in our case.