if (!require("xlsx")) install.packages("xlsx")
## Loading required package: xlsx
library(xlsx)
library(GGally)
## Loading required package: ggplot2
library(fpp2)
## Loading required package: forecast
## Loading required package: fma
##
## Attaching package: 'fma'
## The following object is masked from 'package:GGally':
##
## pigs
## Loading required package: expsmooth
library(seasonal)
library(readxl)
library(forecast)
library("zoo")
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:GGally':
##
## nasa
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(imputeTS)
##
## Attaching package: 'imputeTS'
## The following object is masked from 'package:zoo':
##
## na.locf
library(TSA)
##
## Attaching package: 'TSA'
## The following objects are masked from 'package:stats':
##
## acf, arima
## The following object is masked from 'package:utils':
##
## tar
Part A - ATM Forecast, ATM624Data.xlsx
In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.
Solution
Let’s load the data into a R dataframe and perform some preliminary exploration
ATM_data <- readxl::read_excel("C:/Users/Mezu/Documents/Data624/ATM624Data.xlsx")
head(ATM_data)
## # A tibble: 6 x 3
## DATE ATM Cash
## <dbl> <chr> <dbl>
## 1 39934 ATM1 96.0
## 2 39934 ATM2 107
## 3 39935 ATM1 82.0
## 4 39935 ATM2 89.0
## 5 39936 ATM1 85.0
## 6 39936 ATM2 90.0
str(ATM_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1474 obs. of 3 variables:
## $ DATE: num 39934 39934 39935 39935 39936 ...
## $ ATM : chr "ATM1" "ATM2" "ATM1" "ATM2" ...
## $ Cash: num 96 107 82 89 85 90 90 55 99 79 ...
#transform date field to correct format
ATM_data$DATE<- as.Date(ATM_data$DATE,origin = "1899-12-30")
head(ATM_data)
## # A tibble: 6 x 3
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM1 96.0
## 2 2009-05-01 ATM2 107
## 3 2009-05-02 ATM1 82.0
## 4 2009-05-02 ATM2 89.0
## 5 2009-05-03 ATM1 85.0
## 6 2009-05-03 ATM2 90.0
summary(ATM_data)
## DATE ATM Cash
## Min. :2009-05-01 Length:1474 Min. : 0.0
## 1st Qu.:2009-08-01 Class :character 1st Qu.: 0.5
## Median :2009-11-01 Mode :character Median : 73.0
## Mean :2009-10-31 Mean : 155.6
## 3rd Qu.:2010-02-01 3rd Qu.: 114.0
## Max. :2010-05-14 Max. :10919.8
## NA's :19
Filter data into 4 different datasets one for each ATM machine
ATM1_data <- filter(ATM_data, ATM == "ATM1")
head(ATM1_data)
## # A tibble: 6 x 3
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM1 96.0
## 2 2009-05-02 ATM1 82.0
## 3 2009-05-03 ATM1 85.0
## 4 2009-05-04 ATM1 90.0
## 5 2009-05-05 ATM1 99.0
## 6 2009-05-06 ATM1 88.0
str(ATM1_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 365 obs. of 3 variables:
## $ DATE: Date, format: "2009-05-01" "2009-05-02" ...
## $ ATM : chr "ATM1" "ATM1" "ATM1" "ATM1" ...
## $ Cash: num 96 82 85 90 99 88 8 104 87 93 ...
summary(ATM1_data)
## DATE ATM Cash
## Min. :2009-05-01 Length:365 Min. : 1.00
## 1st Qu.:2009-07-31 Class :character 1st Qu.: 73.00
## Median :2009-10-30 Mode :character Median : 91.00
## Mean :2009-10-30 Mean : 83.89
## 3rd Qu.:2010-01-29 3rd Qu.:108.00
## Max. :2010-04-30 Max. :180.00
## NA's :3
ATM2_data <- filter(ATM_data, ATM == "ATM2")
head(ATM2_data)
## # A tibble: 6 x 3
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM2 107
## 2 2009-05-02 ATM2 89.0
## 3 2009-05-03 ATM2 90.0
## 4 2009-05-04 ATM2 55.0
## 5 2009-05-05 ATM2 79.0
## 6 2009-05-06 ATM2 19.0
str(ATM2_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 365 obs. of 3 variables:
## $ DATE: Date, format: "2009-05-01" "2009-05-02" ...
## $ ATM : chr "ATM2" "ATM2" "ATM2" "ATM2" ...
## $ Cash: num 107 89 90 55 79 19 2 103 107 118 ...
summary(ATM2_data)
## DATE ATM Cash
## Min. :2009-05-01 Length:365 Min. : 0.00
## 1st Qu.:2009-07-31 Class :character 1st Qu.: 25.50
## Median :2009-10-30 Mode :character Median : 67.00
## Mean :2009-10-30 Mean : 62.58
## 3rd Qu.:2010-01-29 3rd Qu.: 93.00
## Max. :2010-04-30 Max. :147.00
## NA's :2
ATM3_data <- filter(ATM_data, ATM == "ATM3")
head(ATM3_data)
## # A tibble: 6 x 3
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM3 0
## 2 2009-05-02 ATM3 0
## 3 2009-05-03 ATM3 0
## 4 2009-05-04 ATM3 0
## 5 2009-05-05 ATM3 0
## 6 2009-05-06 ATM3 0
str(ATM3_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 365 obs. of 3 variables:
## $ DATE: Date, format: "2009-05-01" "2009-05-02" ...
## $ ATM : chr "ATM3" "ATM3" "ATM3" "ATM3" ...
## $ Cash: num 0 0 0 0 0 0 0 0 0 0 ...
summary(ATM3_data)
## DATE ATM Cash
## Min. :2009-05-01 Length:365 Min. : 0.0000
## 1st Qu.:2009-07-31 Class :character 1st Qu.: 0.0000
## Median :2009-10-30 Mode :character Median : 0.0000
## Mean :2009-10-30 Mean : 0.7206
## 3rd Qu.:2010-01-29 3rd Qu.: 0.0000
## Max. :2010-04-30 Max. :96.0000
ATM4_data <- filter(ATM_data, ATM == "ATM4")
head(ATM4_data)
## # A tibble: 6 x 3
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM4 777
## 2 2009-05-02 ATM4 524
## 3 2009-05-03 ATM4 793
## 4 2009-05-04 ATM4 908
## 5 2009-05-05 ATM4 52.8
## 6 2009-05-06 ATM4 52.2
str(ATM4_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 365 obs. of 3 variables:
## $ DATE: Date, format: "2009-05-01" "2009-05-02" ...
## $ ATM : chr "ATM4" "ATM4" "ATM4" "ATM4" ...
## $ Cash: num 777 524.4 792.8 908.2 52.8 ...
summary(ATM4_data)
## DATE ATM Cash
## Min. :2009-05-01 Length:365 Min. : 1.563
## 1st Qu.:2009-07-31 Class :character 1st Qu.: 124.334
## Median :2009-10-30 Mode :character Median : 403.839
## Mean :2009-10-30 Mean : 474.043
## 3rd Qu.:2010-01-29 3rd Qu.: 704.507
## Max. :2010-04-30 Max. :10919.762
We can see that each ATM has 365 observations. We have some missing cash values in ATM1 and ATM2 dataframes
Let’s convert to timeseries
ATM1_ts <- ts(ATM1_data$Cash, start=c(2009,5,1), frequency=365)
autoplot(ATM1_ts)+xlab("days") + ylab("Cash in hundreds of dollars") +
ggtitle("ATM1 time series")
Since we noticed 3 missing values let’s impute it
ATM1_ts<- na.interpolation(ATM1_ts)
autoplot(ATM1_ts)+xlab("days") + ylab("Cash in hundreds of dollars") +
ggtitle("ATM1 time series")
Check if the timeseries needs any transformation using Box cox transformation
(lambda1 <- forecast::BoxCox.lambda(ATM1_ts))
## [1] 1
We can see that this dataset does not need any transformation
From this plot we can see there is some trend and seasonality
ggseasonplot(ATM1_ts )
#ggsubseriesplot(ATM1_ts)
gglagplot(ATM1_ts)
ggAcf(ATM1_ts)
let’s find the seasonality with fourier transform
#reference https://anomaly.io/detect-seasonality-using-fourier-transform-r/
p = periodogram(ATM1_ts)
dd = data.frame(freq=p$freq, spec=p$spec)
order = dd[order(-dd$spec),]
top2 = head(order, 2)
# display the 2 highest "power" frequencies
top2
## freq spec
## 54 0.1440000 103825.80
## 107 0.2853333 62918.41
# convert frequency to time periods
time = 1/top2$f
time
## [1] 6.944444 3.504673
The periodogram shows the “power” of each possible frequency, and we can clearly see spikes at around frequency 0.15 Hz and 0.28 Hz
Frequencies of 0.144 Hz and 0.285 Hz show seasonality!
The main seasonality detected is 7 days. A secondary seasonality of 3.5 days was also found
Let’s plot the trend using a 7 day moving average
autoplot(ATM1_ts, series="Data") +
autolayer(ma(ATM1_ts,7), series="7-MA") +
ggtitle("ATM1 7day moving average trend")
Using Simple exponential smoothing to predict the forecast and check RSME
fc <- ses(ATM1_ts, h=30)
autoplot(fc) +
autolayer(fitted(fc), series="Fitted") +
ylab("Cash in hundreds of dollars") + xlab("days")
summary(fc)
##
## Forecast method: Simple exponential smoothing
##
## Model Information:
## Simple exponential smoothing
##
## Call:
## ses(y = ATM1_ts, h = 30)
##
## Smoothing parameters:
## alpha = 1e-04
##
## Initial states:
## l = 84.1554
##
## sigma: 36.6885
##
## AIC AICc BIC
## 4787.255 4787.322 4798.955
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0309603 36.58785 27.26164 -175.3407 198.6339 NaN
## ACF1
## Training set 0.07062587
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2010.0110 84.15428 37.13608 131.1725 12.24614 156.0624
## 2010.0137 84.15428 37.13608 131.1725 12.24614 156.0624
## 2010.0164 84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0192 84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0219 84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0247 84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0274 84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0301 84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0329 84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0356 84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0384 84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0411 84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0438 84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0466 84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0493 84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0521 84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0548 84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0575 84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0603 84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0630 84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0658 84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0685 84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0712 84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0740 84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0767 84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0795 84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0822 84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0849 84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0877 84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0904 84.15428 37.13607 131.1725 12.24613 156.0624
checkresiduals(fc)
##
## Ljung-Box test
##
## data: Residuals from Simple exponential smoothing
## Q* = 1397.8, df = 71, p-value < 2.2e-16
##
## Model df: 2. Total lags used: 73
We can see the ACF graph and the residuals shows the data is clearly seasonal and the errors are not normally distributed so our predictions might not be accurate
Let’s try auto ETS
fc <- ets(ATM1_ts)
autoplot(fc) +
autolayer(fitted(fc), series="Fitted") +
ylab("Cash in hundreds of dollars") + xlab("days")
summary(fc)
## ETS(M,N,N)
##
## Call:
## ets(y = ATM1_ts)
##
## Smoothing parameters:
## alpha = 0.0326
##
## Initial states:
## l = 79.7942
##
## sigma: 0.4352
##
## AIC AICc BIC
## 4785.562 4785.628 4797.262
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.1295602 36.75059 27.45335 -179.0177 202.7145 NaN
## ACF1
## Training set 0.04828029
checkresiduals(fc)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,N)
## Q* = 1392.9, df = 71, p-value < 2.2e-16
##
## Model df: 2. Total lags used: 73
We can see it does not give us a good model.
Cross validation of prediction Split the data into two parts using
ATM1_ts.train <- window(ATM1_ts, end=c(2009,340))
ATM1_ts.test <- window(ATM1_ts, start=c(2009,341))
Check that your data have been split appropriately by producing the following plot.
autoplot(ATM1_ts) +
autolayer(ATM1_ts.train, series="Training") +
autolayer(ATM1_ts.test, series="Test")
calculate forecasts using ETS applied to ATM1_ts.train
fit_train <- ets(ATM1_ts.train)
compare the accuracy of forecastes against the actual values stored in ATM1_ts.test
p<-predict(fit_train,29)
accuracy(p$mean,ATM1_ts.test)
## ME RMSE MAE MPE MAPE ACF1
## Test set -6.632964 31.92371 19.9583 -315.8501 329.1089 -0.1610123
## Theil's U
## Test set 0.1945403
check residuals
checkresiduals(fit_train)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,N)
## Q* = 1156.3, df = 65.2, p-value < 2.2e-16
##
## Model df: 2. Total lags used: 67.2
We can see the results of ETS is very bad
For this timeseries there is some form of trend and seasonality based on our analysis. Therefore we would do two level of differencing one for trend and next for seasonality Let’s first take a seasonally differenced series.
ATM1_ts.train %>% diff(lag=7) %>% ggtsdisplay()
We can see the data is fairly differenced properly. however in ACF still shows some slight pattern due to trend so let’s difference that
ATM1_ts.train %>% diff(lag=7)%>% diff() %>% ggtsdisplay()
The significant spike at lag 1 in the ACF suggests a non-seasonal MA(1) , the the significant spike at lag 7 in the ACF suggests a seasonal MA(1) component
Based on this information we would compare the results of the auto arima to the manual selection
fc<-auto.arima(ATM1_ts.train, seasonal=FALSE,stepwise=FALSE, approximation=FALSE)
checkresiduals(fc)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,0,0) with non-zero mean
## Q* = 854.19, df = 61.2, p-value < 2.2e-16
##
## Model df: 6. Total lags used: 67.2
We can see that the result of this auto arima is not good due to the ACF plot has lag values well above the critical ranges
Now using the information and plots gotten from previous step we test a couple of various parameters for the p,d,q values
ATM1_ts.train%>%
Arima(order=c(7,0,7)) %>%
residuals() %>% ggtsdisplay()
fit3 <- Arima(ATM1_ts.train, order=c(7,1,7))
fit3
## Series: ATM1_ts.train
## ARIMA(7,1,7)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7 ma1
## -0.9623 -0.9502 -0.9585 -0.9663 -0.9348 -0.9232 0.0453 0.1572
## s.e. 0.2216 0.2074 0.2012 0.1983 0.1934 0.1922 0.1945 0.2291
## ma2 ma3 ma4 ma5 ma6 ma7
## -0.0626 0.0740 0.1389 -0.1305 -0.0639 -0.6587
## s.e. 0.0555 0.0536 0.0612 0.0633 0.0733 0.0805
##
## sigma^2 estimated as 576.5: log likelihood=-1538.87
## AIC=3107.74 AICc=3109.25 BIC=3164.96
checkresiduals(fit3)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(7,1,7)
## Q* = 55.082, df = 53.2, p-value = 0.4033
##
## Model df: 14. Total lags used: 67.2
#AICc = 3112.8
fit4 <- Arima(ATM1_ts.train, order=c(7,1,8))
fit4
## Series: ATM1_ts.train
## ARIMA(7,1,8)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7
## -0.0466 -0.0234 -0.0458 -0.0493 -0.0061 -0.0214 0.9287
## s.e. 0.0345 0.0354 0.0346 0.0349 0.0349 0.0349 0.0336
## ma1 ma2 ma3 ma4 ma5 ma6 ma7 ma8
## -0.7839 -0.2396 0.1263 0.0858 -0.2714 0.0343 -0.5637 0.6122
## s.e. 0.0626 0.0686 0.0619 0.0670 0.0632 0.0669 0.0764 0.0606
##
## sigma^2 estimated as 571.7: log likelihood=-1537.64
## AIC=3107.29 AICc=3109 BIC=3168.31
checkresiduals(fit4)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(7,1,8)
## Q* = 55.203, df = 52.2, p-value = 0.3619
##
## Model df: 15. Total lags used: 67.2
##AICc = 3109.25
fit5 <- Arima(ATM1_ts.train, order=c(7,1,0))
fit5
## Series: ATM1_ts.train
## ARIMA(7,1,0)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7
## -0.7865 -0.7926 -0.7351 -0.6860 -0.6846 -0.6616 0.0936
## s.e. 0.0544 0.0592 0.0630 0.0642 0.0624 0.0587 0.0547
##
## sigma^2 estimated as 711.9: log likelihood=-1574.94
## AIC=3165.89 AICc=3166.33 BIC=3196.4
checkresiduals(fit5)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(7,1,0)
## Q* = 108.71, df = 60.2, p-value = 0.0001292
##
## Model df: 7. Total lags used: 67.2
##AICc = 3166.33
From the various plots we can see that the ARIMA (7,1,8) has the lowest AICc, more normalized residuals and ACF plots below the critical values
Now let’s compare accuracy of the forecasts
compare the accuracy of forecastes against the actual values stored in ATM1_ts.test
p1<-predict(fit4,29)
accuracy(p1$pred,ATM1_ts.test)
## ME RMSE MAE MPE MAPE ACF1
## Test set -7.979495 15.33882 12.12426 -100.8079 105.5511 -0.4944431
## Theil's U
## Test set 0.04826178
p2<-predict(fit3,29)
accuracy(p2$pred,ATM1_ts.test)
## ME RMSE MAE MPE MAPE ACF1
## Test set -6.680468 15.04431 11.65646 -93.69367 99.44533 -0.5369736
## Theil's U
## Test set 0.03820937
p3<-predict(fit5,29)
accuracy(p3$pred,ATM1_ts.test)
## ME RMSE MAE MPE MAPE ACF1
## Test set -8.05515 20.60587 14.33351 -190.2153 196.9253 -0.1356597
## Theil's U
## Test set 0.04923441
p4<-predict(fc,29)
accuracy(p3$pred,ATM1_ts.test)
## ME RMSE MAE MPE MAPE ACF1
## Test set -8.05515 20.60587 14.33351 -190.2153 196.9253 -0.1356597
## Theil's U
## Test set 0.04923441
Based on the RMSE values we can see that the fit3 which corresponds to ARIMA (7,1,7) is best.
Now we have to make a decision ARIMA(7,1,8) had a lower AICc but ARIMA(7,1,7) had a lower RMSE. I will pick ARIMA(7,1,7) because I think that testing a forecast against actual data is more trust worthy than a model metric like AICc
fit3 %>% forecast(h=29) %>% autoplot(include=80)
Let’s plot the graph
fcast <- forecast::forecast(fit3, h=29)
autoplot(fcast, series="Data") +
autolayer(ATM1_ts.test, series="test_set") +
ggtitle("ATM1 trainn-test prediction ")
fit3 <- Arima(ATM1_ts, order=c(7,1,7))
fcast1 <- forecast::forecast(forecast::Arima(ATM1_ts, model=fit3), h=30)
xlsx::write.xlsx(fcast1, file="C:/Users/Mezu/Documents/Data624/DATA624_Project1_ATM1.xlsx",
sheetName="ATM1", col.names = T, row.names = T, append = F)
str(ATM2_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 365 obs. of 3 variables:
## $ DATE: Date, format: "2009-05-01" "2009-05-02" ...
## $ ATM : chr "ATM2" "ATM2" "ATM2" "ATM2" ...
## $ Cash: num 107 89 90 55 79 19 2 103 107 118 ...
summary(ATM2_data)
## DATE ATM Cash
## Min. :2009-05-01 Length:365 Min. : 0.00
## 1st Qu.:2009-07-31 Class :character 1st Qu.: 25.50
## Median :2009-10-30 Mode :character Median : 67.00
## Mean :2009-10-30 Mean : 62.58
## 3rd Qu.:2010-01-29 3rd Qu.: 93.00
## Max. :2010-04-30 Max. :147.00
## NA's :2
Let’s convert to timeseries
ATM2_ts <- ts(ATM2_data$Cash, start=c(2009,5,1), frequency=365)
autoplot(ATM2_ts)+xlab("days") + ylab("Cash in hundreds of dollars") +
ggtitle("ATM2 time series")
Since we noticed 2 missing values let’s impute it
ATM2_ts<- na.interpolation(ATM2_ts)
autoplot(ATM2_ts)+xlab("days") + ylab("Cash in hundreds of dollars") +
ggtitle("ATM2 time series")
Check if the timeseries needs any transformation using Box cox transformation
(lambda2 <- forecast::BoxCox.lambda(ATM2_ts))
## [1] 1
We can see that this dataset does not need any transformation
From this plot we can see there is some trend and seasonality
ggseasonplot(ATM2_ts )
#ggsubseriesplot(ATM2_ts)
gglagplot(ATM2_ts)
ggAcf(ATM2_ts)
let’s find the seasonality with fourier transform
#reference https://anomaly.io/detect-seasonality-using-fourier-transform-r/
p = periodogram(ATM2_ts)
dd = data.frame(freq=p$freq, spec=p$spec)
order = dd[order(-dd$spec),]
top2 = head(order, 2)
# display the 2 highest "power" frequencies
top2
## freq spec
## 107 0.2853333 86194.79
## 108 0.2880000 77054.04
# convert frequency to time periods
time = 1/top2$f
time
## [1] 3.504673 3.472222
The periodogram shows the “power” of each possible frequency, and we can clearly see spikes at around frequency 0.15 Hz and 0.28 Hz
Frequencies of 0.288 Hz and 0.285 Hz show seasonality!
The main seasonality detected is 3.5 days. A secondary seasonality of 7 days was also found
Let’s plot the trend using a 4 day moving average
autoplot(ATM2_ts, series="Data") +
autolayer(ma(ATM2_ts,4), series="4-MA") +
ggtitle("ATM2 4 day moving average trend")
Cross validation of prediction Split the data into two parts using
ATM2_ts.train <- window(ATM2_ts, end=c(2009,340))
ATM2_ts.test <- window(ATM2_ts, start=c(2009,341))
Check that your data have been split appropriately by producing the following plot.
autoplot(ATM2_ts) +
autolayer(ATM2_ts.train, series="Training") +
autolayer(ATM2_ts.test, series="Test")
For this timeseries there is some form of trend and seasonality based on our analysis. Therefore we would do two level of differencing one for trend and next for seasonality Let’s first take a seasonally differenced series.
ATM2_ts.train %>% diff(lag=7) %>% ggtsdisplay()
We can see the data is fairly differenced properly. however in ACF still shows some slight pattern due to trend so let’s difference that
ATM2_ts.train %>% diff() %>% diff(lag=4) %>% diff(lag=7)%>% ggtsdisplay()
ATM2_ts.train %>% diff(lag=7) %>%diff() %>% ggtsdisplay()
The significant spike at lag 1 in the ACF suggests a non-seasonal MA(1) , the the significant spike at lag 7 in the ACF suggests a seasonal MA(1) component
Based on this information we would compare the results of the auto arima to the manual selection
fc<-auto.arima(ATM2_ts.train, seasonal=FALSE,stepwise=FALSE, approximation=FALSE)
checkresiduals(fc)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,5)
## Q* = 649.75, df = 62.2, p-value < 2.2e-16
##
## Model df: 5. Total lags used: 67.2
We can see that the result of this auto arima is not good due to the ACF plot has lag values well above the critical ranges
Now using the information and plots gotten from previous step we test a couple of various parameters for the p,d,q values
ATM2_ts.train%>%
Arima(order=c(7,1,7)) %>%
residuals() %>% ggtsdisplay()
fit3 <- Arima(ATM2_ts.train, order=c(7,1,7))
fit3
## Series: ATM2_ts.train
## ARIMA(7,1,7)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7
## -1.1194 -1.1093 -1.1082 -1.1191 -1.0960 -1.0868 -0.1123
## s.e. 0.0912 0.0902 0.0888 0.0907 0.0899 0.0824 0.0844
## ma1 ma2 ma3 ma4 ma5 ma6 ma7
## 0.1973 -0.1626 -0.0097 0.1120 -0.1258 -0.0931 -0.7016
## s.e. 0.0794 0.0494 0.0545 0.0507 0.0480 0.0637 0.0453
##
## sigma^2 estimated as 607.7: log likelihood=-1549.31
## AIC=3128.61 AICc=3130.12 BIC=3185.83
checkresiduals(fit3)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(7,1,7)
## Q* = 71.919, df = 53.2, p-value = 0.04456
##
## Model df: 14. Total lags used: 67.2
#AICc = 3130.12
fit4 <- Arima(ATM2_ts.train, order=c(7,1,8))
fit4
## Series: ATM2_ts.train
## ARIMA(7,1,8)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7
## -0.0403 -0.0358 -0.0373 -0.0502 -0.0147 -0.0355 0.9366
## s.e. 0.0253 0.0238 0.0237 0.0237 0.0232 0.0227 0.0231
## ma1 ma2 ma3 ma4 ma5 ma6 ma7 ma8
## -0.9121 -0.2537 0.1558 0.0913 -0.2388 0.0723 -0.6168 0.7131
## s.e. 0.0559 0.0711 0.0720 0.0751 0.0614 0.0688 0.0625 0.0468
##
## sigma^2 estimated as 595.8: log likelihood=-1546.26
## AIC=3124.52 AICc=3126.23 BIC=3185.54
checkresiduals(fit4)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(7,1,8)
## Q* = 65.649, df = 52.2, p-value = 0.09997
##
## Model df: 15. Total lags used: 67.2
#AICc=3126.23
fit5 <- Arima(ATM2_ts.train, order=c(7,1,10))
fit5
## Series: ATM2_ts.train
## ARIMA(7,1,10)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7
## -0.0498 -0.0374 -0.0547 -0.0631 -0.0273 -0.0475 0.9210
## s.e. 0.0279 0.0286 0.0278 0.0272 0.0279 0.0286 0.0269
## ma1 ma2 ma3 ma4 ma5 ma6 ma7 ma8
## -0.9014 -0.2130 0.1198 0.0983 -0.2601 0.0827 -0.6156 0.7105
## s.e. 0.0652 0.0797 0.0714 0.0667 0.0643 0.0682 0.0688 0.0773
## ma9 ma10
## -0.1346 0.1270
## s.e. 0.0761 0.0629
##
## sigma^2 estimated as 584.7: log likelihood=-1544.04
## AIC=3124.08 AICc=3126.24 BIC=3192.73
checkresiduals(fit5)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(7,1,10)
## Q* = 60.374, df = 50.2, p-value = 0.154
##
## Model df: 17. Total lags used: 67.2
#AICc=3126.24
From the various plots we can see that the ARIMA (7,1,8) has the lowest AICc, more normalized residuals and ACF plots below the critical values
Now let’s compare accuracy of the forecasts
compare the accuracy of forecastes against the actual values stored in ATM1_ts.test
p1<-predict(fit4,29)
accuracy(p1$pred,ATM2_ts.test)
## ME RMSE MAE MPE MAPE ACF1
## Test set 7.596794 18.42945 15.45209 -27.83442 57.55219 -0.04026416
## Theil's U
## Test set 0.1317035
p2<-predict(fit3,29)
accuracy(p2$pred,ATM2_ts.test)
## ME RMSE MAE MPE MAPE ACF1
## Test set 6.770904 18.28133 15.62466 -72.85834 98.49209 -0.1532232
## Theil's U
## Test set 0.1374686
p3<-predict(fit5,29)
accuracy(p3$pred,ATM2_ts.test)
## ME RMSE MAE MPE MAPE ACF1
## Test set 7.545134 20.32541 17.62188 -107.412 136.4305 -0.05991392
## Theil's U
## Test set 0.1546188
Based on the RMSE values we can see that the fit3 which corresponds to ARIMA (7,1,7) is best.
Now we have to make a decision ARIMA(7,1,8) had a lower AICc but ARIMA(7,1,7) had a lower RMSE. I will pick ARIMA(7,1,7) because I think that testing a forecast against actual data is more trust worthy than a model metric like AICc
fit3 %>% forecast(h=29) %>% autoplot(include=80)
Let’s plot the graph
fcast <- forecast::forecast(fit3, h=29)
autoplot(fcast, series="Data") +
autolayer(ATM2_ts.test, series="test_set") +
ggtitle("ATM2 train-test prediction ")
fit3 <- Arima(ATM2_ts, order=c(7,1,7))
fcast1 <- forecast::forecast(forecast::Arima(ATM2_ts, model=fit3), h=30)
xlsx::write.xlsx(fcast1, file="C:/Users/Mezu/Documents/Data624/DATA624_Project1_ATM2.xlsx",
sheetName="ATM2", col.names = T, row.names = T, append = F)
str(ATM3_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 365 obs. of 3 variables:
## $ DATE: Date, format: "2009-05-01" "2009-05-02" ...
## $ ATM : chr "ATM3" "ATM3" "ATM3" "ATM3" ...
## $ Cash: num 0 0 0 0 0 0 0 0 0 0 ...
summary(ATM3_data)
## DATE ATM Cash
## Min. :2009-05-01 Length:365 Min. : 0.0000
## 1st Qu.:2009-07-31 Class :character 1st Qu.: 0.0000
## Median :2009-10-30 Mode :character Median : 0.0000
## Mean :2009-10-30 Mean : 0.7206
## 3rd Qu.:2010-01-29 3rd Qu.: 0.0000
## Max. :2010-04-30 Max. :96.0000
Let’s convert to timeseries
ATM3_ts <- ts(ATM3_data$Cash, start=c(2009,5,1), frequency=365)
autoplot(ATM3_ts)+xlab("days") + ylab("Cash in hundreds of dollars") +
ggtitle("ATM3 time series")
Based on the available data. I think we don’t have a distributed or varied data to forecast. I would need to first ask the business if this was a new ATM that was installed at that location. I would wait for a couple of months before i can get an accurate picture of the ATM’s data. Due to the low variance of this ATM data compared to others, I would not forecast it.
if the business insists on a forecast we can infer based on the forecast from other ATMs
str(ATM4_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 365 obs. of 3 variables:
## $ DATE: Date, format: "2009-05-01" "2009-05-02" ...
## $ ATM : chr "ATM4" "ATM4" "ATM4" "ATM4" ...
## $ Cash: num 777 524.4 792.8 908.2 52.8 ...
summary(ATM4_data)
## DATE ATM Cash
## Min. :2009-05-01 Length:365 Min. : 1.563
## 1st Qu.:2009-07-31 Class :character 1st Qu.: 124.334
## Median :2009-10-30 Mode :character Median : 403.839
## Mean :2009-10-30 Mean : 474.043
## 3rd Qu.:2010-01-29 3rd Qu.: 704.507
## Max. :2010-04-30 Max. :10919.762
Let’s convert to timeseries
ATM4_ts <- ts(ATM4_data$Cash, start=c(2009,5,1), frequency=365)
autoplot(ATM4_ts)+xlab("days") + ylab("Cash in hundreds of dollars") +
ggtitle("ATM4 time series")
Since we noticed an outlier record at Feb 9, 2010 of 10919.762 we need to treat it as missing values and remove from out dataset so it can be imputed.
I decided to remove this record because I need explanation from the business and understanding if this record was in error before i can further decide what to do.
ATM4_ts[285] <- NA
ATM4_ts<- na.interpolation(ATM4_ts)
autoplot(ATM4_ts)+xlab("days") + ylab("Cash in hundreds of dollars") +
ggtitle("ATM4 time series")
Check if the timeseries needs any transformation using Box cox transformation
(lambda2 <- forecast::BoxCox.lambda(ATM4_ts))
## [1] 1
We can see that this dataset does not need any transformation
From this plot we can see there is some trend and seasonality
ggseasonplot(ATM4_ts )
#ggsubseriesplot(ATM4_ts)
gglagplot(ATM4_ts)
ggAcf(ATM4_ts)
let’s find the seasonality with fourier transform
#reference https://anomaly.io/detect-seasonality-using-fourier-transform-r/
p = periodogram(ATM4_ts)
dd = data.frame(freq=p$freq, spec=p$spec)
order = dd[order(-dd$spec),]
top2 = head(order, 2)
# display the 2 highest "power" frequencies
top2
## freq spec
## 54 0.1440000 2513218
## 161 0.4293333 1243196
# convert frequency to time periods
time = 1/top2$f
time
## [1] 6.944444 2.329193
The periodogram shows the “power” of each possible frequency, and we can clearly see spikes at around frequency 0.15 Hz and 0.43 Hz
Frequencies of 0.144 Hz and 0.429 Hz show seasonality!
The main seasonality detected is 7 days. A secondary seasonality of 2 days was also found
Let’s plot the trend using a 4 day moving average
autoplot(ATM4_ts, series="Data") +
autolayer(ma(ATM4_ts,4), series="4-MA") +
ggtitle("ATM4 4 day moving average trend")
Cross validation of prediction Split the data into two parts using
ATM4_ts.train <- window(ATM4_ts, end=c(2009,340))
ATM4_ts.test <- window(ATM4_ts, start=c(2009,341))
Check that your data have been split appropriately by producing the following plot.
autoplot(ATM4_ts) +
autolayer(ATM4_ts.train, series="Training") +
autolayer(ATM4_ts.test, series="Test")
For this timeseries there is some form of trend and seasonality based on our analysis. Therefore we would do two level of differencing one for trend and next for seasonality Let’s first take a seasonally differenced series.
ATM4_ts.train %>% diff(lag=7) %>% ggtsdisplay()
We can see the data is fairly differenced properly. however in ACF still shows some slight pattern due to trend so let’s difference that
ATM4_ts.train %>% diff(lag=7) %>%diff() %>% ggtsdisplay()
The significant spike at lag 1 in the ACF suggests a non-seasonal MA(1) , the the significant spike at lag 7 in the ACF suggests a seasonal MA(1) component
Based on this information we would compare the results of the auto arima to the manual selection
fc<-auto.arima(ATM4_ts.train, seasonal=FALSE,stepwise=FALSE, approximation=FALSE)
checkresiduals(fc)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0) with non-zero mean
## Q* = 126.25, df = 65.2, p-value = 8.798e-06
##
## Model df: 2. Total lags used: 67.2
We can see that the result of this auto arima is not good due to the ACF plot has lag values well above the critical ranges
Now using the information and plots gotten from previous step we test a couple of various parameters for the p,d,q values
ATM4_ts.train%>%
Arima(order=c(2,1,7)) %>%
residuals() %>% ggtsdisplay()
fit3 <- Arima(ATM4_ts.train, order=c(7,1,7))
fit3
## Series: ATM4_ts.train
## ARIMA(7,1,7)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7
## -0.1591 -0.4079 -1.0015 -0.4732 -0.1316 -0.8005 0.1357
## s.e. 0.0818 0.0730 0.0728 0.0709 0.0761 0.0754 0.0615
## ma1 ma2 ma3 ma4 ma5 ma6 ma7
## -0.7585 0.2092 0.6711 -0.5746 -0.3688 0.7247 -0.9023
## s.e. 0.1134 0.1652 0.1372 0.0828 0.1613 0.1783 0.1149
##
## sigma^2 estimated as 116249: log likelihood=-2427.8
## AIC=4885.6 AICc=4887.11 BIC=4942.82
checkresiduals(fit3)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(7,1,7)
## Q* = 74.895, df = 53.2, p-value = 0.0266
##
## Model df: 14. Total lags used: 67.2
#AICc = 4887.11
fit4 <- Arima(ATM4_ts.train, order=c(5,1,7))
fit4
## Series: ATM4_ts.train
## ARIMA(5,1,7)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ma1 ma2 ma3
## -0.6781 -0.2238 -0.7025 -0.5011 0.3567 -0.2320 -0.4394 0.4896
## s.e. 0.3652 0.3732 0.2229 0.3694 0.2695 0.3738 0.0521 0.1481
## ma4 ma5 ma6 ma7
## -0.2829 -0.9059 0.3073 0.0633
## s.e. 0.1411 NaN 0.3001 0.0593
##
## sigma^2 estimated as 120970: log likelihood=-2434.69
## AIC=4895.38 AICc=4896.51 BIC=4944.96
checkresiduals(fit4)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,1,7)
## Q* = 90.966, df = 55.2, p-value = 0.001734
##
## Model df: 12. Total lags used: 67.2
#AICc=4896.51
fit5 <- Arima(ATM4_ts.train, order=c(2,1,7))
fit5
## Series: ATM4_ts.train
## ARIMA(2,1,7)
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3 ma4 ma5 ma6
## -0.5814 -0.8722 -0.3175 0.2897 -0.7887 -0.2148 0.0175 -0.1577
## s.e. 0.1199 0.0981 0.1270 0.0899 0.1083 0.0692 0.0582 0.0578
## ma7
## 0.1715
## s.e. 0.0541
##
## sigma^2 estimated as 123998: log likelihood=-2438.27
## AIC=4896.55 AICc=4897.23 BIC=4934.69
checkresiduals(fit5)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,7)
## Q* = 91.519, df = 58.2, p-value = 0.003465
##
## Model df: 9. Total lags used: 67.2
#AICc=4897.23
From the various plots we can see that the ARIMA (7,1,7) has the lowest AICc, more normalized residuals and ACF plots below the critical values
Now let’s compare accuracy of the forecasts
compare the accuracy of forecastes against the actual values stored in ATM1_ts.test
p1<-predict(fit4,29)
accuracy(p1$pred,ATM4_ts.test)
## ME RMSE MAE MPE MAPE ACF1
## Test set -52.15396 299.2789 258.7339 -1196.058 1223.817 0.07336747
## Theil's U
## Test set 0.3429161
p2<-predict(fit3,29)
accuracy(p2$pred,ATM4_ts.test)
## ME RMSE MAE MPE MAPE ACF1
## Test set -53.81601 291.5918 243.9763 -1097.047 1122.035 0.02337883
## Theil's U
## Test set 0.1408487
p3<-predict(fit5,29)
accuracy(p3$pred,ATM4_ts.test)
## ME RMSE MAE MPE MAPE ACF1
## Test set -53.83352 293.353 250.0366 -1141.166 1167.065 -0.01144309
## Theil's U
## Test set 0.1288629
Based on the RMSE values we can see that the fit3 which corresponds to ARIMA (7,1,7) is best.
I will pick ARIMA(7,1,7) because I think that testing a forecast against actual data is more trust worthy than a model metric like AICc
fit3 %>% forecast(h=29) %>% autoplot(include=80)
Let’s plot the graph
fcast <- forecast::forecast(fit3, h=29)
autoplot(fcast, series="Data") +
autolayer(ATM4_ts.test, series="test_set") +
ggtitle("ATM4 train-test prediction ")
fit3 <- Arima(ATM4_ts, order=c(7,1,7))
fcast1 <- forecast::forecast(forecast::Arima(ATM4_ts, model=fit3), h=30)
xlsx::write.xlsx(fcast1, file="C:/Users/Mezu/Documents/Data624/DATA624_Project1_ATM4.xlsx",
sheetName="ATM4", col.names = T, row.names = T, append = F)
Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.
Let’s load the data into a R dataframe and perform some preliminary exploration
Power_data <- readxl::read_excel("C:/Users/Mezu/Documents/Data624/ResidentialCustomerForecastLoad-624.xlsx")
head(Power_data)
## # A tibble: 6 x 3
## CaseSequence `YYYY-MMM` KWH
## <dbl> <chr> <dbl>
## 1 733 1998-Jan 6862583
## 2 734 1998-Feb 5838198
## 3 735 1998-Mar 5420658
## 4 736 1998-Apr 5010364
## 5 737 1998-May 4665377
## 6 738 1998-Jun 6467147
str(Power_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 192 obs. of 3 variables:
## $ CaseSequence: num 733 734 735 736 737 738 739 740 741 742 ...
## $ YYYY-MMM : chr "1998-Jan" "1998-Feb" "1998-Mar" "1998-Apr" ...
## $ KWH : num 6862583 5838198 5420658 5010364 4665377 ...
summary(Power_data)
## CaseSequence YYYY-MMM KWH
## Min. :733.0 Length:192 Min. : 770523
## 1st Qu.:780.8 Class :character 1st Qu.: 5429912
## Median :828.5 Mode :character Median : 6283324
## Mean :828.5 Mean : 6502475
## 3rd Qu.:876.2 3rd Qu.: 7620524
## Max. :924.0 Max. :10655730
## NA's :1
Let’s convert to timeseries
Power_data_ts <- ts(Power_data$KWH, start=c(1998,1), frequency=12)
autoplot(Power_data_ts )+xlab("days") + ylab("Power consumption in kilowatt hours") +
ggtitle("Power_data time series")
Since we noticed 1 missing values let’s impute it
Power_data_ts<- na.interpolation(Power_data_ts)
autoplot(Power_data_ts)+xlab("days") + ylab("Power consumption in kilowatt hours") +
ggtitle("Power_data time serie")
Check if the timeseries needs any transformation using Box cox transformation
(lambda1 <- forecast::BoxCox.lambda(Power_data_ts))
## [1] 0.1074118
We can see that this dataset does need a transformation. I will use the log tranformation
autoplot(log(Power_data_ts))+xlab("days") + ylab("Power consumption in kilowatt hours") +
ggtitle("log of Power_data time serie")
From this plot we can see there is some seasonality but no clear trend. Also there is a huge outlier made prominent by the log transformation.
ggseasonplot(Power_data_ts )
ggsubseriesplot(Power_data_ts)
gglagplot(Power_data_ts)
ggAcf(Power_data_ts)
let’s find the seasonality with fourier transform
#reference https://anomaly.io/detect-seasonality-using-fourier-transform-r/
p = periodogram(Power_data_ts)
dd = data.frame(freq=p$freq, spec=p$spec)
order = dd[order(-dd$spec),]
top2 = head(order, 2)
# display the 2 highest "power" frequencies
top2
## freq spec
## 32 0.16666667 1.923770e+14
## 16 0.08333333 2.113811e+13
# convert frequency to time periods
time = 1/top2$f
time
## [1] 6 12
The main seasonality detected is 6 months. A secondary seasonality of 12 months was also found
Let’s plot the trend using a 7 day moving average
autoplot(Power_data_ts, series="Data") +
autolayer(ma(Power_data_ts,6), series="6-MA") +
ggtitle("Power_data_ts 6MA moving average trend")
Using Simple exponential smoothing to predict the forecast and check RSME
fc <- ses(Power_data_ts, h=30)
autoplot(fc) +
autolayer(fitted(fc), series="Fitted") +
ylab("Power in KWH") + xlab("months")
summary(fc)
##
## Forecast method: Simple exponential smoothing
##
## Model Information:
## Simple exponential smoothing
##
## Call:
## ses(y = Power_data_ts, h = 30)
##
## Smoothing parameters:
## alpha = 0.0408
##
## Initial states:
## l = 6512199.706
##
## sigma: 1423653
##
## AIC AICc BIC
## 6454.223 6454.351 6463.996
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 90773.09 1416218 1178550 -6.212522 21.7282 1.691596 0.3784715
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 2014 7222913 5398429 9047398 4432605 10013221
## Feb 2014 7222913 5396913 9048914 4430286 10015540
## Mar 2014 7222913 5395397 9050429 4427969 10017858
## Apr 2014 7222913 5393884 9051943 4425654 10020173
## May 2014 7222913 5392371 9053456 4423340 10022486
## Jun 2014 7222913 5390860 9054967 4421029 10024798
## Jul 2014 7222913 5389350 9056477 4418720 10027107
## Aug 2014 7222913 5387841 9057986 4416412 10029415
## Sep 2014 7222913 5386333 9059494 4414106 10031720
## Oct 2014 7222913 5384827 9061000 4411802 10034024
## Nov 2014 7222913 5383322 9062505 4409501 10036326
## Dec 2014 7222913 5381818 9064009 4407200 10038626
## Jan 2015 7222913 5380315 9065512 4404902 10040924
## Feb 2015 7222913 5378813 9067013 4402606 10043221
## Mar 2015 7222913 5377313 9068513 4400312 10045515
## Apr 2015 7222913 5375814 9070012 4398019 10047808
## May 2015 7222913 5374316 9071510 4395728 10050098
## Jun 2015 7222913 5372820 9073007 4393439 10052387
## Jul 2015 7222913 5371324 9074502 4391152 10054674
## Aug 2015 7222913 5369830 9075996 4388867 10056959
## Sep 2015 7222913 5368337 9077489 4386584 10059243
## Oct 2015 7222913 5366845 9078981 4384303 10061524
## Nov 2015 7222913 5365355 9080472 4382023 10063804
## Dec 2015 7222913 5363866 9081961 4379745 10066082
## Jan 2016 7222913 5362377 9083449 4377469 10068358
## Feb 2016 7222913 5360890 9084936 4375195 10070632
## Mar 2016 7222913 5359405 9086422 4372923 10072904
## Apr 2016 7222913 5357920 9087907 4370652 10075175
## May 2016 7222913 5356436 9089390 4368383 10077443
## Jun 2016 7222913 5354954 9090872 4366116 10079710
checkresiduals(fc)
##
## Ljung-Box test
##
## data: Residuals from Simple exponential smoothing
## Q* = 620.2, df = 22, p-value < 2.2e-16
##
## Model df: 2. Total lags used: 24
We can see the ACF graph and the residuals shows the data is clearly seasonal and the errors are not normally distributed so our predictions might not be accurate
Let’s try auto ETS
fc <- ets(Power_data_ts)
autoplot(fc) +
autolayer(fitted(fc), series="Fitted") +
ylab("Power in KWH") + xlab("monthss")
summary(fc)
## ETS(M,N,M)
##
## Call:
## ets(y = Power_data_ts)
##
## Smoothing parameters:
## alpha = 0.1137
## gamma = 1e-04
##
## Initial states:
## l = 6134772.7867
## s = 0.9547 0.7533 0.8603 1.1912 1.2556 1.1992
## 0.9982 0.7684 0.8161 0.9175 1.0618 1.2237
##
## sigma: 0.1199
##
## AIC AICc BIC
## 6224.057 6226.784 6272.919
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 61009.73 835107 503972.9 -4.39013 12.04006 0.7233624
## ACF1
## Training set 0.1698584
checkresiduals(fc)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,M)
## Q* = 28.615, df = 10, p-value = 0.001438
##
## Model df: 14. Total lags used: 24
We can see it does give us a good model.
Cross validation of prediction Split the data into two parts using
Power_data_ts.train <- window(Power_data_ts, end=c(2012,12))
Power_data_ts.test <- window(Power_data_ts, start=c(2013,1))
Check that your data have been split appropriately by producing the following plot.
autoplot(Power_data_ts) +
autolayer(Power_data_ts.train, series="Training") +
autolayer(Power_data_ts.test, series="Test")
calculate forecasts using ETS applied to ATM1_ts.train
fit_train <- ets(Power_data_ts.train)
compare the accuracy of forecastes against the actual values stored in ATM1_ts.test
p<-predict(fit_train,12)
accuracy(p$mean,Power_data_ts.test)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 398169 1057623 692843.6 4.135402 8.307641 0.08496991 0.6097567
check residuals
checkresiduals(fit_train)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,M)
## Q* = 30.885, df = 10, p-value = 0.0006129
##
## Model df: 14. Total lags used: 24
We can see the results of ETS is not so great but decent
For this timeseries there is some form of trend and seasonality based on our analysis. Therefore we would do two level of differencing one for trend and next for seasonality Let’s first take a seasonally differenced series.
Power_data_ts.train %>% diff(lag=12) %>% ggtsdisplay()
We can see the data is fairly differenced properly. however in ACF still shows some slight pattern due to trend so let’s difference that
Power_data_ts.train %>% diff(lag=12)%>% diff() %>% ggtsdisplay()
The significant spike at lag 1 in the ACF suggests a non-seasonal MA(1) , the the significant spike at lag 12 in the ACF suggests a seasonal MA(1) component
Based on this information we would compare the results of the auto arima to the manual selection
fc<-auto.arima(Power_data_ts.train, seasonal=TRUE,stepwise=FALSE, approximation=FALSE)
checkresiduals(fc)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0)(2,1,1)[12] with drift
## Q* = 12.219, df = 19, p-value = 0.876
##
## Model df: 5. Total lags used: 24
summary(fc)
## Series: Power_data_ts.train
## ARIMA(1,0,0)(2,1,1)[12] with drift
##
## Coefficients:
## ar1 sar1 sar2 sma1 drift
## 0.2371 -0.6340 -0.4944 -0.2762 6795.367
## s.e. 0.0767 0.1804 0.1400 0.1995 2572.161
##
## sigma^2 estimated as 6.862e+11: log likelihood=-2532.59
## AIC=5077.18 AICc=5077.7 BIC=5095.93
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -9927.261 788292 482331.7 -5.171484 11.61867 0.6868009
## ACF1
## Training set -0.01545698
#AICc=5077.7
We can see that the result of this auto arima is good due to the ACF plot has lag values well below the critical ranges
Now using the information and plots gotten from previous step we test a couple of various parameters for the p,d,q values
Power_data_ts.train%>%
Arima(order=c(2,0,0), seasonal = c(2,1,1)) %>%
residuals() %>% ggtsdisplay()
fit3 <- Arima(Power_data_ts.train, order=c(2,0,0), seasonal = c(2,1,1))
fit3
## Series: Power_data_ts.train
## ARIMA(2,0,0)(2,1,1)[12]
##
## Coefficients:
## ar1 ar2 sar1 sar2 sma1
## 0.2460 0.1170 -0.6588 -0.5353 -0.2192
## s.e. 0.0786 0.0797 0.1579 0.1226 0.1774
##
## sigma^2 estimated as 7.021e+11: log likelihood=-2534.64
## AIC=5081.29 AICc=5081.81 BIC=5100.03
checkresiduals(fit3)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,0)(2,1,1)[12]
## Q* = 12.967, df = 19, p-value = 0.8403
##
## Model df: 5. Total lags used: 24
#AICc =5081.81
fit4 <- Arima(Power_data_ts.train, order=c(2,0,2), seasonal = c(2,1,0))
fit4
## Series: Power_data_ts.train
## ARIMA(2,0,2)(2,1,0)[12]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sar1 sar2
## -0.0259 0.6791 0.2401 -0.5650 -0.8357 -0.6334
## s.e. 0.3361 0.2108 0.3533 0.2052 0.0720 0.0774
##
## sigma^2 estimated as 6.974e+11: log likelihood=-2534.15
## AIC=5082.29 AICc=5082.99 BIC=5104.16
checkresiduals(fit4)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,2)(2,1,0)[12]
## Q* = 10.897, df = 18, p-value = 0.8987
##
## Model df: 6. Total lags used: 24
##AICc = 5082.99
fit5 <- Arima(Power_data_ts.train, order=c(2,0,0), seasonal = c(1,1,1))
fit5
## Series: Power_data_ts.train
## ARIMA(2,0,0)(1,1,1)[12]
##
## Coefficients:
## ar1 ar2 sar1 sma1
## 0.3032 0.0603 -0.1262 -0.7284
## s.e. 0.0778 0.0785 0.0977 0.0808
##
## sigma^2 estimated as 7.569e+11: log likelihood=-2539.71
## AIC=5089.43 AICc=5089.8 BIC=5105.05
checkresiduals(fit5)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,0)(1,1,1)[12]
## Q* = 13.369, df = 20, p-value = 0.861
##
## Model df: 4. Total lags used: 24
##AICc = 5089.8
fc <- Arima(Power_data_ts.train, order=c(1,0,0), seasonal = c(2,1,1))
fc
## Series: Power_data_ts.train
## ARIMA(1,0,0)(2,1,1)[12]
##
## Coefficients:
## ar1 sar1 sar2 sma1
## 0.2793 -0.6492 -0.5000 -0.2200
## s.e. 0.0764 0.1707 0.1295 0.1876
##
## sigma^2 estimated as 7.12e+11: log likelihood=-2535.71
## AIC=5081.42 AICc=5081.79 BIC=5097.04
checkresiduals(fc)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0)(2,1,1)[12]
## Q* = 12.972, df = 20, p-value = 0.8786
##
## Model df: 4. Total lags used: 24
From the various plots we can see that the ARIMA(1,0,0)(2,1,1)[12] with drift has the lowest AICc, more normalized residuals and ACF plots below the critical values
Now let’s compare accuracy of the forecasts
compare the accuracy of forecastes against the actual values stored in ATM1_ts.test
p1<-predict(fit4,12)
accuracy(p1$pred,Power_data_ts.test)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 860659.1 1660684 1080721 10.03257 13.0932 0.02039943 0.9068145
p2<-predict(fit3,12)
accuracy(p2$pred,Power_data_ts.test)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 958430.8 1625314 1086974 11.33535 13.12316 0.05164515 0.8920487
p3<-predict(fit5,12)
accuracy(p3$pred,Power_data_ts.test)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 817125.4 1178989 838726.2 9.835089 10.11628 -0.0380971 0.654609
p4<-predict(fc,12)
accuracy(p4$pred,Power_data_ts.test)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 915864.1 1559245 1048962 10.79004 12.631 0.03206625 0.8524698
Based on the RMSE values we can see that the fit5 which corresponds to ARIMA (2,0,0)(1,1,1) is best.
fit5 %>% forecast(h=12) %>% autoplot(include=80)
Let’s plot the graph
fcast <- forecast::forecast(fit5, h=12)
autoplot(fcast, series="Data") +
autolayer(Power_data_ts.test, series="test_set") +
ggtitle(",Power_data_ts train-test prediction ")
fit5 <- Arima(Power_data_ts, order=c(2,0,0), seasonal = c(1,1,1))
fcast1 <- forecast::forecast(forecast::Arima(Power_data_ts, model=fit5), h=12)
xlsx::write.xlsx(fcast1, file="C:/Users/Mezu/Documents/Data624/DATA624_Project1_Power.xlsx",
sheetName="Power", col.names = T, row.names = T, append = F)
Part C - BONUS, optional (part or all), Waterflow_Pipe1.xlsx and Waterflow_Pipe2.xlsx
Part C consists of two data sets. These are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows). Note for multiple recordings within an hour, take the mean. Then to determine if the data is stationary and can it be forecast. If so, provide a week forward forecast and present results via Rpubs and .rmd and the forecast in an Excel readable file.
Let’s load the data into a R dataframe and perform some preliminary exploration
Waterflow_Pipe1_data <- readxl::read_excel("C:/Users/Mezu/Documents/Data624/Waterflow_Pipe1.xlsx")
head(Waterflow_Pipe1_data)
## # A tibble: 6 x 2
## DateTime WaterFlow
## <dttm> <dbl>
## 1 2015-10-23 00:24:06 23.4
## 2 2015-10-23 00:40:02 28.0
## 3 2015-10-23 00:53:51 23.1
## 4 2015-10-23 00:55:40 30.0
## 5 2015-10-23 01:19:17 6.00
## 6 2015-10-23 01:23:58 15.9
str(Waterflow_Pipe1_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1000 obs. of 2 variables:
## $ DateTime : POSIXct, format: "2015-10-23 00:24:06" "2015-10-23 00:40:02" ...
## $ WaterFlow: num 23.4 28 23.1 30 6 ...
Waterflow_Pipe2_data <- readxl::read_excel("C:/Users/Mezu/Documents/Data624/Waterflow_Pipe2.xlsx")
head(Waterflow_Pipe2_data)
## # A tibble: 6 x 2
## DateTime WaterFlow
## <dttm> <dbl>
## 1 2015-10-23 01:00:00 18.8
## 2 2015-10-23 02:00:00 43.1
## 3 2015-10-23 03:00:00 38.0
## 4 2015-10-23 04:00:00 36.1
## 5 2015-10-23 05:00:00 31.9
## 6 2015-10-23 06:00:00 28.2
str(Waterflow_Pipe2_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1000 obs. of 2 variables:
## $ DateTime : POSIXct, format: "2015-10-23 01:00:00" "2015-10-23 02:00:00" ...
## $ WaterFlow: num 18.8 43.1 38 36.1 31.9 ...
summary(Waterflow_Pipe2_data)
## DateTime WaterFlow
## Min. :2015-10-23 01:00:00 Min. : 1.885
## 1st Qu.:2015-11-02 10:45:00 1st Qu.:28.140
## Median :2015-11-12 20:30:00 Median :39.682
## Mean :2015-11-12 20:30:00 Mean :39.556
## 3rd Qu.:2015-11-23 06:15:00 3rd Qu.:50.782
## Max. :2015-12-03 16:00:00 Max. :78.303
summary(Waterflow_Pipe1_data)
## DateTime WaterFlow
## Min. :2015-10-23 00:24:06 Min. : 1.067
## 1st Qu.:2015-10-25 11:21:35 1st Qu.:13.683
## Median :2015-10-27 20:07:30 Median :19.880
## Mean :2015-10-27 20:49:15 Mean :19.897
## 3rd Qu.:2015-10-30 08:24:51 3rd Qu.:26.159
## Max. :2015-11-01 23:35:43 Max. :38.913
if (!require("xts")) install.packages("xts")
## Loading required package: xts
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
library(xts)
xts::periodicity(unique(Waterflow_Pipe1_data$`DateTime`))
## 10.0198883334796 minute periodicity from 2015-10-23 00:24:06 to 2015-11-01 23:35:43
xts::periodicity(unique(Waterflow_Pipe2_data$`DateTime`))
## Hourly periodicity from 2015-10-23 01:00:00 to 2015-12-03 16:00:00
These are time series’ spanning October 23, 2015 to November 1, 2015 for Pipe 1 and to December 3, 2015 for Pipe 2. The time series contains water flow measurements for two pipes. The datasets do not contain any missing values or apparent outliers.
Pipe1 <- xts::xts(Waterflow_Pipe1_data$WaterFlow, order.by=Waterflow_Pipe2_data$DateTime)
Pipe1h <- xts::period.apply(Pipe1, xts::endpoints(Pipe1, "hours"), mean)
xts::periodicity(Pipe1h)
## Hourly periodicity from 2015-10-23 01:00:00 to 2015-12-03 16:00:00
The data for Pipe 1 which have a period of every ten minutes are grouped and converted to hourly with the mean() function applied to the aggregated hourly periods.
Pipe2 <- xts::xts(Waterflow_Pipe2_data$WaterFlow, order.by=Waterflow_Pipe2_data$DateTime)
Pipes <- merge(merge(Pipe1, Pipe2, join='inner'), Pipe1+Pipe2, join='inner')
head(Pipes)
## Pipe1 Pipe2 Pipe1...Pipe2
## 2015-10-23 01:00:00 23.369599 18.81079 42.18039
## 2015-10-23 02:00:00 28.002881 43.08703 71.08991
## 2015-10-23 03:00:00 23.065895 37.98770 61.05360
## 2015-10-23 04:00:00 29.972809 36.12038 66.09319
## 2015-10-23 05:00:00 5.997953 31.85126 37.84921
## 2015-10-23 06:00:00 15.935223 28.23809 44.17331
acf(Pipe1, ylab="ACF Pipe 1", main="")
pacf(Pipe1, ylab="PACF Pipe 1", main="")
acf(Pipe1h, ylab="ACF Pipe 1 Hourly", main="")
pacf(Pipe1h, ylab="PACF Pipe 1 Hourly", main="")
acf(Pipe2, ylab="ACF Pipe 2", main="")
pacf(Pipe2, ylab="PACF Pipe 2", main="")
Autoplot Pipe1
autoplot(Pipe1h)
Autoplot Pipe2
autoplot(Pipe2)