if (!require("xlsx")) install.packages("xlsx")
## Loading required package: xlsx
library(xlsx)
library(GGally)
## Loading required package: ggplot2
library(fpp2)
## Loading required package: forecast
## Loading required package: fma
## 
## Attaching package: 'fma'
## The following object is masked from 'package:GGally':
## 
##     pigs
## Loading required package: expsmooth
library(seasonal)
library(readxl)
library(forecast)
library("zoo")
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:GGally':
## 
##     nasa
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(imputeTS)
## 
## Attaching package: 'imputeTS'
## The following object is masked from 'package:zoo':
## 
##     na.locf
library(TSA)
## 
## Attaching package: 'TSA'
## The following objects are masked from 'package:stats':
## 
##     acf, arima
## The following object is masked from 'package:utils':
## 
##     tar

Part A - ATM Forecast, ATM624Data.xlsx

In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.

Solution

Let’s load the data into a R dataframe and perform some preliminary exploration

ATM_data <-  readxl::read_excel("C:/Users/Mezu/Documents/Data624/ATM624Data.xlsx")
head(ATM_data)
## # A tibble: 6 x 3
##    DATE ATM    Cash
##   <dbl> <chr> <dbl>
## 1 39934 ATM1   96.0
## 2 39934 ATM2  107  
## 3 39935 ATM1   82.0
## 4 39935 ATM2   89.0
## 5 39936 ATM1   85.0
## 6 39936 ATM2   90.0
str(ATM_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1474 obs. of  3 variables:
##  $ DATE: num  39934 39934 39935 39935 39936 ...
##  $ ATM : chr  "ATM1" "ATM2" "ATM1" "ATM2" ...
##  $ Cash: num  96 107 82 89 85 90 90 55 99 79 ...
#transform date field to correct format
ATM_data$DATE<- as.Date(ATM_data$DATE,origin = "1899-12-30")

head(ATM_data)
## # A tibble: 6 x 3
##   DATE       ATM    Cash
##   <date>     <chr> <dbl>
## 1 2009-05-01 ATM1   96.0
## 2 2009-05-01 ATM2  107  
## 3 2009-05-02 ATM1   82.0
## 4 2009-05-02 ATM2   89.0
## 5 2009-05-03 ATM1   85.0
## 6 2009-05-03 ATM2   90.0
summary(ATM_data)
##       DATE                ATM                 Cash        
##  Min.   :2009-05-01   Length:1474        Min.   :    0.0  
##  1st Qu.:2009-08-01   Class :character   1st Qu.:    0.5  
##  Median :2009-11-01   Mode  :character   Median :   73.0  
##  Mean   :2009-10-31                      Mean   :  155.6  
##  3rd Qu.:2010-02-01                      3rd Qu.:  114.0  
##  Max.   :2010-05-14                      Max.   :10919.8  
##                                          NA's   :19

Filter data into 4 different datasets one for each ATM machine

ATM1_data <- filter(ATM_data, ATM == "ATM1")
head(ATM1_data)
## # A tibble: 6 x 3
##   DATE       ATM    Cash
##   <date>     <chr> <dbl>
## 1 2009-05-01 ATM1   96.0
## 2 2009-05-02 ATM1   82.0
## 3 2009-05-03 ATM1   85.0
## 4 2009-05-04 ATM1   90.0
## 5 2009-05-05 ATM1   99.0
## 6 2009-05-06 ATM1   88.0
str(ATM1_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    365 obs. of  3 variables:
##  $ DATE: Date, format: "2009-05-01" "2009-05-02" ...
##  $ ATM : chr  "ATM1" "ATM1" "ATM1" "ATM1" ...
##  $ Cash: num  96 82 85 90 99 88 8 104 87 93 ...
summary(ATM1_data)
##       DATE                ATM                 Cash       
##  Min.   :2009-05-01   Length:365         Min.   :  1.00  
##  1st Qu.:2009-07-31   Class :character   1st Qu.: 73.00  
##  Median :2009-10-30   Mode  :character   Median : 91.00  
##  Mean   :2009-10-30                      Mean   : 83.89  
##  3rd Qu.:2010-01-29                      3rd Qu.:108.00  
##  Max.   :2010-04-30                      Max.   :180.00  
##                                          NA's   :3
ATM2_data <- filter(ATM_data, ATM == "ATM2")
head(ATM2_data)
## # A tibble: 6 x 3
##   DATE       ATM    Cash
##   <date>     <chr> <dbl>
## 1 2009-05-01 ATM2  107  
## 2 2009-05-02 ATM2   89.0
## 3 2009-05-03 ATM2   90.0
## 4 2009-05-04 ATM2   55.0
## 5 2009-05-05 ATM2   79.0
## 6 2009-05-06 ATM2   19.0
str(ATM2_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    365 obs. of  3 variables:
##  $ DATE: Date, format: "2009-05-01" "2009-05-02" ...
##  $ ATM : chr  "ATM2" "ATM2" "ATM2" "ATM2" ...
##  $ Cash: num  107 89 90 55 79 19 2 103 107 118 ...
summary(ATM2_data)
##       DATE                ATM                 Cash       
##  Min.   :2009-05-01   Length:365         Min.   :  0.00  
##  1st Qu.:2009-07-31   Class :character   1st Qu.: 25.50  
##  Median :2009-10-30   Mode  :character   Median : 67.00  
##  Mean   :2009-10-30                      Mean   : 62.58  
##  3rd Qu.:2010-01-29                      3rd Qu.: 93.00  
##  Max.   :2010-04-30                      Max.   :147.00  
##                                          NA's   :2
ATM3_data <- filter(ATM_data, ATM == "ATM3")
head(ATM3_data)
## # A tibble: 6 x 3
##   DATE       ATM    Cash
##   <date>     <chr> <dbl>
## 1 2009-05-01 ATM3      0
## 2 2009-05-02 ATM3      0
## 3 2009-05-03 ATM3      0
## 4 2009-05-04 ATM3      0
## 5 2009-05-05 ATM3      0
## 6 2009-05-06 ATM3      0
str(ATM3_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    365 obs. of  3 variables:
##  $ DATE: Date, format: "2009-05-01" "2009-05-02" ...
##  $ ATM : chr  "ATM3" "ATM3" "ATM3" "ATM3" ...
##  $ Cash: num  0 0 0 0 0 0 0 0 0 0 ...
summary(ATM3_data)
##       DATE                ATM                 Cash        
##  Min.   :2009-05-01   Length:365         Min.   : 0.0000  
##  1st Qu.:2009-07-31   Class :character   1st Qu.: 0.0000  
##  Median :2009-10-30   Mode  :character   Median : 0.0000  
##  Mean   :2009-10-30                      Mean   : 0.7206  
##  3rd Qu.:2010-01-29                      3rd Qu.: 0.0000  
##  Max.   :2010-04-30                      Max.   :96.0000
ATM4_data <- filter(ATM_data, ATM == "ATM4")
head(ATM4_data)
## # A tibble: 6 x 3
##   DATE       ATM    Cash
##   <date>     <chr> <dbl>
## 1 2009-05-01 ATM4  777  
## 2 2009-05-02 ATM4  524  
## 3 2009-05-03 ATM4  793  
## 4 2009-05-04 ATM4  908  
## 5 2009-05-05 ATM4   52.8
## 6 2009-05-06 ATM4   52.2
str(ATM4_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    365 obs. of  3 variables:
##  $ DATE: Date, format: "2009-05-01" "2009-05-02" ...
##  $ ATM : chr  "ATM4" "ATM4" "ATM4" "ATM4" ...
##  $ Cash: num  777 524.4 792.8 908.2 52.8 ...
summary(ATM4_data)
##       DATE                ATM                 Cash          
##  Min.   :2009-05-01   Length:365         Min.   :    1.563  
##  1st Qu.:2009-07-31   Class :character   1st Qu.:  124.334  
##  Median :2009-10-30   Mode  :character   Median :  403.839  
##  Mean   :2009-10-30                      Mean   :  474.043  
##  3rd Qu.:2010-01-29                      3rd Qu.:  704.507  
##  Max.   :2010-04-30                      Max.   :10919.762

We can see that each ATM has 365 observations. We have some missing cash values in ATM1 and ATM2 dataframes

ATM 1

Let’s convert to timeseries

ATM1_ts <- ts(ATM1_data$Cash, start=c(2009,5,1), frequency=365)

autoplot(ATM1_ts)+xlab("days") + ylab("Cash in hundreds of dollars") +
    ggtitle("ATM1 time series")

Since we noticed 3 missing values let’s impute it

 ATM1_ts<- na.interpolation(ATM1_ts)
autoplot(ATM1_ts)+xlab("days") + ylab("Cash in hundreds of dollars") +
    ggtitle("ATM1 time series")

Check if the timeseries needs any transformation using Box cox transformation

(lambda1 <- forecast::BoxCox.lambda(ATM1_ts))
## [1] 1

We can see that this dataset does not need any transformation

From this plot we can see there is some trend and seasonality

ggseasonplot(ATM1_ts )

#ggsubseriesplot(ATM1_ts)
gglagplot(ATM1_ts)

ggAcf(ATM1_ts)

let’s find the seasonality with fourier transform

#reference https://anomaly.io/detect-seasonality-using-fourier-transform-r/
p = periodogram(ATM1_ts)

dd = data.frame(freq=p$freq, spec=p$spec)
order = dd[order(-dd$spec),]
top2 = head(order, 2)
 
# display the 2 highest "power" frequencies
top2
##          freq      spec
## 54  0.1440000 103825.80
## 107 0.2853333  62918.41
# convert frequency to time periods
time = 1/top2$f
time
## [1] 6.944444 3.504673

The periodogram shows the “power” of each possible frequency, and we can clearly see spikes at around frequency 0.15 Hz and 0.28 Hz

Frequencies of 0.144 Hz and 0.285 Hz show seasonality!

The main seasonality detected is 7 days. A secondary seasonality of 3.5 days was also found

Let’s plot the trend using a 7 day moving average

autoplot(ATM1_ts, series="Data") +
  autolayer(ma(ATM1_ts,7), series="7-MA") +
   ggtitle("ATM1 7day moving average trend") 

Using Simple exponential smoothing to predict the forecast and check RSME

fc <- ses(ATM1_ts, h=30)
autoplot(fc) +
  autolayer(fitted(fc), series="Fitted") +
  ylab("Cash in hundreds of dollars") + xlab("days")

summary(fc)
## 
## Forecast method: Simple exponential smoothing
## 
## Model Information:
## Simple exponential smoothing 
## 
## Call:
##  ses(y = ATM1_ts, h = 30) 
## 
##   Smoothing parameters:
##     alpha = 1e-04 
## 
##   Initial states:
##     l = 84.1554 
## 
##   sigma:  36.6885
## 
##      AIC     AICc      BIC 
## 4787.255 4787.322 4798.955 
## 
## Error measures:
##                      ME     RMSE      MAE       MPE     MAPE MASE
## Training set -0.0309603 36.58785 27.26164 -175.3407 198.6339  NaN
##                    ACF1
## Training set 0.07062587
## 
## Forecasts:
##           Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 2010.0110       84.15428 37.13608 131.1725 12.24614 156.0624
## 2010.0137       84.15428 37.13608 131.1725 12.24614 156.0624
## 2010.0164       84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0192       84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0219       84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0247       84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0274       84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0301       84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0329       84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0356       84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0384       84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0411       84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0438       84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0466       84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0493       84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0521       84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0548       84.15428 37.13607 131.1725 12.24614 156.0624
## 2010.0575       84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0603       84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0630       84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0658       84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0685       84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0712       84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0740       84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0767       84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0795       84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0822       84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0849       84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0877       84.15428 37.13607 131.1725 12.24613 156.0624
## 2010.0904       84.15428 37.13607 131.1725 12.24613 156.0624
checkresiduals(fc)

## 
##  Ljung-Box test
## 
## data:  Residuals from Simple exponential smoothing
## Q* = 1397.8, df = 71, p-value < 2.2e-16
## 
## Model df: 2.   Total lags used: 73

We can see the ACF graph and the residuals shows the data is clearly seasonal and the errors are not normally distributed so our predictions might not be accurate

ETS Method

Let’s try auto ETS

fc <- ets(ATM1_ts)
autoplot(fc) +
  autolayer(fitted(fc), series="Fitted") +
  ylab("Cash in hundreds of dollars") + xlab("days")

summary(fc)
## ETS(M,N,N) 
## 
## Call:
##  ets(y = ATM1_ts) 
## 
##   Smoothing parameters:
##     alpha = 0.0326 
## 
##   Initial states:
##     l = 79.7942 
## 
##   sigma:  0.4352
## 
##      AIC     AICc      BIC 
## 4785.562 4785.628 4797.262 
## 
## Training set error measures:
##                      ME     RMSE      MAE       MPE     MAPE MASE
## Training set -0.1295602 36.75059 27.45335 -179.0177 202.7145  NaN
##                    ACF1
## Training set 0.04828029
checkresiduals(fc)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,N,N)
## Q* = 1392.9, df = 71, p-value < 2.2e-16
## 
## Model df: 2.   Total lags used: 73

We can see it does not give us a good model.

Cross validation of prediction Split the data into two parts using

ATM1_ts.train <- window(ATM1_ts, end=c(2009,340))
ATM1_ts.test <- window(ATM1_ts, start=c(2009,341))

Check that your data have been split appropriately by producing the following plot.

autoplot(ATM1_ts) +
  autolayer(ATM1_ts.train, series="Training") +
  autolayer(ATM1_ts.test, series="Test")

calculate forecasts using ETS applied to ATM1_ts.train

fit_train <-  ets(ATM1_ts.train)

compare the accuracy of forecastes against the actual values stored in ATM1_ts.test

 p<-predict(fit_train,29)
accuracy(p$mean,ATM1_ts.test)
##                 ME     RMSE     MAE       MPE     MAPE       ACF1
## Test set -6.632964 31.92371 19.9583 -315.8501 329.1089 -0.1610123
##          Theil's U
## Test set 0.1945403

check residuals

checkresiduals(fit_train)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,N,N)
## Q* = 1156.3, df = 65.2, p-value < 2.2e-16
## 
## Model df: 2.   Total lags used: 67.2

We can see the results of ETS is very bad

ARIMA forecasting

For this timeseries there is some form of trend and seasonality based on our analysis. Therefore we would do two level of differencing one for trend and next for seasonality Let’s first take a seasonally differenced series.

ATM1_ts.train %>% diff(lag=7) %>% ggtsdisplay()

We can see the data is fairly differenced properly. however in ACF still shows some slight pattern due to trend so let’s difference that

ATM1_ts.train %>% diff(lag=7)%>% diff() %>% ggtsdisplay()

The significant spike at lag 1 in the ACF suggests a non-seasonal MA(1) , the the significant spike at lag 7 in the ACF suggests a seasonal MA(1) component

Based on this information we would compare the results of the auto arima to the manual selection

fc<-auto.arima(ATM1_ts.train, seasonal=FALSE,stepwise=FALSE, approximation=FALSE)
checkresiduals(fc)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,0,0) with non-zero mean
## Q* = 854.19, df = 61.2, p-value < 2.2e-16
## 
## Model df: 6.   Total lags used: 67.2

We can see that the result of this auto arima is not good due to the ACF plot has lag values well above the critical ranges

Now using the information and plots gotten from previous step we test a couple of various parameters for the p,d,q values

ATM1_ts.train%>%
  Arima(order=c(7,0,7)) %>%
  residuals() %>% ggtsdisplay()

fit3 <- Arima(ATM1_ts.train, order=c(7,1,7))
fit3
## Series: ATM1_ts.train 
## ARIMA(7,1,7) 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4      ar5      ar6     ar7     ma1
##       -0.9623  -0.9502  -0.9585  -0.9663  -0.9348  -0.9232  0.0453  0.1572
## s.e.   0.2216   0.2074   0.2012   0.1983   0.1934   0.1922  0.1945  0.2291
##           ma2     ma3     ma4      ma5      ma6      ma7
##       -0.0626  0.0740  0.1389  -0.1305  -0.0639  -0.6587
## s.e.   0.0555  0.0536  0.0612   0.0633   0.0733   0.0805
## 
## sigma^2 estimated as 576.5:  log likelihood=-1538.87
## AIC=3107.74   AICc=3109.25   BIC=3164.96
checkresiduals(fit3) 

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(7,1,7)
## Q* = 55.082, df = 53.2, p-value = 0.4033
## 
## Model df: 14.   Total lags used: 67.2
#AICc = 3112.8

fit4 <- Arima(ATM1_ts.train, order=c(7,1,8))
fit4
## Series: ATM1_ts.train 
## ARIMA(7,1,8) 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4      ar5      ar6     ar7
##       -0.0466  -0.0234  -0.0458  -0.0493  -0.0061  -0.0214  0.9287
## s.e.   0.0345   0.0354   0.0346   0.0349   0.0349   0.0349  0.0336
##           ma1      ma2     ma3     ma4      ma5     ma6      ma7     ma8
##       -0.7839  -0.2396  0.1263  0.0858  -0.2714  0.0343  -0.5637  0.6122
## s.e.   0.0626   0.0686  0.0619  0.0670   0.0632  0.0669   0.0764  0.0606
## 
## sigma^2 estimated as 571.7:  log likelihood=-1537.64
## AIC=3107.29   AICc=3109   BIC=3168.31
checkresiduals(fit4) 

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(7,1,8)
## Q* = 55.203, df = 52.2, p-value = 0.3619
## 
## Model df: 15.   Total lags used: 67.2
##AICc = 3109.25 

fit5 <- Arima(ATM1_ts.train, order=c(7,1,0))
fit5
## Series: ATM1_ts.train 
## ARIMA(7,1,0) 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4      ar5      ar6     ar7
##       -0.7865  -0.7926  -0.7351  -0.6860  -0.6846  -0.6616  0.0936
## s.e.   0.0544   0.0592   0.0630   0.0642   0.0624   0.0587  0.0547
## 
## sigma^2 estimated as 711.9:  log likelihood=-1574.94
## AIC=3165.89   AICc=3166.33   BIC=3196.4
checkresiduals(fit5)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(7,1,0)
## Q* = 108.71, df = 60.2, p-value = 0.0001292
## 
## Model df: 7.   Total lags used: 67.2
##AICc = 3166.33

From the various plots we can see that the ARIMA (7,1,8) has the lowest AICc, more normalized residuals and ACF plots below the critical values

Now let’s compare accuracy of the forecasts

compare the accuracy of forecastes against the actual values stored in ATM1_ts.test

 p1<-predict(fit4,29)
accuracy(p1$pred,ATM1_ts.test)
##                 ME     RMSE      MAE       MPE     MAPE       ACF1
## Test set -7.979495 15.33882 12.12426 -100.8079 105.5511 -0.4944431
##           Theil's U
## Test set 0.04826178
 p2<-predict(fit3,29)
accuracy(p2$pred,ATM1_ts.test)
##                 ME     RMSE      MAE       MPE     MAPE       ACF1
## Test set -6.680468 15.04431 11.65646 -93.69367 99.44533 -0.5369736
##           Theil's U
## Test set 0.03820937
 p3<-predict(fit5,29)
accuracy(p3$pred,ATM1_ts.test)
##                ME     RMSE      MAE       MPE     MAPE       ACF1
## Test set -8.05515 20.60587 14.33351 -190.2153 196.9253 -0.1356597
##           Theil's U
## Test set 0.04923441
 p4<-predict(fc,29)
accuracy(p3$pred,ATM1_ts.test)
##                ME     RMSE      MAE       MPE     MAPE       ACF1
## Test set -8.05515 20.60587 14.33351 -190.2153 196.9253 -0.1356597
##           Theil's U
## Test set 0.04923441

Based on the RMSE values we can see that the fit3 which corresponds to ARIMA (7,1,7) is best.

Now we have to make a decision ARIMA(7,1,8) had a lower AICc but ARIMA(7,1,7) had a lower RMSE. I will pick ARIMA(7,1,7) because I think that testing a forecast against actual data is more trust worthy than a model metric like AICc

fit3 %>% forecast(h=29) %>% autoplot(include=80)

Let’s plot the graph

fcast <- forecast::forecast(fit3, h=29)
autoplot(fcast, series="Data") +
  autolayer(ATM1_ts.test, series="test_set") +
   ggtitle("ATM1 trainn-test prediction ") 

Predict the next 30 days and Export to excel

fit3 <- Arima(ATM1_ts, order=c(7,1,7))
fcast1 <- forecast::forecast(forecast::Arima(ATM1_ts, model=fit3), h=30)
xlsx::write.xlsx(fcast1, file="C:/Users/Mezu/Documents/Data624/DATA624_Project1_ATM1.xlsx", 
  sheetName="ATM1", col.names = T, row.names = T, append = F)

ATM 2

str(ATM2_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    365 obs. of  3 variables:
##  $ DATE: Date, format: "2009-05-01" "2009-05-02" ...
##  $ ATM : chr  "ATM2" "ATM2" "ATM2" "ATM2" ...
##  $ Cash: num  107 89 90 55 79 19 2 103 107 118 ...
summary(ATM2_data)
##       DATE                ATM                 Cash       
##  Min.   :2009-05-01   Length:365         Min.   :  0.00  
##  1st Qu.:2009-07-31   Class :character   1st Qu.: 25.50  
##  Median :2009-10-30   Mode  :character   Median : 67.00  
##  Mean   :2009-10-30                      Mean   : 62.58  
##  3rd Qu.:2010-01-29                      3rd Qu.: 93.00  
##  Max.   :2010-04-30                      Max.   :147.00  
##                                          NA's   :2

Let’s convert to timeseries

ATM2_ts <- ts(ATM2_data$Cash, start=c(2009,5,1), frequency=365)

autoplot(ATM2_ts)+xlab("days") + ylab("Cash in hundreds of dollars") +
    ggtitle("ATM2 time series")

Since we noticed 2 missing values let’s impute it

 ATM2_ts<- na.interpolation(ATM2_ts)
autoplot(ATM2_ts)+xlab("days") + ylab("Cash in hundreds of dollars") +
    ggtitle("ATM2 time series")

Check if the timeseries needs any transformation using Box cox transformation

(lambda2 <- forecast::BoxCox.lambda(ATM2_ts))
## [1] 1

We can see that this dataset does not need any transformation

From this plot we can see there is some trend and seasonality

ggseasonplot(ATM2_ts )

#ggsubseriesplot(ATM2_ts)
gglagplot(ATM2_ts)

ggAcf(ATM2_ts)

let’s find the seasonality with fourier transform

#reference https://anomaly.io/detect-seasonality-using-fourier-transform-r/
p = periodogram(ATM2_ts)

dd = data.frame(freq=p$freq, spec=p$spec)
order = dd[order(-dd$spec),]
top2 = head(order, 2)
 
# display the 2 highest "power" frequencies
top2
##          freq     spec
## 107 0.2853333 86194.79
## 108 0.2880000 77054.04
# convert frequency to time periods
time = 1/top2$f
time
## [1] 3.504673 3.472222

The periodogram shows the “power” of each possible frequency, and we can clearly see spikes at around frequency 0.15 Hz and 0.28 Hz

Frequencies of 0.288 Hz and 0.285 Hz show seasonality!

The main seasonality detected is 3.5 days. A secondary seasonality of 7 days was also found

Let’s plot the trend using a 4 day moving average

autoplot(ATM2_ts, series="Data") +
  autolayer(ma(ATM2_ts,4), series="4-MA") +
   ggtitle("ATM2 4 day moving average trend") 

Cross validation of prediction Split the data into two parts using

ATM2_ts.train <- window(ATM2_ts, end=c(2009,340))
ATM2_ts.test <- window(ATM2_ts, start=c(2009,341))

Check that your data have been split appropriately by producing the following plot.

autoplot(ATM2_ts) +
  autolayer(ATM2_ts.train, series="Training") +
  autolayer(ATM2_ts.test, series="Test")

ARIMA forecasting

For this timeseries there is some form of trend and seasonality based on our analysis. Therefore we would do two level of differencing one for trend and next for seasonality Let’s first take a seasonally differenced series.

ATM2_ts.train %>% diff(lag=7) %>% ggtsdisplay()

We can see the data is fairly differenced properly. however in ACF still shows some slight pattern due to trend so let’s difference that

ATM2_ts.train %>% diff() %>% diff(lag=4) %>% diff(lag=7)%>%  ggtsdisplay()

ATM2_ts.train %>% diff(lag=7) %>%diff()  %>%  ggtsdisplay()

The significant spike at lag 1 in the ACF suggests a non-seasonal MA(1) , the the significant spike at lag 7 in the ACF suggests a seasonal MA(1) component

Based on this information we would compare the results of the auto arima to the manual selection

fc<-auto.arima(ATM2_ts.train, seasonal=FALSE,stepwise=FALSE, approximation=FALSE)
checkresiduals(fc)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,5)
## Q* = 649.75, df = 62.2, p-value < 2.2e-16
## 
## Model df: 5.   Total lags used: 67.2

We can see that the result of this auto arima is not good due to the ACF plot has lag values well above the critical ranges

Now using the information and plots gotten from previous step we test a couple of various parameters for the p,d,q values

ATM2_ts.train%>%
  Arima(order=c(7,1,7)) %>%
  residuals() %>% ggtsdisplay()

fit3 <- Arima(ATM2_ts.train, order=c(7,1,7))
fit3
## Series: ATM2_ts.train 
## ARIMA(7,1,7) 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4      ar5      ar6      ar7
##       -1.1194  -1.1093  -1.1082  -1.1191  -1.0960  -1.0868  -0.1123
## s.e.   0.0912   0.0902   0.0888   0.0907   0.0899   0.0824   0.0844
##          ma1      ma2      ma3     ma4      ma5      ma6      ma7
##       0.1973  -0.1626  -0.0097  0.1120  -0.1258  -0.0931  -0.7016
## s.e.  0.0794   0.0494   0.0545  0.0507   0.0480   0.0637   0.0453
## 
## sigma^2 estimated as 607.7:  log likelihood=-1549.31
## AIC=3128.61   AICc=3130.12   BIC=3185.83
checkresiduals(fit3) 

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(7,1,7)
## Q* = 71.919, df = 53.2, p-value = 0.04456
## 
## Model df: 14.   Total lags used: 67.2
#AICc = 3130.12

fit4 <- Arima(ATM2_ts.train, order=c(7,1,8))
fit4
## Series: ATM2_ts.train 
## ARIMA(7,1,8) 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4      ar5      ar6     ar7
##       -0.0403  -0.0358  -0.0373  -0.0502  -0.0147  -0.0355  0.9366
## s.e.   0.0253   0.0238   0.0237   0.0237   0.0232   0.0227  0.0231
##           ma1      ma2     ma3     ma4      ma5     ma6      ma7     ma8
##       -0.9121  -0.2537  0.1558  0.0913  -0.2388  0.0723  -0.6168  0.7131
## s.e.   0.0559   0.0711  0.0720  0.0751   0.0614  0.0688   0.0625  0.0468
## 
## sigma^2 estimated as 595.8:  log likelihood=-1546.26
## AIC=3124.52   AICc=3126.23   BIC=3185.54
checkresiduals(fit4) 

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(7,1,8)
## Q* = 65.649, df = 52.2, p-value = 0.09997
## 
## Model df: 15.   Total lags used: 67.2
#AICc=3126.23

fit5 <- Arima(ATM2_ts.train, order=c(7,1,10))
fit5
## Series: ATM2_ts.train 
## ARIMA(7,1,10) 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4      ar5      ar6     ar7
##       -0.0498  -0.0374  -0.0547  -0.0631  -0.0273  -0.0475  0.9210
## s.e.   0.0279   0.0286   0.0278   0.0272   0.0279   0.0286  0.0269
##           ma1      ma2     ma3     ma4      ma5     ma6      ma7     ma8
##       -0.9014  -0.2130  0.1198  0.0983  -0.2601  0.0827  -0.6156  0.7105
## s.e.   0.0652   0.0797  0.0714  0.0667   0.0643  0.0682   0.0688  0.0773
##           ma9    ma10
##       -0.1346  0.1270
## s.e.   0.0761  0.0629
## 
## sigma^2 estimated as 584.7:  log likelihood=-1544.04
## AIC=3124.08   AICc=3126.24   BIC=3192.73
checkresiduals(fit5)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(7,1,10)
## Q* = 60.374, df = 50.2, p-value = 0.154
## 
## Model df: 17.   Total lags used: 67.2
#AICc=3126.24

From the various plots we can see that the ARIMA (7,1,8) has the lowest AICc, more normalized residuals and ACF plots below the critical values

Now let’s compare accuracy of the forecasts

compare the accuracy of forecastes against the actual values stored in ATM1_ts.test

 p1<-predict(fit4,29)
accuracy(p1$pred,ATM2_ts.test)
##                ME     RMSE      MAE       MPE     MAPE        ACF1
## Test set 7.596794 18.42945 15.45209 -27.83442 57.55219 -0.04026416
##          Theil's U
## Test set 0.1317035
 p2<-predict(fit3,29)
accuracy(p2$pred,ATM2_ts.test)
##                ME     RMSE      MAE       MPE     MAPE       ACF1
## Test set 6.770904 18.28133 15.62466 -72.85834 98.49209 -0.1532232
##          Theil's U
## Test set 0.1374686
 p3<-predict(fit5,29)
accuracy(p3$pred,ATM2_ts.test)
##                ME     RMSE      MAE      MPE     MAPE        ACF1
## Test set 7.545134 20.32541 17.62188 -107.412 136.4305 -0.05991392
##          Theil's U
## Test set 0.1546188

Based on the RMSE values we can see that the fit3 which corresponds to ARIMA (7,1,7) is best.

Now we have to make a decision ARIMA(7,1,8) had a lower AICc but ARIMA(7,1,7) had a lower RMSE. I will pick ARIMA(7,1,7) because I think that testing a forecast against actual data is more trust worthy than a model metric like AICc

fit3 %>% forecast(h=29) %>% autoplot(include=80)

Let’s plot the graph

fcast <- forecast::forecast(fit3, h=29)
autoplot(fcast, series="Data") +
  autolayer(ATM2_ts.test, series="test_set") +
   ggtitle("ATM2 train-test prediction ") 

Predict the next 30 days and Export to excel

fit3 <- Arima(ATM2_ts, order=c(7,1,7))
fcast1 <- forecast::forecast(forecast::Arima(ATM2_ts, model=fit3), h=30)
xlsx::write.xlsx(fcast1, file="C:/Users/Mezu/Documents/Data624/DATA624_Project1_ATM2.xlsx", 
  sheetName="ATM2", col.names = T, row.names = T, append = F)

ATM 3

str(ATM3_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    365 obs. of  3 variables:
##  $ DATE: Date, format: "2009-05-01" "2009-05-02" ...
##  $ ATM : chr  "ATM3" "ATM3" "ATM3" "ATM3" ...
##  $ Cash: num  0 0 0 0 0 0 0 0 0 0 ...
summary(ATM3_data)
##       DATE                ATM                 Cash        
##  Min.   :2009-05-01   Length:365         Min.   : 0.0000  
##  1st Qu.:2009-07-31   Class :character   1st Qu.: 0.0000  
##  Median :2009-10-30   Mode  :character   Median : 0.0000  
##  Mean   :2009-10-30                      Mean   : 0.7206  
##  3rd Qu.:2010-01-29                      3rd Qu.: 0.0000  
##  Max.   :2010-04-30                      Max.   :96.0000

Let’s convert to timeseries

ATM3_ts <- ts(ATM3_data$Cash, start=c(2009,5,1), frequency=365)

autoplot(ATM3_ts)+xlab("days") + ylab("Cash in hundreds of dollars") +
    ggtitle("ATM3 time series")

Based on the available data. I think we don’t have a distributed or varied data to forecast. I would need to first ask the business if this was a new ATM that was installed at that location. I would wait for a couple of months before i can get an accurate picture of the ATM’s data. Due to the low variance of this ATM data compared to others, I would not forecast it.

if the business insists on a forecast we can infer based on the forecast from other ATMs

ATM 4

str(ATM4_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    365 obs. of  3 variables:
##  $ DATE: Date, format: "2009-05-01" "2009-05-02" ...
##  $ ATM : chr  "ATM4" "ATM4" "ATM4" "ATM4" ...
##  $ Cash: num  777 524.4 792.8 908.2 52.8 ...
summary(ATM4_data)
##       DATE                ATM                 Cash          
##  Min.   :2009-05-01   Length:365         Min.   :    1.563  
##  1st Qu.:2009-07-31   Class :character   1st Qu.:  124.334  
##  Median :2009-10-30   Mode  :character   Median :  403.839  
##  Mean   :2009-10-30                      Mean   :  474.043  
##  3rd Qu.:2010-01-29                      3rd Qu.:  704.507  
##  Max.   :2010-04-30                      Max.   :10919.762

Let’s convert to timeseries

ATM4_ts <- ts(ATM4_data$Cash, start=c(2009,5,1), frequency=365)

autoplot(ATM4_ts)+xlab("days") + ylab("Cash in hundreds of dollars") +
    ggtitle("ATM4 time series")

Since we noticed an outlier record at Feb 9, 2010 of 10919.762 we need to treat it as missing values and remove from out dataset so it can be imputed.

I decided to remove this record because I need explanation from the business and understanding if this record was in error before i can further decide what to do.

ATM4_ts[285] <- NA
 ATM4_ts<- na.interpolation(ATM4_ts)
autoplot(ATM4_ts)+xlab("days") + ylab("Cash in hundreds of dollars") +
    ggtitle("ATM4 time series")

Check if the timeseries needs any transformation using Box cox transformation

(lambda2 <- forecast::BoxCox.lambda(ATM4_ts))
## [1] 1

We can see that this dataset does not need any transformation

From this plot we can see there is some trend and seasonality

ggseasonplot(ATM4_ts )

#ggsubseriesplot(ATM4_ts)
gglagplot(ATM4_ts)

ggAcf(ATM4_ts)

let’s find the seasonality with fourier transform

#reference https://anomaly.io/detect-seasonality-using-fourier-transform-r/
p = periodogram(ATM4_ts)

dd = data.frame(freq=p$freq, spec=p$spec)
order = dd[order(-dd$spec),]
top2 = head(order, 2)
 
# display the 2 highest "power" frequencies
top2
##          freq    spec
## 54  0.1440000 2513218
## 161 0.4293333 1243196
# convert frequency to time periods
time = 1/top2$f
time
## [1] 6.944444 2.329193

The periodogram shows the “power” of each possible frequency, and we can clearly see spikes at around frequency 0.15 Hz and 0.43 Hz

Frequencies of 0.144 Hz and 0.429 Hz show seasonality!

The main seasonality detected is 7 days. A secondary seasonality of 2 days was also found

Let’s plot the trend using a 4 day moving average

autoplot(ATM4_ts, series="Data") +
  autolayer(ma(ATM4_ts,4), series="4-MA") +
   ggtitle("ATM4 4 day moving average trend") 

Cross validation of prediction Split the data into two parts using

ATM4_ts.train <- window(ATM4_ts, end=c(2009,340))
ATM4_ts.test <- window(ATM4_ts, start=c(2009,341))

Check that your data have been split appropriately by producing the following plot.

autoplot(ATM4_ts) +
  autolayer(ATM4_ts.train, series="Training") +
  autolayer(ATM4_ts.test, series="Test")

ARIMA forecasting

For this timeseries there is some form of trend and seasonality based on our analysis. Therefore we would do two level of differencing one for trend and next for seasonality Let’s first take a seasonally differenced series.

ATM4_ts.train %>% diff(lag=7) %>% ggtsdisplay()

We can see the data is fairly differenced properly. however in ACF still shows some slight pattern due to trend so let’s difference that

ATM4_ts.train %>% diff(lag=7) %>%diff()  %>%  ggtsdisplay()

The significant spike at lag 1 in the ACF suggests a non-seasonal MA(1) , the the significant spike at lag 7 in the ACF suggests a seasonal MA(1) component

Based on this information we would compare the results of the auto arima to the manual selection

fc<-auto.arima(ATM4_ts.train, seasonal=FALSE,stepwise=FALSE, approximation=FALSE)
checkresiduals(fc)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0) with non-zero mean
## Q* = 126.25, df = 65.2, p-value = 8.798e-06
## 
## Model df: 2.   Total lags used: 67.2

We can see that the result of this auto arima is not good due to the ACF plot has lag values well above the critical ranges

Now using the information and plots gotten from previous step we test a couple of various parameters for the p,d,q values

ATM4_ts.train%>%
  Arima(order=c(2,1,7)) %>%
  residuals() %>% ggtsdisplay()

fit3 <- Arima(ATM4_ts.train, order=c(7,1,7))
fit3
## Series: ATM4_ts.train 
## ARIMA(7,1,7) 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4      ar5      ar6     ar7
##       -0.1591  -0.4079  -1.0015  -0.4732  -0.1316  -0.8005  0.1357
## s.e.   0.0818   0.0730   0.0728   0.0709   0.0761   0.0754  0.0615
##           ma1     ma2     ma3      ma4      ma5     ma6      ma7
##       -0.7585  0.2092  0.6711  -0.5746  -0.3688  0.7247  -0.9023
## s.e.   0.1134  0.1652  0.1372   0.0828   0.1613  0.1783   0.1149
## 
## sigma^2 estimated as 116249:  log likelihood=-2427.8
## AIC=4885.6   AICc=4887.11   BIC=4942.82
checkresiduals(fit3) 

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(7,1,7)
## Q* = 74.895, df = 53.2, p-value = 0.0266
## 
## Model df: 14.   Total lags used: 67.2
#AICc = 4887.11 

fit4 <- Arima(ATM4_ts.train, order=c(5,1,7))
fit4
## Series: ATM4_ts.train 
## ARIMA(5,1,7) 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4     ar5      ma1      ma2     ma3
##       -0.6781  -0.2238  -0.7025  -0.5011  0.3567  -0.2320  -0.4394  0.4896
## s.e.   0.3652   0.3732   0.2229   0.3694  0.2695   0.3738   0.0521  0.1481
##           ma4      ma5     ma6     ma7
##       -0.2829  -0.9059  0.3073  0.0633
## s.e.   0.1411      NaN  0.3001  0.0593
## 
## sigma^2 estimated as 120970:  log likelihood=-2434.69
## AIC=4895.38   AICc=4896.51   BIC=4944.96
checkresiduals(fit4) 

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,1,7)
## Q* = 90.966, df = 55.2, p-value = 0.001734
## 
## Model df: 12.   Total lags used: 67.2
#AICc=4896.51

fit5 <- Arima(ATM4_ts.train, order=c(2,1,7))
fit5
## Series: ATM4_ts.train 
## ARIMA(2,1,7) 
## 
## Coefficients:
##           ar1      ar2      ma1     ma2      ma3      ma4     ma5      ma6
##       -0.5814  -0.8722  -0.3175  0.2897  -0.7887  -0.2148  0.0175  -0.1577
## s.e.   0.1199   0.0981   0.1270  0.0899   0.1083   0.0692  0.0582   0.0578
##          ma7
##       0.1715
## s.e.  0.0541
## 
## sigma^2 estimated as 123998:  log likelihood=-2438.27
## AIC=4896.55   AICc=4897.23   BIC=4934.69
checkresiduals(fit5)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,7)
## Q* = 91.519, df = 58.2, p-value = 0.003465
## 
## Model df: 9.   Total lags used: 67.2
#AICc=4897.23 

From the various plots we can see that the ARIMA (7,1,7) has the lowest AICc, more normalized residuals and ACF plots below the critical values

Now let’s compare accuracy of the forecasts

compare the accuracy of forecastes against the actual values stored in ATM1_ts.test

 p1<-predict(fit4,29)
accuracy(p1$pred,ATM4_ts.test)
##                 ME     RMSE      MAE       MPE     MAPE       ACF1
## Test set -52.15396 299.2789 258.7339 -1196.058 1223.817 0.07336747
##          Theil's U
## Test set 0.3429161
 p2<-predict(fit3,29)
accuracy(p2$pred,ATM4_ts.test)
##                 ME     RMSE      MAE       MPE     MAPE       ACF1
## Test set -53.81601 291.5918 243.9763 -1097.047 1122.035 0.02337883
##          Theil's U
## Test set 0.1408487
 p3<-predict(fit5,29)
accuracy(p3$pred,ATM4_ts.test)
##                 ME    RMSE      MAE       MPE     MAPE        ACF1
## Test set -53.83352 293.353 250.0366 -1141.166 1167.065 -0.01144309
##          Theil's U
## Test set 0.1288629

Based on the RMSE values we can see that the fit3 which corresponds to ARIMA (7,1,7) is best.

I will pick ARIMA(7,1,7) because I think that testing a forecast against actual data is more trust worthy than a model metric like AICc

fit3 %>% forecast(h=29) %>% autoplot(include=80)

Let’s plot the graph

fcast <- forecast::forecast(fit3, h=29)
autoplot(fcast, series="Data") +
  autolayer(ATM4_ts.test, series="test_set") +
   ggtitle("ATM4 train-test prediction ") 

Predict the next 30 days and Export to excel

fit3 <- Arima(ATM4_ts, order=c(7,1,7))
fcast1 <- forecast::forecast(forecast::Arima(ATM4_ts, model=fit3), h=30)
xlsx::write.xlsx(fcast1, file="C:/Users/Mezu/Documents/Data624/DATA624_Project1_ATM4.xlsx", 
  sheetName="ATM4", col.names = T, row.names = T, append = F)

Part B - Forecasting Power, ResidentialCustomerForecastLoad-624.xlsx

Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.

Let’s load the data into a R dataframe and perform some preliminary exploration

Power_data <-  readxl::read_excel("C:/Users/Mezu/Documents/Data624/ResidentialCustomerForecastLoad-624.xlsx")
head(Power_data)
## # A tibble: 6 x 3
##   CaseSequence `YYYY-MMM`     KWH
##          <dbl> <chr>        <dbl>
## 1          733 1998-Jan   6862583
## 2          734 1998-Feb   5838198
## 3          735 1998-Mar   5420658
## 4          736 1998-Apr   5010364
## 5          737 1998-May   4665377
## 6          738 1998-Jun   6467147
str(Power_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    192 obs. of  3 variables:
##  $ CaseSequence: num  733 734 735 736 737 738 739 740 741 742 ...
##  $ YYYY-MMM    : chr  "1998-Jan" "1998-Feb" "1998-Mar" "1998-Apr" ...
##  $ KWH         : num  6862583 5838198 5420658 5010364 4665377 ...
summary(Power_data)
##   CaseSequence     YYYY-MMM              KWH          
##  Min.   :733.0   Length:192         Min.   :  770523  
##  1st Qu.:780.8   Class :character   1st Qu.: 5429912  
##  Median :828.5   Mode  :character   Median : 6283324  
##  Mean   :828.5                      Mean   : 6502475  
##  3rd Qu.:876.2                      3rd Qu.: 7620524  
##  Max.   :924.0                      Max.   :10655730  
##                                     NA's   :1

Let’s convert to timeseries

Power_data_ts <- ts(Power_data$KWH, start=c(1998,1), frequency=12)

autoplot(Power_data_ts )+xlab("days") + ylab("Power consumption in kilowatt hours") +
    ggtitle("Power_data time series")

Since we noticed 1 missing values let’s impute it

 Power_data_ts<- na.interpolation(Power_data_ts)
autoplot(Power_data_ts)+xlab("days") + ylab("Power consumption in kilowatt hours") +
    ggtitle("Power_data time serie")

Check if the timeseries needs any transformation using Box cox transformation

(lambda1 <- forecast::BoxCox.lambda(Power_data_ts))
## [1] 0.1074118

We can see that this dataset does need a transformation. I will use the log tranformation

autoplot(log(Power_data_ts))+xlab("days") + ylab("Power consumption in kilowatt hours") +
    ggtitle("log of Power_data time serie")

From this plot we can see there is some seasonality but no clear trend. Also there is a huge outlier made prominent by the log transformation.

ggseasonplot(Power_data_ts )

ggsubseriesplot(Power_data_ts)

gglagplot(Power_data_ts)

ggAcf(Power_data_ts)

let’s find the seasonality with fourier transform

#reference https://anomaly.io/detect-seasonality-using-fourier-transform-r/
p = periodogram(Power_data_ts)

dd = data.frame(freq=p$freq, spec=p$spec)
order = dd[order(-dd$spec),]
top2 = head(order, 2)
 
# display the 2 highest "power" frequencies
top2
##          freq         spec
## 32 0.16666667 1.923770e+14
## 16 0.08333333 2.113811e+13
# convert frequency to time periods
time = 1/top2$f
time
## [1]  6 12

The main seasonality detected is 6 months. A secondary seasonality of 12 months was also found

Let’s plot the trend using a 7 day moving average

autoplot(Power_data_ts, series="Data") +
  autolayer(ma(Power_data_ts,6), series="6-MA") +
   ggtitle("Power_data_ts 6MA moving average trend") 

Using Simple exponential smoothing to predict the forecast and check RSME

fc <- ses(Power_data_ts, h=30)
autoplot(fc) +
  autolayer(fitted(fc), series="Fitted") +
  ylab("Power in KWH") + xlab("months")

summary(fc)
## 
## Forecast method: Simple exponential smoothing
## 
## Model Information:
## Simple exponential smoothing 
## 
## Call:
##  ses(y = Power_data_ts, h = 30) 
## 
##   Smoothing parameters:
##     alpha = 0.0408 
## 
##   Initial states:
##     l = 6512199.706 
## 
##   sigma:  1423653
## 
##      AIC     AICc      BIC 
## 6454.223 6454.351 6463.996 
## 
## Error measures:
##                    ME    RMSE     MAE       MPE    MAPE     MASE      ACF1
## Training set 90773.09 1416218 1178550 -6.212522 21.7282 1.691596 0.3784715
## 
## Forecasts:
##          Point Forecast   Lo 80   Hi 80   Lo 95    Hi 95
## Jan 2014        7222913 5398429 9047398 4432605 10013221
## Feb 2014        7222913 5396913 9048914 4430286 10015540
## Mar 2014        7222913 5395397 9050429 4427969 10017858
## Apr 2014        7222913 5393884 9051943 4425654 10020173
## May 2014        7222913 5392371 9053456 4423340 10022486
## Jun 2014        7222913 5390860 9054967 4421029 10024798
## Jul 2014        7222913 5389350 9056477 4418720 10027107
## Aug 2014        7222913 5387841 9057986 4416412 10029415
## Sep 2014        7222913 5386333 9059494 4414106 10031720
## Oct 2014        7222913 5384827 9061000 4411802 10034024
## Nov 2014        7222913 5383322 9062505 4409501 10036326
## Dec 2014        7222913 5381818 9064009 4407200 10038626
## Jan 2015        7222913 5380315 9065512 4404902 10040924
## Feb 2015        7222913 5378813 9067013 4402606 10043221
## Mar 2015        7222913 5377313 9068513 4400312 10045515
## Apr 2015        7222913 5375814 9070012 4398019 10047808
## May 2015        7222913 5374316 9071510 4395728 10050098
## Jun 2015        7222913 5372820 9073007 4393439 10052387
## Jul 2015        7222913 5371324 9074502 4391152 10054674
## Aug 2015        7222913 5369830 9075996 4388867 10056959
## Sep 2015        7222913 5368337 9077489 4386584 10059243
## Oct 2015        7222913 5366845 9078981 4384303 10061524
## Nov 2015        7222913 5365355 9080472 4382023 10063804
## Dec 2015        7222913 5363866 9081961 4379745 10066082
## Jan 2016        7222913 5362377 9083449 4377469 10068358
## Feb 2016        7222913 5360890 9084936 4375195 10070632
## Mar 2016        7222913 5359405 9086422 4372923 10072904
## Apr 2016        7222913 5357920 9087907 4370652 10075175
## May 2016        7222913 5356436 9089390 4368383 10077443
## Jun 2016        7222913 5354954 9090872 4366116 10079710
checkresiduals(fc)

## 
##  Ljung-Box test
## 
## data:  Residuals from Simple exponential smoothing
## Q* = 620.2, df = 22, p-value < 2.2e-16
## 
## Model df: 2.   Total lags used: 24

We can see the ACF graph and the residuals shows the data is clearly seasonal and the errors are not normally distributed so our predictions might not be accurate

ETS Method

Let’s try auto ETS

fc <- ets(Power_data_ts)
autoplot(fc) +
  autolayer(fitted(fc), series="Fitted") +
  ylab("Power in KWH") + xlab("monthss")

summary(fc)
## ETS(M,N,M) 
## 
## Call:
##  ets(y = Power_data_ts) 
## 
##   Smoothing parameters:
##     alpha = 0.1137 
##     gamma = 1e-04 
## 
##   Initial states:
##     l = 6134772.7867 
##     s = 0.9547 0.7533 0.8603 1.1912 1.2556 1.1992
##            0.9982 0.7684 0.8161 0.9175 1.0618 1.2237
## 
##   sigma:  0.1199
## 
##      AIC     AICc      BIC 
## 6224.057 6226.784 6272.919 
## 
## Training set error measures:
##                    ME   RMSE      MAE      MPE     MAPE      MASE
## Training set 61009.73 835107 503972.9 -4.39013 12.04006 0.7233624
##                   ACF1
## Training set 0.1698584
checkresiduals(fc)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,N,M)
## Q* = 28.615, df = 10, p-value = 0.001438
## 
## Model df: 14.   Total lags used: 24

We can see it does give us a good model.

Cross validation of prediction Split the data into two parts using

Power_data_ts.train <- window(Power_data_ts, end=c(2012,12))
Power_data_ts.test <- window(Power_data_ts, start=c(2013,1))

Check that your data have been split appropriately by producing the following plot.

autoplot(Power_data_ts) +
  autolayer(Power_data_ts.train, series="Training") +
  autolayer(Power_data_ts.test, series="Test")

calculate forecasts using ETS applied to ATM1_ts.train

fit_train <-  ets(Power_data_ts.train)

compare the accuracy of forecastes against the actual values stored in ATM1_ts.test

 p<-predict(fit_train,12)
accuracy(p$mean,Power_data_ts.test)
##              ME    RMSE      MAE      MPE     MAPE       ACF1 Theil's U
## Test set 398169 1057623 692843.6 4.135402 8.307641 0.08496991 0.6097567

check residuals

checkresiduals(fit_train)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,N,M)
## Q* = 30.885, df = 10, p-value = 0.0006129
## 
## Model df: 14.   Total lags used: 24

We can see the results of ETS is not so great but decent

ARIMA forecasting

For this timeseries there is some form of trend and seasonality based on our analysis. Therefore we would do two level of differencing one for trend and next for seasonality Let’s first take a seasonally differenced series.

Power_data_ts.train %>% diff(lag=12) %>% ggtsdisplay()

We can see the data is fairly differenced properly. however in ACF still shows some slight pattern due to trend so let’s difference that

Power_data_ts.train %>% diff(lag=12)%>% diff() %>% ggtsdisplay()

The significant spike at lag 1 in the ACF suggests a non-seasonal MA(1) , the the significant spike at lag 12 in the ACF suggests a seasonal MA(1) component

Based on this information we would compare the results of the auto arima to the manual selection

fc<-auto.arima(Power_data_ts.train, seasonal=TRUE,stepwise=FALSE, approximation=FALSE)
checkresiduals(fc)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0)(2,1,1)[12] with drift
## Q* = 12.219, df = 19, p-value = 0.876
## 
## Model df: 5.   Total lags used: 24
summary(fc)
## Series: Power_data_ts.train 
## ARIMA(1,0,0)(2,1,1)[12] with drift 
## 
## Coefficients:
##          ar1     sar1     sar2     sma1     drift
##       0.2371  -0.6340  -0.4944  -0.2762  6795.367
## s.e.  0.0767   0.1804   0.1400   0.1995  2572.161
## 
## sigma^2 estimated as 6.862e+11:  log likelihood=-2532.59
## AIC=5077.18   AICc=5077.7   BIC=5095.93
## 
## Training set error measures:
##                     ME   RMSE      MAE       MPE     MAPE      MASE
## Training set -9927.261 788292 482331.7 -5.171484 11.61867 0.6868009
##                     ACF1
## Training set -0.01545698
#AICc=5077.7

We can see that the result of this auto arima is good due to the ACF plot has lag values well below the critical ranges

Now using the information and plots gotten from previous step we test a couple of various parameters for the p,d,q values

Power_data_ts.train%>%
  Arima(order=c(2,0,0), seasonal = c(2,1,1)) %>%
  residuals() %>% ggtsdisplay()

fit3 <- Arima(Power_data_ts.train, order=c(2,0,0), seasonal = c(2,1,1))
fit3
## Series: Power_data_ts.train 
## ARIMA(2,0,0)(2,1,1)[12] 
## 
## Coefficients:
##          ar1     ar2     sar1     sar2     sma1
##       0.2460  0.1170  -0.6588  -0.5353  -0.2192
## s.e.  0.0786  0.0797   0.1579   0.1226   0.1774
## 
## sigma^2 estimated as 7.021e+11:  log likelihood=-2534.64
## AIC=5081.29   AICc=5081.81   BIC=5100.03
checkresiduals(fit3) 

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,0)(2,1,1)[12]
## Q* = 12.967, df = 19, p-value = 0.8403
## 
## Model df: 5.   Total lags used: 24
#AICc =5081.81

fit4 <- Arima(Power_data_ts.train, order=c(2,0,2), seasonal = c(2,1,0))
fit4
## Series: Power_data_ts.train 
## ARIMA(2,0,2)(2,1,0)[12] 
## 
## Coefficients:
##           ar1     ar2     ma1      ma2     sar1     sar2
##       -0.0259  0.6791  0.2401  -0.5650  -0.8357  -0.6334
## s.e.   0.3361  0.2108  0.3533   0.2052   0.0720   0.0774
## 
## sigma^2 estimated as 6.974e+11:  log likelihood=-2534.15
## AIC=5082.29   AICc=5082.99   BIC=5104.16
checkresiduals(fit4) 

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,2)(2,1,0)[12]
## Q* = 10.897, df = 18, p-value = 0.8987
## 
## Model df: 6.   Total lags used: 24
##AICc = 5082.99 

fit5 <- Arima(Power_data_ts.train, order=c(2,0,0), seasonal = c(1,1,1))
fit5
## Series: Power_data_ts.train 
## ARIMA(2,0,0)(1,1,1)[12] 
## 
## Coefficients:
##          ar1     ar2     sar1     sma1
##       0.3032  0.0603  -0.1262  -0.7284
## s.e.  0.0778  0.0785   0.0977   0.0808
## 
## sigma^2 estimated as 7.569e+11:  log likelihood=-2539.71
## AIC=5089.43   AICc=5089.8   BIC=5105.05
checkresiduals(fit5)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,0)(1,1,1)[12]
## Q* = 13.369, df = 20, p-value = 0.861
## 
## Model df: 4.   Total lags used: 24
##AICc = 5089.8

fc <- Arima(Power_data_ts.train, order=c(1,0,0), seasonal = c(2,1,1))
fc
## Series: Power_data_ts.train 
## ARIMA(1,0,0)(2,1,1)[12] 
## 
## Coefficients:
##          ar1     sar1     sar2     sma1
##       0.2793  -0.6492  -0.5000  -0.2200
## s.e.  0.0764   0.1707   0.1295   0.1876
## 
## sigma^2 estimated as 7.12e+11:  log likelihood=-2535.71
## AIC=5081.42   AICc=5081.79   BIC=5097.04
checkresiduals(fc)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0)(2,1,1)[12]
## Q* = 12.972, df = 20, p-value = 0.8786
## 
## Model df: 4.   Total lags used: 24

From the various plots we can see that the ARIMA(1,0,0)(2,1,1)[12] with drift has the lowest AICc, more normalized residuals and ACF plots below the critical values

Now let’s compare accuracy of the forecasts

compare the accuracy of forecastes against the actual values stored in ATM1_ts.test

 p1<-predict(fit4,12)
accuracy(p1$pred,Power_data_ts.test)
##                ME    RMSE     MAE      MPE    MAPE       ACF1 Theil's U
## Test set 860659.1 1660684 1080721 10.03257 13.0932 0.02039943 0.9068145
 p2<-predict(fit3,12)
accuracy(p2$pred,Power_data_ts.test)
##                ME    RMSE     MAE      MPE     MAPE       ACF1 Theil's U
## Test set 958430.8 1625314 1086974 11.33535 13.12316 0.05164515 0.8920487
 p3<-predict(fit5,12)
accuracy(p3$pred,Power_data_ts.test)
##                ME    RMSE      MAE      MPE     MAPE       ACF1 Theil's U
## Test set 817125.4 1178989 838726.2 9.835089 10.11628 -0.0380971  0.654609
 p4<-predict(fc,12)
accuracy(p4$pred,Power_data_ts.test)
##                ME    RMSE     MAE      MPE   MAPE       ACF1 Theil's U
## Test set 915864.1 1559245 1048962 10.79004 12.631 0.03206625 0.8524698

Based on the RMSE values we can see that the fit5 which corresponds to ARIMA (2,0,0)(1,1,1) is best.

fit5 %>% forecast(h=12) %>% autoplot(include=80)

Let’s plot the graph

fcast <- forecast::forecast(fit5, h=12)
autoplot(fcast, series="Data") +
  autolayer(Power_data_ts.test, series="test_set") +
   ggtitle(",Power_data_ts train-test prediction ") 

Predict the next 12 months and Export to excel

fit5 <- Arima(Power_data_ts,  order=c(2,0,0), seasonal = c(1,1,1))
fcast1 <- forecast::forecast(forecast::Arima(Power_data_ts, model=fit5), h=12)
xlsx::write.xlsx(fcast1, file="C:/Users/Mezu/Documents/Data624/DATA624_Project1_Power.xlsx", 
  sheetName="Power", col.names = T, row.names = T, append = F)

Part C - BONUS, optional (part or all), Waterflow_Pipe1.xlsx and Waterflow_Pipe2.xlsx

Part C consists of two data sets. These are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows). Note for multiple recordings within an hour, take the mean. Then to determine if the data is stationary and can it be forecast. If so, provide a week forward forecast and present results via Rpubs and .rmd and the forecast in an Excel readable file.

Let’s load the data into a R dataframe and perform some preliminary exploration

Waterflow_Pipe1_data <-  readxl::read_excel("C:/Users/Mezu/Documents/Data624/Waterflow_Pipe1.xlsx")
head(Waterflow_Pipe1_data)
## # A tibble: 6 x 2
##   DateTime            WaterFlow
##   <dttm>                  <dbl>
## 1 2015-10-23 00:24:06     23.4 
## 2 2015-10-23 00:40:02     28.0 
## 3 2015-10-23 00:53:51     23.1 
## 4 2015-10-23 00:55:40     30.0 
## 5 2015-10-23 01:19:17      6.00
## 6 2015-10-23 01:23:58     15.9
str(Waterflow_Pipe1_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1000 obs. of  2 variables:
##  $ DateTime : POSIXct, format: "2015-10-23 00:24:06" "2015-10-23 00:40:02" ...
##  $ WaterFlow: num  23.4 28 23.1 30 6 ...
Waterflow_Pipe2_data <-  readxl::read_excel("C:/Users/Mezu/Documents/Data624/Waterflow_Pipe2.xlsx")
head(Waterflow_Pipe2_data)
## # A tibble: 6 x 2
##   DateTime            WaterFlow
##   <dttm>                  <dbl>
## 1 2015-10-23 01:00:00      18.8
## 2 2015-10-23 02:00:00      43.1
## 3 2015-10-23 03:00:00      38.0
## 4 2015-10-23 04:00:00      36.1
## 5 2015-10-23 05:00:00      31.9
## 6 2015-10-23 06:00:00      28.2
str(Waterflow_Pipe2_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1000 obs. of  2 variables:
##  $ DateTime : POSIXct, format: "2015-10-23 01:00:00" "2015-10-23 02:00:00" ...
##  $ WaterFlow: num  18.8 43.1 38 36.1 31.9 ...
summary(Waterflow_Pipe2_data)
##     DateTime                     WaterFlow     
##  Min.   :2015-10-23 01:00:00   Min.   : 1.885  
##  1st Qu.:2015-11-02 10:45:00   1st Qu.:28.140  
##  Median :2015-11-12 20:30:00   Median :39.682  
##  Mean   :2015-11-12 20:30:00   Mean   :39.556  
##  3rd Qu.:2015-11-23 06:15:00   3rd Qu.:50.782  
##  Max.   :2015-12-03 16:00:00   Max.   :78.303
summary(Waterflow_Pipe1_data)
##     DateTime                     WaterFlow     
##  Min.   :2015-10-23 00:24:06   Min.   : 1.067  
##  1st Qu.:2015-10-25 11:21:35   1st Qu.:13.683  
##  Median :2015-10-27 20:07:30   Median :19.880  
##  Mean   :2015-10-27 20:49:15   Mean   :19.897  
##  3rd Qu.:2015-10-30 08:24:51   3rd Qu.:26.159  
##  Max.   :2015-11-01 23:35:43   Max.   :38.913
if (!require("xts")) install.packages("xts")
## Loading required package: xts
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
library(xts)
xts::periodicity(unique(Waterflow_Pipe1_data$`DateTime`))
## 10.0198883334796 minute periodicity from 2015-10-23 00:24:06 to 2015-11-01 23:35:43
xts::periodicity(unique(Waterflow_Pipe2_data$`DateTime`))
## Hourly periodicity from 2015-10-23 01:00:00 to 2015-12-03 16:00:00

These are time series’ spanning October 23, 2015 to November 1, 2015 for Pipe 1 and to December 3, 2015 for Pipe 2. The time series contains water flow measurements for two pipes. The datasets do not contain any missing values or apparent outliers.

Pipe1 <- xts::xts(Waterflow_Pipe1_data$WaterFlow, order.by=Waterflow_Pipe2_data$DateTime)
Pipe1h <- xts::period.apply(Pipe1, xts::endpoints(Pipe1, "hours"), mean)
xts::periodicity(Pipe1h)
## Hourly periodicity from 2015-10-23 01:00:00 to 2015-12-03 16:00:00

The data for Pipe 1 which have a period of every ten minutes are grouped and converted to hourly with the mean() function applied to the aggregated hourly periods.

Pipe2 <- xts::xts(Waterflow_Pipe2_data$WaterFlow, order.by=Waterflow_Pipe2_data$DateTime)
Pipes <- merge(merge(Pipe1, Pipe2, join='inner'), Pipe1+Pipe2, join='inner')
head(Pipes)
##                         Pipe1    Pipe2 Pipe1...Pipe2
## 2015-10-23 01:00:00 23.369599 18.81079      42.18039
## 2015-10-23 02:00:00 28.002881 43.08703      71.08991
## 2015-10-23 03:00:00 23.065895 37.98770      61.05360
## 2015-10-23 04:00:00 29.972809 36.12038      66.09319
## 2015-10-23 05:00:00  5.997953 31.85126      37.84921
## 2015-10-23 06:00:00 15.935223 28.23809      44.17331
acf(Pipe1, ylab="ACF Pipe 1", main="") 

pacf(Pipe1, ylab="PACF Pipe 1", main="")

acf(Pipe1h, ylab="ACF Pipe 1 Hourly", main="") 

pacf(Pipe1h, ylab="PACF Pipe 1 Hourly", main="")

acf(Pipe2, ylab="ACF Pipe 2", main="") 

pacf(Pipe2, ylab="PACF Pipe 2", main="")

Autoplot Pipe1

autoplot(Pipe1h)

Autoplot Pipe2

autoplot(Pipe2)