library("readxl")
library("dplyr")
library("tidyr")
library("plotly")
library("forecast")
library("fpp2")
library("lubridate")
library("knitr")
Here we load the data and clean it up. This includes removing NA’s, putting correct data formats etc.
atmdata <- read_xlsx("data/ATM624Data.xlsx")
print(str(atmdata))
## Classes 'tbl_df', 'tbl' and 'data.frame': 1474 obs. of 3 variables:
## $ DATE: num 39934 39934 39935 39935 39936 ...
## $ ATM : chr "ATM1" "ATM2" "ATM1" "ATM2" ...
## $ Cash: num 96 107 82 89 85 90 90 55 99 79 ...
## NULL
atmdata$DATE = as.Date(atmdata$DATE, origin = "1899-12-30")
atmdata = atmdata[!is.na(atmdata$ATM),]
kable(head(atmdata))
DATE | ATM | Cash |
---|---|---|
2009-05-01 | ATM1 | 96 |
2009-05-01 | ATM2 | 107 |
2009-05-02 | ATM1 | 82 |
2009-05-02 | ATM2 | 89 |
2009-05-03 | ATM1 | 85 |
2009-05-03 | ATM2 | 90 |
kable(summary(atmdata))
DATE | ATM | Cash | |
---|---|---|---|
Min. :2009-05-01 | Length:1460 | Min. : 0.0 | |
1st Qu.:2009-07-31 | Class :character | 1st Qu.: 0.5 | |
Median :2009-10-30 | Mode :character | Median : 73.0 | |
Mean :2009-10-30 | NA | Mean : 155.6 | |
3rd Qu.:2010-01-29 | NA | 3rd Qu.: 114.0 | |
Max. :2010-04-30 | NA | Max. :10919.8 | |
NA | NA | NA’s :5 |
We can compare the data using histogram. This will show us what time of data we are dealing with.
ggplot(atmdata) + geom_histogram(aes(x = Cash)) +
facet_wrap(~ ATM, scales = "free") +
labs(title = "Histogram of Cash by ATM")
From the above histogram it is clear we have some issues with ATM 3 and 4. ATM 1 and 2 seem okay.
atm1 = atmdata %>% filter(ATM=="ATM1") %>% replace_na(list(Cash=0)) %>% select(-DATE,-ATM) %>% ts(start=c(2009,as.numeric(format(as.Date("2009-05-01"), "%j"))), frequency = 365)
kable(head(atm1))
Cash |
---|
96 |
82 |
85 |
90 |
99 |
88 |
atm2 = atmdata %>% filter(ATM=="ATM2") %>% replace_na(list(Cash=0)) %>% select(-DATE,-ATM) %>% ts(start=c(2009,as.numeric(format(as.Date("2009-05-01"), "%j"))), frequency = 365)
kable(head(atm2))
Cash |
---|
107 |
89 |
90 |
55 |
79 |
19 |
atm3 = atmdata %>% filter(ATM=="ATM3",Cash>0) %>% replace_na(list(Cash=0)) %>%select(-DATE,-ATM) %>% ts(start=c(2010,as.numeric(format(as.Date("2010-04-28"), "%j"))),end=c(2010,as.numeric(format(as.Date("2010-04-30"), "%j"))) ,frequency = 365)
kable(head(atm3))
Cash |
---|
96 |
82 |
85 |
atm4 = atmdata %>% filter(ATM=="ATM4") %>% replace_na(list(Cash=0)) %>%select(-DATE,-ATM) %>% ts(start=c(2009,as.numeric(format(as.Date("2009-05-01"), "%j"))), frequency = 365)
kable(head(atm4))
Cash |
---|
776.99342 |
524.41796 |
792.81136 |
908.23846 |
52.83210 |
52.20845 |
autoplot(atm1) +
labs(title = "Cash withdrawl ATM 1",
subtitle = "5/1/2009 - 4/30/2010",
x = "Date",y = "Cash")
print(atmdata)
## # A tibble: 1,460 x 3
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM1 96
## 2 2009-05-01 ATM2 107
## 3 2009-05-02 ATM1 82
## 4 2009-05-02 ATM2 89
## 5 2009-05-03 ATM1 85
## 6 2009-05-03 ATM2 90
## 7 2009-05-04 ATM1 90
## 8 2009-05-04 ATM2 55
## 9 2009-05-05 ATM1 99
## 10 2009-05-05 ATM2 79
## # ... with 1,450 more rows
autoplot(atm2) +
labs(title = "Cash withdrawl ATM 2", subtitle = "5/1/2009 - 4/30/2010",
x = "Date",y = "Cash")
autoplot(atm3) +
labs(title = "Cash withdrawl ATM3",
subtitle = "5/1/2009 - 4/30/2010",
x = "Date",y = "Cash")
autoplot(atm4) +
labs(title = "Cash withdrawl ATM4",
subtitle = "5/1/2009 - 4/30/2010",
x = "Date",y = "Cash")
Comparing all 4 charts we can visually identify the underlying data. ATM 1 & 2 seems to have similar pattern. ATM 3 has some issues. Looks like there could be problems with the data like missing values etc. ATM 4 seems okay except for the spike. This datapoint has to be further investigated.
ATM1_Train = atmdata %>% filter(ATM=="ATM1")%>%replace_na(list(Cash=0)) %>% select(-DATE,-ATM) %>% ts(start=c(2009,as.numeric(format(as.Date("2009-05-01"), "%j"))),end=c(2010,as.numeric(format(as.Date("2010-03-30"), "%j"))), frequency = 365)
ATM1_Train
## Time Series:
## Start = c(2009, 121)
## End = c(2010, 89)
## Frequency = 365
## [1] 96 82 85 90 99 88 8 104 87 93 86 111 75 6 102 73 92 82
## [19] 86 73 20 100 93 90 94 98 73 10 97 102 85 85 108 94 14 3
## [37] 96 109 96 145 81 16 142 0 120 106 0 108 21 140 110 115 0 108
## [55] 66 13 99 105 104 98 110 79 16 110 96 114 126 126 73 4 19 114
## [73] 98 97 114 78 19 102 94 108 91 86 78 16 114 115 108 102 129 79
## [91] 13 103 90 68 85 99 86 13 116 105 123 114 127 111 34 151 110 115
## [109] 112 132 94 24 122 104 128 120 174 96 13 121 133 118 91 120 88 19
## [127] 150 144 121 105 133 109 18 1 105 112 82 111 79 13 112 99 140 110
## [145] 180 73 7 106 103 93 96 117 80 14 120 91 96 74 108 73 13 93
## [163] 94 76 111 88 76 9 87 105 78 67 90 68 9 78 74 74 60 75
## [181] 61 9 90 86 86 79 90 80 21 93 104 109 88 96 70 15 73 94
## [199] 108 73 87 75 10 92 87 74 73 93 66 18 99 94 136 6 140 73
## [217] 9 140 103 110 90 135 67 12 109 84 92 84 118 68 14 90 92 93
## [235] 85 93 70 13 90 91 102 97 42 2 14 132 108 120 120 90 48 15
## [253] 86 109 115 123 123 60 22 81 98 94 76 96 72 12 91 74 104 74
## [271] 91 66 28 152 97 100 85 123 46 38 138 74 85 78 123 50 28 105
## [289] 109 26 54 105 179 125 84 99 110 80 1 111 115 108 100 152 90 4
## [307] 128 87 84 69 112 62 4 94 102 98 88 123 55 4 92 88 92 92
## [325] 117 73 3 85 88 76 93 116 68 19
ATM1_Test = atmdata %>% filter(ATM=="ATM1")%>%replace_na(list(Cash=0)) %>%select(-DATE,-ATM) %>% ts(start=c(2010,as.numeric(format(as.Date("2010-03-31"), "%j"))),end=c(2010,as.numeric(format(as.Date("2010-04-30"), "%j"))), frequency = 365)
ATM1_Test
## Time Series:
## Start = c(2010, 90)
## End = c(2010, 120)
## Frequency = 365
## [1] 96 82 85 90 99 88 8 104 87 93 86 111 75 6 102 73 92 82 86
## [20] 73 20 100 93 90 94 98 73 10 97 102 85
ggtsdisplay(atm1)
Residuals are not normally distributed. The model did not pass the Ljung-Box test. We have to take a look at the intervals. It may not require a lambda value. The model chosen by the function is ETS(A,N,N)
Model1_ATM1 = ets(atm1,lambda=BoxCox.lambda(atm1))
checkresiduals(Model1_ATM1)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,N)
## Q* = 1181.4, df = 71, p-value < 2.2e-16
##
## Model df: 2. Total lags used: 73
The AIC is 4800.063, AICc is 4800.129 and BIC is 4811.763. The RMSE is 37.23543
summary(Model1_ATM1)
## ETS(A,N,N)
##
## Call:
## ets(y = atm1, lambda = BoxCox.lambda(atm1))
##
## Box-Cox transformation: lambda= 1
##
## Smoothing parameters:
## alpha = 1e-04
##
## Initial states:
## l = 82.1827
##
## sigma: 37.3379
##
## AIC AICc BIC
## 4800.063 4800.129 4811.763
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.003393192 37.23543 27.94454 -Inf Inf NaN 0.02153731
The ETS model fails to capture data trend
autoplot(forecast(Model1_ATM1, 31))
Model 2 is a non-seasonal ARIMA(2,0,2) with 2 lag differences no ordinary AR lags and 2 Ma lags. The model fails the Ljung-Box test, and the ACF test indicates no white noise and residual are not normally distributed around zero.
Model2_ATM1 =auto.arima(atm1,lambda=BoxCox.lambda(atm1))
checkresiduals(Model2_ATM1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,2) with non-zero mean
## Q* = 949.44, df = 68, p-value < 2.2e-16
##
## Model df: 5. Total lags used: 73
From the summary we see AIC is 3665.1, AICc is 3665.34 and BIC is 3688.5. The Root Mean Squared Error (RMSE) is 36.0517
summary(Model2_ATM1)
## Series: atm1
## ARIMA(2,0,2) with non-zero mean
## Box Cox transformation: lambda= 1
##
## Coefficients:
## ar1 ar2 ma1 ma2 mean
## -0.2414 0.3195 0.2672 -0.5364 82.1961
## s.e. 0.1282 0.1107 0.1105 0.0987 1.4986
##
## sigma^2 estimated as 1318: log likelihood=-1826.55
## AIC=3665.1 AICc=3665.34 BIC=3688.5
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.01600632 36.0517 27.43585 -Inf Inf NaN 0.01751342
autoplot(forecast(Model2_ATM1, 31))
Seen how the two previous models failed we group the series based on weeks. This allowed us to apply a lambda value and convert to white noise.
ATM1_Weekly = ts(atm1, frequency = 7)
ggtsdisplay(ATM1_Weekly)
The model given is ARIMA(0,0,0)(2,0,0)[7] with non-zero mean with 2 seasonal diferencing. The Ljung-Box test fails and the residuals are not normally distributed, ACF showed improvement.
Model3_ATM1_Weekly = auto.arima(ATM1_Weekly,lambda=BoxCox.lambda(ATM1_Weekly))
checkresiduals(Model3_ATM1_Weekly)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,0)(0,1,1)[7]
## Q* = 28.916, df = 13, p-value = 0.006729
##
## Model df: 1. Total lags used: 14
As you see in the summary the AIC is 3665.1, AICc is 3665.34 and BIC is 3688.5. The RMSE is 36.0517
summary(Model3_ATM1_Weekly)
## Series: ATM1_Weekly
## ARIMA(0,0,0)(0,1,1)[7]
## Box Cox transformation: lambda= 0.5621795
##
## Coefficients:
## sma1
## -0.6947
## s.e. 0.0398
##
## sigma^2 estimated as 21.79: log likelihood=-1061.35
## AIC=2126.7 AICc=2126.73 BIC=2134.46
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 1.885474 26.69451 17.00408 -Inf Inf 0.8841626 0.06009574
autoplot(forecast(Model3_ATM1_Weekly, 31))
ATM1_Forecast = forecast(Model3_ATM1_Weekly, 31, level = 95)
F_ATM1 =data_frame(DATE = rep(max(atmdata$DATE) + 1:31),
ATM = rep("ATM1"),
Cash = as.numeric( ATM1_Forecast$mean))
ARIMA(3,1,0)(2,0,0)[7] was applied 3 ordinary lags, 1 differenced and 2 seasonal lags. The model fails Ljung-Box test and residuals do not appear typically distributed.
ATM2_Weekly = ts(atm2, frequency = 7)
ggtsdisplay(ATM2_Weekly)
Model1_ATM2_Weekly =auto.arima(ATM2_Weekly,lambda=BoxCox.lambda(ATM2_Weekly))
checkresiduals(Model1_ATM2_Weekly)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,2)(0,1,1)[7]
## Q* = 10.046, df = 9, p-value = 0.3468
##
## Model df: 5. Total lags used: 14
Summary reveals the AIC is 2329.29, AICc is 2329.53 and BIC is 2352.57. The RMSE is 24.36601
summary(Model1_ATM2_Weekly)
## Series: ATM2_Weekly
## ARIMA(2,0,2)(0,1,1)[7]
## Box Cox transformation: lambda= 0.6498018
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1
## -0.4131 -0.8930 0.4475 0.7740 -0.7113
## s.e. 0.0605 0.0532 0.0895 0.0683 0.0419
##
## sigma^2 estimated as 38.01: log likelihood=-1158.64
## AIC=2329.29 AICc=2329.53 BIC=2352.57
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.5811661 24.36601 16.99358 -Inf Inf 0.8308795 -0.01297589
autoplot(forecast(Model1_ATM2_Weekly, 31, level = 95))
ATM2_Forecast = forecast(Model1_ATM2_Weekly, 31, level = 95)
F_ATM2 =data_frame(DATE = rep(max(atmdata$DATE) + 1:31),
ATM = rep("ATM2"),
Cash = as.numeric( ATM2_Forecast$mean) )
Model 3 was filtered with zero due to erroneous forecasting values.
ggtsdisplay(atm3)
ATM3_Forecast_fit = meanf(atm3, 31, level = 95)
autoplot(ATM3_Forecast_fit)
checkresiduals(ATM3_Forecast_fit)
##
## Ljung-Box test
##
## data: Residuals from Mean
## Q* = NA, df = 3, p-value = NA
##
## Model df: 1. Total lags used: 4
summary(ATM3_Forecast_fit)
##
## Forecast method: Mean
##
## Model Information:
## $mu
## [1] 87.66667
##
## $mu.se
## [1] 4.255715
##
## $sd
## [1] 7.371115
##
## $bootstrap
## [1] FALSE
##
## $call
## meanf(y = atm3, h = 31, level = 95)
##
## attr(,"class")
## [1] "meanf"
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -4.737024e-15 6.01849 5.555556 -0.4557562 6.242793 NaN -0.295501
##
## Forecasts:
## Point Forecast Lo 95 Hi 95
## 2010.3288 87.66667 51.04494 124.2884
## 2010.3315 87.66667 51.04494 124.2884
## 2010.3342 87.66667 51.04494 124.2884
## 2010.3370 87.66667 51.04494 124.2884
## 2010.3397 87.66667 51.04494 124.2884
## 2010.3425 87.66667 51.04494 124.2884
## 2010.3452 87.66667 51.04494 124.2884
## 2010.3479 87.66667 51.04494 124.2884
## 2010.3507 87.66667 51.04494 124.2884
## 2010.3534 87.66667 51.04494 124.2884
## 2010.3562 87.66667 51.04494 124.2884
## 2010.3589 87.66667 51.04494 124.2884
## 2010.3616 87.66667 51.04494 124.2884
## 2010.3644 87.66667 51.04494 124.2884
## 2010.3671 87.66667 51.04494 124.2884
## 2010.3699 87.66667 51.04494 124.2884
## 2010.3726 87.66667 51.04494 124.2884
## 2010.3753 87.66667 51.04494 124.2884
## 2010.3781 87.66667 51.04494 124.2884
## 2010.3808 87.66667 51.04494 124.2884
## 2010.3836 87.66667 51.04494 124.2884
## 2010.3863 87.66667 51.04494 124.2884
## 2010.3890 87.66667 51.04494 124.2884
## 2010.3918 87.66667 51.04494 124.2884
## 2010.3945 87.66667 51.04494 124.2884
## 2010.3973 87.66667 51.04494 124.2884
## 2010.4000 87.66667 51.04494 124.2884
## 2010.4027 87.66667 51.04494 124.2884
## 2010.4055 87.66667 51.04494 124.2884
## 2010.4082 87.66667 51.04494 124.2884
## 2010.4110 87.66667 51.04494 124.2884
F_ATM3 =data_frame(DATE = rep(max(atmdata$DATE) + 1:31),
ATM = rep("atm3"),
Cash = as.numeric( ATM3_Forecast_fit$mean))
The model was ARIMA(0,0,0)(1,0,0)[7] with non-zero mean based on RMSE. Will use the second model.
ATM4_Weekly = ts(atm4, frequency = 7)
ggtsdisplay(ATM4_Weekly)
Model1_ATM4_Weekly =auto.arima(ATM4_Weekly,lambda=BoxCox.lambda(ATM4_Weekly))
checkresiduals(Model1_ATM4_Weekly)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,0)(2,0,0)[7] with non-zero mean
## Q* = 17.625, df = 11, p-value = 0.09069
##
## Model df: 3. Total lags used: 14
Summary reveals the AIC is 979.18, AICc is 979.29 and BIC is 94.78. The RMSE is 678.9958
summary(Model1_ATM4_Weekly)
## Series: ATM4_Weekly
## ARIMA(0,0,0)(2,0,0)[7] with non-zero mean
## Box Cox transformation: lambda= -0.07372558
##
## Coefficients:
## sar1 sar2 mean
## 0.2487 0.1947 4.4864
## s.e. 0.0521 0.0525 0.0841
##
## sigma^2 estimated as 0.8418: log likelihood=-485.59
## AIC=979.18 AICc=979.29 BIC=994.78
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 209.7822 678.9958 317.1943 -239.7643 304.041 0.7892482
## ACF1
## Training set -0.005100994
autoplot(forecast(Model1_ATM4_Weekly, 31, level = 95))
ATM4_Forecast = forecast(Model1_ATM4_Weekly, 31, level = 95)
Model 2 is an ETS(M,N,A) multiplicative errors and there is no trend and additive seasonality. The model fails Ljung-Box test, but appear to have residual around 0 somewhat normal.
Model2_ATM4_Weekly = ets(ATM4_Weekly)
checkresiduals(Model2_ATM4_Weekly)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,A)
## Q* = 8.1066, df = 5, p-value = 0.1505
##
## Model df: 9. Total lags used: 14
Summary reveals the AIC is 6690.624, AICc is 6691.246 and BIC is 6729.623. The RMSE is 645.1182
summary(Model2_ATM4_Weekly)
## ETS(M,N,A)
##
## Call:
## ets(y = ATM4_Weekly)
##
## Smoothing parameters:
## alpha = 0.0013
## gamma = 0.003
##
## Initial states:
## l = 372.4108
## s = -146.6545 -57.0431 217.4408 -6.2437 37.4918 5.355
## -50.3463
##
## sigma: 1.2874
##
## AIC AICc BIC
## 6690.624 6691.246 6729.623
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 77.03608 645.1182 312.3148 -510.4324 551.5781 0.7771069
## ACF1
## Training set -0.01055406
autoplot(forecast(Model2_ATM4_Weekly, 31, level = 95))
ATM4_Forecast_ets = forecast(Model2_ATM4_Weekly, 31, level = 95)
F_ATM4 =data_frame(DATE = rep(max(atmdata$DATE) + 1:31),
ATM = rep("ATM4"),
Cash = as.numeric( ATM4_Forecast_ets$mean))
ATM_Fs = rbind(F_ATM1,F_ATM2,F_ATM3,F_ATM4)
write.csv(ATM_Fs,"Data/Forecast_ATM.csv")
The NA values were replaced with the median value. Here ARIMA and ETS with BoxCox of 0.082 was used
powerdata <- read_xlsx("data/ResidentialCustomerForecastLoad-624.xlsx")
sum(is.na(powerdata))
## [1] 1
powerdata$KWH[is.na(powerdata$KWH)]= median(powerdata$KWH,na.rm = TRUE)
Power_ts_All =ts(powerdata[,"KWH"],start = c(1998,1),frequency = 12)
Split dataset into training and test set.
Power_ts_Train = ts(powerdata[,"KWH"],start = c(1998,1),end=c(2012,12),frequency = 12)
Power_ts_Test = ts(powerdata[,"KWH"],start = c(2013,1),end=c(2013,12),frequency = 12)
autoplot(Power_ts_All) + labs(title = "Monthly Power Usage Jan 1998 Dec 2013")
We can see seasonality form the plot. However in 2010 there is a dip. Apart from that dip the trend was consistent
ggseasonplot(Power_ts_All)
ggtsdisplay - Plots a time series along with its acf and either its pacf, lagged scatterplot or spectrum.
ggtsdisplay(Power_ts_Train)
ARIMA(1,0,0)(2,0,0)[12] ordinary lag of 1 and 2 seasonal lags was used
Power_Model1 =auto.arima(Power_ts_Train,lambda=BoxCox.lambda(Power_ts_Train))
The residual appears normal residuals mostly around 0, and Ljung-Box test had a p-value = 0.7106 which indicated white noise.
checkresiduals(Power_Model1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(2,0,0)[12] with non-zero mean
## Q* = 15.607, df = 20, p-value = 0.7407
##
## Model df: 4. Total lags used: 24
Summary reveals the AIC is 436.79, AICc is 437.13 and BIC is 452.75. The RMSE is 1088497
summary(Power_Model1)
## Series: Power_ts_Train
## ARIMA(0,0,1)(2,0,0)[12] with non-zero mean
## Box Cox transformation: lambda= 0.0827662
##
## Coefficients:
## ma1 sar1 sar2 mean
## 0.1972 0.2706 0.2718 32.0780
## s.e. 0.0745 0.0702 0.0729 0.1376
##
## sigma^2 estimated as 0.6283: log likelihood=-213.39
## AIC=436.79 AICc=437.13 BIC=452.75
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 139232.9 1088497 725959 -4.279079 15.03363 1.028716 0.1586741
autoplot(forecast(Power_Model1, 12, level = 95))+autolayer(Power_ts_Test,series = "Test Data")
Power_Model1_Forecast = forecast(Power_Model1, 12, level = 95)
Here we look for lower RMSE. In the test set we see a RMSE of 1126090
accuracy(Power_Model1_Forecast,Power_ts_Test)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 139232.9 1088497 725959.0 -4.279079 15.03363 1.028716 0.1586741
## Test set -668953.2 1126090 995823.3 -14.025170 18.05111 1.411126 0.3563829
## Theil's U
## Training set NA
## Test set 1.061721
Model 2 is based on ets the model returned is ETS(A,N,A) with Additive errors no trend and additive seasonality. The Ljung-Box test has a p-value = 0.1249, ACF and residuals seem normally distributed.
Power_Model2 =ets(Power_ts_Train,lambda=BoxCox.lambda(Power_ts_Train))
checkresiduals(Power_Model2)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,A)
## Q* = 14.774, df = 10, p-value = 0.1405
##
## Model df: 14. Total lags used: 24
Summary shows the AIC is 824.2822, AICc is 827.2091 and BIC is 872.1766. The RMSE is 876009.5
summary(Power_Model2)
## ETS(A,N,A)
##
## Call:
## ets(y = Power_ts_Train, lambda = BoxCox.lambda(Power_ts_Train))
##
## Box-Cox transformation: lambda= 0.0828
##
## Smoothing parameters:
## alpha = 0.0356
## gamma = 1e-04
##
## Initial states:
## l = 31.8939
## s = -0.1916 -0.9788 -0.3502 0.6621 0.9525 0.2372
## 0.0144 -0.7993 -0.6584 -0.155 0.4117 0.8553
##
## sigma: 0.7049
##
## AIC AICc BIC
## 824.2822 827.2091 872.1766
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 188310.8 876009.5 572516.3 -2.112802 12.3411 0.8112813 0.2874506
Compare training and test model data
autoplot(forecast(Power_Model2, 12, level = 95))+autolayer(Power_ts_Test,series = "Test Data")
Power_Model2_Forecast = forecast(Power_Model2, 12, level = 95)
RMSE for the test data is 1115408.3
accuracy(Power_Model2_Forecast,Power_ts_Test)
## ME RMSE MAE MPE MAPE MASE
## Training set 188310.8 876009.5 572516.3 -2.112802 12.3411 0.8112813
## Test set -618978.6 1115408.3 944441.8 -12.100485 15.9204 1.3383164
## ACF1 Theil's U
## Training set 0.2874506 NA
## Test set 0.2732840 0.8969385
The first model has RMSE lower score and has better prediction. We are looking for lower scores. I will pick the first model.
powerdf = data.frame(Date = paste0(month.abb,"-2014"),
KWH = as.numeric(Power_Model1_Forecast$mean) )
write.csv(powerdf,"/Data/Power_Forecast2014.csv")
w1 = read_excel("Data/Waterflow_Pipe1.xlsx",col_types =c("date", "numeric"))
w2 = read_excel("Data/Waterflow_Pipe2.xlsx",col_types =c("date", "numeric"))
colnames(w1)= c("date_time","WaterFlow")
colnames(w2)= c("Date_Time","WaterFlow")
Waterdf= w1 %>% mutate(Date_Time = lubridate::round_date(date_time,"hour") ) %>% select(Date_Time,WaterFlow) %>% bind_rows(w2) %>% group_by(Date_Time) %>% summarize(WaterFlowF = mean(WaterFlow, na.rm = T))
Create time series with a frequency of 24 to simulate hourly other forms frequencies and times were tested. We will split the data into training and test sets. We will examine the complete time series data and proceed to view the ts display.
Water_ts = ts(Waterdf$WaterFlowF,frequency = 24)
autoplot(Water_ts)
The ts ACF plot reveals that correlations are not random and that the time series is not white noise and can be used to forecast future values.
ggtsdisplay(Water_ts)
The model we ran was ARIMA(0,1,1). The Arima model has been differenced once, and one past error has been used. We passed lambda -0.82 simulating an inverse. The residuals are close to zero with few outliers.
Water_Model1 =auto.arima(Water_ts,lambda=BoxCox.lambda(Water_ts),stepwise = FALSE)
checkresiduals(Water_Model1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1)
## Q* = 41.388, df = 47, p-value = 0.7034
##
## Model df: 1. Total lags used: 48
Summary shows the AIC is -3270.77, AICc is -3270.76 and BIC is -3260.96. The RMSE is 16.73475
summary(Water_Model1)
## Series: Water_ts
## ARIMA(0,1,1)
## Box Cox transformation: lambda= -0.8243695
##
## Coefficients:
## ma1
## -0.9869
## s.e. 0.0055
##
## sigma^2 estimated as 0.002209: log likelihood=1637.39
## AIC=-3270.77 AICc=-3270.76 BIC=-3260.96
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 7.689601 16.73475 12.72097 -1.78783 42.43292 0.847649 0.05335534
Water_Model1_Forecast = forecast(Water_Model1, 168, level=95)
autoplot(forecast(Water_Model1, 168, level = 95))
Model two consist of an ets model. The Ljung-Box test identifies white noise residuals close to zero and some random autocorrelations. The model selected was ETS(A,N,N) with additive errors no seasonality and no trend. AICc 798.2221 is and RMSE is 16.76823.
Water_Model2 = ets(Water_ts,lambda=BoxCox.lambda(Water_ts))
checkresiduals(Water_Model2)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,N)
## Q* = 41.298, df = 46, p-value = 0.6692
##
## Model df: 2. Total lags used: 48
summary(Water_Model2)
## ETS(A,N,N)
##
## Call:
## ets(y = Water_ts, lambda = BoxCox.lambda(Water_ts))
##
## Box-Cox transformation: lambda= -0.8244
##
## Smoothing parameters:
## alpha = 0.0119
##
## Initial states:
## l = 1.1247
##
## sigma: 0.047
##
## AIC AICc BIC
## 798.1980 798.2221 812.9243
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 7.738182 16.76823 12.74726 -1.625442 42.41892 0.849401 0.05613023
autoplot(forecast(Water_Model2, 168, level = 95))
Water_Model2_Forecast = forecast(Water_Model2, 168, level=95)
The first model has better RMSE and AICc and is a better choice.
Water_csv= data_frame(Date_Time = max(Waterdf$Date_Time) + lubridate::hours(1:168),
WaterFlowF = as.numeric( Water_Model1_Forecast$mean) )
write.csv(Water_csv,"Data/Water_ForecastWeek.csv")