DATA 624 Project 1

library("readxl")
library("dplyr")
library("tidyr")
library("plotly")
library("forecast")
library("fpp2")
library("lubridate")
library("knitr")

Part A – ATM Forecast

Here we load the data and clean it up. This includes removing NA’s, putting correct data formats etc.

atmdata <- read_xlsx("data/ATM624Data.xlsx")
print(str(atmdata))

## Classes 'tbl_df', 'tbl' and 'data.frame':    1474 obs. of  3 variables:
##  $ DATE: num  39934 39934 39935 39935 39936 ...
##  $ ATM : chr  "ATM1" "ATM2" "ATM1" "ATM2" ...
##  $ Cash: num  96 107 82 89 85 90 90 55 99 79 ...
## NULL

atmdata$DATE = as.Date(atmdata$DATE, origin = "1899-12-30")
atmdata = atmdata[!is.na(atmdata$ATM),]

kable(head(atmdata))

DATE	ATM	Cash
2009-05-01	ATM1	96
2009-05-01	ATM2	107
2009-05-02	ATM1	82
2009-05-02	ATM2	89
2009-05-03	ATM1	85
2009-05-03	ATM2	90

kable(summary(atmdata))

DATE	ATM	Cash
Min. :2009-05-01	Length:1460	Min. : 0.0
1st Qu.:2009-07-31	Class :character	1st Qu.: 0.5
Median :2009-10-30	Mode :character	Median : 73.0
Mean :2009-10-30	NA	Mean : 155.6
3rd Qu.:2010-01-29	NA	3rd Qu.: 114.0
Max. :2010-04-30	NA	Max. :10919.8
NA	NA	NA’s :5

We can compare the data using histogram. This will show us what time of data we are dealing with.

ggplot(atmdata) + geom_histogram(aes(x = Cash)) + 
    facet_wrap(~ ATM, scales = "free") + 
    labs(title = "Histogram of Cash by ATM")

From the above histogram it is clear we have some issues with ATM 3 and 4. ATM 1 and 2 seem okay.

atm1 = atmdata %>% filter(ATM=="ATM1") %>% replace_na(list(Cash=0)) %>% select(-DATE,-ATM) %>%  ts(start=c(2009,as.numeric(format(as.Date("2009-05-01"), "%j"))), frequency = 365)
kable(head(atm1))

Cash
96
82
85
90
99
88

atm2 = atmdata %>% filter(ATM=="ATM2") %>% replace_na(list(Cash=0)) %>% select(-DATE,-ATM) %>%  ts(start=c(2009,as.numeric(format(as.Date("2009-05-01"), "%j"))), frequency = 365)
kable(head(atm2))

Cash
107
89
90
55
79
19

atm3 = atmdata %>% filter(ATM=="ATM3",Cash>0) %>% replace_na(list(Cash=0)) %>%select(-DATE,-ATM) %>%   ts(start=c(2010,as.numeric(format(as.Date("2010-04-28"), "%j"))),end=c(2010,as.numeric(format(as.Date("2010-04-30"), "%j"))) ,frequency = 365)
kable(head(atm3))

Cash
96
82
85

atm4 = atmdata %>% filter(ATM=="ATM4") %>% replace_na(list(Cash=0)) %>%select(-DATE,-ATM) %>%  ts(start=c(2009,as.numeric(format(as.Date("2009-05-01"), "%j"))), frequency = 365)
kable(head(atm4))

Cash
776.99342
524.41796
792.81136
908.23846
52.83210
52.20845

autoplot(atm1) +
  labs(title = "Cash withdrawl ATM 1",
       subtitle = "5/1/2009 - 4/30/2010",
       x = "Date",y = "Cash")

print(atmdata)

## # A tibble: 1,460 x 3
##    DATE       ATM    Cash
##    <date>     <chr> <dbl>
##  1 2009-05-01 ATM1     96
##  2 2009-05-01 ATM2    107
##  3 2009-05-02 ATM1     82
##  4 2009-05-02 ATM2     89
##  5 2009-05-03 ATM1     85
##  6 2009-05-03 ATM2     90
##  7 2009-05-04 ATM1     90
##  8 2009-05-04 ATM2     55
##  9 2009-05-05 ATM1     99
## 10 2009-05-05 ATM2     79
## # ... with 1,450 more rows

autoplot(atm2) +
  labs(title = "Cash withdrawl ATM 2", subtitle = "5/1/2009 - 4/30/2010",
       x = "Date",y = "Cash")

autoplot(atm3) +
  labs(title = "Cash withdrawl ATM3",
       subtitle = "5/1/2009 - 4/30/2010",
       x = "Date",y = "Cash")

autoplot(atm4) +
  labs(title = "Cash withdrawl ATM4",
       subtitle = "5/1/2009 - 4/30/2010",
       x = "Date",y = "Cash")

Comparing all 4 charts we can visually identify the underlying data. ATM 1 & 2 seems to have similar pattern. ATM 3 has some issues. Looks like there could be problems with the data like missing values etc. ATM 4 seems okay except for the spike. This datapoint has to be further investigated.

ATM 1

ATM1_Train = atmdata %>% filter(ATM=="ATM1")%>%replace_na(list(Cash=0)) %>% select(-DATE,-ATM) %>%  ts(start=c(2009,as.numeric(format(as.Date("2009-05-01"), "%j"))),end=c(2010,as.numeric(format(as.Date("2010-03-30"), "%j"))), frequency = 365)

ATM1_Train

## Time Series:
## Start = c(2009, 121) 
## End = c(2010, 89) 
## Frequency = 365 
##   [1]  96  82  85  90  99  88   8 104  87  93  86 111  75   6 102  73  92  82
##  [19]  86  73  20 100  93  90  94  98  73  10  97 102  85  85 108  94  14   3
##  [37]  96 109  96 145  81  16 142   0 120 106   0 108  21 140 110 115   0 108
##  [55]  66  13  99 105 104  98 110  79  16 110  96 114 126 126  73   4  19 114
##  [73]  98  97 114  78  19 102  94 108  91  86  78  16 114 115 108 102 129  79
##  [91]  13 103  90  68  85  99  86  13 116 105 123 114 127 111  34 151 110 115
## [109] 112 132  94  24 122 104 128 120 174  96  13 121 133 118  91 120  88  19
## [127] 150 144 121 105 133 109  18   1 105 112  82 111  79  13 112  99 140 110
## [145] 180  73   7 106 103  93  96 117  80  14 120  91  96  74 108  73  13  93
## [163]  94  76 111  88  76   9  87 105  78  67  90  68   9  78  74  74  60  75
## [181]  61   9  90  86  86  79  90  80  21  93 104 109  88  96  70  15  73  94
## [199] 108  73  87  75  10  92  87  74  73  93  66  18  99  94 136   6 140  73
## [217]   9 140 103 110  90 135  67  12 109  84  92  84 118  68  14  90  92  93
## [235]  85  93  70  13  90  91 102  97  42   2  14 132 108 120 120  90  48  15
## [253]  86 109 115 123 123  60  22  81  98  94  76  96  72  12  91  74 104  74
## [271]  91  66  28 152  97 100  85 123  46  38 138  74  85  78 123  50  28 105
## [289] 109  26  54 105 179 125  84  99 110  80   1 111 115 108 100 152  90   4
## [307] 128  87  84  69 112  62   4  94 102  98  88 123  55   4  92  88  92  92
## [325] 117  73   3  85  88  76  93 116  68  19

ATM1_Test = atmdata %>% filter(ATM=="ATM1")%>%replace_na(list(Cash=0)) %>%select(-DATE,-ATM) %>%  ts(start=c(2010,as.numeric(format(as.Date("2010-03-31"), "%j"))),end=c(2010,as.numeric(format(as.Date("2010-04-30"), "%j"))), frequency = 365)
ATM1_Test

## Time Series:
## Start = c(2010, 90) 
## End = c(2010, 120) 
## Frequency = 365 
##  [1]  96  82  85  90  99  88   8 104  87  93  86 111  75   6 102  73  92  82  86
## [20]  73  20 100  93  90  94  98  73  10  97 102  85

ggtsdisplay(atm1)

Residuals are not normally distributed. The model did not pass the Ljung-Box test. We have to take a look at the intervals. It may not require a lambda value. The model chosen by the function is ETS(A,N,N)

Model1_ATM1 = ets(atm1,lambda=BoxCox.lambda(atm1))
checkresiduals(Model1_ATM1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,N)
## Q* = 1181.4, df = 71, p-value < 2.2e-16
## 
## Model df: 2.   Total lags used: 73

The AIC is 4800.063, AICc is 4800.129 and BIC is 4811.763. The RMSE is 37.23543

summary(Model1_ATM1)

## ETS(A,N,N) 
## 
## Call:
##  ets(y = atm1, lambda = BoxCox.lambda(atm1)) 
## 
##   Box-Cox transformation: lambda= 1 
## 
##   Smoothing parameters:
##     alpha = 1e-04 
## 
##   Initial states:
##     l = 82.1827 
## 
##   sigma:  37.3379
## 
##      AIC     AICc      BIC 
## 4800.063 4800.129 4811.763 
## 
## Training set error measures:
##                        ME     RMSE      MAE  MPE MAPE MASE       ACF1
## Training set -0.003393192 37.23543 27.94454 -Inf  Inf  NaN 0.02153731

The ETS model fails to capture data trend

autoplot(forecast(Model1_ATM1, 31))

Model 2 is a non-seasonal ARIMA(2,0,2) with 2 lag differences no ordinary AR lags and 2 Ma lags. The model fails the Ljung-Box test, and the ACF test indicates no white noise and residual are not normally distributed around zero.

Model2_ATM1 =auto.arima(atm1,lambda=BoxCox.lambda(atm1))
checkresiduals(Model2_ATM1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,2) with non-zero mean
## Q* = 949.44, df = 68, p-value < 2.2e-16
## 
## Model df: 5.   Total lags used: 73

From the summary we see AIC is 3665.1, AICc is 3665.34 and BIC is 3688.5. The Root Mean Squared Error (RMSE) is 36.0517

summary(Model2_ATM1)

## Series: atm1 
## ARIMA(2,0,2) with non-zero mean 
## Box Cox transformation: lambda= 1 
## 
## Coefficients:
##           ar1     ar2     ma1      ma2     mean
##       -0.2414  0.3195  0.2672  -0.5364  82.1961
## s.e.   0.1282  0.1107  0.1105   0.0987   1.4986
## 
## sigma^2 estimated as 1318:  log likelihood=-1826.55
## AIC=3665.1   AICc=3665.34   BIC=3688.5
## 
## Training set error measures:
##                      ME    RMSE      MAE  MPE MAPE MASE       ACF1
## Training set 0.01600632 36.0517 27.43585 -Inf  Inf  NaN 0.01751342

autoplot(forecast(Model2_ATM1, 31))

Seen how the two previous models failed we group the series based on weeks. This allowed us to apply a lambda value and convert to white noise.

ATM1_Weekly = ts(atm1, frequency = 7)

ggtsdisplay(ATM1_Weekly)

The model given is ARIMA(0,0,0)(2,0,0)[7] with non-zero mean with 2 seasonal diferencing. The Ljung-Box test fails and the residuals are not normally distributed, ACF showed improvement.

Model3_ATM1_Weekly = auto.arima(ATM1_Weekly,lambda=BoxCox.lambda(ATM1_Weekly))

checkresiduals(Model3_ATM1_Weekly)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,0)(0,1,1)[7]
## Q* = 28.916, df = 13, p-value = 0.006729
## 
## Model df: 1.   Total lags used: 14

As you see in the summary the AIC is 3665.1, AICc is 3665.34 and BIC is 3688.5. The RMSE is 36.0517

summary(Model3_ATM1_Weekly)

## Series: ATM1_Weekly 
## ARIMA(0,0,0)(0,1,1)[7] 
## Box Cox transformation: lambda= 0.5621795 
## 
## Coefficients:
##          sma1
##       -0.6947
## s.e.   0.0398
## 
## sigma^2 estimated as 21.79:  log likelihood=-1061.35
## AIC=2126.7   AICc=2126.73   BIC=2134.46
## 
## Training set error measures:
##                    ME     RMSE      MAE  MPE MAPE      MASE       ACF1
## Training set 1.885474 26.69451 17.00408 -Inf  Inf 0.8841626 0.06009574

autoplot(forecast(Model3_ATM1_Weekly, 31))

ATM1_Forecast = forecast(Model3_ATM1_Weekly, 31, level = 95)

F_ATM1 =data_frame(DATE = rep(max(atmdata$DATE) + 1:31),
           ATM = rep("ATM1"),
           Cash = as.numeric( ATM1_Forecast$mean))

ATM 2

ARIMA(3,1,0)(2,0,0)[7] was applied 3 ordinary lags, 1 differenced and 2 seasonal lags. The model fails Ljung-Box test and residuals do not appear typically distributed.

ATM2_Weekly = ts(atm2, frequency = 7)

ggtsdisplay(ATM2_Weekly)

Model1_ATM2_Weekly =auto.arima(ATM2_Weekly,lambda=BoxCox.lambda(ATM2_Weekly))
checkresiduals(Model1_ATM2_Weekly)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,2)(0,1,1)[7]
## Q* = 10.046, df = 9, p-value = 0.3468
## 
## Model df: 5.   Total lags used: 14

Summary reveals the AIC is 2329.29, AICc is 2329.53 and BIC is 2352.57. The RMSE is 24.36601

summary(Model1_ATM2_Weekly)

## Series: ATM2_Weekly 
## ARIMA(2,0,2)(0,1,1)[7] 
## Box Cox transformation: lambda= 0.6498018 
## 
## Coefficients:
##           ar1      ar2     ma1     ma2     sma1
##       -0.4131  -0.8930  0.4475  0.7740  -0.7113
## s.e.   0.0605   0.0532  0.0895  0.0683   0.0419
## 
## sigma^2 estimated as 38.01:  log likelihood=-1158.64
## AIC=2329.29   AICc=2329.53   BIC=2352.57
## 
## Training set error measures:
##                     ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 0.5811661 24.36601 16.99358 -Inf  Inf 0.8308795 -0.01297589

autoplot(forecast(Model1_ATM2_Weekly, 31, level = 95))

ATM2_Forecast = forecast(Model1_ATM2_Weekly, 31, level = 95)

F_ATM2 =data_frame(DATE = rep(max(atmdata$DATE) + 1:31),
           ATM = rep("ATM2"),
           Cash = as.numeric( ATM2_Forecast$mean) )

ATM 3

Model 3 was filtered with zero due to erroneous forecasting values.

ggtsdisplay(atm3)

ATM3_Forecast_fit = meanf(atm3, 31, level = 95)
autoplot(ATM3_Forecast_fit)

checkresiduals(ATM3_Forecast_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from Mean
## Q* = NA, df = 3, p-value = NA
## 
## Model df: 1.   Total lags used: 4

summary(ATM3_Forecast_fit)

## 
## Forecast method: Mean
## 
## Model Information:
## $mu
## [1] 87.66667
## 
## $mu.se
## [1] 4.255715
## 
## $sd
## [1] 7.371115
## 
## $bootstrap
## [1] FALSE
## 
## $call
## meanf(y = atm3, h = 31, level = 95)
## 
## attr(,"class")
## [1] "meanf"
## 
## Error measures:
##                         ME    RMSE      MAE        MPE     MAPE MASE      ACF1
## Training set -4.737024e-15 6.01849 5.555556 -0.4557562 6.242793  NaN -0.295501
## 
## Forecasts:
##           Point Forecast    Lo 95    Hi 95
## 2010.3288       87.66667 51.04494 124.2884
## 2010.3315       87.66667 51.04494 124.2884
## 2010.3342       87.66667 51.04494 124.2884
## 2010.3370       87.66667 51.04494 124.2884
## 2010.3397       87.66667 51.04494 124.2884
## 2010.3425       87.66667 51.04494 124.2884
## 2010.3452       87.66667 51.04494 124.2884
## 2010.3479       87.66667 51.04494 124.2884
## 2010.3507       87.66667 51.04494 124.2884
## 2010.3534       87.66667 51.04494 124.2884
## 2010.3562       87.66667 51.04494 124.2884
## 2010.3589       87.66667 51.04494 124.2884
## 2010.3616       87.66667 51.04494 124.2884
## 2010.3644       87.66667 51.04494 124.2884
## 2010.3671       87.66667 51.04494 124.2884
## 2010.3699       87.66667 51.04494 124.2884
## 2010.3726       87.66667 51.04494 124.2884
## 2010.3753       87.66667 51.04494 124.2884
## 2010.3781       87.66667 51.04494 124.2884
## 2010.3808       87.66667 51.04494 124.2884
## 2010.3836       87.66667 51.04494 124.2884
## 2010.3863       87.66667 51.04494 124.2884
## 2010.3890       87.66667 51.04494 124.2884
## 2010.3918       87.66667 51.04494 124.2884
## 2010.3945       87.66667 51.04494 124.2884
## 2010.3973       87.66667 51.04494 124.2884
## 2010.4000       87.66667 51.04494 124.2884
## 2010.4027       87.66667 51.04494 124.2884
## 2010.4055       87.66667 51.04494 124.2884
## 2010.4082       87.66667 51.04494 124.2884
## 2010.4110       87.66667 51.04494 124.2884

F_ATM3 =data_frame(DATE = rep(max(atmdata$DATE) + 1:31),
           ATM = rep("atm3"),
           Cash = as.numeric( ATM3_Forecast_fit$mean))

ATM 4

The model was ARIMA(0,0,0)(1,0,0)[7] with non-zero mean based on RMSE. Will use the second model.

ATM4_Weekly = ts(atm4, frequency = 7)
ggtsdisplay(ATM4_Weekly)

Model1_ATM4_Weekly =auto.arima(ATM4_Weekly,lambda=BoxCox.lambda(ATM4_Weekly))

checkresiduals(Model1_ATM4_Weekly)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,0)(2,0,0)[7] with non-zero mean
## Q* = 17.625, df = 11, p-value = 0.09069
## 
## Model df: 3.   Total lags used: 14

Summary reveals the AIC is 979.18, AICc is 979.29 and BIC is 94.78. The RMSE is 678.9958

summary(Model1_ATM4_Weekly)

## Series: ATM4_Weekly 
## ARIMA(0,0,0)(2,0,0)[7] with non-zero mean 
## Box Cox transformation: lambda= -0.07372558 
## 
## Coefficients:
##         sar1    sar2    mean
##       0.2487  0.1947  4.4864
## s.e.  0.0521  0.0525  0.0841
## 
## sigma^2 estimated as 0.8418:  log likelihood=-485.59
## AIC=979.18   AICc=979.29   BIC=994.78
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE    MAPE      MASE
## Training set 209.7822 678.9958 317.1943 -239.7643 304.041 0.7892482
##                      ACF1
## Training set -0.005100994

autoplot(forecast(Model1_ATM4_Weekly, 31, level = 95))

ATM4_Forecast = forecast(Model1_ATM4_Weekly, 31, level = 95)

Model 2 is an ETS(M,N,A) multiplicative errors and there is no trend and additive seasonality. The model fails Ljung-Box test, but appear to have residual around 0 somewhat normal.

Model2_ATM4_Weekly = ets(ATM4_Weekly)

checkresiduals(Model2_ATM4_Weekly)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,N,A)
## Q* = 8.1066, df = 5, p-value = 0.1505
## 
## Model df: 9.   Total lags used: 14

Summary reveals the AIC is 6690.624, AICc is 6691.246 and BIC is 6729.623. The RMSE is 645.1182

summary(Model2_ATM4_Weekly)

## ETS(M,N,A) 
## 
## Call:
##  ets(y = ATM4_Weekly) 
## 
##   Smoothing parameters:
##     alpha = 0.0013 
##     gamma = 0.003 
## 
##   Initial states:
##     l = 372.4108 
##     s = -146.6545 -57.0431 217.4408 -6.2437 37.4918 5.355
##            -50.3463
## 
##   sigma:  1.2874
## 
##      AIC     AICc      BIC 
## 6690.624 6691.246 6729.623 
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 77.03608 645.1182 312.3148 -510.4324 551.5781 0.7771069
##                     ACF1
## Training set -0.01055406

autoplot(forecast(Model2_ATM4_Weekly, 31, level = 95))

ATM4_Forecast_ets = forecast(Model2_ATM4_Weekly, 31, level = 95)

F_ATM4 =data_frame(DATE = rep(max(atmdata$DATE) + 1:31),
           ATM = rep("ATM4"),
           Cash = as.numeric( ATM4_Forecast_ets$mean))

ATM_Fs = rbind(F_ATM1,F_ATM2,F_ATM3,F_ATM4)
write.csv(ATM_Fs,"Data/Forecast_ATM.csv")

PART B - Customer Forecast

The NA values were replaced with the median value. Here ARIMA and ETS with BoxCox of 0.082 was used

powerdata <- read_xlsx("data/ResidentialCustomerForecastLoad-624.xlsx")
sum(is.na(powerdata))

## [1] 1

powerdata$KWH[is.na(powerdata$KWH)]= median(powerdata$KWH,na.rm = TRUE)
Power_ts_All =ts(powerdata[,"KWH"],start = c(1998,1),frequency = 12)

Split dataset into training and test set.

Power_ts_Train = ts(powerdata[,"KWH"],start = c(1998,1),end=c(2012,12),frequency = 12)
Power_ts_Test = ts(powerdata[,"KWH"],start = c(2013,1),end=c(2013,12),frequency = 12)
autoplot(Power_ts_All) + labs(title = "Monthly Power Usage Jan 1998 Dec 2013")

We can see seasonality form the plot. However in 2010 there is a dip. Apart from that dip the trend was consistent

ggseasonplot(Power_ts_All)

ggtsdisplay - Plots a time series along with its acf and either its pacf, lagged scatterplot or spectrum.

ggtsdisplay(Power_ts_Train)

ARIMA(1,0,0)(2,0,0)[12] ordinary lag of 1 and 2 seasonal lags was used

Power_Model1 =auto.arima(Power_ts_Train,lambda=BoxCox.lambda(Power_ts_Train))

The residual appears normal residuals mostly around 0, and Ljung-Box test had a p-value = 0.7106 which indicated white noise.

checkresiduals(Power_Model1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1)(2,0,0)[12] with non-zero mean
## Q* = 15.607, df = 20, p-value = 0.7407
## 
## Model df: 4.   Total lags used: 24

Summary reveals the AIC is 436.79, AICc is 437.13 and BIC is 452.75. The RMSE is 1088497

summary(Power_Model1)

## Series: Power_ts_Train 
## ARIMA(0,0,1)(2,0,0)[12] with non-zero mean 
## Box Cox transformation: lambda= 0.0827662 
## 
## Coefficients:
##          ma1    sar1    sar2     mean
##       0.1972  0.2706  0.2718  32.0780
## s.e.  0.0745  0.0702  0.0729   0.1376
## 
## sigma^2 estimated as 0.6283:  log likelihood=-213.39
## AIC=436.79   AICc=437.13   BIC=452.75
## 
## Training set error measures:
##                    ME    RMSE    MAE       MPE     MAPE     MASE      ACF1
## Training set 139232.9 1088497 725959 -4.279079 15.03363 1.028716 0.1586741

autoplot(forecast(Power_Model1, 12, level = 95))+autolayer(Power_ts_Test,series = "Test Data")

Power_Model1_Forecast = forecast(Power_Model1, 12, level = 95)

Here we look for lower RMSE. In the test set we see a RMSE of 1126090

accuracy(Power_Model1_Forecast,Power_ts_Test)

##                     ME    RMSE      MAE        MPE     MAPE     MASE      ACF1
## Training set  139232.9 1088497 725959.0  -4.279079 15.03363 1.028716 0.1586741
## Test set     -668953.2 1126090 995823.3 -14.025170 18.05111 1.411126 0.3563829
##              Theil's U
## Training set        NA
## Test set      1.061721

Model 2 is based on ets the model returned is ETS(A,N,A) with Additive errors no trend and additive seasonality. The Ljung-Box test has a p-value = 0.1249, ACF and residuals seem normally distributed.

Power_Model2 =ets(Power_ts_Train,lambda=BoxCox.lambda(Power_ts_Train))
checkresiduals(Power_Model2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,A)
## Q* = 14.774, df = 10, p-value = 0.1405
## 
## Model df: 14.   Total lags used: 24

Summary shows the AIC is 824.2822, AICc is 827.2091 and BIC is 872.1766. The RMSE is 876009.5

summary(Power_Model2)

## ETS(A,N,A) 
## 
## Call:
##  ets(y = Power_ts_Train, lambda = BoxCox.lambda(Power_ts_Train)) 
## 
##   Box-Cox transformation: lambda= 0.0828 
## 
##   Smoothing parameters:
##     alpha = 0.0356 
##     gamma = 1e-04 
## 
##   Initial states:
##     l = 31.8939 
##     s = -0.1916 -0.9788 -0.3502 0.6621 0.9525 0.2372
##            0.0144 -0.7993 -0.6584 -0.155 0.4117 0.8553
## 
##   sigma:  0.7049
## 
##      AIC     AICc      BIC 
## 824.2822 827.2091 872.1766 
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE    MAPE      MASE      ACF1
## Training set 188310.8 876009.5 572516.3 -2.112802 12.3411 0.8112813 0.2874506

Compare training and test model data

autoplot(forecast(Power_Model2, 12, level = 95))+autolayer(Power_ts_Test,series = "Test Data")

Power_Model2_Forecast = forecast(Power_Model2, 12, level = 95)

RMSE for the test data is 1115408.3

accuracy(Power_Model2_Forecast,Power_ts_Test)

##                     ME      RMSE      MAE        MPE    MAPE      MASE
## Training set  188310.8  876009.5 572516.3  -2.112802 12.3411 0.8112813
## Test set     -618978.6 1115408.3 944441.8 -12.100485 15.9204 1.3383164
##                   ACF1 Theil's U
## Training set 0.2874506        NA
## Test set     0.2732840 0.8969385

The first model has RMSE lower score and has better prediction. We are looking for lower scores. I will pick the first model.

powerdf = data.frame(Date = paste0(month.abb,"-2014"),
           KWH = as.numeric(Power_Model1_Forecast$mean) )
write.csv(powerdf,"/Data/Power_Forecast2014.csv")

PART C - Waterflow

w1 = read_excel("Data/Waterflow_Pipe1.xlsx",col_types =c("date", "numeric"))
w2 = read_excel("Data/Waterflow_Pipe2.xlsx",col_types =c("date", "numeric"))
colnames(w1)= c("date_time","WaterFlow")
colnames(w2)= c("Date_Time","WaterFlow")

Waterdf= w1 %>% mutate(Date_Time = lubridate::round_date(date_time,"hour") ) %>% select(Date_Time,WaterFlow) %>% bind_rows(w2) %>% group_by(Date_Time) %>% summarize(WaterFlowF = mean(WaterFlow, na.rm = T))

Create time series with a frequency of 24 to simulate hourly other forms frequencies and times were tested. We will split the data into training and test sets. We will examine the complete time series data and proceed to view the ts display.

Water_ts = ts(Waterdf$WaterFlowF,frequency = 24)

autoplot(Water_ts)

The ts ACF plot reveals that correlations are not random and that the time series is not white noise and can be used to forecast future values.

ggtsdisplay(Water_ts)

The model we ran was ARIMA(0,1,1). The Arima model has been differenced once, and one past error has been used. We passed lambda -0.82 simulating an inverse. The residuals are close to zero with few outliers.

Water_Model1 =auto.arima(Water_ts,lambda=BoxCox.lambda(Water_ts),stepwise = FALSE)

checkresiduals(Water_Model1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)
## Q* = 41.388, df = 47, p-value = 0.7034
## 
## Model df: 1.   Total lags used: 48

Summary shows the AIC is -3270.77, AICc is -3270.76 and BIC is -3260.96. The RMSE is 16.73475

summary(Water_Model1)

## Series: Water_ts 
## ARIMA(0,1,1) 
## Box Cox transformation: lambda= -0.8243695 
## 
## Coefficients:
##           ma1
##       -0.9869
## s.e.   0.0055
## 
## sigma^2 estimated as 0.002209:  log likelihood=1637.39
## AIC=-3270.77   AICc=-3270.76   BIC=-3260.96
## 
## Training set error measures:
##                    ME     RMSE      MAE      MPE     MAPE     MASE       ACF1
## Training set 7.689601 16.73475 12.72097 -1.78783 42.43292 0.847649 0.05335534

Water_Model1_Forecast = forecast(Water_Model1, 168, level=95)

autoplot(forecast(Water_Model1, 168, level = 95))

Model two consist of an ets model. The Ljung-Box test identifies white noise residuals close to zero and some random autocorrelations. The model selected was ETS(A,N,N) with additive errors no seasonality and no trend. AICc 798.2221 is and RMSE is 16.76823.

Water_Model2 = ets(Water_ts,lambda=BoxCox.lambda(Water_ts))
checkresiduals(Water_Model2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,N)
## Q* = 41.298, df = 46, p-value = 0.6692
## 
## Model df: 2.   Total lags used: 48

summary(Water_Model2)

## ETS(A,N,N) 
## 
## Call:
##  ets(y = Water_ts, lambda = BoxCox.lambda(Water_ts)) 
## 
##   Box-Cox transformation: lambda= -0.8244 
## 
##   Smoothing parameters:
##     alpha = 0.0119 
## 
##   Initial states:
##     l = 1.1247 
## 
##   sigma:  0.047
## 
##      AIC     AICc      BIC 
## 798.1980 798.2221 812.9243 
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE     MASE       ACF1
## Training set 7.738182 16.76823 12.74726 -1.625442 42.41892 0.849401 0.05613023

autoplot(forecast(Water_Model2, 168, level = 95))

Water_Model2_Forecast = forecast(Water_Model2, 168, level=95)

The first model has better RMSE and AICc and is a better choice.

Water_csv= data_frame(Date_Time = max(Waterdf$Date_Time) + lubridate::hours(1:168),
           WaterFlowF = as.numeric( Water_Model1_Forecast$mean) )

write.csv(Water_csv,"Data/Water_ForecastWeek.csv")