In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.
The data file provided will be saved to local directory and from there, will be read into R
View: data structures, descriptive statistics, complete and incomplete cases, visualize through box plots, add columns
Prepare data by imputing missing values, treating outliers, removing cmplete null rows
Create and plot the time series for 4 ATM machines, having weekly frequency
For each ATM machine, build 3 models using Holt-Winter, ETS, and Arima for ATM1, ATM2, ATM4. For ATM3 with only 3 withdrawals, use mean for forecasting
Evaluate models by looking at summaries and accuracy measures such as AICc, RMSE, p-values
Put in comments to better document snippet of codes
# Load from working directory
atm624data <- readxl::read_xlsx("ATM624Data.xlsx")
# View data structure
str(atm624data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1474 obs. of 3 variables:
## $ DATE: num 39934 39934 39935 39935 39936 ...
## $ ATM : chr "ATM1" "ATM2" "ATM1" "ATM2" ...
## $ Cash: num 96 107 82 89 85 90 90 55 99 79 ...
# Add columns: date readability and aggregation
atm624data$Date2 = as.Date(atm624data$DATE, origin = "1899-12-30")
atm624data$Day = weekdays(atm624data$Date2)
# View sample observations
head(atm624data)
## # A tibble: 6 x 5
## DATE ATM Cash Date2 Day
## <dbl> <chr> <dbl> <date> <chr>
## 1 39934 ATM1 96 2009-05-01 Friday
## 2 39934 ATM2 107 2009-05-01 Friday
## 3 39935 ATM1 82 2009-05-02 Saturday
## 4 39935 ATM2 89 2009-05-02 Saturday
## 5 39936 ATM1 85 2009-05-03 Sunday
## 6 39936 ATM2 90 2009-05-03 Sunday
# View incomplete cases
atm.inc <- atm624data[!complete.cases(atm624data),] %>%
group_by(ATM, Day)
atm.inc
## # A tibble: 19 x 5
## # Groups: ATM, Day [12]
## DATE ATM Cash Date2 Day
## <dbl> <chr> <dbl> <date> <chr>
## 1 39977 ATM1 NA 2009-06-13 Saturday
## 2 39980 ATM1 NA 2009-06-16 Tuesday
## 3 39982 ATM2 NA 2009-06-18 Thursday
## 4 39986 ATM1 NA 2009-06-22 Monday
## 5 39988 ATM2 NA 2009-06-24 Wednesday
## 6 40299 <NA> NA 2010-05-01 Saturday
## 7 40300 <NA> NA 2010-05-02 Sunday
## 8 40301 <NA> NA 2010-05-03 Monday
## 9 40302 <NA> NA 2010-05-04 Tuesday
## 10 40303 <NA> NA 2010-05-05 Wednesday
## 11 40304 <NA> NA 2010-05-06 Thursday
## 12 40305 <NA> NA 2010-05-07 Friday
## 13 40306 <NA> NA 2010-05-08 Saturday
## 14 40307 <NA> NA 2010-05-09 Sunday
## 15 40308 <NA> NA 2010-05-10 Monday
## 16 40309 <NA> NA 2010-05-11 Tuesday
## 17 40310 <NA> NA 2010-05-12 Wednesday
## 18 40311 <NA> NA 2010-05-13 Thursday
## 19 40312 <NA> NA 2010-05-14 Friday
# Visualize to see outlier, if any
atm <- ggplot(atm624data, aes(x=ATM, y=Cash, group=ATM)) +
geom_boxplot(aes(fill=ATM))
atm
## Warning: Removed 19 rows containing non-finite values (stat_boxplot).
# Group statistics per ATM and day
# To be used for imputing missing Cash value and for replacing outlier
atm.comp <- filter(atm624data, Cash > 0) %>%
select(ATM, Day, Cash) %>%
group_by(ATM, Day) %>%
summarize(count = n()
,min = min(Cash)
,max = max(Cash)
,mean = mean(Cash)
,median = median(Cash)
)
atm.comp
## # A tibble: 24 x 7
## # Groups: ATM [4]
## ATM Day count min max mean median
## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 ATM1 Friday 53 1 152 98.6 98
## 2 ATM1 Monday 51 6 126 86 85
## 3 ATM1 Saturday 51 69 144 96.6 96
## 4 ATM1 Sunday 52 26 152 103. 108
## 5 ATM1 Thursday 52 4 125 31.7 16
## 6 ATM1 Tuesday 51 1 180 89.6 99
## 7 ATM1 Wednesday 52 2 179 82.2 78.5
## 8 ATM2 Friday 53 2 147 92.0 104
## 9 ATM2 Monday 52 2 132 58.7 63.5
## 10 ATM2 Saturday 52 15 147 76.0 72.5
## # ... with 14 more rows
# replace outlier with median
row.max <- which.max(atm624data$Cash)
atm624data[row.max,]
## # A tibble: 1 x 5
## DATE ATM Cash Date2 Day
## <dbl> <chr> <dbl> <date> <chr>
## 1 40218 ATM4 10920. 2010-02-09 Tuesday
atm624data[row.max,"Cash"] <-
as.numeric(subset(atm.comp, ATM == 'ATM4' & Day == 'Tuesday',"median"))
atm624data[row.max,]
## # A tibble: 1 x 5
## DATE ATM Cash Date2 Day
## <dbl> <chr> <dbl> <date> <chr>
## 1 40218 ATM4 307. 2010-02-09 Tuesday
# impute missing Cash value with mean
atmfill <- atm624data
row.fillCash = c()
for (row in 1:nrow(atmfill)) {
if(!is.na(atmfill[row,2]) & is.na(atmfill[row,3]))
{
row.fillCash <- c(row.fillCash, row)
atmfill[row,3] <-
as.numeric(subset(atm.comp, ATM == as.character(atmfill[row,2]) &
Day == as.character(atmfill[row,5]),"mean"))
}
}
atm624data[row.fillCash,]
## # A tibble: 5 x 5
## DATE ATM Cash Date2 Day
## <dbl> <chr> <dbl> <date> <chr>
## 1 39977 ATM1 NA 2009-06-13 Saturday
## 2 39980 ATM1 NA 2009-06-16 Tuesday
## 3 39982 ATM2 NA 2009-06-18 Thursday
## 4 39986 ATM1 NA 2009-06-22 Monday
## 5 39988 ATM2 NA 2009-06-24 Wednesday
atmfill[row.fillCash,]
## # A tibble: 5 x 5
## DATE ATM Cash Date2 Day
## <dbl> <chr> <dbl> <date> <chr>
## 1 39977 ATM1 96.6 2009-06-13 Saturday
## 2 39980 ATM1 89.6 2009-06-16 Tuesday
## 3 39982 ATM2 25.5 2009-06-18 Thursday
## 4 39986 ATM1 86 2009-06-22 Monday
## 5 39988 ATM2 43.7 2009-06-24 Wednesday
# remove the rest of missing values (no ATM, no Cash)
atm.df <- atmfill[complete.cases(atmfill),]
#### Create and plot of time-series
# create time series, weekly, starts friday
atm1 <- ts(subset(atm.df, ATM=='ATM1', select=c(Date2, Cash))["Cash"],
frequency = 7, start=c(1,6))
atm2 <- ts(subset(atm.df, ATM=='ATM2', select=c(Date2, Cash))["Cash"],
frequency = 7, start=c(1,6))
atm3 <- ts(subset(atm.df, ATM=='ATM3', select=c(Date2, Cash))["Cash"],
frequency = 7, start=c(1,6))
atm4 <- ts(subset(atm.df, ATM=='ATM4', select=c(Date2, Cash))["Cash"],
frequency = 7, start(1,6))
# time series and correlation plots
ggtsdisplay(atm1)
ggtsdisplay(atm2)
ggtsdisplay(atm3)
ggtsdisplay(atm4)
ATM 1
# Additivie Holt-Winter
fit.hw.atm1 <- hw(atm1, seasonal = "additive")
checkresiduals(fit.hw.atm1)
##
## Ljung-Box test
##
## data: Residuals from Holt-Winters' additive method
## Q* = 28.687, df = 3, p-value = 2.605e-06
##
## Model df: 11. Total lags used: 14
summary(fit.hw.atm1)
##
## Forecast method: Holt-Winters' additive method
##
## Model Information:
## Holt-Winters' additive method
##
## Call:
## hw(y = atm1, seasonal = "additive")
##
## Smoothing parameters:
## alpha = 0.0181
## beta = 1e-04
## gamma = 0.3295
##
## Initial states:
## l = 79.1281
## b = -0.0069
## s = -56.2972 -1.2651 9.0566 2.7969 20.2304 14.0349
## 11.4436
##
## sigma: 24.1802
##
## AIC AICc BIC
## 4491.733 4492.619 4538.531
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.142441 23.81304 15.1104 -107.0129 121.9384 0.8529868 0.1303259
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 53.85714 86.839366 55.85122 117.82752 39.44708 134.23165
## 54.00000 100.532427 69.53915 131.52571 53.13229 147.93256
## 54.14286 72.934232 41.93576 103.93270 25.52616 120.34230
## 54.28571 5.456512 -25.54721 36.46023 -41.95958 52.87261
## 54.42857 99.914315 68.90529 130.92334 52.49011 147.33852
## 54.57143 79.204415 48.19003 110.21880 31.77201 126.63682
## 54.71429 85.359624 54.33982 116.37943 37.91893 132.80032
## 54.85714 86.753591 53.90975 119.59743 36.52327 136.98391
## 55.00000 100.446652 67.59758 133.29572 50.20833 150.68497
## 55.14286 72.848457 39.99410 105.70281 22.60205 123.09486
## 55.28571 5.370737 -27.48896 38.23044 -44.88384 55.62531
## 55.42857 99.828540 66.96344 132.69364 49.56571 150.09137
## 55.57143 79.118641 46.24809 111.98920 28.84747 129.38982
## 55.71429 85.273849 52.39778 118.14992 34.99424 135.55346
# ETS model
fit.ets.atm1 <- ets(atm1, model = "ZZZ")
checkresiduals(fit.ets.atm1)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,A)
## Q* = 28.478, df = 5, p-value = 2.935e-05
##
## Model df: 9. Total lags used: 14
summary(fit.ets.atm1)
## ETS(A,N,A)
##
## Call:
## ets(y = atm1, model = "ZZZ")
##
## Smoothing parameters:
## alpha = 0.0157
## gamma = 0.3294
##
## Initial states:
## l = 78.2327
## s = -54.1883 -1.6153 13.056 4.5792 16.9074 13.5857
## 7.6753
##
## sigma: 24.0883
##
## AIC AICc BIC
## 4487.011 4487.633 4526.010
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.09806938 23.78951 15.09229 -106.8251 121.6681 0.8519644
## ACF1
## Training set 0.1301807
# Arima model
fit.arima.atm1 <- auto.arima(atm1,seasonal = TRUE,
stepwise = FALSE, approximation = FALSE)
checkresiduals(fit.arima.atm1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(1,1,1)[7]
## Q* = 16.241, df = 11, p-value = 0.1324
##
## Model df: 3. Total lags used: 14
summary(fit.arima.atm1)
## Series: atm1
## ARIMA(0,0,1)(1,1,1)[7]
##
## Coefficients:
## ma1 sar1 sma1
## 0.1726 0.1719 -0.7492
## s.e. 0.0550 0.0793 0.0555
##
## sigma^2 estimated as 558.9: log likelihood=-1640.86
## AIC=3289.71 AICc=3289.83 BIC=3305.24
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.06598054 23.31467 14.61401 -102.9177 117.827 0.8249653
## ACF1
## Training set -0.007945761
ATM1 will be forecasted using ARIMA(0,0,1)(1,1,1)[7]
# Forecast ATM1
fc.arima.atm1 <- forecast(fit.arima.atm1, h=31)
autoplot(fc.arima.atm1)
ATM 2
# Additivie Holt-Winter
fit.hw.atm2 <- hw(atm2, seasonal = "additive")
checkresiduals(fit.hw.atm2)
##
## Ljung-Box test
##
## data: Residuals from Holt-Winters' additive method
## Q* = 35.274, df = 3, p-value = 1.066e-07
##
## Model df: 11. Total lags used: 14
summary(fit.hw.atm2)
##
## Forecast method: Holt-Winters' additive method
##
## Model Information:
## Holt-Winters' additive method
##
## Call:
## hw(y = atm2, seasonal = "additive")
##
## Smoothing parameters:
## alpha = 1e-04
## beta = 1e-04
## gamma = 0.3659
##
## Initial states:
## l = 73.8921
## b = -0.0432
## s = -37.3512 -19.4984 14.0102 -3.4782 5.3113 14.1138
## 26.8924
##
## sigma: 25.2108
##
## AIC AICc BIC
## 4522.203 4523.089 4569.002
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.169966 24.82803 17.47542 -Inf Inf 0.8546081 0.01799577
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 53.85714 67.528233 35.21926 99.83721 18.11592 116.94054
## 54.00000 73.377219 41.06825 105.68619 23.96491 122.78953
## 54.14286 10.238845 -22.07013 42.54782 -39.17347 59.65116
## 54.28571 1.279520 -31.02946 33.58850 -48.13280 50.69184
## 54.42857 100.830699 68.52172 133.13968 51.41838 150.24302
## 54.57143 91.621462 59.31247 123.93045 42.20913 141.03379
## 54.71429 68.292314 35.98332 100.60131 18.87997 117.70466
## 54.85714 67.269474 32.85643 101.68252 14.63926 119.89968
## 55.00000 73.118459 38.70540 107.53152 20.48823 125.74869
## 55.14286 9.980085 -24.43299 44.39316 -42.65017 62.61034
## 55.28571 1.020760 -33.39233 35.43385 -51.60952 53.65104
## 55.42857 100.571939 66.15883 134.98505 47.94163 153.20225
## 55.57143 91.362702 56.94956 125.77584 38.73235 143.99305
## 55.71429 68.033555 33.62039 102.44672 15.40316 120.66395
# ETS model
fit.ets.atm2 <- ets(atm2, model = "ZZZ")
checkresiduals(fit.ets.atm2)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,A)
## Q* = 35.986, df = 5, p-value = 9.561e-07
##
## Model df: 9. Total lags used: 14
summary(fit.ets.atm2)
## ETS(A,N,A)
##
## Call:
## ets(y = atm2, model = "ZZZ")
##
## Smoothing parameters:
## alpha = 1e-04
## gamma = 0.3602
##
## Initial states:
## l = 74.9242
## s = -64.8508 -32.3927 21.7717 5.3454 15.9989 31.6313
## 22.4962
##
## sigma: 24.9972
##
## AIC AICc BIC
## 4514.048 4514.669 4553.047
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.8039085 24.68711 17.36852 -Inf Inf 0.8493804 0.01219414
# Arima model
fit.arima.atm2 <- auto.arima(atm2,seasonal = TRUE,
stepwise = FALSE, approximation = FALSE)
checkresiduals(fit.arima.atm2)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,2)(0,1,1)[7]
## Q* = 9.3093, df = 9, p-value = 0.4092
##
## Model df: 5. Total lags used: 14
summary(fit.arima.atm2)
## Series: atm2
## ARIMA(2,0,2)(0,1,1)[7]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1
## -0.4269 -0.9138 0.4658 0.8038 -0.7498
## s.e. 0.0547 0.0411 0.0851 0.0562 0.0387
##
## sigma^2 estimated as 587.5: log likelihood=-1649.09
## AIC=3310.19 AICc=3310.43 BIC=3333.47
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.8591747 23.83632 16.79494 -Inf Inf 0.8213304 -0.004223441
ATM2 will be forecasted using ARIMA(2,0,2)(0,1,1)[7]
# Forecast ATM2
fc.arima.atm2 <- forecast(fit.arima.atm2, h=31)
autoplot(fc.arima.atm2)
ATM 3
ATM3 will be forecasted using meanf()
# Mean forecast
fc.mean.atm3 <- meanf(atm3[atm3>0], h=31)
ATM 4
# Additivie Holt-Winter
fit.hw.atm4 <- hw(atm4, seasonal = "additive", lambda = "auto")
checkresiduals(fit.hw.atm4)
##
## Ljung-Box test
##
## data: Residuals from Holt-Winters' additive method
## Q* = 17.38, df = 3, p-value = 0.0005903
##
## Model df: 11. Total lags used: 14
summary(fit.hw.atm4)
##
## Forecast method: Holt-Winters' additive method
##
## Model Information:
## Holt-Winters' additive method
##
## Call:
## hw(y = atm4, seasonal = "additive", lambda = "auto")
##
## Box-Cox transformation: lambda= 0.4509
##
## Smoothing parameters:
## alpha = 0.0329
## beta = 1e-04
## gamma = 0.1105
##
## Initial states:
## l = 34.2377
## b = -0.01
## s = -12.6127 0.3154 -0.3825 2.7855 3.2318 2.5498
## 4.1127
##
## sigma: 13.1723
##
## AIC AICc BIC
## 4048.318 4049.205 4095.117
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 66.04301 341.6294 262.0047 -362.5675 406.2026 0.7563291 0.08584822
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 53.14286 276.79407 35.694429 788.0975 0.96393269 1178.8704
## 53.28571 339.15571 57.598769 896.9278 5.68062007 1314.0798
## 53.42857 346.83797 60.435108 910.3912 6.48142971 1330.9269
## 53.57143 65.90551 -1.032496 361.1661 -36.29007305 628.2402
## 53.71429 394.92911 79.447507 991.8765 12.76224350 1431.4635
## 53.85714 250.27789 27.157120 742.2527 0.12471531 1122.6845
## 54.00000 451.03769 103.374600 1084.6247 22.29683646 1545.0440
## 54.14286 274.43787 33.338346 792.8433 0.57114138 1190.7123
## 54.28571 336.52088 54.521208 902.0335 4.55751524 1326.6686
## 54.42857 344.17047 57.271768 915.5501 5.26877273 1343.6250
## 54.57143 64.83624 -1.408175 364.2806 -39.67633304 636.6717
## 54.71429 392.06415 75.759539 997.3072 10.97378378 1444.7227
## 54.85714 248.04868 25.126584 746.8953 0.02326509 1134.3104
## 55.00000 447.95558 99.100223 1090.3531 19.84071833 1558.9158
# ETS model
fit.ets.atm4 <- ets(atm4, model = "ZZZ", lambda = "auto", biasadj = T,
damped = T)
checkresiduals(fit.ets.atm4)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,Ad,A)
## Q* = 21.645, df = 3, p-value = 7.732e-05
##
## Model df: 12. Total lags used: 15
summary(fit.ets.atm4)
## ETS(A,Ad,A)
##
## Call:
## ets(y = atm4, model = "ZZZ", damped = T, lambda = "auto", biasadj = T)
##
## Box-Cox transformation: lambda= 0.4509
##
## Smoothing parameters:
## alpha = 2e-04
## beta = 1e-04
## gamma = 0.1104
## phi = 0.8615
##
## Initial states:
## l = 33.1961
## b = -0.8431
## s = -13.3301 1.727 1.5304 3.9611 1.8923 0.6925
## 3.5267
##
## sigma: 13.1105
##
## AIC AICc BIC
## 4045.852 4046.889 4096.550
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -7.944094 333.5195 263.3104 -486.8635 516.1564 0.7600984 0.1024821
# Arima model
fit.arima.atm4 <- auto.arima(atm4,seasonal = TRUE,
stepwise = FALSE, approximation = FALSE)
checkresiduals(fit.arima.atm4)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0)(2,0,0)[7] with non-zero mean
## Q* = 15.363, df = 10, p-value = 0.1194
##
## Model df: 4. Total lags used: 14
summary(fit.arima.atm4)
## Series: atm4
## ARIMA(1,0,0)(2,0,0)[7] with non-zero mean
##
## Coefficients:
## ar1 sar1 sar2 mean
## 0.0825 0.1477 0.1138 443.7139
## s.e. 0.0523 0.0521 0.0525 26.1872
##
## sigma^2 estimated as 118476: log likelihood=-2648.14
## AIC=5306.29 AICc=5306.46 BIC=5325.79
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.2025295 342.3122 282.3934 -524.425 556.6231 0.8151851
## ACF1
## Training set -0.0006743136
ATM4 will be forecasted using Additive trend Additive seasonal componet ETS(A,Ad,A)
# Forecast ATM4
fc.ets.atm4 <- forecast(fit.ets.atm4, h=31)
autoplot(fc.ets.atm4)
# Write to csv, save to working directory
Date <- as.character(seq(as.Date('2010-05-01'), as.Date('2010-05-31'), 1))
ATM1 <- fc.arima.atm1$mean %>% as.vector()
ATM2 <- fc.arima.atm2$mean %>% as.vector()
ATM3 <- fc.mean.atm3$mean %>% as.vector()
ATM4 <- fc.ets.atm4$mean %>% as.vector()
partA <- cbind(Date, ATM1, ATM2, ATM3, ATM4)
write.csv(partA, "PartA_ATM.csv")
Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.
The data file provided will be saved to local directory and from there, will be read into R
View: data structures, descriptive statistics, visualize, add column
Prepare data by imputing missing values, treating outliers
Create and plot the time series
Split data between train and test sets
Build 5 seasonal models
Evaluate models by performance of in the hold-out set, looking at summaries and accuracy measures such as AICc, RMSE, p-values
Put in comments to better document snippet of codes
# Load from working directory
residential624data <- readxl::read_xlsx("ResidentialCustomerForecastLoad-624.xlsx")
# data structure, summary and plots
str(residential624data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 192 obs. of 3 variables:
## $ CaseSequence: num 733 734 735 736 737 738 739 740 741 742 ...
## $ YYYY-MMM : chr "1998-Jan" "1998-Feb" "1998-Mar" "1998-Apr" ...
## $ KWH : num 6862583 5838198 5420658 5010364 4665377 ...
summary(residential624data)
## CaseSequence YYYY-MMM KWH
## Min. :733.0 Length:192 Min. : 770523
## 1st Qu.:780.8 Class :character 1st Qu.: 5429912
## Median :828.5 Mode :character Median : 6283324
## Mean :828.5 Mean : 6502475
## 3rd Qu.:876.2 3rd Qu.: 7620524
## Max. :924.0 Max. :10655730
## NA's :1
hist(residential624data$KWH)
boxplot(residential624data$KWH)
# Add column for aggregation
residential624data$Yr = as.integer(substr(residential624data$`YYYY-MMM`,1,4))
# Yearly median, mean
power.comp <- residential624data[complete.cases(residential624data),] %>%
select(Yr, KWH) %>%
group_by(Yr) %>%
summarize(mean = mean(KWH), median = median(KWH))
power.comp
## # A tibble: 16 x 3
## Yr mean median
## <int> <dbl> <dbl>
## 1 1998 6204659. 6091909
## 2 1999 5845579. 5429608.
## 3 2000 6264024. 6048208.
## 4 2001 6182502. 6031252
## 5 2002 6325899. 5857698.
## 6 2003 6248668. 6086098.
## 7 2004 6181055. 6474100
## 8 2005 6388314. 6372943
## 9 2006 6320780. 6070580.
## 10 2007 6284244. 6236756
## 11 2008 6357963. 6442746
## 12 2009 6581986. 6390616.
## 13 2010 6322466. 6424438.
## 14 2011 7687937 8155460.
## 15 2012 7213140. 7084421
## 16 2013 7618335. 7801212.
# Missing values
residential624data[!complete.cases(residential624data),]
## # A tibble: 1 x 4
## CaseSequence `YYYY-MMM` KWH Yr
## <dbl> <chr> <dbl> <int>
## 1 861 2008-Sep NA 2008
power.df <- residential624data
power.df[!complete.cases(power.df),]["KWH"] <-
subset(power.comp, Yr == 2008,"mean")
power.df[!complete.cases(power.df),]
## # A tibble: 0 x 4
## # ... with 4 variables: CaseSequence <dbl>, `YYYY-MMM` <chr>, KWH <dbl>,
## # Yr <int>
# Outlier
row.min <- which.min(power.df$KWH)
power.df[row.min,]
## # A tibble: 1 x 4
## CaseSequence `YYYY-MMM` KWH Yr
## <dbl> <chr> <dbl> <int>
## 1 883 2010-Jul 770523 2010
power.df[row.min,"KWH"] <-
subset(power.comp, Yr == 2010,"mean")
# Create time series and plot
power.ts <- ts(power.df$KWH, start=c(1998, 1), frequency = 12)
ggtsdisplay(power.ts)
# Split between training and test sets
power.tr <- window(power.ts,end=c(2010,12))
power.te <- window(power.ts, start=c(2011,1))
length(power.te)
## [1] 36
# Seasonal Naive
fit.snaive <- snaive(power.tr,h=36)
accuracy(fit.snaive, power.te)
## ME RMSE MAE MPE MAPE MASE
## Training set 48372.4 739983.7 585495.6 0.1148917 9.251209 1.000000
## Test set 721343.1 1397695.5 1088763.2 8.3270908 13.666394 1.859558
## ACF1 Theil's U
## Training set 0.2584869 NA
## Test set 0.3746623 0.8092061
checkresiduals(fit.snaive)
##
## Ljung-Box test
##
## data: Residuals from Seasonal naive method
## Q* = 70.338, df = 24, p-value = 1.943e-06
##
## Model df: 0. Total lags used: 24
autoplot(power.tr) +
autolayer(power.te, series="Observed", PI=FALSE) +
autolayer(fit.snaive, series="Fit1 - Seasonal naïve", PI=FALSE) +
xlab("Month") + ylab("KWH") +
ggtitle("Forecasts for Monthly Power Usage") +
guides(colour=guide_legend(title="Forecast"))
## Warning: Ignoring unknown parameters: PI
# STL Decomposition
fit.stl <- stlf(power.tr, h=36)
accuracy(fit.stl, power.te)
## ME RMSE MAE MPE MAPE MASE
## Training set 28369.99 499461.9 387844.5 -0.1977429 6.162275 0.662421
## Test set 891941.81 1223492.7 924795.4 10.7061970 11.284982 1.579509
## ACF1 Theil's U
## Training set 0.2154471 NA
## Test set 0.1605826 0.7389296
checkresiduals(fit.stl)
## Warning in checkresiduals(fit.stl): The fitted degrees of freedom is based on
## the model used for the seasonally adjusted data.
##
## Ljung-Box test
##
## data: Residuals from STL + ETS(M,N,N)
## Q* = 40.588, df = 22, p-value = 0.009223
##
## Model df: 2. Total lags used: 24
autoplot(power.tr) +
autolayer(power.te, series="Observed", PI=FALSE) +
autolayer(fit.stl, series="Fit2 - STL Decomposition", PI=FALSE) +
xlab("Month") + ylab("KWH") +
ggtitle("Forecasts for Monthly Power Usage") +
guides(colour=guide_legend(title="Forecast"))
## Warning: Ignoring unknown parameters: PI
# Holt Winter Multiplicative
fit.hw <- hw(power.tr, seasonal = "multiplicative", h=36)
accuracy(fit.hw, power.te)
## ME RMSE MAE MPE MAPE MASE
## Training set -28102.15 559662.2 428378.7 -1.193585 6.832018 0.7316515
## Test set 871233.99 1217342.6 945227.6 10.507829 11.730288 1.6144060
## ACF1 Theil's U
## Training set 0.3512054 NA
## Test set 0.1436944 0.7442634
checkresiduals(fit.hw)
##
## Ljung-Box test
##
## data: Residuals from Holt-Winters' multiplicative method
## Q* = 62.978, df = 8, p-value = 1.21e-10
##
## Model df: 16. Total lags used: 24
autoplot(power.tr) +
autolayer(power.te, series="Observed", PI=FALSE) +
autolayer(fit.hw, series="Fit3 - HW Multiplicative", PI=FALSE) +
xlab("Month") + ylab("KWH") +
ggtitle("Forecasts for Monthly Power Usage") +
guides(colour=guide_legend(title="Forecast"))
## Warning: Ignoring unknown parameters: PI
# ETS
f_ets <- ets(power.tr, model = "ZZZ")
fit.ets <- forecast(f_ets, h=36)
summary(fit.ets)
##
## Forecast method: ETS(M,N,M)
##
## Model Information:
## ETS(M,N,M)
##
## Call:
## ets(y = power.tr, model = "ZZZ")
##
## Smoothing parameters:
## alpha = 0.0451
## gamma = 3e-04
##
## Initial states:
## l = 6190007.5655
## s = 0.9257 0.7429 0.8968 1.1915 1.2672 1.181
## 0.9938 0.7678 0.8172 0.9285 1.0609 1.2267
##
## sigma: 0.0913
##
## AIC AICc BIC
## 4933.857 4937.286 4979.605
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 49446.84 560599.1 420240.8 0.1141215 6.615923 0.7177523 0.3341793
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 2011 8056453 7113827 8999079 6614831 9498075
## Feb 2011 6967106 6151102 7783109 5719136 8215075
## Mar 2011 6097699 5382794 6812605 5004346 7191053
## Apr 2011 5367092 4737203 5996981 4403760 6330424
## May 2011 5042351 4449972 5634729 4136386 5948316
## Jun 2011 6526694 5759156 7294233 5352846 7700543
## Jul 2011 7755992 6842965 8669019 6359638 9152346
## Aug 2011 8322330 7341645 9303016 6822501 9822160
## Sep 2011 7825090 6902068 8748112 6413450 9236730
## Oct 2011 5889876 5194426 6585326 4826277 6953474
## Nov 2011 4879385 4302670 5456099 3997376 5761393
## Dec 2011 6079610 5360316 6798905 4979545 7179676
## Jan 2012 8056454 7102309 9010599 6597215 9515692
## Feb 2012 6967107 6141152 7793061 5703918 8230295
## Mar 2012 6097700 5374093 6821307 4991039 7204361
## Apr 2012 5367093 4729553 6004632 4392060 6342126
## May 2012 5042351 4442792 5641911 4125404 5959298
## Jun 2012 6526695 5749871 7303519 5338646 7714744
## Jul 2012 7755993 6831942 8680043 6342779 9169206
## Aug 2012 8322331 7329828 9314834 6804429 9840233
## Sep 2012 7825091 6890969 8759213 6396474 9253707
## Oct 2012 5889876 5186080 6593673 4813512 6966240
## Nov 2012 4879385 4295763 5463008 3986812 5771959
## Dec 2012 6079611 5351718 6807504 4966394 7192827
## Jan 2013 8056455 7090926 9021983 6579806 9533103
## Feb 2013 6967107 6131317 7802897 5688877 8245337
## Mar 2013 6097701 5365495 6829906 4977888 7217513
## Apr 2013 5367093 4721992 6012195 4380496 6353691
## May 2013 5042352 4435695 5649008 4114551 5970153
## Jun 2013 6526696 5740694 7312697 5324610 7728781
## Jul 2013 7755993 6821047 8690940 6326116 9185871
## Aug 2013 8322332 7318149 9326515 6786567 9858097
## Sep 2013 7825092 6879998 8770186 6379695 9270488
## Oct 2013 5889877 5177830 6601924 4800895 6978859
## Nov 2013 4879386 4288935 5469837 3976369 5782402
## Dec 2013 6079612 5343218 6816005 4953395 7205828
accuracy(fit.ets, power.te)
## ME RMSE MAE MPE MAPE MASE
## Training set 49446.84 560599.1 420240.8 0.1141215 6.615923 0.7177523
## Test set 938996.94 1261414.9 994298.8 11.4824315 12.399093 1.6982173
## ACF1 Theil's U
## Training set 0.3341793 NA
## Test set 0.1300949 0.7729428
checkresiduals(fit.ets)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,M)
## Q* = 64.968, df = 10, p-value = 4.111e-10
##
## Model df: 14. Total lags used: 24
autoplot(power.tr) +
autolayer(power.te, series="Observed", PI=FALSE) +
autolayer(fit.ets, series="Fit4 - ETS", PI=FALSE) +
xlab("Month") + ylab("KWH") +
ggtitle("Forecasts for Monthly Power Usage") +
guides(colour=guide_legend(title="Forecast"))
## Warning: Ignoring unknown parameters: PI
# Arima
f_arima <- auto.arima(power.tr,seasonal = TRUE,
stepwise = FALSE, approximation = FALSE,
biasadj = T, lambda = "auto")
fit.arima <- forecast(f_arima, h=36)
summary(fit.arima)
##
## Forecast method: ARIMA(0,0,3)(2,1,0)[12] with drift
##
## Model Information:
## Series: power.tr
## ARIMA(0,0,3)(2,1,0)[12] with drift
## Box Cox transformation: lambda= 0.8030663
##
## Coefficients:
## ma1 ma2 ma3 sar1 sar2 drift
## 0.2341 -0.0010 0.2332 -0.7979 -0.4812 195.8108
## s.e. 0.0808 0.0954 0.0838 0.0830 0.0812 115.8525
##
## sigma^2 estimated as 6.17e+08: log likelihood=-1663.92
## AIC=3341.84 AICc=3342.67 BIC=3362.63
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -10453.18 513746.5 392693.6 -0.6532787 6.303264 0.6707029
## ACF1
## Training set -0.006890228
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 2011 8443534 7710496 9182369 7329622 9580200
## Feb 2011 7594533 6857810 8338178 6475810 8739261
## Mar 2011 6259378 5550816 6975729 5184781 7363238
## Apr 2011 5612708 4902305 6332319 4536354 6722417
## May 2011 4993497 4299776 5697108 3943460 6079366
## Jun 2011 6461827 5730858 7201245 5353136 7601117
## Jul 2011 7519277 6765599 8280659 6374997 8691451
## Aug 2011 8239441 7471755 9014414 7073262 9432002
## Sep 2011 7158577 6412333 7912760 6025935 8319963
## Oct 2011 5521585 4813538 6238932 4448943 6627917
## Nov 2011 4967152 4274177 5670048 3918296 6051957
## Dec 2011 6709216 5972675 7454014 5591782 7856554
## Jan 2012 8690138 7900226 9487613 7489924 9917081
## Feb 2012 7668651 6897649 8447844 6498025 8868194
## Mar 2012 6443177 5698832 7196628 5314332 7604182
## Apr 2012 5743220 5015348 6480896 4640324 6880705
## May 2012 5215894 4502143 5940043 4135265 6333215
## Jun 2012 6337146 5594597 7088928 5211166 7495694
## Jul 2012 7351457 6586273 8125074 6189978 8542679
## Aug 2012 8318201 7533717 9110508 7126537 9537450
## Sep 2012 7695723 6923441 8476209 6523133 8897238
## Oct 2012 5772884 5044250 6511292 4668789 6911460
## Nov 2012 5208646 4495098 5932602 4128336 6325679
## Dec 2012 6835317 6081306 7598144 5691346 8010383
## Jan 2013 9077215 8236379 9927017 7799603 10384609
## Feb 2013 8115112 7290871 8949043 6863539 9398784
## Mar 2013 6937215 6138741 7746349 5726009 8183745
## Apr 2013 5834559 5061679 6619398 4663734 7044916
## May 2013 5116656 4364245 5882024 3978149 6298011
## Jun 2013 6668975 5874774 7474197 5464607 7909768
## Jul 2013 7025957 6223302 7839288 5808319 8278878
## Aug 2013 8227257 7398533 9065679 6968789 9517766
## Sep 2013 7707150 6889301 8535090 6465698 8981946
## Oct 2013 5859461 5085909 6644941 4687578 7070773
## Nov 2013 5049304 4298934 5812737 3914026 6227781
## Dec 2013 6585065 5792905 7388332 5383904 7822936
accuracy(fit.arima, power.te)
## ME RMSE MAE MPE MAPE MASE
## Training set -10453.18 513746.5 392693.6 -0.6532787 6.303264 0.6707029
## Test set 757440.15 1148002.8 858536.5 8.9624441 10.496008 1.4663415
## ACF1 Theil's U
## Training set -0.006890228 NA
## Test set 0.299686846 0.6839646
checkresiduals(fit.arima)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,3)(2,1,0)[12] with drift
## Q* = 14.975, df = 18, p-value = 0.6637
##
## Model df: 6. Total lags used: 24
autoplot(power.tr) +
autolayer(power.te, series="Observed", PI=FALSE) +
autolayer(fit.arima, series="Fit5 -Arima", PI=FALSE) +
xlab("Month") + ylab("KWH") +
ggtitle("Forecasts for Monthly Power Usage") +
guides(colour=guide_legend(title="Forecast"))
## Warning: Ignoring unknown parameters: PI
Power usage will be forecasted using ARIMA(0,0,3)(2,1,0)[12] with drift
# Forecast
f_arima <- auto.arima(power.ts,seasonal = TRUE,
stepwise = FALSE, approximation = FALSE,
biasadj = T, lambda = "auto")
fit.arima <- forecast(f_arima, h=12)
summary(fit.arima)
##
## Forecast method: ARIMA(0,0,3)(2,1,0)[12] with drift
##
## Model Information:
## Series: power.ts
## ARIMA(0,0,3)(2,1,0)[12] with drift
## Box Cox transformation: lambda= -0.2130625
##
## Coefficients:
## ma1 ma2 ma3 sar1 sar2 drift
## 0.2697 0.0864 0.1905 -0.7592 -0.4070 0e+00
## s.e. 0.0760 0.0858 0.0684 0.0726 0.0751 1e-04
##
## sigma^2 estimated as 1.211e-05: log likelihood=773.41
## AIC=-1532.82 AICc=-1532.17 BIC=-1510.47
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 20381.36 639044.5 491740.9 -0.273508 7.495678 0.771515 0.1110666
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 2014 10322477 8943066 11810540 8330666 12748948
## Feb 2014 8896976 7707405 10188718 7177882 11001358
## Mar 2014 7015008 6118595 7985270 5716251 8590685
## Apr 2014 5967983 5219191 6779447 4881661 7283674
## May 2014 5930080 5187009 6735258 4851984 7235469
## Jun 2014 8311690 7196216 9527671 6699270 10292129
## Jul 2014 9641850 8308341 11099559 7717541 12021043
## Aug 2014 10191652 8766192 11751531 8135988 12739675
## Sep 2014 8546561 7393066 9804637 6879708 10596366
## Oct 2014 5852555 5121160 6644901 4791252 7136921
## Nov 2014 6143594 5368204 6984302 5019033 7507217
## Dec 2014 8244330 7139721 9448288 6647469 10204962
checkresiduals(fit.arima)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,3)(2,1,0)[12] with drift
## Q* = 19.184, df = 18, p-value = 0.3806
##
## Model df: 6. Total lags used: 24
autoplot(fit.arima)
# Write to csv, save to working directory
Date <- seq(as.Date('2014-01-01'), as.Date('2014-12-31'), by = 'month')
KWH <- fit.arima$mean %>% as.vector()
partB <- cbind(Date, KWH)
write.csv(partB, "PartB_Power.csv")
Part C consists of two data sets. These are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows). Note for multiple recordings within an hour, take the mean. Then to determine if the data is stationary and can it be forecast. If so, provide a week forward forecast and present results via Rpubs and .rmd and the forecast in an Excel readable file.
The data file provided will be saved to local directory and from there, will be read into R
The 2 files will be merged, group by Date-Hour, summarized by mean of the waterflow
Time series will be created with frequency of 24 (hourly)
Check the need for differencing, do differencing, check for test statistic to verify data is stationary
Fit arima model and forecast for 1 week. The resulting point forecast is the same for all future Date-Hour. In the absence of trend, seasonality, the best forecast seems like the mean.
# Load from working directory
wp1data <- suppressWarnings(readxl::read_xlsx("Waterflow_Pipe1.xlsx"
,col_types =c("date", "numeric")))
wp2data <- suppressWarnings(readxl::read_xlsx("Waterflow_Pipe2.xlsx"
,col_types =c("date", "numeric")))
wpdata <- rbind(wp1data, wp2data)
names(wpdata)[1] <- "DateTime"
str(wpdata)
## Classes 'tbl_df', 'tbl' and 'data.frame': 2000 obs. of 2 variables:
## $ DateTime : POSIXct, format: "2015-10-23 00:24:06" "2015-10-23 00:40:02" ...
## $ WaterFlow: num 23.4 28 23.1 30 6 ...
# Group by Date-Hour
wp.df <- wpdata %>%
mutate(Ddate = as.Date(DateTime),
Dhour=format(DateTime, '%H'))
# Summarize by mean waterflow
wpgroup.df <- wp.df %>%
select(Ddate, Dhour, WaterFlow) %>%
group_by(Ddate, Dhour) %>%
summarize(MeanFlowmean = mean(WaterFlow))
head(wpgroup.df)
## # A tibble: 6 x 3
## # Groups: Ddate [1]
## Ddate Dhour MeanFlowmean
## <date> <chr> <dbl>
## 1 2015-10-23 00 26.1
## 2 2015-10-23 01 18.8
## 3 2015-10-23 02 24.5
## 4 2015-10-23 03 25.6
## 5 2015-10-23 04 22.4
## 6 2015-10-23 05 23.6
tail(wpgroup.df)
## # A tibble: 6 x 3
## # Groups: Ddate [1]
## Ddate Dhour MeanFlowmean
## <date> <chr> <dbl>
## 1 2015-12-03 11 73.0
## 2 2015-12-03 12 31.5
## 3 2015-12-03 13 66.8
## 4 2015-12-03 14 42.9
## 5 2015-12-03 15 33.4
## 6 2015-12-03 16 66.7
# create time series, hourly
wp.ts <- ts(wpgroup.df$MeanFlowmean, frequency = 24)
# plot
ggtsdisplay(wp.ts)
# check need for differencing
wp.ts %>% ur.kpss %>% summary
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 7 lags.
##
## Value of test-statistic is: 4.0615
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
ndiffs(wp.ts)
## [1] 1
# differentiate
wp.ts.diff <- diff(wp.ts)
# check test statistic for stationarity
wp.ts.diff %>% ur.kpss %>% summary
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 7 lags.
##
## Value of test-statistic is: 0.0075
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
ggtsdisplay(wp.ts.diff)
# fit arima model
fit.arima.wp <- auto.arima(wp.ts,seasonal = FALSE,
stepwise = FALSE, approximation = FALSE)
checkresiduals(fit.arima.wp)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1)
## Q* = 71.248, df = 47, p-value = 0.01277
##
## Model df: 1. Total lags used: 48
summary(fit.arima.wp)
## Series: wp.ts
## ARIMA(0,1,1)
##
## Coefficients:
## ma1
## -0.9629
## s.e. 0.0083
##
## sigma^2 estimated as 212.9: log likelihood=-4100.12
## AIC=8204.24 AICc=8204.25 BIC=8214.05
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.531199 14.57593 11.14363 -26.38446 47.91859 0.7370315
## ACF1
## Training set -0.02311872
# forecast for 1 week (the first 7 forecast would be the missing months of the last day in the series), 1 week = 175
# Forecast ATM4
fc.wp <- forecast(fit.arima.wp, h=175)
autoplot(fc.wp)
# Write to csv, save to working directory
Date <- as.character(seq(as.Date('2015-12-04'), as.Date('2015-12-10'), 1))
Hour <- rep(seq(0,23,1),7)
Waterflow <- fc.wp$mean[8:175] %>% as.vector()
partC <- cbind(Date, Hour, Waterflow)
write.csv(partC, "PartC_WaterFlow.csv")