# function selects chosen atm
select_atm = function(n){
atm = paste0('ATM',n)
data = atm_series%>%
filter(ATM == atm)
data = data%>%
replace_na(list(Cash = mean(data$Cash,na.rm=TRUE)))
return(data)
}
# plots time series for chosen ATM
plot_atm = function(n){
data = select_atm(n)
data%>%
autoplot(Cash)+labs(title = paste('Withdrawals from',paste0('ATM',n)))
}
# plots forecasts based on model
forecast_model = function(data, model,type){
model%>%
select('ATM',type)%>%
forecast(h=30)%>%
autoplot(data)+labs(title = paste(data$ATM[1],'Withdrawal Forecast'))
}
atm_data = read_excel('ATM624Data.xlsx')
atm_data%>%head()
## # A tibble: 6 x 3
## DATE ATM Cash
## <dbl> <chr> <dbl>
## 1 39934 ATM1 96
## 2 39934 ATM2 107
## 3 39935 ATM1 82
## 4 39935 ATM2 89
## 5 39936 ATM1 85
## 6 39936 ATM2 90
# transform excel date values to dates
atm_data['DATE'] = as.Date(atm_data$DATE, origin = "1899-12-30")
# turn dataset into time series
atm_series = atm_data%>%
as_tsibble(index = DATE, key = ATM)
atm_series%>%head()
## # A tsibble: 6 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM1 96
## 2 2009-05-02 ATM1 82
## 3 2009-05-03 ATM1 85
## 4 2009-05-04 ATM1 90
## 5 2009-05-05 ATM1 99
## 6 2009-05-06 ATM1 88
ATM withdrawals seem to follow a flat(not increasing nor decreasing) trend with seasonal variation
atm1 = select_atm(1)
plot_atm(1)
Using classical decomposition, we can see that there is no clear trend. However, there is a seasonal component
atm1%>%
model(
classical_decomposition(Cash)
)%>%
components()%>%
autoplot()
## Warning: Removed 3 row(s) containing missing values (geom_path).
Chose various models to fit the data. Based on RMSE, the arima model performed the best for ATM1
atm1.models = atm1%>%
model(
'mean' = MEAN(Cash),
'snaive' = SNAIVE(Cash),
'ets' = ETS(Cash~error("A") + trend("N") + season("A")),
'arima' = ARIMA(Cash~pdq(1,0,1))
)
atm1.models%>%
accuracy()
## # A tibble: 4 x 11
## ATM .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM1 mean Training 4.29e-15 36.5 27.1 -175. 198. 1.52 1.31 0.0567
## 2 ATM1 snaive Training -3.63e- 2 27.9 17.8 -96.6 116. 1 1 0.151
## 3 ATM1 ets Training -4.45e- 2 23.8 15.1 -106. 121. 0.846 0.854 0.125
## 4 ATM1 arima Training -7.58e- 2 23.4 14.7 -102. 117. 0.827 0.839 -0.000861
Residuals of the arima model are normally distributed and uncorrelated
atm1.models%>%
select('arima')%>%
gg_tsresiduals()
Using arima model to forecast may 2010
forecast_model(atm1,atm1.models,'arima')
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(type)` instead of `type` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
Withdrawals from ATM2 are similar to those seen in ATM1. There seems to be a seasonal component and no clear trend
atm2 = select_atm(2)
plot_atm(2)
Based on the decomposition, we see that there is a slight downtrend.
atm2%>%
model(
classical_decomposition(Cash)
)%>%
components()%>%
autoplot()
## Warning: Removed 3 row(s) containing missing values (geom_path).
Fitting various models and calculating the RMSE, we can see that arima model slightly out performs the ETS model
atm2.models = atm2%>%
model(
'mean' = MEAN(Cash),
'snaive' = SNAIVE(Cash),
'ets' = ETS(Cash~error("A") + trend("A") + season("A")),
'arima' = ARIMA(Cash~pdq(1,0,1))
)
atm2.models%>%
accuracy()
## # A tibble: 4 x 11
## ATM .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM2 mean Training -4.73e-17 38.7 33.1 -Inf Inf 1.59 1.29 0.0214
## 2 ATM2 snaive Training 2.23e- 2 30.1 20.8 -Inf Inf 1 1 0.0459
## 3 ATM2 ets Training 1.48e- 1 25.1 17.8 -Inf Inf 0.856 0.834 0.0200
## 4 ATM2 arima Training -3.88e- 1 24.8 17.4 -Inf Inf 0.838 0.826 -0.0205
Residuals for the arima model are normal and uncorrelated
atm2.models%>%
select('arima')%>%
gg_tsresiduals()
Using Arima model, forecast ATM2 withdrawals
forecast_model(atm2,atm2.models,'arima')
ATM3 only has withdrawals data for a couple of days in the dataset.There is no seasonal component
atm3 = select_atm(3)
plot_atm(3)
comparing various models, we can see that the naive method is nearly as good as ETS and ARIMA while being far less complex
atm3.models = atm3%>%
model(
'mean' = MEAN(Cash),
'naive' = NAIVE(Cash),
'ets' = ETS(Cash~error("A") + trend("A") + season("N")),
'arima' = ARIMA(Cash~pdq(1,1,0))
)
atm3.models%>%
accuracy()
## # A tibble: 4 x 11
## ATM .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM3 mean Training -5.63e-17 7.93 1.43 -Inf Inf 1.95 0.986 0.640
## 2 ATM3 naive Training 2.34e- 1 5.09 0.310 28.8 40.2 0.423 0.632 -0.149
## 3 ATM3 ets Training 2.65e- 1 5.02 0.267 NaN Inf 0.363 0.625 -0.00266
## 4 ATM3 arima Training 2.66e- 1 5.03 0.266 33.7 33.7 0.362 0.625 -0.00175
Since the dataset has so few entries, I believe using the naive method is superior as there is little historical data to go off of.
forecast_model(atm3,atm3.models,'naive')
ATM4 follows a similar pattern to ATM 1 and ATM2. There seems to be a seasonal component with no general trend. There also seems to be a couple of days in particular with abnormally high amounts of cash withdrawal
atm4 = select_atm(4)
plot_atm(4)
After replacing outliers with the mean, we are able to see the fluctuations more clearly.
m = mean(atm4$Cash)
s = sd(atm4$Cash)
low = m-3*s
high = m+3*s
atm4 = atm4%>%
filter(Cash>=low & Cash<= high)%>%
fill_gaps()%>%
replace_na(list(Cash = m))
atm4%>%
autoplot(Cash)+labs(title = 'Withdrawals from ATM4')
There is a seasonal component and a flat trend
atm4%>%
model(
classical_decomposition(Cash)
)%>%
components()%>%
autoplot()
## Warning: Removed 3 row(s) containing missing values (geom_path).
All of the models have high RMSE values. Our ets model performs the best with lowest RMSE
atm4.models = atm4%>%
model(
'mean' = MEAN(Cash),
'snaive' = SNAIVE(Cash),
'ets' = ETS(Cash~error("A") + trend("N") + season("A")),
'arima' = ARIMA(Cash~pdq(1,0,1))
)
atm4.models%>%
accuracy()
## # A tibble: 4 x 11
## ATM .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM4 mean Training -2.83e-15 350. 292. -574. 607. 0.842 0.775 0.0746
## 2 ATM4 snaive Training -3.70e+ 0 452. 346. -392. 447. 1 1 0.0595
## 3 ATM4 ets Training -8.36e+ 0 329. 265. -523. 553. 0.764 0.727 0.100
## 4 ATM4 arima Training -1.70e- 1 342. 282. -523. 555. 0.815 0.756 0.00192
The residuals are near normal and uncorrelated
atm4.models%>%
select('ets')%>%
gg_tsresiduals()
Forecasting ATM4 using ETS model
forecast_model(atm4,atm4.models,'ets')
power_usage = read_excel('ResidentialCustomerForecastLoad-624.xlsx')
power_usage%>%head()
## # A tibble: 6 x 3
## CaseSequence `YYYY-MMM` KWH
## <dbl> <chr> <dbl>
## 1 733 1998-Jan 6862583
## 2 734 1998-Feb 5838198
## 3 735 1998-Mar 5420658
## 4 736 1998-Apr 5010364
## 5 737 1998-May 4665377
## 6 738 1998-Jun 6467147
power_usage['yearmonth'] = yearmonth(power_usage$`YYYY-MMM`)
# get mean, sd and create low/high based on 3 standard deviations
KWH_mean = mean(power_usage$KWH, na.rm=TRUE)
KWH_sd = sd(power_usage$KWH, na.rm=TRUE)
KWH_low = KWH_mean-KWH_sd*3
KWH_high = KWH_mean+KWH_sd*3
# replace NA with mean
power_series = power_usage%>%
select('yearmonth','KWH')%>%
as_tsibble(index = 'yearmonth')
power_series%>%head()
## # A tsibble: 6 x 2 [1M]
## yearmonth KWH
## <mth> <dbl>
## 1 1998 Jan 6862583
## 2 1998 Feb 5838198
## 3 1998 Mar 5420658
## 4 1998 Apr 5010364
## 5 1998 May 4665377
## 6 1998 Jun 6467147
Data looks to be seasonal with an flat trend. There seems to be an outlier near 2010
power_series%>%
autoplot(KWH) + labs(title = 'KWH')
remove outliers from dataset and fill gaps with mean
power_usage_ts = power_series%>%
filter(KWH>= KWH_low & KWH<=KWH_high)%>%
fill_gaps()%>%
replace_na(list(KWH = KWH_mean))
power_usage_ts%>%
autoplot(KWH) + labs(title = 'KWH')
There is a seasonal component along with trend. There is a slight uptrend
power_usage_ts%>%
model(
classical_decomposition(KWH)
)%>%
components()%>%
autoplot()
## Warning: Removed 6 row(s) containing missing values (geom_path).
Based on RMSE, the arima model fits this dataset the best
power.models = power_usage_ts%>%
model(
'mean' = MEAN(KWH),
'snaive' = SNAIVE(KWH),
'ets' = ETS(KWH~error("A") + trend("A") + season("A")),
'arima' = ARIMA(KWH~pdq(2,1,2) + PDQ(0,1,2))
)
power.models%>%
accuracy()
## # A tibble: 4 x 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 mean Training 9.70e-11 1378993. 1157503. -4.42 18.3 1.83 1.63 0.482
## 2 snaive Training 9.42e+ 4 847752. 633765. 0.651 9.41 1 1 0.304
## 3 ets Training 1.53e+ 4 636898. 468894. -0.626 7.09 0.740 0.751 0.280
## 4 arima Training 4.34e+ 4 604400. 444327. 0.0682 6.70 0.701 0.713 -0.0129
Model residuals are normal and uncorrelated
power.models%>%
select('arima')%>%
gg_tsresiduals()
Forecast KWH usage using arima
power.models%>%
select('arima')%>%
forecast(h=30)%>%
autoplot(power_usage_ts) + labs(title ='KWH Forecast')
get_lambda = function(data, feature){
data%>%
features(data[feature],features = guerrero)%>%
pull(lambda_guerrero)%>%
return()
}
water_pipe1 = read_excel('Waterflow_Pipe1.xlsx')
water_pipe2 = read_excel('Waterflow_Pipe2.xlsx')
water_pipe1%>%head()
## # A tibble: 6 x 2
## `Date Time` WaterFlow
## <dbl> <dbl>
## 1 42300. 23.4
## 2 42300. 28.0
## 3 42300. 23.1
## 4 42300. 30.0
## 5 42300. 6.00
## 6 42300. 15.9
water_pipe2%>%head()
## # A tibble: 6 x 2
## `Date Time` WaterFlow
## <dbl> <dbl>
## 1 42300. 18.8
## 2 42300. 43.1
## 3 42300. 38.0
## 4 42300. 36.1
## 5 42300. 31.9
## 6 42300. 28.2
water_pipe = rbind(water_pipe1,water_pipe2)
mean_flow = mean(water_pipe$WaterFlow,na.rm =TRUE)
water_series = water_pipe%>%
mutate(days = as.Date(water_pipe$`Date Time`,origin = "1899-12-30"),
hour = round((water_pipe$`Date Time`- round(water_pipe$`Date Time`))*24))%>%
mutate(day_hour = days+hours(hour))%>%
select('day_hour','WaterFlow')%>%
group_by(day_hour)%>%
summarize(avg_flow = mean(WaterFlow))%>%
as_tsibble(index = day_hour)%>%
fill_gaps()%>%
replace_na(list(avg_flow = mean_flow))
# Average flow increased substantially in November
water_series%>%
autoplot(avg_flow) +labs(title = 'Average Water Flow')
water_series%>%
model(
classical_decomposition(avg_flow)
)%>%
components()%>%
autoplot()
## Warning: Removed 12 row(s) containing missing values (geom_path).
After differencing the data fits better between the confidence levels
water_series%>%
ACF(avg_flow)%>%
autoplot() + labs(title = 'ACF Flow')
water_series%>%
ACF(difference(avg_flow))%>%
autoplot() + labs(title = 'ACF Difference in Flow')
Based on the RMSE, The ARIMA model hasthe least amount of error.
flow.models = water_series%>%
model(
'mean' = MEAN(avg_flow),
'naive' = NAIVE(avg_flow),
'snaive' = SNAIVE(avg_flow),
'ets' = ETS(avg_flow~error("A") + trend("N") + season("A")),
'arima' = ARIMA(avg_flow~pdq(2,1,2) + PDQ(0,1,1))
)
flow.models%>%
accuracy()
## # A tibble: 5 x 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 mean Training 1.37e-15 15.4 12.7 -29.5 52.4 0.856 0.796 0.172
## 2 naive Training 5.76e- 2 19.8 15.3 -27.0 60.3 1.03 1.02 -0.500
## 3 snaive Training 4.74e- 1 19.4 14.9 -24.7 57.6 1 1 0.0215
## 4 ets Training 4.66e- 1 14.1 10.9 -24.9 46.5 0.731 0.728 0.0123
## 5 arima Training -1.88e- 1 14.1 10.8 -26.8 46.7 0.723 0.727 -0.000156
flow.models%>%
select('arima')%>%
gg_tsresiduals()
flow.models%>%
select('arima')%>%
forecast(h=24*7)%>%
autoplot(water_series)+labs(title = 'Average Water Flow Forecast')