ATM Forecast

# function selects chosen atm
select_atm = function(n){
  atm = paste0('ATM',n)
  data = atm_series%>%
    filter(ATM == atm)
  data = data%>%
    replace_na(list(Cash = mean(data$Cash,na.rm=TRUE)))
  return(data)
}

# plots time series for chosen ATM
plot_atm = function(n){
  data = select_atm(n)
  data%>%
    autoplot(Cash)+labs(title = paste('Withdrawals from',paste0('ATM',n)))
}



# plots forecasts based on model
forecast_model = function(data, model,type){
  model%>%
  select('ATM',type)%>%
  forecast(h=30)%>%
  autoplot(data)+labs(title = paste(data$ATM[1],'Withdrawal Forecast'))
}

atm_data = read_excel('ATM624Data.xlsx')
atm_data%>%head()

## # A tibble: 6 x 3
##    DATE ATM    Cash
##   <dbl> <chr> <dbl>
## 1 39934 ATM1     96
## 2 39934 ATM2    107
## 3 39935 ATM1     82
## 4 39935 ATM2     89
## 5 39936 ATM1     85
## 6 39936 ATM2     90

cleaning data

# transform excel date values to dates
atm_data['DATE'] = as.Date(atm_data$DATE, origin = "1899-12-30")

# turn dataset into time series
atm_series = atm_data%>%
  as_tsibble(index = DATE, key = ATM)

atm_series%>%head()

## # A tsibble: 6 x 3 [1D]
## # Key:       ATM [1]
##   DATE       ATM    Cash
##   <date>     <chr> <dbl>
## 1 2009-05-01 ATM1     96
## 2 2009-05-02 ATM1     82
## 3 2009-05-03 ATM1     85
## 4 2009-05-04 ATM1     90
## 5 2009-05-05 ATM1     99
## 6 2009-05-06 ATM1     88

ATM1

ATM withdrawals seem to follow a flat(not increasing nor decreasing) trend with seasonal variation

atm1 = select_atm(1)
plot_atm(1)

Using classical decomposition, we can see that there is no clear trend. However, there is a seasonal component

atm1%>%
  model(
    classical_decomposition(Cash)
    )%>%
  components()%>%
  autoplot()

## Warning: Removed 3 row(s) containing missing values (geom_path).

Chose various models to fit the data. Based on RMSE, the arima model performed the best for ATM1

atm1.models = atm1%>%
  model(
    'mean' = MEAN(Cash),
    'snaive' = SNAIVE(Cash),
    'ets' = ETS(Cash~error("A") + trend("N") + season("A")),
    'arima' = ARIMA(Cash~pdq(1,0,1))
  )

atm1.models%>%
  accuracy()

## # A tibble: 4 x 11
##   ATM   .model .type           ME  RMSE   MAE    MPE  MAPE  MASE RMSSE      ACF1
##   <chr> <chr>  <chr>        <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>     <dbl>
## 1 ATM1  mean   Training  4.29e-15  36.5  27.1 -175.   198. 1.52  1.31   0.0567  
## 2 ATM1  snaive Training -3.63e- 2  27.9  17.8  -96.6  116. 1     1      0.151   
## 3 ATM1  ets    Training -4.45e- 2  23.8  15.1 -106.   121. 0.846 0.854  0.125   
## 4 ATM1  arima  Training -7.58e- 2  23.4  14.7 -102.   117. 0.827 0.839 -0.000861

Residuals of the arima model are normally distributed and uncorrelated

atm1.models%>%
  select('arima')%>%
  gg_tsresiduals()

Using arima model to forecast may 2010

forecast_model(atm1,atm1.models,'arima')

## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(type)` instead of `type` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

ATM2

Withdrawals from ATM2 are similar to those seen in ATM1. There seems to be a seasonal component and no clear trend

atm2 = select_atm(2)
plot_atm(2)

Based on the decomposition, we see that there is a slight downtrend.

atm2%>%
  model(
    classical_decomposition(Cash)
    )%>%
  components()%>%
  autoplot()

## Warning: Removed 3 row(s) containing missing values (geom_path).

Fitting various models and calculating the RMSE, we can see that arima model slightly out performs the ETS model

atm2.models = atm2%>%
  model(
    'mean' = MEAN(Cash),
    'snaive' = SNAIVE(Cash),
    'ets' = ETS(Cash~error("A") + trend("A") + season("A")),
    'arima' = ARIMA(Cash~pdq(1,0,1))
  )

atm2.models%>%
  accuracy()

## # A tibble: 4 x 11
##   ATM   .model .type           ME  RMSE   MAE   MPE  MAPE  MASE RMSSE    ACF1
##   <chr> <chr>  <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 ATM2  mean   Training -4.73e-17  38.7  33.1  -Inf   Inf 1.59  1.29   0.0214
## 2 ATM2  snaive Training  2.23e- 2  30.1  20.8  -Inf   Inf 1     1      0.0459
## 3 ATM2  ets    Training  1.48e- 1  25.1  17.8  -Inf   Inf 0.856 0.834  0.0200
## 4 ATM2  arima  Training -3.88e- 1  24.8  17.4  -Inf   Inf 0.838 0.826 -0.0205

Residuals for the arima model are normal and uncorrelated

atm2.models%>%
  select('arima')%>%
  gg_tsresiduals()

Using Arima model, forecast ATM2 withdrawals

forecast_model(atm2,atm2.models,'arima')

ATM3

ATM3 only has withdrawals data for a couple of days in the dataset.There is no seasonal component

atm3 = select_atm(3)
plot_atm(3)

comparing various models, we can see that the naive method is nearly as good as ETS and ARIMA while being far less complex

atm3.models = atm3%>%
  model(
    'mean' = MEAN(Cash),
    'naive' = NAIVE(Cash),
    'ets' = ETS(Cash~error("A") + trend("A") + season("N")),
    'arima' = ARIMA(Cash~pdq(1,1,0))
  )

atm3.models%>%
  accuracy()

## # A tibble: 4 x 11
##   ATM   .model .type           ME  RMSE   MAE    MPE  MAPE  MASE RMSSE     ACF1
##   <chr> <chr>  <chr>        <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>    <dbl>
## 1 ATM3  mean   Training -5.63e-17  7.93 1.43  -Inf   Inf   1.95  0.986  0.640  
## 2 ATM3  naive  Training  2.34e- 1  5.09 0.310   28.8  40.2 0.423 0.632 -0.149  
## 3 ATM3  ets    Training  2.65e- 1  5.02 0.267  NaN   Inf   0.363 0.625 -0.00266
## 4 ATM3  arima  Training  2.66e- 1  5.03 0.266   33.7  33.7 0.362 0.625 -0.00175

Since the dataset has so few entries, I believe using the naive method is superior as there is little historical data to go off of.

forecast_model(atm3,atm3.models,'naive')

ATM4

ATM4 follows a similar pattern to ATM 1 and ATM2. There seems to be a seasonal component with no general trend. There also seems to be a couple of days in particular with abnormally high amounts of cash withdrawal

atm4 = select_atm(4)
plot_atm(4)

After replacing outliers with the mean, we are able to see the fluctuations more clearly.

m = mean(atm4$Cash)
s = sd(atm4$Cash)
low = m-3*s
high = m+3*s

atm4 = atm4%>%
  filter(Cash>=low & Cash<= high)%>%
  fill_gaps()%>%
  replace_na(list(Cash = m))

atm4%>%
  autoplot(Cash)+labs(title = 'Withdrawals from ATM4')

There is a seasonal component and a flat trend

atm4%>%
  model(
    classical_decomposition(Cash)
    )%>%
  components()%>%
  autoplot()

## Warning: Removed 3 row(s) containing missing values (geom_path).

All of the models have high RMSE values. Our ets model performs the best with lowest RMSE

atm4.models = atm4%>%
  model(
    'mean' = MEAN(Cash),
    'snaive' = SNAIVE(Cash),
    'ets' = ETS(Cash~error("A") + trend("N") + season("A")),
    'arima' = ARIMA(Cash~pdq(1,0,1))
  )

atm4.models%>%
  accuracy()

## # A tibble: 4 x 11
##   ATM   .model .type           ME  RMSE   MAE   MPE  MAPE  MASE RMSSE    ACF1
##   <chr> <chr>  <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 ATM4  mean   Training -2.83e-15  350.  292. -574.  607. 0.842 0.775 0.0746 
## 2 ATM4  snaive Training -3.70e+ 0  452.  346. -392.  447. 1     1     0.0595 
## 3 ATM4  ets    Training -8.36e+ 0  329.  265. -523.  553. 0.764 0.727 0.100  
## 4 ATM4  arima  Training -1.70e- 1  342.  282. -523.  555. 0.815 0.756 0.00192

The residuals are near normal and uncorrelated

atm4.models%>%
  select('ets')%>%
  gg_tsresiduals()

Forecasting ATM4 using ETS model

forecast_model(atm4,atm4.models,'ets')

Forecasting Power Usage

power_usage = read_excel('ResidentialCustomerForecastLoad-624.xlsx')
power_usage%>%head()

## # A tibble: 6 x 3
##   CaseSequence `YYYY-MMM`     KWH
##          <dbl> <chr>        <dbl>
## 1          733 1998-Jan   6862583
## 2          734 1998-Feb   5838198
## 3          735 1998-Mar   5420658
## 4          736 1998-Apr   5010364
## 5          737 1998-May   4665377
## 6          738 1998-Jun   6467147

Cleaning data

power_usage['yearmonth'] = yearmonth(power_usage$`YYYY-MMM`)

# get mean, sd and create low/high based on 3 standard deviations
KWH_mean = mean(power_usage$KWH, na.rm=TRUE)
KWH_sd = sd(power_usage$KWH, na.rm=TRUE)
KWH_low = KWH_mean-KWH_sd*3
KWH_high = KWH_mean+KWH_sd*3

# replace NA with mean
power_series = power_usage%>%
  select('yearmonth','KWH')%>%
  as_tsibble(index = 'yearmonth')

power_series%>%head()

## # A tsibble: 6 x 2 [1M]
##   yearmonth     KWH
##       <mth>   <dbl>
## 1  1998 Jan 6862583
## 2  1998 Feb 5838198
## 3  1998 Mar 5420658
## 4  1998 Apr 5010364
## 5  1998 May 4665377
## 6  1998 Jun 6467147

Data looks to be seasonal with an flat trend. There seems to be an outlier near 2010

power_series%>%
  autoplot(KWH) + labs(title = 'KWH')

remove outliers from dataset and fill gaps with mean

power_usage_ts = power_series%>%
  filter(KWH>= KWH_low & KWH<=KWH_high)%>%
  fill_gaps()%>%
  replace_na(list(KWH = KWH_mean))

power_usage_ts%>%
  autoplot(KWH) + labs(title = 'KWH')

Decomposition

There is a seasonal component along with trend. There is a slight uptrend

power_usage_ts%>%
  model(
    classical_decomposition(KWH)
    )%>%
  components()%>%
  autoplot()

## Warning: Removed 6 row(s) containing missing values (geom_path).

Modeling

Based on RMSE, the arima model fits this dataset the best

power.models = power_usage_ts%>%
  model(
    'mean' = MEAN(KWH),
    'snaive' = SNAIVE(KWH),
    'ets' = ETS(KWH~error("A") + trend("A") + season("A")),
    'arima' = ARIMA(KWH~pdq(2,1,2) + PDQ(0,1,2))
  )

power.models%>%
  accuracy()

## # A tibble: 4 x 10
##   .model .type          ME     RMSE      MAE     MPE  MAPE  MASE RMSSE    ACF1
##   <chr>  <chr>       <dbl>    <dbl>    <dbl>   <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 mean   Training 9.70e-11 1378993. 1157503. -4.42   18.3  1.83  1.63   0.482 
## 2 snaive Training 9.42e+ 4  847752.  633765.  0.651   9.41 1     1      0.304 
## 3 ets    Training 1.53e+ 4  636898.  468894. -0.626   7.09 0.740 0.751  0.280 
## 4 arima  Training 4.34e+ 4  604400.  444327.  0.0682  6.70 0.701 0.713 -0.0129

Model residuals are normal and uncorrelated

power.models%>%
  select('arima')%>%
  gg_tsresiduals()

Forecast

Forecast KWH usage using arima

power.models%>%
  select('arima')%>%
  forecast(h=30)%>%
  autoplot(power_usage_ts) + labs(title ='KWH Forecast')

Waterflow Pipe (Bonus)

get_lambda = function(data, feature){
  data%>%
    features(data[feature],features = guerrero)%>%
    pull(lambda_guerrero)%>%
    return()
}

water_pipe1 = read_excel('Waterflow_Pipe1.xlsx')
water_pipe2 = read_excel('Waterflow_Pipe2.xlsx')
water_pipe1%>%head()

## # A tibble: 6 x 2
##   `Date Time` WaterFlow
##         <dbl>     <dbl>
## 1      42300.     23.4 
## 2      42300.     28.0 
## 3      42300.     23.1 
## 4      42300.     30.0 
## 5      42300.      6.00
## 6      42300.     15.9

water_pipe2%>%head()

## # A tibble: 6 x 2
##   `Date Time` WaterFlow
##         <dbl>     <dbl>
## 1      42300.      18.8
## 2      42300.      43.1
## 3      42300.      38.0
## 4      42300.      36.1
## 5      42300.      31.9
## 6      42300.      28.2

cleaning data

water_pipe = rbind(water_pipe1,water_pipe2)

mean_flow = mean(water_pipe$WaterFlow,na.rm =TRUE)

water_series = water_pipe%>%
  mutate(days = as.Date(water_pipe$`Date Time`,origin = "1899-12-30"),
         hour = round((water_pipe$`Date Time`- round(water_pipe$`Date Time`))*24))%>%
  mutate(day_hour = days+hours(hour))%>%
  select('day_hour','WaterFlow')%>%
  group_by(day_hour)%>%
  summarize(avg_flow = mean(WaterFlow))%>%
  as_tsibble(index = day_hour)%>%
  fill_gaps()%>%
  replace_na(list(avg_flow = mean_flow))


# Average flow increased substantially in November
water_series%>%
  autoplot(avg_flow) +labs(title = 'Average Water Flow')

Exploratory Analysis

water_series%>%
  model(
    classical_decomposition(avg_flow)
    )%>%
  components()%>%
  autoplot()

## Warning: Removed 12 row(s) containing missing values (geom_path).

After differencing the data fits better between the confidence levels

water_series%>%
  ACF(avg_flow)%>%
  autoplot() + labs(title = 'ACF Flow')

water_series%>%
  ACF(difference(avg_flow))%>%
  autoplot() + labs(title = 'ACF Difference in Flow')

Modeling

Based on the RMSE, The ARIMA model hasthe least amount of error.

flow.models = water_series%>%
  model(
    'mean' = MEAN(avg_flow),
    'naive' = NAIVE(avg_flow),
    'snaive' = SNAIVE(avg_flow),
    'ets' = ETS(avg_flow~error("A") + trend("N") + season("A")),
    'arima' = ARIMA(avg_flow~pdq(2,1,2) + PDQ(0,1,1))
  )

flow.models%>%
  accuracy()

## # A tibble: 5 x 10
##   .model .type           ME  RMSE   MAE   MPE  MAPE  MASE RMSSE      ACF1
##   <chr>  <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>     <dbl>
## 1 mean   Training  1.37e-15  15.4  12.7 -29.5  52.4 0.856 0.796  0.172   
## 2 naive  Training  5.76e- 2  19.8  15.3 -27.0  60.3 1.03  1.02  -0.500   
## 3 snaive Training  4.74e- 1  19.4  14.9 -24.7  57.6 1     1      0.0215  
## 4 ets    Training  4.66e- 1  14.1  10.9 -24.9  46.5 0.731 0.728  0.0123  
## 5 arima  Training -1.88e- 1  14.1  10.8 -26.8  46.7 0.723 0.727 -0.000156

flow.models%>%
  select('arima')%>%
  gg_tsresiduals()

Forecast

flow.models%>%
  select('arima')%>%
  forecast(h=24*7)%>%
  autoplot(water_series)+labs(title = 'Average Water Flow Forecast')

data624_project_1

Richard

4/2/2022

ATM Forecast

cleaning data

ATM1

ATM2

ATM3

ATM4

Forecasting Power Usage

Cleaning data

Decomposition

Modeling

Forecast

Waterflow Pipe (Bonus)

cleaning data

Exploratory Analysis

Modeling

Forecast