install.packages("readxl")
install.packages("httr")
install.packages("devtools")
install.packages("RCurl")
install.packages("plyr")
install.packages("dplyr")
install.packages("tidyverse")
install.packages("DescTools")
install.packages("ggpubr")
install.packages("openintro")
install.packages("readr")
install.packages("rvest")
install.packages("fpp3")
install.packages("ggplot2")
install.packages("tsibble")
install.packages("feasts")
install.packages("latex2exp")
install.packages("seasonal")
install.packages("seasonalview")
install.packages("fable")
install.packages("rio")
install.packages("urca")
install.packages("writexl")
library(readxl)
library(httr)
library(devtools)
## Loading required package: usethis
library(RCurl)
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange() masks plyr::arrange()
## ✖ purrr::compact() masks plyr::compact()
## ✖ tidyr::complete() masks RCurl::complete()
## ✖ dplyr::count() masks plyr::count()
## ✖ dplyr::desc() masks plyr::desc()
## ✖ dplyr::failwith() masks plyr::failwith()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::id() masks plyr::id()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::mutate() masks plyr::mutate()
## ✖ dplyr::rename() masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DescTools)
library(ggpubr)
##
## Attaching package: 'ggpubr'
##
## The following object is masked from 'package:plyr':
##
## mutate
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
##
## Attaching package: 'openintro'
##
## The following object is masked from 'package:DescTools':
##
## cards
library(readr)
library(rvest)
##
## Attaching package: 'rvest'
##
## The following object is masked from 'package:readr':
##
## guess_encoding
library(fpp3)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tsibble 1.1.5 ✔ feasts 0.4.1
## ✔ tsibbledata 0.4.1 ✔ fable 0.4.0
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ dplyr::arrange() masks plyr::arrange()
## ✖ purrr::compact() masks plyr::compact()
## ✖ tidyr::complete() masks RCurl::complete()
## ✖ dplyr::count() masks plyr::count()
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::desc() masks plyr::desc()
## ✖ dplyr::failwith() masks plyr::failwith()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::id() masks plyr::id()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ fabletools::MAE() masks DescTools::MAE()
## ✖ fabletools::MAPE() masks DescTools::MAPE()
## ✖ fabletools::MSE() masks DescTools::MSE()
## ✖ ggpubr::mutate() masks dplyr::mutate(), plyr::mutate()
## ✖ dplyr::rename() masks plyr::rename()
## ✖ fabletools::RMSE() masks DescTools::RMSE()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
## ✖ tsibble::union() masks base::union()
library(ggplot2)
library(tsibble)
library(feasts)
library(openxlsx)
library(latex2exp)
library(seasonal)
##
## Attaching package: 'seasonal'
##
## The following object is masked from 'package:tibble':
##
## view
library(seasonalview)
##
## Attaching package: 'seasonalview'
##
## The following object is masked from 'package:seasonal':
##
## view
##
## The following object is masked from 'package:tibble':
##
## view
library(fable)
library(rio)
library(urca)
library(zoo)
##
## Attaching package: 'zoo'
##
## The following object is masked from 'package:tsibble':
##
## index
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(lubridate)
library(writexl)
In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.
The general workflow was to decompose the data to look at the trend and seasonality. Then for predictions, an exponential smoothing model ETS was applied. The type of exponential smoothing model was determined statistically by minimizing the AIC (Akaike’s Information Criterion). The residuals of the model compared to the data were checked using Ljung-Box statistic. The Ljung-Box testing if a time series residuals are white noise. an ideal fit model has residuals that have a normal distribution and behave like white noise. The higher the p value the more the residuals resemble white noise since the null hypothesis is that the residuals are uncorrelated white noise. typically a p >0.05 is used to determine whether the residuals are white noise. An auto selected ARIMA model was was also also checked. by looking at the fit and the p value one of the models was selected. if neither model fits the data well then other methods were used. Linear models were not used on this data due to less distinct trends. In other words the trends were wavey and curved.
the auto selected models were used to avoid guesswork.
First Loading in the ATM data. Data was put on github for code reproducibility.
#url of the data
url <- "https://github.com/division-zero/Data-624-project-1/raw/refs/heads/main/ATM624Data.xlsx"
#specify a temp file to download
temp <- tempfile(fileext = ".xlsx")
#download the file
GET(url, write_disk(temp))
## Response [https://raw.githubusercontent.com/division-zero/Data-624-project-1/refs/heads/main/ATM624Data.xlsx]
## Date: 2024-10-30 05:14
## Status: 200
## Content-Type: application/octet-stream
## Size: 40.8 kB
## <ON DISK> C:\Users\keith\AppData\Local\Temp\Rtmpuw0RkS\file134c0106d2cda.xlsx
# Load the file into a dataframe
df <- read_excel(temp)
# temp file removed
unlink(temp)
dfATM <- as.data.frame(df)
#converting 100s of dollars into dollars
dfATM <- dfATM %>%
mutate(Cash = Cash * 100)
#visualize the data
head(dfATM)
## DATE ATM Cash
## 1 39934 ATM1 9600
## 2 39934 ATM2 10700
## 3 39935 ATM1 8200
## 4 39935 ATM2 8900
## 5 39936 ATM1 8500
## 6 39936 ATM2 9000
#all the ATM data
dfATM |> ggplot( aes(x = DATE, y = Cash, color = ATM)) +
geom_line()
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_line()`).
#visualize for each ATM
dfATM |> filter(ATM == 'ATM1') |> ggplot( aes(x = DATE, y = Cash)) +
geom_line() +
labs(title = "ATM1")
dfATM |> filter(ATM == 'ATM2') |> ggplot( aes(x = DATE, y = Cash)) +
geom_line() +
labs(title = "ATM2")
dfATM |> filter(ATM == 'ATM3') |> ggplot( aes(x = DATE, y = Cash)) +
geom_line() +
labs(title = "ATM3")
dfATM |> filter(ATM == 'ATM4') |> ggplot( aes(x = DATE, y = Cash)) +
geom_line( ) +
labs(title = "ATM4")
Visually wanted to look at the cash withdrawals from each ATM separately.
In order to use the fable package which will determine our models for forcasting the data was converted into a time series tibble.
#fix the date column
#convert to time series tibble
dfATM_ts <- dfATM |>
as_tsibble(key = ATM, index = DATE)
#convert the converted dates back into dates
#dates ended up being converted into a number, so they were converted back
dfATM_ts <- dfATM_ts |>
mutate(DATE = as.Date(DATE, origin = "1899-12-30"))
#with time series tibble autoplot can be used as well as other fable packages
head(dfATM_ts)
## # A tsibble: 6 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM1 9600
## 2 2009-05-02 ATM1 8200
## 3 2009-05-03 ATM1 8500
## 4 2009-05-04 ATM1 9000
## 5 2009-05-05 ATM1 9900
## 6 2009-05-06 ATM1 8800
dfATM_ts |> autoplot(Cash) +
labs(title = "Cash vs Date", x = "Date", y = "Cash")
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_line()`).
The data is now in the correct data type to make model estimations.
The model generation was failing due to missing values in the data. The missing data was imputated using a linear interpolation between the closest points of data that were not NA.
First focused on ATM1
#separate the ATM1 data from the time series tibble.
dfATM1 <- dfATM_ts |> filter(ATM == "ATM1")
#visualize the data
dfATM1 |> autoplot(Cash)
#perform a linear interpolation imputation between nearest points otherwise model will not work
dfATM1 <- dfATM1 |>
mutate(Cash = zoo::na.approx(Cash, na.rm = FALSE))
#check the decomposition for seasonality
dcmp <- dfATM1 |>
model(stl = STL(Cash))
components(dcmp) |> autoplot()
Data
does show seasonal patterns
attempting an ETS model for ATM1
#fit is the ETS model for ATM1
fit <- dfATM1 |>
model(ETS(Cash))
report(fit)
## Series: Cash
## Model: ETS(A,N,A)
## Smoothing parameters:
## alpha = 0.0203332
## gamma = 0.3305402
##
## Initial states:
## l[0] s[0] s[-1] s[-2] s[-3] s[-4] s[-5] s[-6]
## 8108.703 -5353.917 -228.6555 1201.759 245.6049 1905.499 1364.864 864.8459
##
## sigma^2: 5812790
##
## AIC AICc BIC
## 7849.433 7850.055 7888.432
components(fit) |>
autoplot() +
labs(title = "ETS components")
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).
#using the model to forcast.
fit |>
forecast(h = 31) |>
autoplot(dfATM1)+
labs(title="Cash",
y="Cash")
augment(fit) |>
features(.innov, ljung_box, lag=31, dof=10)
## # A tibble: 1 × 4
## ATM .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 ATM1 ETS(Cash) 43.9 0.00242
glance(fit)
## # A tibble: 1 × 10
## ATM .model sigma2 log_lik AIC AICc BIC MSE AMSE MAE
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM1 ETS(Cash) 5812790. -3915. 7849. 7850. 7888. 5669461. 5702741. 1512.
fit |> gg_tsresiduals(lag=31)
with a P value <0.05 residuals are not uncorrelated at least checking the lag of one month. the acf graph shows some indication of autocorrelation due to serveral lags passing the threshold.
Checking the statistics of the ETS fit
augment(fit) |>
features(.innov, ljung_box, lag=90, dof=10)
## # A tibble: 1 × 4
## ATM .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 ATM1 ETS(Cash) 102. 0.0519
glance(fit)
## # A tibble: 1 × 10
## ATM .model sigma2 log_lik AIC AICc BIC MSE AMSE MAE
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM1 ETS(Cash) 5812790. -3915. 7849. 7850. 7888. 5669461. 5702741. 1512.
fit |> gg_tsresiduals(lag=90)
checking a larger lag of 3 months the p value increases to 0.05 from Ljung box statistic test, which may indicate the model may work better for longer predictions. However since it is borderline it would not be considered a good model.
Checking the ARIMA model fit
#autogenerate a best fit model by reducing the AIC.
autofit <- dfATM1 |> model(auto = ARIMA(Cash, stepwise = FALSE, approx = FALSE))
#viewing basic info about the model
head(autofit)
## # A mable: 1 x 2
## # Key: ATM [1]
## ATM auto
## <chr> <model>
## 1 ATM1 <ARIMA(0,0,1)(1,1,1)[7]>
glance(autofit)
## # A tibble: 1 × 9
## ATM .model sigma2 log_lik AIC AICc BIC ar_roots ma_roots
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
## 1 ATM1 auto 5550941. -3288. 6585. 6585. 6600. <cpl [7]> <cpl [8]>
report(autofit)
## Series: Cash
## Model: ARIMA(0,0,1)(1,1,1)[7]
##
## Coefficients:
## ma1 sar1 sma1
## 0.1968 0.1848 -0.7537
## s.e. 0.0546 0.0793 0.0552
##
## sigma^2 estimated as 5550941: log likelihood=-3288.28
## AIC=6584.56 AICc=6584.67 BIC=6600.08
checking the residuals of the ARIMA model.
#checking the residuals of a lag of 3 months
augment(autofit) |>
filter(.model == "auto") |>
features(.innov, ljung_box, lag=90, dof=3)
## # A tibble: 1 × 4
## ATM .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 ATM1 auto 84.0 0.571
autofit |> select(auto) |> gg_tsresiduals(lag=90)
#checking the residuals of a lag of 1 month
augment(autofit) |>
filter(.model == "auto") |>
features(.innov, ljung_box, lag=31, dof=3)
## # A tibble: 1 × 4
## ATM .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 ATM1 auto 32.5 0.256
autofit |> select(auto) |> gg_tsresiduals(lag=31)
#forcasting 1 month using the ARIMA the model
autofit |>
forecast(h = 31) |>
autoplot(dfATM1)+
labs(title="Cash",
y="Cash")
#storing the predicted values of the model
prediction <- autofit |>
forecast(h = 31)
#placing the predicted values into a dataframe for creating a file.
df_prediction <- as.data.frame(prediction)
# View the forecasted dataframe
head(df_prediction)
## ATM .model DATE Cash .mean
## 1 ATM1 auto 2010-05-01 N(8704.058, 5550941) 8704.0575
## 2 ATM1 auto 2010-05-02 N(10420.22, 5765965) 10420.2209
## 3 ATM1 auto 2010-05-03 N(7351.214, 5765965) 7351.2141
## 4 ATM1 auto 2010-05-04 N(918.1776, 5765965) 918.1776
## 5 ATM1 auto 2010-05-05 N(9956.588, 5765965) 9956.5881
## 6 ATM1 auto 2010-05-06 N(8048.391, 5765965) 8048.3908
#using a method to write an excel file of the predicted values.
write.xlsx(df_prediction, "ATM1forcast.xlsx")
write_xlsx(df_prediction, "ATM1forecast.xlsx")
ARIMA and ETS visually appear to give the same prediction. ARIMA has significantly larger p value indicating the residuals are uncorrelated indicating that it is a better fit to the data. so the ARIMA model was used to forecast ATM1.
Moving on to ATM2
#selecting ATM2 from the.
dfATM2 <- dfATM_ts |> filter(ATM == "ATM2")
#viewing the data
dfATM2 |> autoplot(Cash)
#perform a linear imputation so that a model can be generated
dfATM2 <- dfATM2 |>
mutate(Cash = zoo::na.approx(Cash, na.rm = FALSE))
#view the components of the data
dcmp <- dfATM2 |>
model(stl = STL(Cash))
components(dcmp) |> autoplot()
There is seasonality to the data
attempting an ETS fit first.
fit2 <- dfATM2 |>
model(ETS(Cash))
report(fit2)
## Series: Cash
## Model: ETS(A,N,A)
## Smoothing parameters:
## alpha = 0.0001000086
## gamma = 0.3601849
##
## Initial states:
## l[0] s[0] s[-1] s[-2] s[-3] s[-4] s[-5] s[-6]
## 7338.583 -3673.336 -1942.586 1483.159 -349.1514 473.7369 1400.743 2607.435
##
## sigma^2: 6489600
##
## AIC AICc BIC
## 7889.634 7890.256 7928.633
components(fit2) |>
autoplot() +
labs(title = "ETS components")
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).
#forcasting with the ETS model
fit2 |>
forecast(h = 31) |>
autoplot(dfATM2)+
labs(title="ATM2 ETS",
y="Cash")
#checking lag of 1 month and a lag of 3 months.
augment(fit2) |>
features(.innov, ljung_box, lag=31, dof=4)
## # A tibble: 1 × 4
## ATM .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 ATM2 ETS(Cash) 67.0 0.0000301
augment(fit2) |>
features(.innov, ljung_box, lag=90, dof=4)
## # A tibble: 1 × 4
## ATM .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 ATM2 ETS(Cash) 125. 0.00415
glance(fit2)
## # A tibble: 1 × 10
## ATM .model sigma2 log_lik AIC AICc BIC MSE AMSE MAE
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM2 ETS(Cash) 6489600. -3935. 7890. 7890. 7929. 6329583. 6345911. 1790.
p values are very low for the residuals indicating that they are still correlated in the model.
Attempt ATM2 with ARIMA model
autofit2 <- dfATM2 |> model(auto = ARIMA(Cash, stepwise = FALSE, approx = FALSE))
head(autofit2)
## # A mable: 1 x 2
## # Key: ATM [1]
## ATM auto
## <chr> <model>
## 1 ATM2 <ARIMA(2,0,2)(0,1,1)[7]>
head(autofit2 |> select(auto ) |> pivot_longer(everything(), names_to = "Model name", values_to = "Orders"))
## # A mable: 1 x 2
## # Key: Model name [1]
## `Model name` Orders
## <chr> <model>
## 1 auto <ARIMA(2,0,2)(0,1,1)[7]>
glance(autofit2)
## # A tibble: 1 × 9
## ATM .model sigma2 log_lik AIC AICc BIC ar_roots ma_roots
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
## 1 ATM2 auto 6025157. -3302. 6617. 6617. 6640. <cpl [2]> <cpl [9]>
augment(autofit2) |>
features(.innov, ljung_box, lag=30, dof=4)
## # A tibble: 1 × 4
## ATM .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 ATM2 auto 39.6 0.0428
autofit2 |> select(auto) |> gg_tsresiduals(lag=100)
augment(autofit2) |>
features(.innov, ljung_box, lag=90, dof=4)
## # A tibble: 1 × 4
## ATM .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 ATM2 auto 97.4 0.189
autofit2 |>
forecast(h = 31) |>
autoplot(dfATM2)+
labs(title="ATM2 ARIMA",
y="Cash")
#storing the predicted values of the model
prediction2 <- autofit2 |>
forecast(h = 31)
#placing the predicted values into a dataframe for creating a file.
df_prediction2 <- as.data.frame(prediction2)
write_xlsx(df_prediction2, "ATM2forecast.xlsx")
preference went to the ARIMA due to following the trend better with the steady decrease. the p values for the residual correlation are also higher for the ARIMA model. data is stored into an excel file
moving on to ATM3
dfATM3 <- dfATM_ts |> filter(ATM == "ATM3")
dfATM3 |> autoplot(Cash)
#perform a linear imputation
dfATM3 <- dfATM3 |>
mutate(Cash = zoo::na.approx(Cash, na.rm = FALSE))
#view the decomposed data
dcmp <- dfATM3 |>
model(stl = STL(Cash))
components(dcmp) |> autoplot()
#ETS model for ATM3
fit3 <- dfATM3 |>
model(ETS(Cash))
report(fit3)
## Series: Cash
## Model: ETS(A,N,N)
## Smoothing parameters:
## alpha = 0.8583809
##
## Initial states:
## l[0]
## -0.1250323
##
## sigma^2: 254127.4
##
## AIC AICc BIC
## 6700.098 6700.164 6711.797
components(fit3) |>
autoplot() +
labs(title = "ETS components")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
fit3 |>
forecast(h = 31) |>
autoplot(dfATM3)+
labs(title="Cash",
y="Cash")
augment(fit3) |>
features(.innov, ljung_box, lag=31, dof=4)
## # A tibble: 1 × 4
## ATM .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 ATM3 ETS(Cash) 0.306 1
augment(fit3) |>
features(.innov, ljung_box, lag=90, dof=4)
## # A tibble: 1 × 4
## ATM .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 ATM3 ETS(Cash) 0.313 1
glance(fit3)
## # A tibble: 1 × 10
## ATM .model sigma2 log_lik AIC AICc BIC MSE AMSE MAE
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM3 ETS(Cash) 254127. -3347. 6700. 6700. 6712. 252735. 442995. 27.2
fit3 |> gg_tsresiduals(lag=100)
ETS model looks very similar to a naive model. This fits the data well due to what appears to be a inoperative ATM that was then started to be used. there is seasonality to the at the end of the data that can be captured with a SNAIVE model.
ATM3 SNAIVE
atm3_sn <- dfATM3 |> model(SNAIVE(Cash ~ lag("3 days"))) #forecast based on the past 3 days due to that being the only time it was used
atm3_predict <- atm3_sn |>
forecast(h = "1 months") #make the forecast model go out to 1 month.
dfATM3 |> autoplot(Cash) +
atm3_predict |> autolayer(filter_index(Cash)) + #graph the forecast
labs(y = "Cash", title = "ATM3 SNAIVE")
#storing the predicted values of the model
prediction3 <- atm3_sn |>
forecast(h = 31)
#placing the predicted values into a dataframe for creating a file.
df_prediction3 <- as.data.frame(prediction3)
write_xlsx(df_prediction3, "ATM3forecast.xlsx")
The SNAIVE model assumes that the ATM will be used as when it was started to be used.
ARIMA attempt for ATM3
autofit3 <- dfATM3 |> model(auto = ARIMA(Cash, stepwise = FALSE, approx = FALSE))
head(autofit3)
## # A mable: 1 x 2
## # Key: ATM [1]
## ATM auto
## <chr> <model>
## 1 ATM3 <ARIMA(0,0,2)>
glance(autofit3)
## # A tibble: 1 × 9
## ATM .model sigma2 log_lik AIC AICc BIC ar_roots ma_roots
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
## 1 ATM3 auto 254016. -2790. 5585. 5585. 5597. <cpl [0]> <cpl [2]>
# checking the Ljung-box statistics of the fit at 30 days and 3 days
augment(autofit3) |>
features(.innov, ljung_box, lag=30, dof=4)
## # A tibble: 1 × 4
## ATM .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 ATM3 auto 0.132 1
autofit |> select(auto) |> gg_tsresiduals(lag=100)
augment(autofit3) |>
features(.innov, ljung_box, lag=3, dof=4)
## Warning in pchisq(STATISTIC, lag - fitdf): NaNs produced
## # A tibble: 1 × 4
## ATM .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 ATM3 auto 0.132 NaN
#forcast of the ARIMA model
autofit3 |>
forecast(h = 31) |>
autoplot(dfATM3)+
labs(title="ATM3 ARIMA",
y="Cash")
the ARIMA model predicts that the data will continue to be flat as when
the ATM was not being used. Just from a visual standpoint if the ATM
continues to be used it should maintain its cash numbers. The SNAIVE
model was stored as the prediction. The ETS model also favors the most
recent behavior in the data. There is nothing to indicate that the cash
will become zero again.
ATM4
#selecting the ATM4 data
dfATM4 <- dfATM_ts |> filter(ATM == "ATM4")
#plotting the data
dfATM4 |> autoplot(Cash)
#perform a linear imputation so that the models function
dfATM4 <- dfATM4 |>
mutate(Cash = zoo::na.approx(Cash, na.rm = FALSE))
#decomposing atm4
dcmp <- dfATM4 |>
model(stl = STL(Cash))
components(dcmp) |> autoplot()
Data
appears to be fairly seasonal with one outlier spike.
ATM4 ETS model
#ETS model of ATM4
fit4 <- dfATM4 |>
model(ETS(Cash))
report(fit4)
## Series: Cash
## Model: ETS(M,N,A)
## Smoothing parameters:
## alpha = 0.0001003321
## gamma = 0.000100001
##
## Initial states:
## l[0] s[0] s[-1] s[-2] s[-3] s[-4] s[-5] s[-6]
## 46188.24 -25543.78 -5974.68 17518.75 839.131 6073.636 1836.276 5250.664
##
## sigma^2: 1.3511
##
## AIC AICc BIC
## 10080.25 10080.87 10119.25
#visualizing the model
components(fit4) |>
autoplot() +
labs(title = "ETS components")
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).
# predict 31 days with the ETS model
fit4 |>
forecast(h = 31) |>
autoplot(dfATM4)+
labs(title="ATM4 ETS",
y="Cash")
#check the statistics 31 days
augment(fit4) |>
features(.innov, ljung_box, lag=31, dof=4)
## # A tibble: 1 × 4
## ATM .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 ATM4 ETS(Cash) 38.4 0.0715
glance(fit4)
## # A tibble: 1 × 10
## ATM .model sigma2 log_lik AIC AICc BIC MSE AMSE MAE
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM4 ETS(Cash) 1.35 -5030. 10080. 10081. 10119. 4033760455. 4.04e9 0.660
#check the statistics of 90 days
augment(fit4) |>
features(.innov, ljung_box, lag=90, dof=4)
## # A tibble: 1 × 4
## ATM .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 ATM4 ETS(Cash) 78.1 0.717
glance(fit4)
## # A tibble: 1 × 10
## ATM .model sigma2 log_lik AIC AICc BIC MSE AMSE MAE
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM4 ETS(Cash) 1.35 -5030. 10080. 10081. 10119. 4033760455. 4.04e9 0.660
#visualize the residuals of 100 days
fit4 |> gg_tsresiduals(lag=100)
#storing the predicted values of the model
prediction4 <- fit4 |>
forecast(h = 31)
#placing the predicted values into a dataframe for creating a file.
df_prediction4 <- as.data.frame(prediction4)
write_xlsx(df_prediction4, "ATM4forecast.xlsx")
p values are relatively high indicating a good fit for the ETS model.
ATM4 ARIMA
#autogenerate ARIMA model
autofit4 <- dfATM4 |> model(auto = ARIMA(Cash, stepwise = FALSE, approx = FALSE))
head(autofit4)
## # A mable: 1 x 2
## # Key: ATM [1]
## ATM auto
## <chr> <model>
## 1 ATM4 <ARIMA(0,0,0) w/ mean>
glance(autofit4)
## # A tibble: 1 × 9
## ATM .model sigma2 log_lik AIC AICc BIC ar_roots ma_roots
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
## 1 ATM4 auto 4237176650. -4563. 9130. 9130. 9138. <cpl [0]> <cpl [0]>
# check p value for lag of 1 month
augment(autofit4) |>
features(.innov, ljung_box, lag=30, dof=4)
## # A tibble: 1 × 4
## ATM .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 ATM4 auto 13.8 0.975
autofit4 |> select(auto) |> gg_tsresiduals(lag=100)
#check p value for lag of 3 months
augment(autofit4) |>
features(.innov, ljung_box, lag=90, dof=4)
## # A tibble: 1 × 4
## ATM .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 ATM4 auto 41.4 1.00
#forecast using ARIMA model
autofit4 |>
forecast(h = 31) |>
autoplot(dfATM4)+
labs(title="ATM4 ARIMA",
y="Cash")
The ARIMA model for ATM4 resembles a Naive prediction. it fails to
capture any seasonality. The ETS model is favored due to this reason.
They both have similar p values. The ETS model prediction is stored into
an excel file.
Part B – Forecasting Power, ResidentialCustomerForecastLoad-624.xlsx
Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.
Reading in the data for Residential power
#url of the data
url1 <- "https://github.com/division-zero/Data-624-project-1/raw/refs/heads/main/ResidentialCustomerForecastLoad-624.xlsx"
#specify a temp file to download
temp1 <- tempfile(fileext = ".xlsx")
#download the file
GET(url1, write_disk(temp1))
## Response [https://raw.githubusercontent.com/division-zero/Data-624-project-1/refs/heads/main/ResidentialCustomerForecastLoad-624.xlsx]
## Date: 2024-10-30 05:16
## Status: 200
## Content-Type: application/octet-stream
## Size: 14.1 kB
## <ON DISK> C:\Users\keith\AppData\Local\Temp\Rtmpuw0RkS\file134c0524e5600.xlsx
# Load the file into a dataframe
df1 <- read_excel(temp1)
# temp file removed
unlink(temp1)
#visualize the data
dfres <- as.data.frame(df1)
head(dfres)
## CaseSequence YYYY-MMM KWH
## 1 733 1998-Jan 6862583
## 2 734 1998-Feb 5838198
## 3 735 1998-Mar 5420658
## 4 736 1998-Apr 5010364
## 5 737 1998-May 4665377
## 6 738 1998-Jun 6467147
dfres <- setNames(dfres, c("Case", "Date", "KWH"))
#all the res data
dfres |> ggplot( aes(x = Date, y = KWH)) +
geom_line()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
#fix the date column
dfres <- dfres |>
mutate(Date = as.character(Date))
dfres <- dfres |> mutate(Date = parse_date(Date, "%Y-%b"))
#convert to time series tibble
dfres_ts <- dfres |>
as_tsibble( index = Date)
#convert the converted dates back into dates
dfres_ts <- dfres_ts |>
mutate(Date = as.Date(Date))
#with time series tibble autoplot can be used as well as other fable packages
head(dfres_ts)
## # A tsibble: 6 x 3 [1D]
## Case Date KWH
## <dbl> <date> <dbl>
## 1 733 1998-01-01 6862583
## 2 734 1998-02-01 5838198
## 3 735 1998-03-01 5420658
## 4 736 1998-04-01 5010364
## 5 737 1998-05-01 4665377
## 6 738 1998-06-01 6467147
dfres_ts |> autoplot(KWH) +
labs(title = "KWH vs Date", x = "Date", y = "KWH")
cleaning the data and visualizing
#imputation and filling in missing dates.
dfres_ts <- dfres_ts |>
fill_gaps()
#linear imputation used
dfres_ts <- dfres_ts |>
mutate(KWH = zoo::na.approx(KWH, na.rm = FALSE))
dcmp <- dfres_ts |>
model(stl = STL(KWH))
components(dcmp) |> autoplot()
upon first glance it appears the data is seasonal however the crest and troughs are not evenly spaced. which causes issues with several models.
ETS model attempted first
#applying an ETS model on the whole dataset
pwrfit <- dfres_ts |>
model(ETS(KWH))
report(pwrfit)
## Series: KWH
## Model: ETS(A,Ad,N)
## Smoothing parameters:
## alpha = 0.9998921
## beta = 0.9998911
## phi = 0.9691968
##
## Initial states:
## l[0] b[0]
## 6917010 -33044.95
##
## sigma^2: 157446555
##
## AIC AICc BIC
## 160139.8 160139.8 160179.8
#
components(pwrfit) |>
autoplot() +
labs(title = "ETS components")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
pwrfit |>
forecast(h = 365) |>
autoplot(dfres_ts)+
labs(title="KWH",
y="KWH")
augment(pwrfit) |>
features(.innov, ljung_box, lag=365, dof=4)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 ETS(KWH) 1292. 0
glance(pwrfit)
## # A tibble: 1 × 9
## .model sigma2 log_lik AIC AICc BIC MSE AMSE MAE
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ETS(KWH) 157446555. -80064. 160140. 160140. 160180. 157311152. 1.03e9 2795.
ETS model is not a reliable predictor of the data go goes outside the bounds of the original data set. this dataset causes several models to have odd predictions.
ARIMA attempt
#ARIMA model on whole data set
autopwrfit <- dfres_ts |> model(auto = ARIMA(KWH, stepwise = TRUE, approx = FALSE))
head(autopwrfit)
## # A mable: 1 x 1
## auto
## <model>
## 1 <ARIMA(1,1,0)>
glance(autopwrfit)
## # A tibble: 1 × 8
## .model sigma2 log_lik AIC AICc BIC ar_roots ma_roots
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
## 1 auto 157231150. -63104. 126212. 126212. 126225. <cpl [1]> <cpl [0]>
report(autopwrfit)
## Series: KWH
## Model: ARIMA(1,1,0)
##
## Coefficients:
## ar1
## 0.9693
## s.e. 0.0032
##
## sigma^2 estimated as 157231150: log likelihood=-63104.08
## AIC=126212.2 AICc=126212.2 BIC=126225.5
#check statistics
augment(autopwrfit) |>
features(.innov, ljung_box, lag=365, dof=10)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 auto 1307. 0
autopwrfit |> select(auto) |> gg_tsresiduals(lag=365)
#plot prediction for next year
autopwrfit |>
forecast(h = 365) |>
autoplot(dfres_ts)+
labs(title="KWH",
y="KWH")
Again the plot diverges from the original data set.
Regression model
linear_pwr <- dfres_ts |> select(Date, KWH) |>
model(TSLM(KWH ~ trend() + season()))
fc_pwr <- forecast(linear_pwr, h = "1 years")
fc_pwr |>
autoplot(dfres_ts) +
labs(
title = "Forecast KWH using regression",
y = "KWH"
)
linear_pwr |> gg_tsresiduals()
augment(linear_pwr) |>
features(.innov, ljung_box, lag=365, dof=10)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 TSLM(KWH ~ trend() + season()) 386020. 0
A linear regression follows the trend of the data since it has slightly increased however it fails to capture the sinusoidal wave like pattern of the data.
trying a different approach with the ARIMA trying to model the most recent years’ data. chose data from 2011 and up since it is just passed the lowest point that may be an outlier.
autopwrfit2 <- dfres_ts |> filter(year(Date)>=2011) |> model(auto = ARIMA(KWH, stepwise = TRUE, approx = FALSE))
head(autopwrfit2)
## # A mable: 1 x 1
## auto
## <model>
## 1 <ARIMA(2,0,0) w/ mean>
glance(autopwrfit2)
## # A tibble: 1 × 8
## .model sigma2 log_lik AIC AICc BIC ar_roots ma_roots
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
## 1 auto 163281291. -11593. 23193. 23193. 23213. <cpl [2]> <cpl [0]>
report(autopwrfit2)
## Series: KWH
## Model: ARIMA(2,0,0) w/ mean
##
## Coefficients:
## ar1 ar2 constant
## 1.9757 -0.9773 11706.4134
## s.e. 0.0066 0.0067 394.9599
##
## sigma^2 estimated as 163281291: log likelihood=-11592.54
## AIC=23193.08 AICc=23193.12 BIC=23212.97
augment(autopwrfit2) |>
features(.innov, ljung_box, lag=365, dof=10)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 auto 420. 0.0101
autopwrfit2 |> select(auto) |> gg_tsresiduals(lag=365)
autopwrfit2 |>
forecast(h = 365) |>
autoplot(dfres_ts)+
labs(title="KWH ARIMA from 2011",
y="KWH")
reducing the data to modeling only from 2011 definitely helps the ARIMA
model make a better prediction. The residuals are still correlated but
visually it matches better with the overall data.
Using a seasonal Naive of 1 year.
pwr_model <- dfres_ts |> model(SNAIVE(KWH ~ lag("1 years"))) #forecast based on the past 1 year
## Warning: Non-integer lag orders for random walk models are not supported.
## Rounding to the nearest integer.
#predict the next year from the previous year
pwr_predict <- pwr_model |>
forecast(h = "1 years") #make the forecast model go out to 1 year.
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `SNAIVE(KWH ~ lag("1 years")) = (function (object, ...) ...`.
## Caused by warning:
## ! Non-integer lag orders for random walk models are not supported. Rounding to the nearest integer.
dfres_ts |> autoplot(KWH) +
pwr_predict |> autolayer(filter_index(KHW)) + #graph the forecast
labs(y = "KWH", title = "KWH SNAIVE")
pwr_model |> gg_tsresiduals()
## Warning: Removed 365 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 365 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 365 rows containing non-finite outside the scale range
## (`stat_bin()`).
prediction5 <- pwr_model |>
forecast(h = 365)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `SNAIVE(KWH ~ lag("1 years")) = (function (object, ...) ...`.
## Caused by warning:
## ! Non-integer lag orders for random walk models are not supported. Rounding to the nearest integer.
#placing the predicted values into a dataframe for creating a file.
df_prediction5 <- as.data.frame(prediction5)
write_xlsx(df_prediction5, "respwrforecast.xlsx")
from a visual perspective it appears that copying the data from the previous year and making it the prediction of the next year using the SNAIVE model seems to work best purely from a visual standpoint. The prediction behaves more in a way that one would expect from the data. The residuals are still highly correlated though. It is still not a good model of the data however it is believed to not be the worst predictor.