R Markdown

# STEP 0: Load Required Packages
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tseries)
library(ggplot2)
library(scales)  # for formatting axis labels
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
# STEP 1: Load and Prepare CSV Data
data <- read_csv("C:/Users/accou/Downloads/WWF Analysis/WWFCollectionsAD.csv")
## Rows: 66 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (1): AmountCollected
## date (1): DatePaid
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Ensure date column is in Date format and sorted
data <- data %>%
  mutate(DatePaid = as.Date(DatePaid)) %>%
  arrange(DatePaid)

# STEP 2: Create Time Series Object
start_year <- year(min(data$DatePaid))
start_month <- month(min(data$DatePaid))
ts_data <- ts(data$AmountCollected, start = c(start_year, start_month), frequency = 12)

# STEP 3: Check Stationarity
adf.test(ts_data)
## Warning in adf.test(ts_data): p-value greater than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_data
## Dickey-Fuller = 1.269, Lag order = 4, p-value = 0.99
## alternative hypothesis: stationary
# If p-value > 0.05, differencing is needed

# STEP 4: Fit ARIMA Model Automatically
model <- auto.arima(ts_data)
summary(model)
## Series: ts_data 
## ARIMA(1,1,0)(1,0,0)[12] with drift 
## 
## Coefficients:
##           ar1    sar1     drift
##       -0.6027  0.5264  5494.490
## s.e.   0.0976  0.1349  2161.523
## 
## sigma^2 = 244747730:  log likelihood = -720.63
## AIC=1449.26   AICc=1449.93   BIC=1457.96
## 
## Training set error measures:
##                    ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -149.643 15162.93 10282.28 -0.7234085 4.202458 0.2511172
##                    ACF1
## Training set 0.05659578
# STEP 5: Forecast Future Values
forecasted <- forecast(model, h = 60)
plot(forecasted)

# STEP 6: Combine Forecast with Dates and Confidence Intervals
last_date <- max(data$DatePaid)
future_dates <- seq(last_date %m+% months(1), by = "month", length.out = 60)

forecast_df <- data.frame(
  date = future_dates,
  forecast = as.numeric(forecasted$mean),
  lower80 = as.numeric(forecasted$lower[,1]),
  upper80 = as.numeric(forecasted$upper[,1]),
  lower95 = as.numeric(forecasted$lower[,2]),
  upper95 = as.numeric(forecasted$upper[,2])
)

# STEP 7: Visualize Actual vs Forecast with Confidence Ribbon
actual_df <- data %>%
  select(DatePaid, AmountCollected) %>%
  rename(date = DatePaid, value = AmountCollected) %>%
  mutate(type = "Actual")

forecast_plot_df <- forecast_df %>%
  rename(value = forecast) %>%
  mutate(type = "Forecast")

combined_df <- bind_rows(actual_df, forecast_plot_df)

ggplot() +
  geom_line(data = actual_df, aes(x = date, y = value), color = "#1f77b4", size = 1.2) +
  geom_line(data = forecast_plot_df, aes(x = date, y = value), color = "#ff7f0e", size = 1.2) +
  geom_ribbon(data = forecast_df, aes(x = date, ymin = lower95, ymax = upper95), fill = "#ff7f0e", alpha = 0.2) +
  scale_y_continuous(labels = comma_format(accuracy = 1)) +
  labs(title = "ARIMA Forecast of WWF Collections with 95% Confidence Interval",
       x = "Date", y = "Amount Collected (ZAR)") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# STEP 8: Export Forecast to CSV
write_csv(forecast_df, "arima_forecast_output.csv")

#STEP 9: last 10 Data sample

tail(forecast_df)
##          date forecast  lower80  upper80  lower95 upper95
## 55 2030-03-30 791437.7 635091.8 947783.5 552327.4 1030548
## 56 2030-04-30 794516.3 636131.6 952900.9 552287.8 1036745
## 57 2030-05-30 801696.3 641294.5 962098.1 556382.9 1047010
## 58 2030-06-30 805679.9 643288.5 968071.3 557323.7 1054036
## 59 2030-07-30 814452.1 650093.7 978810.5 563087.6 1065817
## 60 2030-08-30 819950.5 653649.3 986251.8 565614.7 1074286