Task 1 - 213

Part I: Research Question A1.What is the outcome of a 120-day revenue forecast with an ARIMA model?

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(tseries)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(seasonal)

## 
## Attaching package: 'seasonal'
## 
## The following object is masked from 'package:tibble':
## 
##     view

library(forecast)
library(rmarkdown)
library(knitr)

# Load and inspect the data
data <- read.csv('/Users/kimshellenberger/Desktop/medical_time_series .csv')
head(data)

##   Day    Revenue
## 1   1  0.0000000
## 2   2 -0.2923555
## 3   3 -0.3277718
## 4   4 -0.3399871
## 5   5 -0.1248875
## 6   6 -0.4915896

str(data)

## 'data.frame':   731 obs. of  2 variables:
##  $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Revenue: num  0 -0.292 -0.328 -0.34 -0.125 ...

###III: Data Preparation
#C1.Provide a line graph visualizing the realization of the time series.

# 1. Line graph visualization
ggplot(data, aes(x = Day, y = Revenue)) +
  geom_line() +
  labs(title = "Time Series Visualization",
       x = "Day",
       y = "Revenue")

####2.Describe the time step formatting of the realization, including any gaps in measurement and the length of the sequence.

#The time step formatting is in days which equals 2 years of data
# Check for any gaps in measurement
print(paste("Any gaps in measurement:", any(diff(data$Day) != time_step)))

## Error in eval(expr, envir, enclos): object 'time_step' not found

# Length of the sequence
length_sequence <- nrow(data)

# 3.Evaluate the stationarity of the time series.
adf.test(data$Revenue)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  data$Revenue
## Dickey-Fuller = -2.5906, Lag order = 9, p-value = 0.3283
## alternative hypothesis: stationary

# Take first-order difference to remove trend
data$diff_revenue <- c(NA, diff(data$Revenue))

# Evaluate stationarity
adf_result <- adf.test(data$diff_revenue[-1], alternative = "stationary")

## Warning in adf.test(data$diff_revenue[-1], alternative = "stationary"): p-value
## smaller than printed p-value

print(adf_result)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  data$diff_revenue[-1]
## Dickey-Fuller = -7.2794, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary

###The p-value is below .05 so that indicates stationary



# 4.Explain the steps you used to prepare the data for analysis, including the training and test set split.
length_sequence <- nrow(data)

train_data <- data[1:700,]
test_data <- data[-(1:700),]
summary(test_data)

##       Day           Revenue       diff_revenue     
##  Min.   :701.0   Min.   :15.25   Min.   :-1.17889  
##  1st Qu.:708.5   1st Qu.:16.03   1st Qu.:-0.27805  
##  Median :716.0   Median :18.51   Median : 0.09275  
##  Mean   :716.0   Mean   :17.92   Mean   :-0.02011  
##  3rd Qu.:723.5   3rd Qu.:19.03   3rd Qu.: 0.30914  
##  Max.   :731.0   Max.   :20.31   Max.   : 1.14450

count(test_data)

##    n
## 1 31

####I checked for null and n/a in the original data prior to splitting the data into training and test.

#See the 2 attached files.

#The training and test data is split 80/20 

# Write train data to CSV
write.csv(train_data, file = "/Users/kimshellenberger/Desktop/train_data.csv", row.names = FALSE)

# Write test data to CSV
write.csv(test_data, file = "/Users/kimshellenberger/Desktop/test_data.csv", row.names = FALSE)


# Part IV: Model Identification and Analysis
# D.1.Report the annotated findings with visualizations of your data analysis, including the following elements:

# •the decomposed time series  •the presence or lack of a seasonal component  •trends 
decomposed_ts <- decompose(ts(data$Revenue, frequency = 2), "multiplicative")
# Plot the decomposed components
plot(decomposed_ts)

# •the autocorrelation function 
acf(data$Revenue)

# Plot autocorrelation function
acf(data$Revenue, main = "Autocorrelation Function", lag.max = 9)

# •the spectral density
spec.pgram(data$Revenue)

# 2.Identify an autoregressive integrated moving average (ARIMA) model that accounts for the observed trend and seasonality of the time series data.
arima_model <- auto.arima(train_data$Revenue)
summary(arima_model)

## Series: train_data$Revenue 
## ARIMA(1,1,0) 
## 
## Coefficients:
##          ar1
##       0.4109
## s.e.  0.0345
## 
## sigma^2 = 0.1938:  log likelihood = -417.86
## AIC=839.72   AICc=839.74   BIC=848.82
## 
## Training set error measures:
##                     ME      RMSE       MAE        MPE     MAPE      MASE
## Training set 0.0144574 0.4395562 0.3543905 -0.2323458 7.729072 0.9246115
##                      ACF1
## Training set -0.001796746

accuracy(arima_model)

##                     ME      RMSE       MAE        MPE     MAPE      MASE
## Training set 0.0144574 0.4395562 0.3543905 -0.2323458 7.729072 0.9246115
##                      ACF1
## Training set -0.001796746

plot(arima_model$residuals)

# 3.Perform a forecast using the derived ARIMA model identified in part D2.
forecast_result <- forecast(arima_model, h = 60)
plot(forecast_result) #4.Provide the output and calculations of the analysis you performed.

forecast_result

##     Point Forecast     Lo 80    Hi 80     Lo 95    Hi 95
## 701       16.95330 16.389180 17.51742 16.090553 17.81605
## 702       17.06028 16.084724 18.03583 15.568297 18.55226
## 703       17.10423 15.782926 18.42554 15.083467 19.12500
## 704       17.12230 15.506348 18.73824 14.650918 19.59367
## 705       17.12972 15.257048 19.00239 14.265717 19.99372
## 706       17.13277 15.031566 19.23397 13.919258 20.34627
## 707       17.13402 14.825691 19.44235 13.603737 20.66430
## 708       17.13453 14.635765 19.63330 13.312997 20.95607
## 709       17.13474 14.458892 19.81060 13.042382 21.22711
## 710       17.13483 14.292845 19.97682 12.788388 21.48128
## 711       17.13487 14.135910 20.13382 12.548358 21.72138
## 712       17.13488 13.986762 20.28300 12.320249 21.94952
## 713       17.13489 13.844357 20.42542 12.102455 22.16732
## 714       17.13489 13.707859 20.56192 11.893698 22.37608
## 715       17.13489 13.576592 20.69319 11.692941 22.57684
## 716       17.13489 13.449997 20.81979 11.499331 22.77045
## 717       17.13489 13.327608 20.94218 11.312154 22.95763
## 718       17.13489 13.209034 21.06075 11.130810 23.13898
## 719       17.13489 13.093937 21.17585 10.954784 23.31500
## 720       17.13489 12.982028 21.28776 10.783635 23.48615
## 721       17.13489 12.873058 21.39673 10.616979 23.65281
## 722       17.13489 12.766805 21.50298 10.454478 23.81531
## 723       17.13489 12.663075 21.60671 10.295838 23.97395
## 724       17.13489 12.561698 21.70809 10.140796 24.12899
## 725       17.13489 12.462520 21.80726  9.989116 24.28067
## 726       17.13489 12.365404 21.90438  9.840590 24.42920
## 727       17.13489 12.270227 21.99956  9.695028 24.57476
## 728       17.13489 12.176876 22.09291  9.552260 24.71752
## 729       17.13489 12.085250 22.18453  9.412131 24.85765
## 730       17.13489 11.995258 22.27453  9.274500 24.99529
## 731       17.13489 11.906814 22.36297  9.139237 25.13055
## 732       17.13489 11.819843 22.44994  9.006225 25.26356
## 733       17.13489 11.734271 22.53551  8.875355 25.39443
## 734       17.13489 11.650034 22.61975  8.746526 25.52326
## 735       17.13489 11.567072 22.70271  8.619646 25.65014
## 736       17.13489 11.485328 22.78446  8.494629 25.77516
## 737       17.13489 11.404750 22.86504  8.371395 25.89839
## 738       17.13489 11.325289 22.94450  8.249871 26.01991
## 739       17.13489 11.246900 23.02288  8.129986 26.13980
## 740       17.13489 11.169542 23.10024  8.011676 26.25811
## 741       17.13489 11.093174 23.17661  7.894881 26.37490
## 742       17.13489 11.017759 23.25203  7.779544 26.49024
## 743       17.13489 10.943263 23.32652  7.665612 26.60417
## 744       17.13489 10.869652 23.40013  7.553035 26.71675
## 745       17.13489 10.796897 23.47289  7.441765 26.82802
## 746       17.13489 10.724967 23.54482  7.331757 26.93803
## 747       17.13489 10.653835 23.61595  7.222971 27.04681
## 748       17.13489 10.583476 23.68631  7.115366 27.15442
## 749       17.13489 10.513864 23.75592  7.008904 27.26088
## 750       17.13489 10.444977 23.82481  6.903550 27.36624
## 751       17.13489 10.376792 23.89299  6.799269 27.47052
## 752       17.13489 10.309288 23.96050  6.696031 27.57375
## 753       17.13489 10.242444 24.02734  6.593803 27.67598
## 754       17.13489 10.176243 24.09354  6.492557 27.77723
## 755       17.13489 10.110666 24.15912  6.392266 27.87752
## 756       17.13489 10.045696 24.22409  6.292902 27.97688
## 757       17.13489  9.981315 24.28847  6.194440 28.07534
## 758       17.13489  9.917509 24.35228  6.096857 28.17293
## 759       17.13489  9.854262 24.41552  6.000129 28.26966
## 760       17.13489  9.791560 24.47823  5.904234 28.36555

plot(forecast_result$residuals)

Acf(forecast_result$residuals)

pacf(forecast_result$residuals)

qqnorm(forecast_result$residuals)

# Ensure same length of test data and forecasted values
test_data <- test_data[1:min(length(test_data$Revenue), length(forecast_result$mean)), ]
forecast_values <- forecast_result$mean[1:min(length(test_data$Revenue), length(forecast_result$mean))]

# Remove NA values from test data and forecasted values
test_data <- test_data[!is.na(test_data$Revenue), ]
forecast_values <- forecast_values[!is.na(test_data$Revenue)]

# Check spectral density
spec.pgram(data$Revenue)

spec.pgram(train_data$Revenue)

spec.pgram(forecast_result)

# Part V: Data Summary and Implications
# E.Summarize your findings and assumptions by doing the following:
# 1. Discuss the results of your data analysis, including the following points:
# •the selection of an ARIMA model

# the ARIMA model was selected using auto.arima and it mad sense due to no seasonality.

# •the model evaluation procedure and error metric
# I used RMSE as the evaluation metric. The closer the outcome to 1 the better the forcast fit the test data. 
rmse <- sqrt(mean((test_data$Revenue - forecast_values)^2))
rmse

## [1] 1.778914

# 2.  Provide an annotated visualization of the forecast of the final model compared to the test set.
plot(forecast_result, main = "Forecast with 95% Confidence Interval")
lines(train_data$Day, train_data$Revenue, col = "blue")  # Train data
lines(test_data$Day, test_data$Revenue, col = "red")    # Test data
lines(forecast_result$mean, col = "black")              # Forecast
lines(forecast_result$lower[,2], col = "green", lty = 2) # 95% confidence interval lower bound
lines(forecast_result$upper[,2], col = "green", lty = 2) # 95% confidence interval upper bound
legend("topleft", legend = c("Train Data", "Test Data", "Forecast", "95% Confidence Interval"),
       col = c("blue", "red", "black", "green"), lty = c(1, 1, 1, 2))