Predictive Analysis Assignment 1

library(forecast)

## Warning: package 'forecast' was built under R version 4.3.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(ggplot2)
library(readxl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(zoo)

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(fpp2)

## ── Attaching packages ────────────────────────────────────────────── fpp2 2.5 ──

## ✔ fma       2.5     ✔ expsmooth 2.3

##

# data loading
file_path <- "/Users/timyang/Downloads/Illinois_Dry_Natural_Gas_Production.csv"

natural_gas_data <- read.csv(file_path, skip = 4)

# Convert the Month column to date format
natural_gas_data$Month <- zoo::as.yearmon(natural_gas_data$Month, format = "%b %Y")

# Create a time series object
# Extract year and month from the first data point
start_year <- as.numeric(format(natural_gas_data$Month[1], "%Y"))
start_month <- as.numeric(format(natural_gas_data$Month[1], "%m"))
ts_data <- ts(natural_gas_data$Illinois.Dry.Natural.Gas.Production.Million.Cubic.Feet, frequency = 12, start = c(start_year, start_month))

# Use only the last 4 years of data for model building
end_year <- start_year + floor(length(ts_data) / 12) - 1
train_data <- window(ts_data, start = c(end_year - 3, 1), end = c(end_year, 12))

# Forecast horizon (12 months for the 5th year)
h <- 12

# Fit Naive model
naive_forecast <- naive(train_data, h = h)
# Fit Drift model
drift_forecast <- rwf(train_data, h = h, drift = TRUE)

# Fit SNAIVE model
snaive_forecast <- snaive(train_data, h = h)

# Fit ETS model
ets_model <- ets(train_data)
ets_forecast <- forecast(ets_model, h = h)

# Plot the forecasts
autoplot(train_data) +
  autolayer(naive_forecast, series = "Naive", PI = FALSE) +
  autolayer(drift_forecast, series = "Drift", PI = FALSE) +
  autolayer(snaive_forecast, series = "SNAIVE", PI = FALSE) +
  autolayer(ets_forecast, series = "ETS", PI = FALSE) +
  ggtitle("Forecasts from Different Models") +
  xlab("Year") + ylab("Production (Million Cubic Feet)") +
  guides(colour = guide_legend(title = "Forecast"))

# Calculate performance measures for each model
naive_accuracy <- accuracy(naive_forecast)
drift_accuracy <- accuracy(drift_forecast)
snaive_accuracy <- accuracy(snaive_forecast)
ets_accuracy <- accuracy(ets_forecast)

# Print accuracy measures
print(naive_accuracy)

##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -3.148936 25.58091 13.40426 -31.04317 39.26186 0.5772167
##                    ACF1
## Training set 0.03643154

print(drift_accuracy)

##                         ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -5.744814e-15 25.38636 14.04527 -27.49236 38.93524 0.6048202
##                    ACF1
## Training set 0.03643154

print(snaive_accuracy)

##                     ME     RMSE      MAE       MPE     MAPE MASE      ACF1
## Training set -10.22222 28.56571 23.22222 -37.20881 48.93813    1 0.3915384

print(ets_accuracy)

##                     ME     RMSE      MAE       MPE     MAPE     MASE       ACF1
## Training set -2.968299 25.32566 13.24059 -30.32318 38.51753 0.570169 0.02342824

The best model is identified based on the lowest RMSE, but MAE and MAPE are also considered. the lowest RMSE value is ETS model which preforming the best, those values for each model are all super close to each other, which is a little bit hard to statistical tell which one is the best.

Predictive Analysis Assignment 1

Mingdong Yang

2024-07-10