rm(list = ls())
library(reader)
## Loading required package: NCmisc
##
## Attaching package: 'reader'
## The following objects are masked from 'package:NCmisc':
##
## cat.path, get.ext, rmv.ext
library(fpp2)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## ── Attaching packages ────────────────────────────────────────────── fpp2 2.4 ──
## ✓ ggplot2 3.3.5 ✓ fma 2.4
## ✓ forecast 8.16 ✓ expsmooth 2.3
##
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.3 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
setwd("~/Desktop/Brandeis University/Graduate/Spring 2022/Forecasting/R Files")
passenger.db <- read_csv("AmtrakPassengers.csv")
## Rows: 156 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Month
## dbl (1): Ridership
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
passenger.ts <- ts(passenger.db$Ridership,
start = c(1991,1), end = c(2004,3), frequency = 12)
plot(passenger.ts, xlab = "Months", ylab = "Ridership",
main = "Amtrak Passengers Data",
ylim = c(1300,2300), bty = "l")
nValid <- 36
nTrain <- length(passenger.ts) - nValid
train.ts <- window(passenger.ts, start = c(1991,1), end = c(1991, nTrain))
valid.ts <- window(passenger.ts, start = c(1991, nTrain+1), end = c(1991, nTrain + nValid))
passenger.lm <- tslm(train.ts ~ trend + I(trend^2))
passenger.lms <- tslm(train.ts ~ trend + I(trend^2) + season)
# CREATE FORECAST using forecast() function
passenger.lm.pred <- forecast(passenger.lm, h = nValid, level = 0)
passenger.lms.pred <- forecast(passenger.lms, h = nValid, level = 0)
passenger.na.pred <- naive(train.ts, h = nValid)
hgA1 <- hist(passenger.lms.pred$residuals, ylab = "Frequency", xlab = "Forecast Error", bty = "l",
main = "Distribution of residuals (Train set, Seasonal model)")
hgB1 <-hist(passenger.lm.pred$residuals, ylab = "Frequency", xlab = "Forecast Error", bty = "l",
main = "Distribution of residuals with Seasonality (Train set, Quadratic model)")
hgC1 <-hist(passenger.na.pred$residuals, ylab = "Frequency", xlab = "Forecast Error", bty = "l",
main = "Distribution of residuals with Seasonality (Train set, Naive model)")
c1 <- rgb(173,216,230,max = 255, alpha = 80, names = "lt.blue")
c2 <- rgb(255,192,203, max = 255, alpha = 80, names = "lt.pink")
c3 <- rgb(255,255,224, max = 255, alpha = 80, names = "lt.yellow")
valid.err <- valid.ts - passenger.lm.pred$mean
hgB2 <- hist(valid.err, ylab = "Frequency", xlab = "Forecast Error", bty = "l",
main = "Distribution of residuals from forecast (Validation set, Quadratic Model)")
valid.err <- valid.ts - passenger.lms.pred$mean
hgA2 <- hist(valid.err, ylab = "Frequency", xlab = "Forecast Error", bty = "l",
main = "Distribution of residuals from forecast (Validation set, Seasonal Model)")
valid.err <- valid.ts - passenger.na.pred$mean
hgC2 <- hist(valid.err, ylab = "Frequency", xlab = "Forecast Error", bty = "l",
main = "Distribution of residuals from forecast (Validation set, Naive Model)")
library(forecast)
accuracy(passenger.lm.pred)
## ME RMSE MAE MPE MAPE MASE
## Training set -1.294397e-14 146.9583 120.3771 -0.734747 7.01433 1.459675
## ACF1
## Training set 0.3439566
accuracy(passenger.lms.pred)
## ME RMSE MAE MPE MAPE MASE
## Training set 1.854553e-15 66.72437 51.93312 -0.1523749 3.014258 0.629733
## ACF1
## Training set 0.6039636
accuracy(passenger.na.pred)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 2.45082 168.1426 125.3033 -0.3459458 7.271436 1.519408 -0.2472638
In this homework assignment, I compare two time series models. The quadratic model (lm) does not take seasonality into account, while the seasonal model (lms) does. From the accuracy measures table above, we see that the seasonal model outperforms the quadratic model.From Swanson (2015:3), we know that a mean absolute percentage error of less than 5% is considered as an indication that the forecast is acceptably accurate. Judging from the performance of the two models I tested here, we see that the seasonal model model attains MAPE of 3.014%, and should thus be considered relatively accurate, while the quadratic model attains MAPE of 7.014%, and should therefore not be considered acceptably accurate. In addition, the ME, RMSE, MAE, MPE, and MASE are all lower in the seasonal model compared to the quadratic model. This indicates an overall higher accuracy of the the seasonal model compared with the quadratic model. The naive model underperforms compared to the two models, with significantly higher mean error, and higher RMSE.
In addition, considering the histogram plots plotted earlier, we see that the residuals take more Gaussian shape in the seasonal model compared with the quadratic model. Comparing the AFC plots of all models, we see that both the quadratic and the naive models miss an autocorrelation pattern in the residuals which is accounted for in the quadratic seasonal model. The AFC plot of the quadratic seasonal model shows variation over time, but it is less fixed in a patten as seen in the two other models.
plot(hgA1, col = c1, , main = "Train set Combined Histogram (Seasonal model in blue)")
plot(hgB1, col = c2, add = TRUE)
plot(hgA2, col = c1, , main = "Validation Set Combined Histogram (Seasonal model in blue)")
plot(hgB2, col = c2, add = TRUE)
library(ggthemes)
checkresiduals(passenger.lm.pred)
##
## Ljung-Box test
##
## data: Residuals from Linear regression model
## Q* = 291.97, df = 21, p-value < 2.2e-16
##
## Model df: 3. Total lags used: 24
checkresiduals(passenger.lms.pred)
##
## Ljung-Box test
##
## data: Residuals from Linear regression model
## Q* = 151.44, df = 10, p-value < 2.2e-16
##
## Model df: 14. Total lags used: 24
checkresiduals(passenger.na.pred)
##
## Ljung-Box test
##
## data: Residuals from Naive method
## Q* = 373.95, df = 24, p-value < 2.2e-16
##
## Model df: 0. Total lags used: 24
AutoCorrelation <- acf(passenger.lm.pred, plot = FALSE)
plot(AutoCorrelation)
AutoCorrelation <- acf(passenger.lms.pred, plot = FALSE)
plot(AutoCorrelation)
AutoCorrelation <- acf(passenger.na.pred, plot = FALSE)
plot(AutoCorrelation)
The overall accuracy of the seasonal model is best represented through a graph visualization.
plot(passenger.lm.pred, ylim = c(1300, 2600), ylab = "Ridership", xlab = "Time", bty = "l",
xlim = c(1991, 2006.25), main = "Quadratic Model", flty = 2)
lines(passenger.lm$fitted.values, lwd = 2, col = "purple")
plot(passenger.lms.pred, ylim = c(1300, 2600), ylab = "Ridership", xlab = "Time", bty = "l",
xlim = c(1991, 2006.25), main = "Quadratic Seasonal Model", flty = 2)
lines(passenger.lms$fitted.values, lwd = 2, col = "blue")
lines(valid.ts)
plot(passenger.na.pred, ylim = c(1300, 2600), ylab = "Ridership", xlab = "Time", bty = "l",
xlim = c(1991, 2006.25), main = "Naive Model", flty = 2)
The quadratic model is in purple, while the quadratic seasonal model is in blue. The better performance of the seasonal model is clearly evident. The dotted line is the prediction of the seasonal model. To summarize, this excercize has shown that a seasonal quadratic model does a much better job forecasting riderships compared with a simpler quadratic regression model or naive model.