Midterm

Importing Data

library(fpp2)

## Warning: package 'fpp2' was built under R version 4.0.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## -- Attaching packages ------------------------ fpp2 2.4 --

## v ggplot2   3.3.2     v fma       2.4  
## v forecast  8.13      v expsmooth 2.3

## Warning: package 'fma' was built under R version 4.0.3

## Warning: package 'expsmooth' was built under R version 4.0.3

##

library(forecast)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(timeDate)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(psych)

## Warning: package 'psych' was built under R version 4.0.3

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(urca)

Since we have only focused on singular forecasts, for now I am just going to use the first restaurant ID

#visit.data<-read.csv("D:/predictive analytics/midterm/air_visit_data.csv")
visit.data<-read.csv("C:/Users/jbiasi/OneDrive - CoStar Realty Information, Inc/Documents/air_visit_data.csv")


visit.data <- visit.data %>%
  mutate(visit.data = ymd(visit_date))
visit.data.group <- visit.data %>% group_by(visit_date) %>% summarize(visitors = mean(visitors))

## `summarise()` ungrouping output (override with `.groups` argument)

visit.data.ts<-ts(visit.data.group$visitors, frequency=365.25, start=c(2016,1,13))

Plotting the time series.

autoplot(visit.data.ts)

The data is clearly seasonal on a weekly basis, so we get a 7 day moving average to see the trend.

visit.data.ts.ma<-ma(visit.data.ts, order=7)
autoplot(visit.data.ts.ma)

Also want to show them added together to show that there is a break in the data about a third of the way through the time series, justifying possibly a spline, even if the average is relatively smooth

visit.data.group.sum <- visit.data %>% group_by(visit_date) %>% summarize(visitors = sum(visitors))

## `summarise()` ungrouping output (override with `.groups` argument)

visit.data.ts.sum<-ts(visit.data.group.sum$visitors, frequency=365.25, start=c(2016,1,13))
autoplot(visit.data.ts.sum)

Some Summary Plots of raw data

describe(visit.data)

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

##               vars      n   mean     sd median trimmed    mad min  max range
## air_store_id*    1 252108 413.70 239.70    407  413.21 309.86   1  829   828
## visit_date*      2 252108 286.53 123.15    297  294.11 137.88   1  478   477
## visitors         3 252108  20.97  16.76     17   18.82  13.34   1  877   876
## visit.data       4 252108    NaN     NA     NA     NaN     NA Inf -Inf  -Inf
##                skew kurtosis   se
## air_store_id*  0.02    -1.21 0.48
## visit_date*   -0.41    -0.69 0.25
## visitors       3.31    74.26 0.03
## visit.data       NA       NA   NA

visit.data %>%
  ggplot(aes(visitors)) +
  geom_vline(xintercept = 20.97, color = "red") +
  geom_histogram(fill = "blue", bins = 100)

visit.data %>%
  ggplot(aes(visitors)) +
  geom_vline(xintercept = 20.97, color = "red") +
  geom_histogram(fill = "blue", bins = 30) +
  scale_x_log10()

Some Summary Plots of grouped data

describe(visit.data.group)

##             vars   n   mean     sd median trimmed    mad   min    max  range
## visit_date*    1 478 239.50 138.13  239.5  239.50 177.17  1.00 478.00 477.00
## visitors       2 478  21.15   3.95   20.6   20.95   4.56 13.13  31.27  18.13
##             skew kurtosis   se
## visit_date* 0.00    -1.21 6.32
## visitors    0.34    -0.93 0.18

visit.data.group %>%
  ggplot(aes(log(visitors))) +
  geom_histogram(fill = "blue", bins = 30)

visit.data.group %>%
  mutate(wday = wday(visit_date, label = TRUE)) %>%
  group_by(wday) %>%
  summarise(visits = mean(visitors)) %>%
  ggplot(aes(wday, visits, fill = wday)) +
  geom_col(colour="dodgerblue") + labs(x = "Day of the week", y = "Mean visitor Count", main="Mean Air Visitor Count by Day of Week")

## `summarise()` ungrouping output (override with `.groups` argument)

There is a clear upward trend in the first quarter, followed by a downward trend through the next 3 quarters. Now to split training and test, with training ending at the end of October

rest.train <- ts(visit.data.ts[1:399],frequency = 7)
rest.test <- ts(visit.data.ts[400:478], frequency = 7)

The data is clearly “seasonal” on a weekly basis, so we account for that. Using a seasonal difference on a box-cox with an additioanl diff gets us close enough to white noise.

ggtsdisplay(rest.train)

ndiffs(rest.train)

## [1] 1

ggtsdisplay(diff(rest.train))

rest.train.bc<-BoxCox(rest.train, lambda=BoxCox.lambda(rest.train))


diff(rest.train.bc, 7) %>% ur.kpss() %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 0.0213 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

ggtsdisplay(diff(rest.train.bc, 7))

rest.test.bc<-BoxCox(rest.test, lambda=BoxCox.lambda(rest.train))
rest.test.bc.diff<-diff(rest.test.bc, 7)
rest.train.bc.diff<-diff(rest.train.bc, 7)
describe(rest.train.bc.diff)

##    vars   n  mean   sd median trimmed  mad   min  max range  skew kurtosis   se
## X1    1 392 -0.01 0.91    0.1    0.04 0.58 -3.99 2.87  6.85 -0.92     3.89 0.05

Trying nonlinear to account for big jumps at t=182

h<-78
t <- time(rest.train.bc.diff)
t[370]

## [1] 54.71429

t.break1<-27
tb1 <- ts(pmax(0, t - t.break1), start = 1)

fit.pw <- tslm(rest.train.bc.diff ~ t + tb1)
t.new <- t[length(t)] + seq(h)
tb1.new <- tb1[length(tb1)] + seq(h)

newdata <- cbind(t=t.new, tb1=tb1.new) %>%
  as.data.frame()
fcasts.pw <- forecast(fit.pw, newdata = newdata)

autoplot(rest.train.bc.diff)+
  autolayer(fitted(fit.pw))+
  autolayer(fcasts.pw)

fit.spline <- tslm(rest.train.bc.diff ~ t + I(t^2) + I(tb1^3))
fcasts.spl <- forecast(fit.spline, newdata = newdata)

autoplot(rest.train.bc.diff)+
  autolayer(fitted(fit.spline))+
  autolayer(fcasts.spl)

None of these are accurate, but we might as well test them Piecewise

pw.accuracy <- accuracy(fcasts.pw,rest.test.bc.diff[1:78])
pw.accuracy

##                        ME      RMSE       MAE      MPE     MAPE      MASE
## Training set 1.876403e-17 0.9066755 0.6170048 111.8268 112.6504 0.9044698
## Test set     5.846271e-02 0.6889969 0.5247208 102.9080 102.9080 0.7691903
##                   ACF1
## Training set 0.4181748
## Test set            NA

Cubic spline

cu.accuracy <- accuracy(fcasts.spl,rest.test.bc.diff[1:78])
cu.accuracy

##                        ME      RMSE       MAE        MPE      MAPE      MASE
## Training set 1.004574e-17 0.9063183 0.6188331   74.89719  121.7767 0.9071499
## Test set     3.969112e+00 5.1970726 3.9691116 -273.41610 1714.9596 5.8183365
##                  ACF1
## Training set 0.417864
## Test set           NA

Fit a simple ETS forecast

fit.ets <- ets(rest.train.bc.diff)
autoplot(fit.ets)

fc.ets <- forecast(fit.ets, h=78)
autoplot(fc.ets)

checkresiduals(fc.ets)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,N)
## Q* = 151, df = 12, p-value < 2.2e-16
## 
## Model df: 2.   Total lags used: 14

Test for model accuracy

ets.accuracy <- accuracy(fc.ets,rest.test.bc.diff[1:78])
ets.accuracy

##                       ME      RMSE       MAE       MPE     MAPE      MASE
## Training set 0.001038186 0.9176661 0.6422419 114.60021 212.7298 0.9414650
## Test set     0.115214402 0.6932997 0.5324364  93.27108 103.9638 0.7805006
##                   ACF1
## Training set 0.2535666
## Test set            NA

ARIMA

(rest.arima<-auto.arima(rest.train.bc.diff, stepwise=FALSE, approximation=FALSE))

## Series: rest.train.bc.diff 
## ARIMA(0,0,1)(0,0,1)[7] with zero mean 
## 
## Coefficients:
##          ma1     sma1
##       0.5516  -0.9192
## s.e.  0.0359   0.0377
## 
## sigma^2 estimated as 0.4379:  log likelihood=-400.08
## AIC=806.16   AICc=806.23   BIC=818.08

rest.arima.fcst<-forecast(rest.arima, 78)

autoplot(rest.arima.fcst)

accuracy(rest.arima.fcst,rest.test.bc.diff[1:78])

##                       ME      RMSE       MAE       MPE     MAPE      MASE
## Training set -0.02320338 0.6600837 0.4775531 281.62716 417.2258 0.7000470
## Test set      0.05716394 0.6853906 0.5167277  98.54291 102.5316 0.7574731
##                    ACF1
## Training set 0.09839145
## Test set             NA

(rest.arima<-auto.arima(rest.train, lambda=BoxCox.lambda(rest.train), d=1, stepwise=FALSE, approximation=FALSE))

## Series: rest.train 
## ARIMA(3,1,0)(2,1,0)[7] 
## Box Cox transformation: lambda= 0.6376314 
## 
## Coefficients:
##           ar1      ar2      ar3     sar1     sar2
##       -0.2504  -0.3887  -0.2789  -0.5464  -0.3325
## s.e.   0.0494   0.0463   0.0488   0.0489   0.0482
## 
## sigma^2 estimated as 0.5787:  log likelihood=-447.21
## AIC=906.41   AICc=906.63   BIC=930.22

rest.arima.fcst<-forecast(rest.arima, 78)
autoplot(rest.arima.fcst)

Midterm

Joey Biasi

11/20/2020