Importing Data
library(fpp2)
## Warning: package 'fpp2' was built under R version 4.0.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## -- Attaching packages ------------------------ fpp2 2.4 --
## v ggplot2 3.3.2 v fma 2.4
## v forecast 8.13 v expsmooth 2.3
## Warning: package 'fma' was built under R version 4.0.3
## Warning: package 'expsmooth' was built under R version 4.0.3
##
library(forecast)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(timeDate)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(psych)
## Warning: package 'psych' was built under R version 4.0.3
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(urca)
Since we have only focused on singular forecasts, for now I am just going to use the first restaurant ID
#visit.data<-read.csv("D:/predictive analytics/midterm/air_visit_data.csv")
visit.data<-read.csv("C:/Users/jbiasi/OneDrive - CoStar Realty Information, Inc/Documents/air_visit_data.csv")
visit.data <- visit.data %>%
mutate(visit.data = ymd(visit_date))
visit.data.group <- visit.data %>% group_by(visit_date) %>% summarize(visitors = mean(visitors))
## `summarise()` ungrouping output (override with `.groups` argument)
visit.data.ts<-ts(visit.data.group$visitors, frequency=365.25, start=c(2016,1,13))
Plotting the time series.
autoplot(visit.data.ts)
The data is clearly seasonal on a weekly basis, so we get a 7 day moving average to see the trend.
visit.data.ts.ma<-ma(visit.data.ts, order=7)
autoplot(visit.data.ts.ma)
Also want to show them added together to show that there is a break in the data about a third of the way through the time series, justifying possibly a spline, even if the average is relatively smooth
visit.data.group.sum <- visit.data %>% group_by(visit_date) %>% summarize(visitors = sum(visitors))
## `summarise()` ungrouping output (override with `.groups` argument)
visit.data.ts.sum<-ts(visit.data.group.sum$visitors, frequency=365.25, start=c(2016,1,13))
autoplot(visit.data.ts.sum)
Some Summary Plots of raw data
describe(visit.data)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed mad min max range
## air_store_id* 1 252108 413.70 239.70 407 413.21 309.86 1 829 828
## visit_date* 2 252108 286.53 123.15 297 294.11 137.88 1 478 477
## visitors 3 252108 20.97 16.76 17 18.82 13.34 1 877 876
## visit.data 4 252108 NaN NA NA NaN NA Inf -Inf -Inf
## skew kurtosis se
## air_store_id* 0.02 -1.21 0.48
## visit_date* -0.41 -0.69 0.25
## visitors 3.31 74.26 0.03
## visit.data NA NA NA
visit.data %>%
ggplot(aes(visitors)) +
geom_vline(xintercept = 20.97, color = "red") +
geom_histogram(fill = "blue", bins = 100)
visit.data %>%
ggplot(aes(visitors)) +
geom_vline(xintercept = 20.97, color = "red") +
geom_histogram(fill = "blue", bins = 30) +
scale_x_log10()
Some Summary Plots of grouped data
describe(visit.data.group)
## vars n mean sd median trimmed mad min max range
## visit_date* 1 478 239.50 138.13 239.5 239.50 177.17 1.00 478.00 477.00
## visitors 2 478 21.15 3.95 20.6 20.95 4.56 13.13 31.27 18.13
## skew kurtosis se
## visit_date* 0.00 -1.21 6.32
## visitors 0.34 -0.93 0.18
visit.data.group %>%
ggplot(aes(log(visitors))) +
geom_histogram(fill = "blue", bins = 30)
visit.data.group %>%
mutate(wday = wday(visit_date, label = TRUE)) %>%
group_by(wday) %>%
summarise(visits = mean(visitors)) %>%
ggplot(aes(wday, visits, fill = wday)) +
geom_col(colour="dodgerblue") + labs(x = "Day of the week", y = "Mean visitor Count", main="Mean Air Visitor Count by Day of Week")
## `summarise()` ungrouping output (override with `.groups` argument)
There is a clear upward trend in the first quarter, followed by a downward trend through the next 3 quarters. Now to split training and test, with training ending at the end of October
rest.train <- ts(visit.data.ts[1:399],frequency = 7)
rest.test <- ts(visit.data.ts[400:478], frequency = 7)
The data is clearly “seasonal” on a weekly basis, so we account for that. Using a seasonal difference on a box-cox with an additioanl diff gets us close enough to white noise.
ggtsdisplay(rest.train)
ndiffs(rest.train)
## [1] 1
ggtsdisplay(diff(rest.train))
rest.train.bc<-BoxCox(rest.train, lambda=BoxCox.lambda(rest.train))
diff(rest.train.bc, 7) %>% ur.kpss() %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 5 lags.
##
## Value of test-statistic is: 0.0213
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
ggtsdisplay(diff(rest.train.bc, 7))
rest.test.bc<-BoxCox(rest.test, lambda=BoxCox.lambda(rest.train))
rest.test.bc.diff<-diff(rest.test.bc, 7)
rest.train.bc.diff<-diff(rest.train.bc, 7)
describe(rest.train.bc.diff)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 392 -0.01 0.91 0.1 0.04 0.58 -3.99 2.87 6.85 -0.92 3.89 0.05
Trying nonlinear to account for big jumps at t=182
h<-78
t <- time(rest.train.bc.diff)
t[370]
## [1] 54.71429
t.break1<-27
tb1 <- ts(pmax(0, t - t.break1), start = 1)
fit.pw <- tslm(rest.train.bc.diff ~ t + tb1)
t.new <- t[length(t)] + seq(h)
tb1.new <- tb1[length(tb1)] + seq(h)
newdata <- cbind(t=t.new, tb1=tb1.new) %>%
as.data.frame()
fcasts.pw <- forecast(fit.pw, newdata = newdata)
autoplot(rest.train.bc.diff)+
autolayer(fitted(fit.pw))+
autolayer(fcasts.pw)
fit.spline <- tslm(rest.train.bc.diff ~ t + I(t^2) + I(tb1^3))
fcasts.spl <- forecast(fit.spline, newdata = newdata)
autoplot(rest.train.bc.diff)+
autolayer(fitted(fit.spline))+
autolayer(fcasts.spl)
None of these are accurate, but we might as well test them Piecewise
pw.accuracy <- accuracy(fcasts.pw,rest.test.bc.diff[1:78])
pw.accuracy
## ME RMSE MAE MPE MAPE MASE
## Training set 1.876403e-17 0.9066755 0.6170048 111.8268 112.6504 0.9044698
## Test set 5.846271e-02 0.6889969 0.5247208 102.9080 102.9080 0.7691903
## ACF1
## Training set 0.4181748
## Test set NA
Cubic spline
cu.accuracy <- accuracy(fcasts.spl,rest.test.bc.diff[1:78])
cu.accuracy
## ME RMSE MAE MPE MAPE MASE
## Training set 1.004574e-17 0.9063183 0.6188331 74.89719 121.7767 0.9071499
## Test set 3.969112e+00 5.1970726 3.9691116 -273.41610 1714.9596 5.8183365
## ACF1
## Training set 0.417864
## Test set NA
Fit a simple ETS forecast
fit.ets <- ets(rest.train.bc.diff)
autoplot(fit.ets)
fc.ets <- forecast(fit.ets, h=78)
autoplot(fc.ets)
checkresiduals(fc.ets)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,N)
## Q* = 151, df = 12, p-value < 2.2e-16
##
## Model df: 2. Total lags used: 14
Test for model accuracy
ets.accuracy <- accuracy(fc.ets,rest.test.bc.diff[1:78])
ets.accuracy
## ME RMSE MAE MPE MAPE MASE
## Training set 0.001038186 0.9176661 0.6422419 114.60021 212.7298 0.9414650
## Test set 0.115214402 0.6932997 0.5324364 93.27108 103.9638 0.7805006
## ACF1
## Training set 0.2535666
## Test set NA
ARIMA
(rest.arima<-auto.arima(rest.train.bc.diff, stepwise=FALSE, approximation=FALSE))
## Series: rest.train.bc.diff
## ARIMA(0,0,1)(0,0,1)[7] with zero mean
##
## Coefficients:
## ma1 sma1
## 0.5516 -0.9192
## s.e. 0.0359 0.0377
##
## sigma^2 estimated as 0.4379: log likelihood=-400.08
## AIC=806.16 AICc=806.23 BIC=818.08
rest.arima.fcst<-forecast(rest.arima, 78)
autoplot(rest.arima.fcst)
accuracy(rest.arima.fcst,rest.test.bc.diff[1:78])
## ME RMSE MAE MPE MAPE MASE
## Training set -0.02320338 0.6600837 0.4775531 281.62716 417.2258 0.7000470
## Test set 0.05716394 0.6853906 0.5167277 98.54291 102.5316 0.7574731
## ACF1
## Training set 0.09839145
## Test set NA
(rest.arima<-auto.arima(rest.train, lambda=BoxCox.lambda(rest.train), d=1, stepwise=FALSE, approximation=FALSE))
## Series: rest.train
## ARIMA(3,1,0)(2,1,0)[7]
## Box Cox transformation: lambda= 0.6376314
##
## Coefficients:
## ar1 ar2 ar3 sar1 sar2
## -0.2504 -0.3887 -0.2789 -0.5464 -0.3325
## s.e. 0.0494 0.0463 0.0488 0.0489 0.0482
##
## sigma^2 estimated as 0.5787: log likelihood=-447.21
## AIC=906.41 AICc=906.63 BIC=930.22
rest.arima.fcst<-forecast(rest.arima, 78)
autoplot(rest.arima.fcst)