library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

## The following object is masked from 'package:purrr':
## 
##     transpose

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(reshape)

## 
## Attaching package: 'reshape'

## The following object is masked from 'package:lubridate':
## 
##     stamp

## The following object is masked from 'package:data.table':
## 
##     melt

## The following object is masked from 'package:dplyr':
## 
##     rename

## The following objects are masked from 'package:tidyr':
## 
##     expand, smiths

library(broom)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:reshape':
## 
##     rename

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(tidyr)
library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(ggplot2)
library(fpp3)

## ── Attaching packages ──────────────────────────────────────────── fpp3 0.4.0 ──

## ✓ tsibble     1.1.1     ✓ feasts      0.2.2
## ✓ tsibbledata 0.4.0     ✓ fable       0.3.1

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## x data.table::between()   masks dplyr::between()
## x lubridate::date()       masks base::date()
## x reshape::expand()       masks tidyr::expand()
## x plotly::filter()        masks dplyr::filter(), stats::filter()
## x data.table::first()     masks dplyr::first()
## x fabletools::forecast()  masks forecast::forecast()
## x lubridate::hour()       masks data.table::hour()
## x tsibble::intersect()    masks base::intersect()
## x tsibble::interval()     masks lubridate::interval()
## x lubridate::isoweek()    masks data.table::isoweek()
## x tsibble::key()          masks data.table::key()
## x dplyr::lag()            masks stats::lag()
## x data.table::last()      masks dplyr::last()
## x caret::lift()           masks purrr::lift()
## x fabletools::MAE()       masks caret::MAE()
## x lubridate::mday()       masks data.table::mday()
## x lubridate::minute()     masks data.table::minute()
## x lubridate::month()      masks data.table::month()
## x lubridate::quarter()    masks data.table::quarter()
## x plotly::rename()        masks reshape::rename(), dplyr::rename()
## x fabletools::RMSE()      masks caret::RMSE()
## x lubridate::second()     masks data.table::second()
## x tsibble::setdiff()      masks base::setdiff()
## x reshape::stamp()        masks lubridate::stamp()
## x data.table::transpose() masks purrr::transpose()
## x tsibble::union()        masks base::union()
## x lubridate::wday()       masks data.table::wday()
## x lubridate::week()       masks data.table::week()
## x lubridate::yday()       masks data.table::yday()
## x lubridate::year()       masks data.table::year()

library(fma)
library(readxl)
library(tsibble)
library(feasts)
library(seasonal)

## 
## Attaching package: 'seasonal'

## The following object is masked from 'package:tibble':
## 
##     view

library(psych)

## 
## Attaching package: 'psych'

## The following object is masked from 'package:seasonal':
## 
##     outlier

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(MASS)

## 
## Attaching package: 'MASS'

## The following objects are masked from 'package:fma':
## 
##     cement, housing, petrol

## The following object is masked from 'package:plotly':
## 
##     select

## The following object is masked from 'package:dplyr':
## 
##     select

library(neuralnet)

## 
## Attaching package: 'neuralnet'

## The following object is masked from 'package:dplyr':
## 
##     compute

library(dplyr)
library(vars)

## Loading required package: strucchange

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following object is masked from 'package:tsibble':
## 
##     index

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

## 
## Attaching package: 'strucchange'

## The following object is masked from 'package:stringr':
## 
##     boundary

## Loading required package: urca

## Loading required package: lmtest

## 
## Attaching package: 'vars'

## The following object is masked from 'package:fable':
## 
##     VAR

library(expsmooth)

###Import data###

air_visit_data <- read.csv("~/Desktop/recruit-restaurant-visitor-forecasting/air_visit_data.csv")
air_reserve <- read.csv("~/Desktop/recruit-restaurant-visitor-forecasting/air_reserve.csv")
air_store_info <- read.csv("~/Desktop/recruit-restaurant-visitor-forecasting/air_store_info.csv")
hpg_reserve <- read.csv("~/Desktop/recruit-restaurant-visitor-forecasting/hpg_reserve.csv") 
hpg_store_info <- read.csv("~/Desktop/recruit-restaurant-visitor-forecasting/hpg_store_info.csv") 
store_id_relation <- read.csv("~/Desktop/recruit-restaurant-visitor-forecasting/store_id_relation.csv")
date_info <- read.csv("~/Desktop/recruit-restaurant-visitor-forecasting/date_info.csv")
sample_submission <- read.csv("~/Desktop/recruit-restaurant-visitor-forecasting/sample_submission.csv")

summary(air_visit_data)

##  air_store_id        visit_date           visitors     
##  Length:252108      Length:252108      Min.   :  1.00  
##  Class :character   Class :character   1st Qu.:  9.00  
##  Mode  :character   Mode  :character   Median : 17.00  
##                                        Mean   : 20.97  
##                                        3rd Qu.: 29.00  
##                                        Max.   :877.00

summary(air_reserve)

##  air_store_id       visit_datetime     reserve_datetime   reserve_visitors 
##  Length:92378       Length:92378       Length:92378       Min.   :  1.000  
##  Class :character   Class :character   Class :character   1st Qu.:  2.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :  3.000  
##                                                           Mean   :  4.482  
##                                                           3rd Qu.:  5.000  
##                                                           Max.   :100.000

summary(air_store_info)

##  air_store_id       air_genre_name     air_area_name         latitude    
##  Length:829         Length:829         Length:829         Min.   :33.21  
##  Class :character   Class :character   Class :character   1st Qu.:34.70  
##  Mode  :character   Mode  :character   Mode  :character   Median :35.66  
##                                                           Mean   :35.65  
##                                                           3rd Qu.:35.69  
##                                                           Max.   :44.02  
##    longitude    
##  Min.   :130.2  
##  1st Qu.:135.3  
##  Median :139.7  
##  Mean   :137.4  
##  3rd Qu.:139.8  
##  Max.   :144.3

summary(hpg_reserve)

##  hpg_store_id       visit_datetime     reserve_datetime   reserve_visitors 
##  Length:2000320     Length:2000320     Length:2000320     Min.   :  1.000  
##  Class :character   Class :character   Class :character   1st Qu.:  2.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :  3.000  
##                                                           Mean   :  5.074  
##                                                           3rd Qu.:  6.000  
##                                                           Max.   :100.000

summary(hpg_store_info)

##  hpg_store_id       hpg_genre_name     hpg_area_name         latitude    
##  Length:4690        Length:4690        Length:4690        Min.   :33.31  
##  Class :character   Class :character   Class :character   1st Qu.:34.69  
##  Mode  :character   Mode  :character   Mode  :character   Median :35.66  
##                                                           Mean   :35.81  
##                                                           3rd Qu.:35.70  
##                                                           Max.   :43.77  
##    longitude    
##  Min.   :130.3  
##  1st Qu.:135.5  
##  Median :139.5  
##  Mean   :137.7  
##  3rd Qu.:139.7  
##  Max.   :143.7

summary(store_id_relation)

##  air_store_id       hpg_store_id      
##  Length:150         Length:150        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

summary(date_info)

##  calendar_date      day_of_week         holiday_flg    
##  Length:517         Length:517         Min.   :0.0000  
##  Class :character   Class :character   1st Qu.:0.0000  
##  Mode  :character   Mode  :character   Median :0.0000  
##                                        Mean   :0.0677  
##                                        3rd Qu.:0.0000  
##                                        Max.   :1.0000

summary(sample_submission)

##       id               visitors
##  Length:32019       Min.   :0  
##  Class :character   1st Qu.:0  
##  Mode  :character   Median :0  
##                     Mean   :0  
##                     3rd Qu.:0  
##                     Max.   :0

###Processing the dataframe above###

After looking at the sample_submission dataset, I found that both the id and the date exist in the id column, so I separate id into air_store_id and visit_date

sample_submission <- sample_submission %>% 
  mutate(air_store_id = str_sub(id,1,20), 
         visit_date = ymd(str_sub(id,22,31))) %>% dplyr::select(-id)

Reform air_visit_date into ymd date format and log1p transformed visitors. For real-valued input, log1p is accurate also for x so small that 1 + x == 1 in floating-point accuracy.

air_visit_data <- air_visit_data %>%
  mutate(visit_date = ymd(visit_date),
         visitors = log1p(visitors))

Adding visit date & reserve date to hpg_reserve & air_reserve datasets

air_reserve <- air_reserve %>% 
  mutate(visit_datetime = ymd_hms(visit_datetime),
         reserve_datetime = ymd_hms(reserve_datetime),
         visit_date = date(visit_datetime), 
         reserve_date = date(reserve_datetime))
hpg_reserve <- hpg_reserve %>% 
  mutate(visit_datetime = ymd_hms(visit_datetime),
         reserve_datetime = ymd_hms(reserve_datetime),
         visit_date = date(visit_datetime), 
         reserve_date = date(reserve_datetime))

Derive prefecture from air_area_name column in air_store_info dataset

air_store_info <- air_store_info %>% 
  separate(air_area_name, c("prefecture"), sep = " ", remove = FALSE)

## Warning: Expected 1 pieces. Additional pieces discarded in 829 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

Encoding day of week Mon-Sun as 1-7 and adding flags for weekend or holiday in date_info dataset. 0: represent weekday. 1:represent weekend or holiday.

date_info <- date_info %>% 
  dplyr::select(-day_of_week) %>% 
  mutate(calendar_date = ymd(calendar_date),
         dow = wday(calendar_date, label=TRUE),
         dow = fct_relevel(dow, c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")),
         dow = as.numeric(dow),
         weekend_or_holiday_flg = ifelse(dow > 5 | holiday_flg == 1, 1, 0))

Row binding train data (air_visit_data,1/1/2016-4/22/2017) and test data (sample_submission,4/23/2017-5/31/2017)

data <- bind_rows(air_visit_data, sample_submission) %>%
  mutate(dow = wday(visit_date, label=TRUE),
         dow = fct_relevel(dow, c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")),
         dow = as.numeric(dow),
         month = month(visit_date),
         year = year(visit_date))

Joining air_store_info and date_info (4/23/2017-5/31/2017) with data

data <- data %>%
  left_join(air_store_info, by = "air_store_id") %>%
  left_join(date_info, by = c("visit_date" = "calendar_date", "dow"))

###Variables Relationship Exploring & Visualization###

#Sum log1p(visitors) over time for 50 stores.

p1 <- air_visit_data %>%
  filter(air_store_id %in% unique(.$air_store_id)[sample(1:829, 50)]) %>%
  group_by(visit_date) %>%
  summarise(all_visitors = sum(visitors)) %>%
  ggplot(aes(visit_date,all_visitors)) +
  geom_line(col = "black") +
  labs(x = "Date", y = "log1p(visitors)")
p1

A prominent weekly pattern

#log1p(visitors) when grouped by day-of-week, genre, and holiday&weekends. # Distribution of log1p(visitors) by day-of-week

p2 <- air_visit_data %>%
  mutate(day_of_week = wday(visit_date, label = TRUE)) %>%
  ggplot(aes(x = reorder(day_of_week, visitors, FUN = median), y = visitors, 
             fill = day_of_week)) +
  geom_boxplot() +
  labs(x = "Day of week", y = "log1p(visitors)") +
  theme(legend.position = "none")
ggplotly(p2)

Fridays and Sundays are more popular

Distribution of log1p(visitors) by genre

p3 <- air_visit_data %>%
  left_join(air_store_info, by = "air_store_id") %>%
  ggplot(aes(x = reorder(air_genre_name, visitors, FUN = median), y = visitors,
             fill = air_genre_name)) +
  geom_boxplot() +
  labs(x = "Genre", y = "log1p(visitors)") +
  theme(legend.position = "none") +
  coord_flip()
ggplotly(p3)

Asian most popular, Bar least popular

#Relation between actual visitors log1p(visitors) visited and log1p(reserve_visitors) and reserve visitors log1p(reserve_visitors) in Air system # Scatterplot of air visit vs air reserve

p4 <- air_reserve %>%
  group_by(air_store_id, visit_date) %>%
  summarise(reserve_visitors = log1p(sum(reserve_visitors))) %>%
  inner_join(air_visit_data, by = c("air_store_id", "visit_date")) %>%
  ggplot(aes(x = visitors, y = reserve_visitors)) +
  geom_point(color = "black", alpha = 0.3, size = 2) +
  geom_smooth(method = "loess", color = "blue") +
  labs(x = "log1p(visitors)", y = "log1p(air_reserve_visitors)")

## `summarise()` has grouped output by 'air_store_id'. You can override using the
## `.groups` argument.

ggplotly(p4)

## `geom_smooth()` using formula 'y ~ x'

###Preparation for Building Models###

We have to forecast the visitors from date “23-04-2017” to “31-05-2017” (that is, for 39 consecutive days). A forecast window (FW) is defined as time frame for which predictions are made and feature derivation window (FDW) is defined as time frame that is prior in time to the FW and is used for deriving features from the target variable or any of its covariates. The data used for modelling is prepared by row binding the dataframes formed by sliding the FW. We generate data by using fixed size FW of 39 days silding by 7 days and variable size FW of 7, 14, 21, 28 and 35 days just before “23-04-2017”.

# Function to get chunk
get_chunk <- function(chunk_start_date, ndays){
  chunk_end_date <- chunk_start_date + ndays
  chunk <- data %>% 
    filter(visit_date < chunk_end_date & visit_date >= chunk_start_date) %>%
    mutate(diff_from_chunk_start_date = time_length(visit_date - chunk_start_date, unit = "day"))
}

# Function to get visitors stats based on prior duration of chunk grouped by air_store_id 
get_store_visitor_stats <- function(chunk, chunk_start_date, ndays){
  prior_start_date <- ymd(chunk_start_date - ndays)
  temp <- data %>% 
    filter(visit_date < chunk_start_date & visit_date >= prior_start_date) %>%
    group_by(air_store_id) %>%
    summarise(mean = mean(visitors), 
              median = median(visitors), 
              max = max(visitors),
              min = min(visitors),
              sd = sd(visitors),
              skewness = e1071::skewness(visitors),
              count = n()) %>%
    plyr::rename(c(mean = paste0("store_mean_", ndays,"_days_prior"),
                   median = paste0("store_median_", ndays,"_days_prior"),
                   max = paste0("store_max_", ndays,"_days_prior"),
                   min = paste0("store_min_", ndays,"_days_prior"), 
                   sd = paste0("store_sd_", ndays,"_days_prior"),
                   skewness = paste0("store_skewness_", ndays,"_days_prior"),
                   count = paste0("store_count_", ndays,"_days_prior")))
  
  chunk = chunk %>% left_join(temp, by = "air_store_id")
  chunk[is.na(chunk)] <- 0
  chunk
}

# Function to get chunk and features based on days prior to chunk interval
get_chunk_and_features <- function(chunk_start_date, ndays){
  chunk <- get_chunk(chunk_start_date, ndays)
  
  chunk <- get_store_visitor_stats(chunk,chunk_start_date,500) 
  chunk <- get_store_visitor_stats(chunk,chunk_start_date,56)
  chunk <- get_store_visitor_stats(chunk,chunk_start_date,28)
  chunk <- get_store_visitor_stats(chunk,chunk_start_date,14)
  chunk
}

Form train data

train <- data.frame()
start_date = ymd("2017-03-12")
for(i in 0:57){
  train_sub <- get_chunk_and_features(start_date - (i*7), 39)
  train <- train %>% bind_rows(train_sub)
}

for(i in 1:5){
  train_sub <- get_chunk_and_features(start_date + (i*7), 42 - (i*7))
  train <- train %>% bind_rows(train_sub)
}

Form test data

test <- get_chunk_and_features(start_date + 42, 39)

Reforming character type variables to factor and then to numeric form

comb <- train %>% bind_rows(test) %>%
  mutate(air_genre_name = as.numeric(as.factor(air_genre_name)),
         air_area_name = as.numeric(as.factor(air_area_name)),
         prefecture = as.numeric(as.factor(prefecture)))

train <- comb[1:nrow(train), ]
test <- comb[-(1:nrow(train)), ]
rm(comb)

data_ts <- ts(data$visitors,start=c(2016,1),end=c(2017,5), frequency=12)
data_ts %>% autoplot() + labs(title="Numbers of log1p(Visitors) in data ") + 
  ylab("log1p(Visitors)") +
  guides(colour=guide_legend(title="index"))

train_ts <- ts(train$visitors,start=c(2016,1),end=c(2017,4), frequency=12)
train_ts %>% autoplot() + labs(title="Numbers of log1p visitors in train data ") + 
  ylab("log1p visitors") + xlab("Date")

train_ts%>%as_tsibble() %>% 
  gg_tsdisplay(plot_type='partial')

## Plot variable not specified, automatically selected `y = value`

acf(train_ts)

pacf(train_ts)

###Building Models###

###Arima Model###(Half success)

f <- train_ts %>% as_tsibble() %>% 
  model(arima210 = ARIMA(visitors ~ pdq(2,1,0)),
        arima111 = ARIMA(visitors ~ pdq(1,1,1)),
        arima011 = ARIMA(visitors ~ pdq(0,1,1)),
        arima012 = ARIMA(visitors ~ pdq(0,1,2)),
        arima013 = ARIMA(visitors ~ pdq(0,1,3)),
        arima000 = ARIMA(visitors ~ pdq(0,0,0)),
        auto = ARIMA(visitors, stepwise = FALSE, approx = FALSE), #By setting stepwise=FALSE and approximation=FALSE, we are making R work extra hard to find a good model. This takes much longer, but with only one series to model, the extra time taken is not a problem.
        stepwise = ARIMA(visitors),
        search = ARIMA(visitors, stepwise=FALSE))

f %>% pivot_longer(everything(), names_to = "Model name",
                   values_to = "Orders")

## # A mable: 9 x 2
## # Key:     Model name [9]
##   `Model name`                             Orders
##   <chr>                                   <model>
## 1 arima210              <ARIMA(2,1,0)(0,1,0)[12]>
## 2 arima111              <ARIMA(1,1,1)(0,1,0)[12]>
## 3 arima011              <ARIMA(0,1,1)(0,1,0)[12]>
## 4 arima012              <ARIMA(0,1,2)(0,1,0)[12]>
## 5 arima013              <ARIMA(0,1,3)(0,1,0)[12]>
## 6 arima000     <ARIMA(0,0,0)(0,1,0)[12] w/ drift>
## 7 auto         <ARIMA(3,0,1)(0,1,0)[12] w/ drift>
## 8 stepwise     <ARIMA(1,0,1)(0,1,0)[12] w/ drift>
## 9 search       <ARIMA(2,0,3)(0,1,0)[12] w/ drift>

glance(f) %>% arrange(AICc) %>% dplyr::select(.model:BIC)

## # A tibble: 9 × 6
##   .model   sigma2 log_lik   AIC  AICc   BIC
##   <chr>     <dbl>   <dbl> <dbl> <dbl> <dbl>
## 1 auto       390.  -1002. 2016. 2016. 2037.
## 2 search     389.  -1001. 2016. 2017. 2040.
## 3 arima111   416.  -1008. 2021. 2021. 2032.
## 4 stepwise   405.  -1007. 2021. 2022. 2035.
## 5 arima013   431.  -1009. 2027. 2027. 2040.
## 6 arima012   438.  -1012. 2029. 2029. 2039.
## 7 arima011   440.  -1013. 2029. 2029. 2036.
## 8 arima210   446.  -1013. 2033. 2033. 2043.
## 9 arima000   732.  -1075. 2154. 2154. 2161.

#auto arima performs best, smallest AICc.

fit_arima <- train_ts %>% as_tsibble() %>% 
  model(auto = ARIMA(visitors, stepwise = FALSE, approx = FALSE))
report(fit_arima)

## Series: visitors 
## Model: ARIMA(3,0,1)(0,1,0)[12] w/ drift 
## 
## Coefficients:
##           ar1     ar2     ar3     ma1  constant
##       -0.4095  0.6526  0.2237  0.9784    9.7176
## s.e.   0.0668  0.0568  0.0660  0.0195    2.5230
## 
## sigma^2 estimated as 390.5:  log likelihood=-1001.97
## AIC=2015.95   AICc=2016.33   BIC=2036.52

checkresiduals(train_ts)

## Warning in modeldf.default(object): Could not find appropriate degrees of
## freedom for this model.

fc_arima <- forecast(fit_arima, h=39) %>%
  filter(.model=='auto') %>%
  autoplot(visitors) +  
  labs(title = "ARIMA(3,0,1)(0,1,0) Prediction",
       x = "Date", y = "Log1p(Visitors)")
fc_arima

###Neural Nets Model###(Half success)

fitnn <- train_ts %>% as_tsibble() %>% 
  model(NNETAR(sqrt(visitors)))
report(fitnn)

## Series: visitors 
## Model: NNAR(1,1,2)[12] 
## Transformation: sqrt(visitors) 
## 
## Average of 20 networks, each of which is
## a 2-2-1 network with 9 weights
## options were - linear output units 
## 
## sigma^2 estimated as 0.4707

fc_nn <- forecast(fitnn, h=39) %>%
  autoplot(visitors)+
  labs(title = "NNAR(1,1,2) Prediction",
       y = "log1p visitors")
fc_nn

###TSLM Model###(Success)

lm_model <- lm(visitors ~ month + dow + weekend_or_holiday_flg, data=train)
lm_model

## 
## Call:
## lm(formula = visitors ~ month + dow + weekend_or_holiday_flg, 
##     data = train)
## 
## Coefficients:
##            (Intercept)                   month                     dow  
##              2.5657309              -0.0003705               0.0519277  
## weekend_or_holiday_flg  
##              0.0852223

summary(lm_model)

## 
## Call:
## lm(formula = visitors ~ month + dow + weekend_or_holiday_flg, 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3209 -0.5202  0.0915  0.5961  4.0572 
## 
## Coefficients:
##                          Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)             2.5657309  0.0021380 1200.071   <2e-16 ***
## month                  -0.0003705  0.0001931   -1.918   0.0551 .  
## dow                     0.0519277  0.0004951  104.886   <2e-16 ***
## weekend_or_holiday_flg  0.0852223  0.0021069   40.449   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7993 on 1322333 degrees of freedom
## Multiple R-squared:  0.02685,    Adjusted R-squared:  0.02685 
## F-statistic: 1.216e+04 on 3 and 1322333 DF,  p-value: < 2.2e-16

accuracy(lm_model)

##                        ME      RMSE       MAE       MPE     MAPE      MASE
## Training set 3.501226e-16 0.7993134 0.6444566 -13.84612 31.76349 0.9849236

tts <- ts(train)
tslm_model <- tslm(visitors ~ month + dow + weekend_or_holiday_flg, data=tts)
tslm_model

## 
## Call:
## tslm(formula = visitors ~ month + dow + weekend_or_holiday_flg, 
##     data = tts)
## 
## Coefficients:
##            (Intercept)                   month                     dow  
##              2.5657309              -0.0003705               0.0519277  
## weekend_or_holiday_flg  
##              0.0852223

summary(tslm_model)

## 
## Call:
## tslm(formula = visitors ~ month + dow + weekend_or_holiday_flg, 
##     data = tts)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3209 -0.5202  0.0915  0.5961  4.0572 
## 
## Coefficients:
##                          Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)             2.5657309  0.0021380 1200.071   <2e-16 ***
## month                  -0.0003705  0.0001931   -1.918   0.0551 .  
## dow                     0.0519277  0.0004951  104.886   <2e-16 ***
## weekend_or_holiday_flg  0.0852223  0.0021069   40.449   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7993 on 1322333 degrees of freedom
## Multiple R-squared:  0.02685,    Adjusted R-squared:  0.02685 
## F-statistic: 1.216e+04 on 3 and 1322333 DF,  p-value: < 2.2e-16

accuracy(tslm_model)

##                        ME      RMSE       MAE       MPE     MAPE     MASE
## Training set 3.501226e-16 0.7993134 0.6444566 -13.84612 31.76349 1.082882
##                  ACF1
## Training set 0.507867

fit_tslm <- tslm(train_ts ~ trend + season)
accuracy(fit_tslm)

##                         ME      RMSE       MAE        MPE     MAPE MASE
## Training set -3.122502e-17 0.1472828 0.1002915 -0.3533385 4.098759 0.25
##                   ACF1
## Training set -0.799346

autoplot(forecast(fit_tslm, h=5))  + labs(y = "log1p(visitors)")

fit_trends1 <- tslm(log(train_ts) ~ trend + season)
accuracy(fit_trends1)

##                        ME       RMSE        MAE        MPE     MAPE MASE
## Training set 1.040834e-17 0.05910358 0.03952886 -0.4537455 4.571334 0.25
##                    ACF1
## Training set -0.7929404

autoplot(forecast(fit_trends1, h=5)) + labs(y = "exponential visitors")

#The fitted exponential trend and forecasting, best.

ETS MODEL FOR FURTHER EXPLORING () ################################################################################ETS fit_ets <- ets(train$visitors) fit_ets plot(fit_ets) accuracy(fit_ets)

fit_ets %>% gg_tsresiduals(lag_max = 16)

augment(fit_ets) %>% features(.innov, ljung_box, lag = 16, dof = 6)

forecast(fit_ets, h=5) %>% #filter(.model==‘auto’) %>% autoplot(visitors)+ labs(title = “ETS(A,N,N) Prediction”, y = “log1p visitors”) fc_ets <- forecast(fit_ets,h = 39) autoplot(fc_ets, train)+ labs(title = “ETS(A,N,N) Prediction: Cement production in US”, y = “Index”) ##############################################################################

The Last Dance: Final

Mengjie Pan

2022-05-07

Distribution of log1p(visitors) by genre

Form train data

Form test data

Reforming character type variables to factor and then to numeric form