Load Data

library(readr)
library(stringr)
air_visit <- read_csv(str_c("/Users/dorothymensah/Desktop/air_visit_data.csv"))
## Parsed with column specification:
## cols(
##   air_store_id = col_character(),
##   visit_date = col_date(format = ""),
##   visitors = col_double()
## )
air_reserve <- read_csv(str_c("/Users/dorothymensah/Desktop/air_reserve.csv"))
## Parsed with column specification:
## cols(
##   air_store_id = col_character(),
##   visit_datetime = col_datetime(format = ""),
##   reserve_datetime = col_datetime(format = ""),
##   reserve_visitors = col_double()
## )
hpg_reserve <- read_csv(str_c("/Users/dorothymensah/Desktop/hpg_reserve.csv"))
## Parsed with column specification:
## cols(
##   hpg_store_id = col_character(),
##   visit_datetime = col_datetime(format = ""),
##   reserve_datetime = col_datetime(format = ""),
##   reserve_visitors = col_double()
## )
air_store <- read_csv(str_c("/Users/dorothymensah/Desktop/air_store_info.csv"))
## Parsed with column specification:
## cols(
##   air_store_id = col_character(),
##   air_genre_name = col_character(),
##   air_area_name = col_character(),
##   latitude = col_double(),
##   longitude = col_double()
## )
hpg_store <- read_csv(str_c("/Users/dorothymensah/Desktop/hpg_store_info.csv"))
## Parsed with column specification:
## cols(
##   hpg_store_id = col_character(),
##   hpg_genre_name = col_character(),
##   hpg_area_name = col_character(),
##   latitude = col_double(),
##   longitude = col_double()
## )
holidays <- read_csv(str_c("/Users/dorothymensah/Desktop/date_info.csv"))
## Parsed with column specification:
## cols(
##   calendar_date = col_date(format = ""),
##   day_of_week = col_character(),
##   holiday_flg = col_double()
## )
store_id <- read_csv(str_c("/Users/dorothymensah/Desktop/store_id_relation.csv"))
## Parsed with column specification:
## cols(
##   air_store_id = col_character(),
##   hpg_store_id = col_character()
## )
sample <- read_csv(str_c("/Users/dorothymensah/Desktop/sample_submission.csv"))
## Parsed with column specification:
## cols(
##   id = col_character(),
##   visitors = col_double()
## )

Data Structure and Content

AIR VISITS
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
summary(air_visit)
##  air_store_id         visit_date            visitors     
##  Length:252108      Min.   :2016-01-01   Min.   :  1.00  
##  Class :character   1st Qu.:2016-07-23   1st Qu.:  9.00  
##  Mode  :character   Median :2016-10-23   Median : 17.00  
##                     Mean   :2016-10-12   Mean   : 20.97  
##                     3rd Qu.:2017-01-24   3rd Qu.: 29.00  
##                     Max.   :2017-04-22   Max.   :877.00
##Using glimpse from the dplyr package. Works like str but shows a bit more about the dataset 
glimpse(air_visit)
## Rows: 252,108
## Columns: 3
## $ air_store_id <chr> "air_ba937bf13d40fb24", "air_ba937bf13d40fb24", "air_ba9…
## $ visit_date   <date> 2016-01-13, 2016-01-14, 2016-01-15, 2016-01-16, 2016-01…
## $ visitors     <dbl> 25, 32, 29, 22, 6, 9, 31, 21, 18, 26, 21, 11, 24, 21, 26…
AIR RESERVE
summary(air_reserve)
##  air_store_id       visit_datetime                reserve_datetime             
##  Length:92378       Min.   :2016-01-01 19:00:00   Min.   :2016-01-01 01:00:00  
##  Class :character   1st Qu.:2016-11-15 19:00:00   1st Qu.:2016-11-07 17:00:00  
##  Mode  :character   Median :2017-01-05 18:00:00   Median :2016-12-27 22:00:00  
##                     Mean   :2016-12-05 08:18:58   Mean   :2016-11-27 01:13:07  
##                     3rd Qu.:2017-03-03 19:00:00   3rd Qu.:2017-02-26 18:00:00  
##                     Max.   :2017-05-31 21:00:00   Max.   :2017-04-22 23:00:00  
##  reserve_visitors 
##  Min.   :  1.000  
##  1st Qu.:  2.000  
##  Median :  3.000  
##  Mean   :  4.482  
##  3rd Qu.:  5.000  
##  Max.   :100.000
glimpse(air_reserve)
## Rows: 92,378
## Columns: 4
## $ air_store_id     <chr> "air_877f79706adbfb06", "air_db4b38ebe7a7ceff", "air…
## $ visit_datetime   <dttm> 2016-01-01 19:00:00, 2016-01-01 19:00:00, 2016-01-0…
## $ reserve_datetime <dttm> 2016-01-01 16:00:00, 2016-01-01 19:00:00, 2016-01-0…
## $ reserve_visitors <dbl> 1, 3, 6, 2, 5, 2, 4, 2, 2, 2, 3, 3, 2, 6, 7, 41, 13,…
HPG RESERVE
summary(hpg_reserve)
##  hpg_store_id       visit_datetime                reserve_datetime             
##  Length:2000320     Min.   :2016-01-01 11:00:00   Min.   :2016-01-01 00:00:00  
##  Class :character   1st Qu.:2016-06-26 19:00:00   1st Qu.:2016-06-21 12:00:00  
##  Mode  :character   Median :2016-11-19 20:00:00   Median :2016-11-10 20:00:00  
##                     Mean   :2016-10-15 06:55:20   Mean   :2016-10-07 19:57:59  
##                     3rd Qu.:2017-02-03 19:00:00   3rd Qu.:2017-01-27 13:00:00  
##                     Max.   :2017-05-31 23:00:00   Max.   :2017-04-22 23:00:00  
##  reserve_visitors 
##  Min.   :  1.000  
##  1st Qu.:  2.000  
##  Median :  3.000  
##  Mean   :  5.074  
##  3rd Qu.:  6.000  
##  Max.   :100.000
glimpse(hpg_reserve)
## Rows: 2,000,320
## Columns: 4
## $ hpg_store_id     <chr> "hpg_c63f6f42e088e50f", "hpg_dac72789163a3f47", "hpg…
## $ visit_datetime   <dttm> 2016-01-01 11:00:00, 2016-01-01 13:00:00, 2016-01-0…
## $ reserve_datetime <dttm> 2016-01-01 09:00:00, 2016-01-01 06:00:00, 2016-01-0…
## $ reserve_visitors <dbl> 1, 3, 2, 5, 13, 2, 2, 2, 2, 6, 2, 2, 2, 2, 5, 4, 2, …
AIR STORE 
summary(air_store)
##  air_store_id       air_genre_name     air_area_name         latitude    
##  Length:829         Length:829         Length:829         Min.   :33.21  
##  Class :character   Class :character   Class :character   1st Qu.:34.70  
##  Mode  :character   Mode  :character   Mode  :character   Median :35.66  
##                                                           Mean   :35.65  
##                                                           3rd Qu.:35.69  
##                                                           Max.   :44.02  
##    longitude    
##  Min.   :130.2  
##  1st Qu.:135.3  
##  Median :139.7  
##  Mean   :137.4  
##  3rd Qu.:139.8  
##  Max.   :144.3
glimpse(air_store)
## Rows: 829
## Columns: 5
## $ air_store_id   <chr> "air_0f0cdeee6c9bf3d7", "air_7cc17a324ae5c7dc", "air_f…
## $ air_genre_name <chr> "Italian/French", "Italian/French", "Italian/French", …
## $ air_area_name  <chr> "Hyōgo-ken Kōbe-shi Kumoidōri", "Hyōgo-ken Kōbe-shi Ku…
## $ latitude       <dbl> 34.69512, 34.69512, 34.69512, 34.69512, 35.65807, 35.6…
## $ longitude      <dbl> 135.1979, 135.1979, 135.1979, 135.1979, 139.7516, 139.…
HPG STORE 
summary(hpg_store)
##  hpg_store_id       hpg_genre_name     hpg_area_name         latitude    
##  Length:4690        Length:4690        Length:4690        Min.   :33.31  
##  Class :character   Class :character   Class :character   1st Qu.:34.69  
##  Mode  :character   Mode  :character   Mode  :character   Median :35.66  
##                                                           Mean   :35.81  
##                                                           3rd Qu.:35.70  
##                                                           Max.   :43.77  
##    longitude    
##  Min.   :130.3  
##  1st Qu.:135.5  
##  Median :139.5  
##  Mean   :137.7  
##  3rd Qu.:139.7  
##  Max.   :143.7
glimpse(hpg_store)
## Rows: 4,690
## Columns: 5
## $ hpg_store_id   <chr> "hpg_6622b62385aec8bf", "hpg_e9e068dd49c5fa00", "hpg_2…
## $ hpg_genre_name <chr> "Japanese style", "Japanese style", "Japanese style", …
## $ hpg_area_name  <chr> "Tōkyō-to Setagaya-ku Taishidō", "Tōkyō-to Setagaya-ku…
## $ latitude       <dbl> 35.64367, 35.64367, 35.64367, 35.64367, 35.64367, 35.6…
## $ longitude      <dbl> 139.6682, 139.6682, 139.6682, 139.6682, 139.6682, 139.…
HOLIDAYS
summary(holidays)
##  calendar_date        day_of_week         holiday_flg    
##  Min.   :2016-01-01   Length:517         Min.   :0.0000  
##  1st Qu.:2016-05-09   Class :character   1st Qu.:0.0000  
##  Median :2016-09-15   Mode  :character   Median :0.0000  
##  Mean   :2016-09-15                      Mean   :0.0677  
##  3rd Qu.:2017-01-22                      3rd Qu.:0.0000  
##  Max.   :2017-05-31                      Max.   :1.0000
glimpse(holidays)
## Rows: 517
## Columns: 3
## $ calendar_date <date> 2016-01-01, 2016-01-02, 2016-01-03, 2016-01-04, 2016-0…
## $ day_of_week   <chr> "Friday", "Saturday", "Sunday", "Monday", "Tuesday", "W…
## $ holiday_flg   <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
Store Id's
summary(store_id)
##  air_store_id       hpg_store_id      
##  Length:150         Length:150        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character
glimpse(store_id)
## Rows: 150
## Columns: 2
## $ air_store_id <chr> "air_63b13c56b7201bd9", "air_a24bf50c3e90d583", "air_c7f…
## $ hpg_store_id <chr> "hpg_4bc649e72e2a239a", "hpg_c34b496d0305a809", "hpg_cd8…
Sample Data
summary(sample)
##       id               visitors
##  Length:32019       Min.   :0  
##  Class :character   1st Qu.:0  
##  Mode  :character   Median :0  
##                     Mean   :0  
##                     3rd Qu.:0  
##                     Max.   :0
glimpse(sample)
## Rows: 32,019
## Columns: 2
## $ id       <chr> "air_00a91d42b08b08d9_2017-04-23", "air_00a91d42b08b08d9_201…
## $ visitors <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

Check for missing data

sum(is.na(air_visit))
## [1] 0
sum(is.na(air_reserve))
## [1] 0
sum(is.na(hpg_reserve))
## [1] 0
sum(is.na(air_store))
## [1] 0
sum(is.na(hpg_store))
## [1] 0
sum(is.na(holidays))
## [1] 0
sum(is.na(store_id))
## [1] 0
sum(is.na(sample))
## [1] 0

Creating dates? <- formating the dates presented

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(timeDate)

air_visit <- air_visit %>%
  mutate(visit_date = ymd(visit_date))
air_visit.group <- air_visit %>% group_by(visit_date) %>% summarize(visitors = sum(visitors))
## `summarise()` ungrouping output (override with `.groups` argument)
air_reserve <- air_reserve %>%
  mutate(visit_datetime = ymd_hms(visit_datetime),
         reserve_datetime = ymd_hms(reserve_datetime))

hpg_reserve <- hpg_reserve %>%
  mutate(visit_datetime = ymd_hms(visit_datetime),
         reserve_datetime = ymd_hms(reserve_datetime))

air_store <- air_store %>%
  mutate(air_genre_name = as.factor(air_genre_name),
         air_area_name = as.factor(air_area_name))

hpg_store <- hpg_store %>%
  mutate(hpg_genre_name = as.factor(hpg_genre_name),
         hpg_area_name = as.factor(hpg_area_name))

holidays <- holidays %>%
  mutate(holiday_flg = as.logical(holiday_flg),
         date = ymd(calendar_date),
         calendar_date = as.character(calendar_date))

DATA VISUALIZATION

AIR VISITS
#Load the necesarry libraries 
library(ggplot2)


G1 <- air_visit %>%
  group_by(visit_date) %>%
  summarise(all_visitors = sum(visitors)) %>%
  ggplot(aes(visit_date,all_visitors)) +
  geom_line(col = "red") +
  labs(x = "All visitors", y = "Date")
## `summarise()` ungrouping output (override with `.groups` argument)
G2 <- p2 <- air_visit %>%
  ggplot(aes(visitors)) +
  geom_vline(xintercept = 20, color = "red") +
  geom_histogram(fill = "purple", bins = 30) +
  scale_x_log10()


G3<- p3 <- air_visit %>%
  mutate(wday = wday(visit_date, label = TRUE, week_start = 1)) %>%
  group_by(wday) %>%
  summarise(visits = median(visitors)) %>%
  ggplot(aes(wday, visits, fill = wday)) +
  geom_col() +
  theme(legend.position = "none", axis.text.x  = element_text(angle=45, hjust=1, vjust=0.9)) +
  labs(x = "Day of the week", y = "Median visitors") +
  scale_fill_hue()
## `summarise()` ungrouping output (override with `.groups` argument)
G4 <- air_visit %>%
  mutate(month = month(visit_date, label = TRUE)) %>%
  group_by(month) %>%
  summarise(visits = median(visitors)) %>%
  ggplot(aes(month, visits, fill = month)) +
  geom_col() +
  theme(legend.position = "none") +
  labs(x = "Month", y = "Median visitors")
## `summarise()` ungrouping output (override with `.groups` argument)
plot(G1)

plot(G2)

plot(G3)

plot(G4)

HOLIDAYS
##Plotting how many holidays there are in total 
holiday <- holidays %>%
  mutate(wday = wday(date, week_start = 1))

p1 <- holidays %>%
  ggplot(aes(holiday_flg, fill = holiday_flg)) +
  geom_bar() +
  theme(legend.position = "none")
  
p2 <- holidays %>%
  filter(date > ymd("2016-04-15") & date < ymd("2016-06-01")) %>%
  ggplot(aes(date, holiday_flg, color = holiday_flg)) +
  geom_point(size = 2) +
  theme(legend.position = "none") +
  labs(x = "2016 date")

p3 <- holidays %>%
  filter(date > ymd("2017-04-15") & date < ymd("2017-06-01")) %>%
  ggplot(aes(date, holiday_flg, color = holiday_flg)) +
  geom_point(size = 2) +
  theme(legend.position = "none") +
  labs(x = "2017 date")

plot(p1)

plot(p2)

plot(p3)

#How many holidays are there exactly?

holidays %>% summarise(frac=mean(holiday_flg))
## # A tibble: 1 x 1
##     frac
##    <dbl>
## 1 0.0677
#There seems to be about 7%  of holidays present in our data
library(tseries)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(forecast)
library(tidyr)
library(purrr)
library(forcats)
library(tibble)
Tranforming data into a time series 
date <- air_visit %>%
  group_by(visit_date) %>%
  summarise(all_visitors=sum(visitors))
## `summarise()` ungrouping output (override with `.groups` argument)
date.ts <- ts(date$all_visitors, frequency = 7)
autoplot(date.ts)

##Creating train and test set 
train <- ts(date.ts[1:382],frequency = 7)
test <- ts(date.ts[383:478], frequency = 7)

MODEL FORMULATION

MODEL 1 : Time series Regression Models: Simple Linear Regression 
##1A : Regular regression model 
model1 <- lm(visitors~visit_date, data = air_visit.group)
model1
## 
## Call:
## lm(formula = visitors ~ visit_date, data = air_visit.group)
## 
## Coefficients:
## (Intercept)   visit_date  
##  -452128.50        27.18
summary(model1)
## 
## Call:
## lm(formula = visitors ~ visit_date, data = air_visit.group)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -11132  -2297   -529   1988   9959 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.521e+05  1.934e+04  -23.38   <2e-16 ***
## visit_date   2.718e+01  1.135e+00   23.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3424 on 476 degrees of freedom
## Multiple R-squared:  0.5464, Adjusted R-squared:  0.5455 
## F-statistic: 573.5 on 1 and 476 DF,  p-value: < 2.2e-16
checkresiduals(model1)

## 
##  Breusch-Godfrey test for serial correlation of order up to 10
## 
## data:  Residuals
## LM test = 387.84, df = 10, p-value < 2.2e-16
##1B : TSLM 
##Create time series 

air_visit.ts <- ts(air_visit.group)
model1a <- tslm(visitors~visit_date, data = air_visit.ts)
model1a
## 
## Call:
## tslm(formula = visitors ~ visit_date, data = air_visit.ts)
## 
## Coefficients:
## (Intercept)   visit_date  
##  -452128.50        27.18
summary(model1a)
## 
## Call:
## tslm(formula = visitors ~ visit_date, data = air_visit.ts)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -11132  -2297   -529   1988   9959 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.521e+05  1.934e+04  -23.38   <2e-16 ***
## visit_date   2.718e+01  1.135e+00   23.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3424 on 476 degrees of freedom
## Multiple R-squared:  0.5464, Adjusted R-squared:  0.5455 
## F-statistic: 573.5 on 1 and 476 DF,  p-value: < 2.2e-16
checkresiduals(model1a)

## 
##  Breusch-Godfrey test for serial correlation of order up to 10
## 
## data:  Residuals from Linear regression model
## LM test = 387.84, df = 10, p-value < 2.2e-16
##Forecasting TSLM model1a <- EX-POST FORECAST
##model1a.1 <- window(air_visit.ts, start = 16801)
model1a.fit <- model1a$fitted.values
model1a.fcast <- forecast(model1a.fit)
autoplot(model1a.fcast)+ ggtitle("Forecasts of Visit Data For The Air Restaurant ")+xlab("Date")+ylab("Visitors")



MODEL 2: ETS MODELS

##Creating train and test set 
train <- ts(date.ts[1:382],frequency = 7)
test <- ts(date.ts[383:478], frequency = 7)
Model2:Not sure if i should be seperating the models then adding weights to the best one?
library(smoother)
## Loading required package: TTR
Model2 <- ets(train)
Model2
## ETS(M,Ad,M) 
## 
## Call:
##  ets(y = train) 
## 
##   Smoothing parameters:
##     alpha = 0.5916 
##     beta  = 1e-04 
##     gamma = 1e-04 
##     phi   = 0.9641 
## 
##   Initial states:
##     l = 2156.8817 
##     b = 321.3936 
##     s = 0.934 0.9336 0.8502 0.7356 0.9386 1.3499
##            1.2582
## 
##   sigma:  0.1448
## 
##      AIC     AICc      BIC 
## 7755.280 7756.269 7806.571
#Forecast
Model2.fcast<- forecast(Model2,14)
autoplot(Model2)

checkresiduals(Model2.fcast)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,Ad,M)
## Q* = 41.684, df = 3, p-value = 4.682e-09
## 
## Model df: 12.   Total lags used: 15
##Check for model accuracy 
model2.accuracy <- accuracy(Model2.fcast,test[1:14])
model2.accuracy
##                      ME     RMSE      MAE       MPE     MAPE      MASE
## Training set  -6.032786 1596.076 822.7383 -1.895786 9.064925 0.4109862
## Test set     917.368225 1302.465 949.0908  5.751102 5.984893 0.4741036
##                   ACF1
## Training set 0.2953002
## Test set            NA
##Adding a weighted moving average
wa <- na.omit(WMA(train,14))
#forecast with weighted average 
wa.fcast<- forecast(wa,14)
autoplot(wa.fcast)

checkresiduals(wa.fcast)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,Ad,N)
## Q* = 340.93, df = 5, p-value < 2.2e-16
## 
## Model df: 5.   Total lags used: 10
##Accuracy of model 
wa.accuracy  <- accuracy(wa.fcast,test[1:14])
wa.accuracy
##                       ME      RMSE       MAE        MPE      MAPE       MASE
## Training set    3.490081  340.1146  254.3016  0.0823534  2.516448  0.9432135
## Test set     2080.725828 4014.3363 2842.3760 10.1770561 17.609637 10.5424702
##                   ACF1
## Training set 0.1471929
## Test set            NA