library(readr)
library(stringr)
air_visit <- read_csv(str_c("/Users/dorothymensah/Desktop/air_visit_data.csv"))
## Parsed with column specification:
## cols(
## air_store_id = col_character(),
## visit_date = col_date(format = ""),
## visitors = col_double()
## )
air_reserve <- read_csv(str_c("/Users/dorothymensah/Desktop/air_reserve.csv"))
## Parsed with column specification:
## cols(
## air_store_id = col_character(),
## visit_datetime = col_datetime(format = ""),
## reserve_datetime = col_datetime(format = ""),
## reserve_visitors = col_double()
## )
hpg_reserve <- read_csv(str_c("/Users/dorothymensah/Desktop/hpg_reserve.csv"))
## Parsed with column specification:
## cols(
## hpg_store_id = col_character(),
## visit_datetime = col_datetime(format = ""),
## reserve_datetime = col_datetime(format = ""),
## reserve_visitors = col_double()
## )
air_store <- read_csv(str_c("/Users/dorothymensah/Desktop/air_store_info.csv"))
## Parsed with column specification:
## cols(
## air_store_id = col_character(),
## air_genre_name = col_character(),
## air_area_name = col_character(),
## latitude = col_double(),
## longitude = col_double()
## )
hpg_store <- read_csv(str_c("/Users/dorothymensah/Desktop/hpg_store_info.csv"))
## Parsed with column specification:
## cols(
## hpg_store_id = col_character(),
## hpg_genre_name = col_character(),
## hpg_area_name = col_character(),
## latitude = col_double(),
## longitude = col_double()
## )
holidays <- read_csv(str_c("/Users/dorothymensah/Desktop/date_info.csv"))
## Parsed with column specification:
## cols(
## calendar_date = col_date(format = ""),
## day_of_week = col_character(),
## holiday_flg = col_double()
## )
store_id <- read_csv(str_c("/Users/dorothymensah/Desktop/store_id_relation.csv"))
## Parsed with column specification:
## cols(
## air_store_id = col_character(),
## hpg_store_id = col_character()
## )
sample <- read_csv(str_c("/Users/dorothymensah/Desktop/sample_submission.csv"))
## Parsed with column specification:
## cols(
## id = col_character(),
## visitors = col_double()
## )
AIR VISITS
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
summary(air_visit)
## air_store_id visit_date visitors
## Length:252108 Min. :2016-01-01 Min. : 1.00
## Class :character 1st Qu.:2016-07-23 1st Qu.: 9.00
## Mode :character Median :2016-10-23 Median : 17.00
## Mean :2016-10-12 Mean : 20.97
## 3rd Qu.:2017-01-24 3rd Qu.: 29.00
## Max. :2017-04-22 Max. :877.00
##Using glimpse from the dplyr package. Works like str but shows a bit more about the dataset
glimpse(air_visit)
## Rows: 252,108
## Columns: 3
## $ air_store_id <chr> "air_ba937bf13d40fb24", "air_ba937bf13d40fb24", "air_ba9…
## $ visit_date <date> 2016-01-13, 2016-01-14, 2016-01-15, 2016-01-16, 2016-01…
## $ visitors <dbl> 25, 32, 29, 22, 6, 9, 31, 21, 18, 26, 21, 11, 24, 21, 26…
AIR RESERVE
summary(air_reserve)
## air_store_id visit_datetime reserve_datetime
## Length:92378 Min. :2016-01-01 19:00:00 Min. :2016-01-01 01:00:00
## Class :character 1st Qu.:2016-11-15 19:00:00 1st Qu.:2016-11-07 17:00:00
## Mode :character Median :2017-01-05 18:00:00 Median :2016-12-27 22:00:00
## Mean :2016-12-05 08:18:58 Mean :2016-11-27 01:13:07
## 3rd Qu.:2017-03-03 19:00:00 3rd Qu.:2017-02-26 18:00:00
## Max. :2017-05-31 21:00:00 Max. :2017-04-22 23:00:00
## reserve_visitors
## Min. : 1.000
## 1st Qu.: 2.000
## Median : 3.000
## Mean : 4.482
## 3rd Qu.: 5.000
## Max. :100.000
glimpse(air_reserve)
## Rows: 92,378
## Columns: 4
## $ air_store_id <chr> "air_877f79706adbfb06", "air_db4b38ebe7a7ceff", "air…
## $ visit_datetime <dttm> 2016-01-01 19:00:00, 2016-01-01 19:00:00, 2016-01-0…
## $ reserve_datetime <dttm> 2016-01-01 16:00:00, 2016-01-01 19:00:00, 2016-01-0…
## $ reserve_visitors <dbl> 1, 3, 6, 2, 5, 2, 4, 2, 2, 2, 3, 3, 2, 6, 7, 41, 13,…
HPG RESERVE
summary(hpg_reserve)
## hpg_store_id visit_datetime reserve_datetime
## Length:2000320 Min. :2016-01-01 11:00:00 Min. :2016-01-01 00:00:00
## Class :character 1st Qu.:2016-06-26 19:00:00 1st Qu.:2016-06-21 12:00:00
## Mode :character Median :2016-11-19 20:00:00 Median :2016-11-10 20:00:00
## Mean :2016-10-15 06:55:20 Mean :2016-10-07 19:57:59
## 3rd Qu.:2017-02-03 19:00:00 3rd Qu.:2017-01-27 13:00:00
## Max. :2017-05-31 23:00:00 Max. :2017-04-22 23:00:00
## reserve_visitors
## Min. : 1.000
## 1st Qu.: 2.000
## Median : 3.000
## Mean : 5.074
## 3rd Qu.: 6.000
## Max. :100.000
glimpse(hpg_reserve)
## Rows: 2,000,320
## Columns: 4
## $ hpg_store_id <chr> "hpg_c63f6f42e088e50f", "hpg_dac72789163a3f47", "hpg…
## $ visit_datetime <dttm> 2016-01-01 11:00:00, 2016-01-01 13:00:00, 2016-01-0…
## $ reserve_datetime <dttm> 2016-01-01 09:00:00, 2016-01-01 06:00:00, 2016-01-0…
## $ reserve_visitors <dbl> 1, 3, 2, 5, 13, 2, 2, 2, 2, 6, 2, 2, 2, 2, 5, 4, 2, …
AIR STORE
summary(air_store)
## air_store_id air_genre_name air_area_name latitude
## Length:829 Length:829 Length:829 Min. :33.21
## Class :character Class :character Class :character 1st Qu.:34.70
## Mode :character Mode :character Mode :character Median :35.66
## Mean :35.65
## 3rd Qu.:35.69
## Max. :44.02
## longitude
## Min. :130.2
## 1st Qu.:135.3
## Median :139.7
## Mean :137.4
## 3rd Qu.:139.8
## Max. :144.3
glimpse(air_store)
## Rows: 829
## Columns: 5
## $ air_store_id <chr> "air_0f0cdeee6c9bf3d7", "air_7cc17a324ae5c7dc", "air_f…
## $ air_genre_name <chr> "Italian/French", "Italian/French", "Italian/French", …
## $ air_area_name <chr> "Hyōgo-ken Kōbe-shi Kumoidōri", "Hyōgo-ken Kōbe-shi Ku…
## $ latitude <dbl> 34.69512, 34.69512, 34.69512, 34.69512, 35.65807, 35.6…
## $ longitude <dbl> 135.1979, 135.1979, 135.1979, 135.1979, 139.7516, 139.…
HPG STORE
summary(hpg_store)
## hpg_store_id hpg_genre_name hpg_area_name latitude
## Length:4690 Length:4690 Length:4690 Min. :33.31
## Class :character Class :character Class :character 1st Qu.:34.69
## Mode :character Mode :character Mode :character Median :35.66
## Mean :35.81
## 3rd Qu.:35.70
## Max. :43.77
## longitude
## Min. :130.3
## 1st Qu.:135.5
## Median :139.5
## Mean :137.7
## 3rd Qu.:139.7
## Max. :143.7
glimpse(hpg_store)
## Rows: 4,690
## Columns: 5
## $ hpg_store_id <chr> "hpg_6622b62385aec8bf", "hpg_e9e068dd49c5fa00", "hpg_2…
## $ hpg_genre_name <chr> "Japanese style", "Japanese style", "Japanese style", …
## $ hpg_area_name <chr> "Tōkyō-to Setagaya-ku Taishidō", "Tōkyō-to Setagaya-ku…
## $ latitude <dbl> 35.64367, 35.64367, 35.64367, 35.64367, 35.64367, 35.6…
## $ longitude <dbl> 139.6682, 139.6682, 139.6682, 139.6682, 139.6682, 139.…
HOLIDAYS
summary(holidays)
## calendar_date day_of_week holiday_flg
## Min. :2016-01-01 Length:517 Min. :0.0000
## 1st Qu.:2016-05-09 Class :character 1st Qu.:0.0000
## Median :2016-09-15 Mode :character Median :0.0000
## Mean :2016-09-15 Mean :0.0677
## 3rd Qu.:2017-01-22 3rd Qu.:0.0000
## Max. :2017-05-31 Max. :1.0000
glimpse(holidays)
## Rows: 517
## Columns: 3
## $ calendar_date <date> 2016-01-01, 2016-01-02, 2016-01-03, 2016-01-04, 2016-0…
## $ day_of_week <chr> "Friday", "Saturday", "Sunday", "Monday", "Tuesday", "W…
## $ holiday_flg <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
Store Id's
summary(store_id)
## air_store_id hpg_store_id
## Length:150 Length:150
## Class :character Class :character
## Mode :character Mode :character
glimpse(store_id)
## Rows: 150
## Columns: 2
## $ air_store_id <chr> "air_63b13c56b7201bd9", "air_a24bf50c3e90d583", "air_c7f…
## $ hpg_store_id <chr> "hpg_4bc649e72e2a239a", "hpg_c34b496d0305a809", "hpg_cd8…
Sample Data
summary(sample)
## id visitors
## Length:32019 Min. :0
## Class :character 1st Qu.:0
## Mode :character Median :0
## Mean :0
## 3rd Qu.:0
## Max. :0
glimpse(sample)
## Rows: 32,019
## Columns: 2
## $ id <chr> "air_00a91d42b08b08d9_2017-04-23", "air_00a91d42b08b08d9_201…
## $ visitors <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
sum(is.na(air_visit))
## [1] 0
sum(is.na(air_reserve))
## [1] 0
sum(is.na(hpg_reserve))
## [1] 0
sum(is.na(air_store))
## [1] 0
sum(is.na(hpg_store))
## [1] 0
sum(is.na(holidays))
## [1] 0
sum(is.na(store_id))
## [1] 0
sum(is.na(sample))
## [1] 0
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(timeDate)
air_visit <- air_visit %>%
mutate(visit_date = ymd(visit_date))
air_visit.group <- air_visit %>% group_by(visit_date) %>% summarize(visitors = sum(visitors))
## `summarise()` ungrouping output (override with `.groups` argument)
air_reserve <- air_reserve %>%
mutate(visit_datetime = ymd_hms(visit_datetime),
reserve_datetime = ymd_hms(reserve_datetime))
hpg_reserve <- hpg_reserve %>%
mutate(visit_datetime = ymd_hms(visit_datetime),
reserve_datetime = ymd_hms(reserve_datetime))
air_store <- air_store %>%
mutate(air_genre_name = as.factor(air_genre_name),
air_area_name = as.factor(air_area_name))
hpg_store <- hpg_store %>%
mutate(hpg_genre_name = as.factor(hpg_genre_name),
hpg_area_name = as.factor(hpg_area_name))
holidays <- holidays %>%
mutate(holiday_flg = as.logical(holiday_flg),
date = ymd(calendar_date),
calendar_date = as.character(calendar_date))
AIR VISITS
#Load the necesarry libraries
library(ggplot2)
G1 <- air_visit %>%
group_by(visit_date) %>%
summarise(all_visitors = sum(visitors)) %>%
ggplot(aes(visit_date,all_visitors)) +
geom_line(col = "red") +
labs(x = "All visitors", y = "Date")
## `summarise()` ungrouping output (override with `.groups` argument)
G2 <- p2 <- air_visit %>%
ggplot(aes(visitors)) +
geom_vline(xintercept = 20, color = "red") +
geom_histogram(fill = "purple", bins = 30) +
scale_x_log10()
G3<- p3 <- air_visit %>%
mutate(wday = wday(visit_date, label = TRUE, week_start = 1)) %>%
group_by(wday) %>%
summarise(visits = median(visitors)) %>%
ggplot(aes(wday, visits, fill = wday)) +
geom_col() +
theme(legend.position = "none", axis.text.x = element_text(angle=45, hjust=1, vjust=0.9)) +
labs(x = "Day of the week", y = "Median visitors") +
scale_fill_hue()
## `summarise()` ungrouping output (override with `.groups` argument)
G4 <- air_visit %>%
mutate(month = month(visit_date, label = TRUE)) %>%
group_by(month) %>%
summarise(visits = median(visitors)) %>%
ggplot(aes(month, visits, fill = month)) +
geom_col() +
theme(legend.position = "none") +
labs(x = "Month", y = "Median visitors")
## `summarise()` ungrouping output (override with `.groups` argument)
plot(G1)
plot(G2)
plot(G3)
plot(G4)
HOLIDAYS
##Plotting how many holidays there are in total
holiday <- holidays %>%
mutate(wday = wday(date, week_start = 1))
p1 <- holidays %>%
ggplot(aes(holiday_flg, fill = holiday_flg)) +
geom_bar() +
theme(legend.position = "none")
p2 <- holidays %>%
filter(date > ymd("2016-04-15") & date < ymd("2016-06-01")) %>%
ggplot(aes(date, holiday_flg, color = holiday_flg)) +
geom_point(size = 2) +
theme(legend.position = "none") +
labs(x = "2016 date")
p3 <- holidays %>%
filter(date > ymd("2017-04-15") & date < ymd("2017-06-01")) %>%
ggplot(aes(date, holiday_flg, color = holiday_flg)) +
geom_point(size = 2) +
theme(legend.position = "none") +
labs(x = "2017 date")
plot(p1)
plot(p2)
plot(p3)
#How many holidays are there exactly?
holidays %>% summarise(frac=mean(holiday_flg))
## # A tibble: 1 x 1
## frac
## <dbl>
## 1 0.0677
#There seems to be about 7% of holidays present in our data
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(forecast)
library(tidyr)
library(purrr)
library(forcats)
library(tibble)
Tranforming data into a time series
date <- air_visit %>%
group_by(visit_date) %>%
summarise(all_visitors=sum(visitors))
## `summarise()` ungrouping output (override with `.groups` argument)
date.ts <- ts(date$all_visitors, frequency = 7)
autoplot(date.ts)
##Creating train and test set
train <- ts(date.ts[1:382],frequency = 7)
test <- ts(date.ts[383:478], frequency = 7)
MODEL 1 : Time series Regression Models: Simple Linear Regression
##1A : Regular regression model
model1 <- lm(visitors~visit_date, data = air_visit.group)
model1
##
## Call:
## lm(formula = visitors ~ visit_date, data = air_visit.group)
##
## Coefficients:
## (Intercept) visit_date
## -452128.50 27.18
summary(model1)
##
## Call:
## lm(formula = visitors ~ visit_date, data = air_visit.group)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11132 -2297 -529 1988 9959
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.521e+05 1.934e+04 -23.38 <2e-16 ***
## visit_date 2.718e+01 1.135e+00 23.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3424 on 476 degrees of freedom
## Multiple R-squared: 0.5464, Adjusted R-squared: 0.5455
## F-statistic: 573.5 on 1 and 476 DF, p-value: < 2.2e-16
checkresiduals(model1)
##
## Breusch-Godfrey test for serial correlation of order up to 10
##
## data: Residuals
## LM test = 387.84, df = 10, p-value < 2.2e-16
##1B : TSLM
##Create time series
air_visit.ts <- ts(air_visit.group)
model1a <- tslm(visitors~visit_date, data = air_visit.ts)
model1a
##
## Call:
## tslm(formula = visitors ~ visit_date, data = air_visit.ts)
##
## Coefficients:
## (Intercept) visit_date
## -452128.50 27.18
summary(model1a)
##
## Call:
## tslm(formula = visitors ~ visit_date, data = air_visit.ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11132 -2297 -529 1988 9959
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.521e+05 1.934e+04 -23.38 <2e-16 ***
## visit_date 2.718e+01 1.135e+00 23.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3424 on 476 degrees of freedom
## Multiple R-squared: 0.5464, Adjusted R-squared: 0.5455
## F-statistic: 573.5 on 1 and 476 DF, p-value: < 2.2e-16
checkresiduals(model1a)
##
## Breusch-Godfrey test for serial correlation of order up to 10
##
## data: Residuals from Linear regression model
## LM test = 387.84, df = 10, p-value < 2.2e-16
##Forecasting TSLM model1a <- EX-POST FORECAST
##model1a.1 <- window(air_visit.ts, start = 16801)
model1a.fit <- model1a$fitted.values
model1a.fcast <- forecast(model1a.fit)
autoplot(model1a.fcast)+ ggtitle("Forecasts of Visit Data For The Air Restaurant ")+xlab("Date")+ylab("Visitors")
MODEL 2: ETS MODELS
##Creating train and test set
train <- ts(date.ts[1:382],frequency = 7)
test <- ts(date.ts[383:478], frequency = 7)
Model2:Not sure if i should be seperating the models then adding weights to the best one?
library(smoother)
## Loading required package: TTR
Model2 <- ets(train)
Model2
## ETS(M,Ad,M)
##
## Call:
## ets(y = train)
##
## Smoothing parameters:
## alpha = 0.5916
## beta = 1e-04
## gamma = 1e-04
## phi = 0.9641
##
## Initial states:
## l = 2156.8817
## b = 321.3936
## s = 0.934 0.9336 0.8502 0.7356 0.9386 1.3499
## 1.2582
##
## sigma: 0.1448
##
## AIC AICc BIC
## 7755.280 7756.269 7806.571
#Forecast
Model2.fcast<- forecast(Model2,14)
autoplot(Model2)
checkresiduals(Model2.fcast)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,Ad,M)
## Q* = 41.684, df = 3, p-value = 4.682e-09
##
## Model df: 12. Total lags used: 15
##Check for model accuracy
model2.accuracy <- accuracy(Model2.fcast,test[1:14])
model2.accuracy
## ME RMSE MAE MPE MAPE MASE
## Training set -6.032786 1596.076 822.7383 -1.895786 9.064925 0.4109862
## Test set 917.368225 1302.465 949.0908 5.751102 5.984893 0.4741036
## ACF1
## Training set 0.2953002
## Test set NA
##Adding a weighted moving average
wa <- na.omit(WMA(train,14))
#forecast with weighted average
wa.fcast<- forecast(wa,14)
autoplot(wa.fcast)
checkresiduals(wa.fcast)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,Ad,N)
## Q* = 340.93, df = 5, p-value < 2.2e-16
##
## Model df: 5. Total lags used: 10
##Accuracy of model
wa.accuracy <- accuracy(wa.fcast,test[1:14])
wa.accuracy
## ME RMSE MAE MPE MAPE MASE
## Training set 3.490081 340.1146 254.3016 0.0823534 2.516448 0.9432135
## Test set 2080.725828 4014.3363 2842.3760 10.1770561 17.609637 10.5424702
## ACF1
## Training set 0.1471929
## Test set NA