| Day of Week | Average Visitors | 25th Percentile | 50th Percentile | 75th Percentile |
|---|---|---|---|---|
| 1 Mon | 17.17701 | 7 | 14 | 24 |
| 2 Tue | 17.67214 | 8 | 14 | 24 |
| 3 Wed | 19.23012 | 8 | 16 | 27 |
| 4 Thu | 18.92270 | 8 | 16 | 26 |
| 5 Fri | 23.07274 | 10 | 19 | 32 |
| 6 Sat | 26.31369 | 12 | 22 | 36 |
| 7 Sun | 23.87336 | 9 | 19 | 33 |
ALL REGIONS
Top 10 Ranges (e.g., 72% of visitors)
Model Considerations
Full Training Dataset
Training Dataset with Knot at 7/1/2016
Kaggle Results - Auto-ARIMA, Full Training Data
LEARNINGS
Boomija, G & Anandaraj, A. & Nandhini, S & Lavanya, S. (2018). Restaurant Visitor Time Series Forecasting Using Autoregressive Integrated Moving Average. Journal of Computational and Theoretical Nanoscience. 15. 1590-1593. 10.1166/jctn.2018.7345.
Claveria, O., Monte, E. and Torra, S. (2015), “A new forecasting approach for the hospitality industry”, International Journal of Contemporary Hospitality Management, Vol. 27 No. 7, pp. 1520-1538. https://doi.org/10.1108/IJCHM-06-2014-0286
Fiori AM, Foroni I. Reservation Forecasting Models for Hospitality SMEs with a View to Enhance Their Economic Sustainability. Sustainability. 2019; 11(5):1274. https://doi.org/10.3390/su11051274
Fong-Lin Chu, Forecasting tourism demand with ARMA-based methods, Tourism Management, Volume 30, Issue 5, 2009, Pages 740-751, ISSN 0261-5177, https://doi.org/10.1016/j.tourman.2008.10.016.
Takashi Tanizaki, Tomohiro Hoshino, Takeshi Shimmura, Takeshi Takenaka, Demand forecasting in restaurants using machine learning and statistical analysis, Procedia CIRP, Volume 79, 2019, Pages 679-683, ISSN 2212-8271, https://doi.org/10.1016/j.procir.2019.02.042.
knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)
local({
hook_source <- knitr::knit_hooks$get('source')
knitr::knit_hooks$set(source = function(x, options) {
x <- x[!grepl('# SECRET!!$', x)]
hook_source(x, options)
})
})
#INVOKE APPROPRIATE LIBRARIES
library("feasts")
library("seasonal")
library("tsibble")
library("tsibbledata")
library("dplyr")
library("ggplot2")
library("forecast")
library("fable")
library("fpp3")
library("sqldf")
library("psych")
library("PerformanceAnalytics")
library("car")
library("kableExtra")
library("glmnet")
library("ISLR")
library("leaps")
library("geosphere")
library("hts")
library("geosphere")
#**CREATE LONGITUDINAL DISTANCE DATAFRAME**#
#Define conversion variables
mTOkm <- 1000
kmTOmi <- 0.621371
#Load vector with latitude coordinates (by one-hundreth of a degree)
latitude_coord <- c()
for(i in 1:721) {
if(is.null(latitude_coord[i-1])) {
latitude_coord[i] <- -90
} else {
latitude_coord[i] <- latitude_coord[i-1]+0.25
}
}
#Find distance between degrees of longitude at each latitude point defined
latitude_lower <- c()
latitude_upper <- c()
longitude_dist <- c()
for(i in 1:length(latitude_coord)) {
latitude_lower[i] <- latitude_coord[i]
if(is.null(latitude_coord[i+1])) {
latitude_upper[i] <- 90
} else {
latitude_upper[i] <- latitude_coord[i+1]-0.00001
}
longitude_dist[i] <- distm(c(0, latitude_coord[i]), c(1, latitude_coord[i]))/mTOkm*kmTOmi
}
longitude_dist_df <- data.frame(cbind(latitude_lower,
latitude_upper,
longitude_dist))
lat_deg_distance <- distm(c(0, 0), c(0, 1))/mTOkm*kmTOmi
#IMPORT DATASETS
air_reserve <- read.csv(paste0(csv_path,
"air_reserve.csv"))
air_store_info <- read.csv(paste0(csv_path,
"air_store_info.csv"))
air_visit_data <- read.csv(paste0(csv_path,
"air_visit_data.csv"))
air_visit_data <-
air_visit_data %>%
mutate(visit_date = as.Date(visit_date))
hpg_reserve <- read.csv(paste0(csv_path,
"hpg_reserve.csv"))
hpg_store_info <- read.csv(paste0(csv_path,
"hpg_store_info.csv"))
date_info <- read.csv(paste0(csv_path,
"date_info.csv"))
date_info <-
date_info %>%
mutate(calendar_date = as.Date(calendar_date))
store_id_relation <- read.csv(paste0(csv_path,
"store_id_relation.csv"))
#TASK: Forecast total VISITS to restaurant on a certain date (air visits file)
#ITEM LIST for model:
# Reserved Visits by date (segmented by lead time in placing)
# Tally of restaurant types in an area
# Holiday flag
# Open flag
# Dummy for type of restaurant
#EVALUATION
# Look at aggregated totals of visits and reservations over time
# Look at aggregated totals by area of visits and reservations over time
# Review any null values in existing dataset (BEFORE CHANGES)
# If none, assume any missing values when gaps filled are from not being open
# Minimum Reservation Conversion rate (e.g., proof as a predictor potentially)
# Within each area, % of visits by restaurant by day
#SEASONAL PLOT: Monday - Sunday, by "Week of" Variable
#RESERVATION EXPLORATION
#air data - reserved visitors by visit date segmented by reserve lead time
air_reserve2 <-
air_reserve %>%
mutate(reserve_lead = difftime(visit_datetime, reserve_datetime, units = "days"),
visit_date = as.Date(visit_datetime),
reserve_date = as.Date(reserve_datetime))
air_reserve3 <- sqldf("
WITH DATA AS (
SELECT air_store_id
, visit_datetime
, visit_date
, reserve_datetime
, reserve_date
, CASE WHEN reserve_lead = 0 THEN reserve_visitors ELSE 0 END AS reserve0
, CASE WHEN reserve_lead = 1 THEN reserve_visitors ELSE 0 END AS reserve1
, CASE WHEN reserve_lead = 2 THEN reserve_visitors ELSE 0 END AS reserve2
, CASE WHEN reserve_lead = 3 THEN reserve_visitors ELSE 0 END AS reserve3
, CASE WHEN reserve_lead = 4 THEN reserve_visitors ELSE 0 END AS reserve4
, CASE WHEN reserve_lead = 5 THEN reserve_visitors ELSE 0 END AS reserve5
, CASE WHEN reserve_lead = 6 THEN reserve_visitors ELSE 0 END AS reserve6
, CASE WHEN reserve_lead >= 7 THEN reserve_visitors ELSE 0 END AS reserve7plus
, reserve_visitors
FROM air_reserve2
)
SELECT air_store_id
, visit_date
, SUM(reserve0) AS reserve0_visitors
, SUM(reserve1) AS reserve1_visitors
, SUM(reserve2) AS reserve2_visitors
, SUM(reserve3) AS reserve3_visitors
, SUM(reserve4) AS reserve4_visitors
, SUM(reserve5) AS reserve5_visitors
, SUM(reserve6) AS reserve6_visitors
, SUM(reserve7plus) AS reserve7p_visitors
, SUM(reserve_visitors) AS reserve_visitors
FROM DATA
GROUP BY air_store_id, visit_date
")
hpg_reserve2 <-
hpg_reserve %>%
mutate(reserve_lead = difftime(visit_datetime, reserve_datetime, units = "days"),
visit_date = as.Date(visit_datetime),
reserve_date = as.Date(reserve_datetime))
hpg_reserve3 <- sqldf("
WITH DATA AS (
SELECT hpg_store_id
, visit_datetime
, visit_date
, reserve_datetime
, reserve_date
, CASE WHEN reserve_lead = 0 THEN reserve_visitors ELSE 0 END AS reserve0
, CASE WHEN reserve_lead = 1 THEN reserve_visitors ELSE 0 END AS reserve1
, CASE WHEN reserve_lead = 2 THEN reserve_visitors ELSE 0 END AS reserve2
, CASE WHEN reserve_lead = 3 THEN reserve_visitors ELSE 0 END AS reserve3
, CASE WHEN reserve_lead = 4 THEN reserve_visitors ELSE 0 END AS reserve4
, CASE WHEN reserve_lead = 5 THEN reserve_visitors ELSE 0 END AS reserve5
, CASE WHEN reserve_lead = 6 THEN reserve_visitors ELSE 0 END AS reserve6
, CASE WHEN reserve_lead >= 7 THEN reserve_visitors ELSE 0 END AS reserve7plus
, reserve_visitors
FROM hpg_reserve2
)
SELECT hpg_store_id
, visit_date
, SUM(reserve0) AS reserve0_visitors
, SUM(reserve1) AS reserve1_visitors
, SUM(reserve2) AS reserve2_visitors
, SUM(reserve3) AS reserve3_visitors
, SUM(reserve4) AS reserve4_visitors
, SUM(reserve5) AS reserve5_visitors
, SUM(reserve6) AS reserve6_visitors
, SUM(reserve7plus) AS reserve7p_visitors
, SUM(reserve_visitors) AS reserve_visitors
FROM DATA
GROUP BY hpg_store_id, visit_date
")
#Join hpg data to air based on store_id_relation table
hpg_reserve_air <- sqldf("
WITH DATA AS (
SELECT H.hpg_store_id
, R.air_store_id
, H.visit_date
, H.reserve0_visitors
, H.reserve1_visitors
, H.reserve2_visitors
, H.reserve3_visitors
, H.reserve4_visitors
, H.reserve5_visitors
, H.reserve6_visitors
, H.reserve7p_visitors
, H.reserve_visitors
FROM hpg_reserve3 H
INNER JOIN store_id_relation R
ON R.hpg_store_id = H.hpg_store_id
)
SELECT air_store_id
, visit_date
, reserve0_visitors
, reserve1_visitors
, reserve2_visitors
, reserve3_visitors
, reserve4_visitors
, reserve5_visitors
, reserve6_visitors
, reserve7p_visitors
, reserve_visitors
FROM DATA
")
air_reserve_fin <- sqldf("
WITH DATA AS (
SELECT air_store_id
, visit_date
, reserve0_visitors
, reserve1_visitors
, reserve2_visitors
, reserve3_visitors
, reserve4_visitors
, reserve5_visitors
, reserve6_visitors
, reserve7p_visitors
, reserve_visitors
FROM air_reserve3
UNION ALL
SELECT air_store_id
, visit_date
, reserve0_visitors
, reserve1_visitors
, reserve2_visitors
, reserve3_visitors
, reserve4_visitors
, reserve5_visitors
, reserve6_visitors
, reserve7p_visitors
, reserve_visitors
FROM hpg_reserve_air
)
SELECT air_store_id
, visit_date
, SUM(reserve0_visitors) AS reserve0_visitors
, SUM(reserve1_visitors) AS reserve1_visitors
, SUM(reserve2_visitors) AS reserve2_visitors
, SUM(reserve3_visitors) AS reserve3_visitors
, SUM(reserve4_visitors) AS reserve4_visitors
, SUM(reserve5_visitors) AS reserve5_visitors
, SUM(reserve6_visitors) AS reserve6_visitors
, SUM(reserve7p_visitors) AS reserve7p_visitors
, SUM(reserve_visitors) AS reserve_visitors
FROM DATA
GROUP BY air_store_id, visit_date
")
#TOTAL RESTAURANTS BY GENRE AND AREA
store_info <- sqldf("
SELECT air_area_name as area_name
, air_genre_name as genre_name
, COUNT(air_store_id) AS restaurant_count
, AVG(latitude) AS area_lat
, AVG(longitude) AS area_lon
FROM air_store_info
GROUP BY 1,2
")
store_info$area_lat <- as.numeric(round(store_info$area_lat, 5))
store_info$area_lon <- as.numeric(round(store_info$area_lon, 4))
#Square mileage of restaurant research
mTOkm <- 1000
kmTOmi <- 0.621371
area_height <- distm(c(min(store_info$area_lon), min(store_info$area_lat)),
c(min(store_info$area_lon), max(store_info$area_lat)))/mTOkm*kmTOmi
area_width1 <- distm(c(min(store_info$area_lon), min(store_info$area_lat)),
c(max(store_info$area_lon), min(store_info$area_lat)))/mTOkm*kmTOmi
area_width2 <- distm(c(min(store_info$area_lon), max(store_info$area_lat)),
c(max(store_info$area_lon), max(store_info$area_lat)))/mTOkm*kmTOmi
avg_area_width <- mean(area_width1, area_width2)
area_sq_mile <- area_height * avg_area_width
#Create table of 25 sq mile ranges
covered_distance_lat <-
(max(store_info$area_lat) - min(store_info$area_lat)) *
lat_deg_distance
for_loop_iterations <- round(covered_distance_lat/5, 0) + 1
range_table_latlow <- c()
range_id <- c()
for(i in 1:for_loop_iterations) {
range_id[i] <- paste0("Range", i)
if(i == 1) {
range_table_latlow[i] <- min(store_info$area_lat)
} else {
range_table_latlow[i] <- range_table_latlow[i-1] + (5/lat_deg_distance)
}
}
range_table_lathigh <- c()
for(i in 1:for_loop_iterations) {
if(i != for_loop_iterations) {
range_table_lathigh[i] <- range_table_latlow[i+1] - 0.00001
} else {
range_table_lathigh[i] <- range_table_latlow[i] + (5/lat_deg_distance) - 0.00001
}
}
range_latitude_df <- data.frame(cbind(range_id,
round(range_table_latlow, 5),
round(range_table_lathigh, 5)))
colnames(range_latitude_df) <- c("range_id",
"lat_low",
"lat_high")
range_latitude_df$lat_low <- as.numeric(range_latitude_df$lat_low)
range_latitude_df$lat_high <- as.numeric(range_latitude_df$lat_high)
range_latitude_df <-
range_latitude_df %>%
mutate(lat_ref = round((lat_low + lat_high)/2, 5))
range_latitude_df <- sqldf("
SELECT LAT.range_id
, LAT.lat_low
, LAT.lat_high
, LAT.lat_ref
, LON.longitude_dist AS lon_deg_distance
FROM range_latitude_df LAT
LEFT JOIN longitude_dist_df LON
ON LAT.lat_ref BETWEEN LON.latitude_lower AND LON.latitude_upper
")
range_id2 <- c()
lat_ref_join <- c()
range_lonlow <- c()
for(i in 1:length(rownames(range_latitude_df))) {
lon_deg_distance <- range_latitude_df[i, "lon_deg_distance"]
covered_distance_lon <- (max(store_info$area_lon) - min(store_info$area_lon))*lon_deg_distance
for_loop_iterations2 <- round(covered_distance_lon/5, 0) + 1
range_id_ref <- range_latitude_df[i, "range_id"]
lat_ref <- range_latitude_df[i, "lat_ref"]
for(j in 1:for_loop_iterations2) {
range_id2[length(range_id2)+1] <- paste0(range_id_ref,"_",j)
lat_ref_join[length(lat_ref_join)+1] <- lat_ref
if(j == 1) {
range_lonlow[length(range_lonlow)+1] <- min(store_info$area_lon)
} else {
range_lonlow[length(range_lonlow)+1] <-
range_lonlow[length(range_lonlow)] + (5/lon_deg_distance)
}
}
}
range_longitude_df1 <- data.frame(cbind(range_id2,
lat_ref_join,
round(range_lonlow, 4)))
colnames(range_longitude_df1) <- c("range_id",
"lat_ref",
"lon_low")
range_longitude_df1$lon_low <- as.numeric(range_longitude_df1$lon_low)
lat_ref_join_val <- range_latitude_df$lat_ref
range_lonhigh <- c()
for(i in 1:length(lat_ref_join_val)) {
data_use <- range_longitude_df1 %>%
filter(lat_ref_join == lat_ref_join_val[i])
lon_deg_distance <- range_latitude_df[i, "lon_deg_distance"]
for(j in 1:length(rownames(data_use))) {
if(j != length(rownames(data_use))) {
range_lonhigh[length(range_lonhigh) + 1] <-
data_use[j + 1, 3] - 0.0001
} else {
range_lonhigh[length(range_lonhigh) + 1] <-
round(data_use[j, 3] + (5/lon_deg_distance) - 0.0001, 4)
}
}
}
range_longitude_df <- data.frame(cbind(range_longitude_df1,
range_lonhigh))
colnames(range_longitude_df) <- c(colnames(range_longitude_df1),
"lon_high")
range_longitude_df$lon_low <- as.numeric(range_longitude_df$lon_low)
range_longitude_df$lon_high <- as.numeric(range_longitude_df$lon_high)
range_grid_df <- sqldf("
SELECT LON.range_id
, LAT.range_id AS range_id_big
, LON.lat_ref
, LON.lon_low
, LON.lon_high
, LAT.lat_low
, LAT.lat_high
FROM range_longitude_df LON
LEFT JOIN range_latitude_df LAT
ON LAT.lat_ref = LON.lat_ref
")
store_info_ranges <- sqldf("
SELECT INFO.*
, R.range_id
FROM store_info INFO
LEFT JOIN range_grid_df R
ON INFO.area_lat BETWEEN R.lat_low AND R.lat_high
AND INFO.area_lon BETWEEN R.lon_low AND R.lon_high
")
air_store_info_ranges <- sqldf("
SELECT A.air_store_id
, A.air_genre_name
, A.air_area_name
, R.range_id as air_range_id
FROM air_store_info A
LEFT JOIN store_info_ranges R
ON R.area_name = A.air_area_name
AND R.genre_name = A.air_genre_name
")
#Different dates by store?
air_store_id_vec <- as.vector(unique(air_visit_data$air_store_id))
air_min_date <- c()
air_max_date <- c()
for(i in 1:length(air_store_id_vec)) {
data_house <- air_visit_data %>%
filter(air_store_id == air_store_id_vec[i])
air_min_date[i] <- min(data_house$visit_date)
air_max_date[i] <- max(data_house$visit_date)
}
air_restaurant_data <- sqldf("
SELECT V.air_store_id
, I.air_genre_name
, I.air_area_name
, I.air_range_id
, V.visit_date
, D.holiday_flg AS holiday_flag
, V.visitors
, R.reserve0_visitors
, R.reserve1_visitors
, R.reserve2_visitors
, R.reserve3_visitors
, R.reserve4_visitors
, R.reserve5_visitors
, R.reserve6_visitors
, R.reserve7p_visitors
, R.reserve_visitors AS reserve_visitorsT
FROM air_visit_data V
LEFT JOIN air_store_info_ranges I
ON I.air_store_id = V.air_store_id
LEFT JOIN air_reserve_fin R
ON R.air_store_id = V.air_store_id
AND R.visit_date = V.visit_date
LEFT JOIN date_info D
ON D.calendar_date = V.visit_date
")
air_restaurant_data2 <-
air_restaurant_data %>%
mutate(visit_dow = wday(visit_date, week_start = 1),
visit_week = visit_date + (1 - visit_dow),
visitors = as.numeric(visitors)) %>%
select(air_store_id,
air_genre_name,
air_area_name,
air_range_id,
visit_date,
visit_dow,
visit_week,
holiday_flag,
visitors,
reserve0_visitors,
reserve1_visitors,
reserve2_visitors,
reserve3_visitors,
reserve4_visitors,
reserve5_visitors,
reserve6_visitors,
reserve7p_visitors,
reserve_visitorsT)
air_restaurant_data3 <- sqldf("
SELECT air_store_id
, air_genre_name
, air_range_id
, visit_date
, visit_dow
, CASE WHEN visit_dow = 1 THEN '1 Mon'
WHEN visit_dow = 2 THEN '2 Tue'
WHEN visit_dow = 3 THEN '3 Wed'
WHEN visit_dow = 4 THEN '4 Thu'
WHEN visit_dow = 5 THEN '5 Fri'
WHEN visit_dow = 6 THEN '6 Sat'
ELSE '7 Sun' END AS visit_dow_desc
, visit_week
, holiday_flag
, visitors
, reserve0_visitors
, reserve1_visitors
, reserve2_visitors
, reserve3_visitors
, reserve4_visitors
, reserve5_visitors
, reserve6_visitors
, reserve7p_visitors
, reserve_visitorsT
, CASE WHEN reserve_visitorsT IS NULL THEN 1 ELSE 0 END AS no_res_flag
, 1 AS ROW_COUNT
FROM air_restaurant_data2
")
online_res_rsch <-
air_restaurant_data3 %>%
group_by(air_store_id) %>%
summarise(no_res_flag = sum(no_res_flag),
row_count = sum(ROW_COUNT)) %>%
mutate(online_res_rate = 1 - no_res_flag/row_count) %>%
ungroup()
ggplot(data = online_res_rsch,
aes(y = online_res_rate)) +
geom_boxplot(notch = FALSE) +
theme_classic() +
labs(y = "Online Reservation Rate",
title = "Distribution of Online Reservation Rates")
online_res_rsch2 <- sqldf("
SELECT air_store_id
, no_res_flag
, row_count
, online_res_rate
, CASE WHEN online_res_rate < 0.5 THEN 'N' ELSE 'Y'
END AS visit_majres_online
FROM online_res_rsch
")
air_restaurant_data4 <- sqldf("
SELECT R.air_store_id
, R.air_genre_name
, R.air_range_id
, R.visit_date
, R.visit_dow
, R.visit_dow_desc
, R.visit_week
, R.holiday_flag
, CASE WHEN I.restaurant_count = 0 THEN 'A ZERO'
WHEN I.restaurant_count BETWEEN 1 AND 5 THEN 'B 1-5'
WHEN I.restaurant_count BETWEEN 6 AND 10 THEN 'C 6-10'
WHEN I.restaurant_count BETWEEN 11 AND 15 THEN 'D 11-15'
WHEN I.restaurant_count BETWEEN 16 AND 20 THEN 'E 16-20'
WHEN I.restaurant_count BETWEEN 21 AND 25 THEN 'F 21-25'
ELSE 'G 26+' END AS similar_restaurants
, RES.visit_majres_online
, R.visitors
, CASE WHEN R.reserve0_visitors IS NULL THEN 0 ELSE R.reserve0_visitors
END AS reserve0_visitors
, CASE WHEN R.reserve1_visitors IS NULL THEN 0 ELSE R.reserve1_visitors
END AS reserve1_visitors
, CASE WHEN R.reserve2_visitors IS NULL THEN 0 ELSE R.reserve2_visitors
END AS reserve2_visitors
, CASE WHEN R.reserve3_visitors IS NULL THEN 0 ELSE R.reserve3_visitors
END AS reserve3_visitors
, CASE WHEN R.reserve4_visitors IS NULL THEN 0 ELSE R.reserve4_visitors
END AS reserve4_visitors
, CASE WHEN R.reserve5_visitors IS NULL THEN 0 ELSE R.reserve5_visitors
END AS reserve5_visitors
, CASE WHEN R.reserve6_visitors IS NULL THEN 0 ELSE R.reserve6_visitors
END AS reserve6_visitors
, CASE WHEN R.reserve7p_visitors IS NULL THEN 0 ELSE R.reserve7p_visitors
END AS reserve7p_visitors
, CASE WHEN R.reserve_visitorsT IS NULL THEN 0 ELSE R.reserve_visitorsT
END AS reserve_visitorsT
FROM air_restaurant_data3 R
LEFT JOIN (
SELECT air_range_id
, air_genre_name
, COUNT(air_store_id) AS restaurant_count
FROM air_store_info_ranges
GROUP BY 1,2
) I
ON I.air_range_id = R.air_range_id
AND I.air_genre_name = R.air_genre_name
LEFT JOIN online_res_rsch2 RES
ON RES.air_store_id = R.air_store_id
")
#Where is restaurant visitation concentrated?
range_concentration <- sqldf("
WITH DATA AS (
SELECT air_range_id
, SUM(visitors) AS visitorsT
FROM air_restaurant_data4
GROUP BY 1
),
DATA2 AS (
SELECT air_range_id
, visitorsT
, SUM(visitorsT) OVER() AS visitorsT_Japan
, CAST(visitorsT AS FLOAT)/CAST(SUM(visitorsT) OVER() AS FLOAT) AS visitation_pct
FROM DATA
ORDER BY 4 DESC
)
SELECT air_range_id
, visitorsT
, visitorsT_Japan
, visitation_pct
, SUM(visitation_pct) OVER(ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
AS visitation_pct_cum
FROM DATA2
")
air_restaurant_kable <-
air_restaurant_data4 %>%
group_by(visit_dow_desc) %>%
summarise(mean(visitors),
quantile(visitors, c(0.25)),
quantile(visitors, c(0.5)),
quantile(visitors, c(0.75)))
colnames(air_restaurant_kable) <- c("Day of Week",
"Average Visitors",
"25th Percentile",
"50th Percentile",
"75th Percentile")
kable(air_restaurant_kable) %>%
kable_styling(latex_options = "striped") %>%
kable_styling(latex_options = "HOLD_position")
ggplot(data = air_restaurant_data4,
aes(x = visit_dow_desc,
y = visitors)) +
geom_boxplot(notch=TRUE) +
theme_classic() +
labs(x = "Day of Week",
y = "Number of Visitors",
title = "Restaurant Visitors by Day of Week",
subtitle = "Boxplot, All Ranges")
ggplot(data = air_restaurant_data4,
aes(x = air_range_id,
y = visitors)) +
geom_boxplot(notch=TRUE) +
theme_classic() +
labs(x = "25-sq. Mile Range",
y = "Number of Visitors",
title = "Restaurant Visitors by 25 square-mile range",
subtitle = "Boxplot, All Ranges") +
theme(axis.ticks.x = element_blank(),
axis.text.x = element_blank())
air_restaurant_summrange <-
air_restaurant_data4 %>%
group_by(visit_date,
visit_dow,
visit_dow_desc,
visit_week,
air_range_id,
holiday_flag,
similar_restaurants,
visit_majres_online) %>%
summarize(visitorsT = sum(visitors)) %>%
ungroup()
air_restaurant_summrange %>%
group_by(visit_date) %>%
summarize(visitorsT = sum(visitorsT)) %>%
ungroup() %>%
ggplot(aes(y = visitorsT,
x = visit_date)) +
geom_line() +
labs(x = "Visit Date",
y = "Count of Visitors",
title = "Daily Restaurant Visitors",
subtitle = "All Regions") +
theme_classic()
air_restaurant_seasonal <-
air_restaurant_summrange %>%
select(visit_week,
visit_dow,
visitorsT) %>%
group_by(visit_week, visit_dow) %>%
summarize(visitorsT = sum(visitorsT)) %>%
ungroup() %>%
mutate(visit_dow = as.integer(visit_dow),
visit_week = paste("Week of ", visit_week))
ggplot(data = air_restaurant_seasonal,
aes(x = visit_dow,
y = visitorsT,
color = visit_week)) +
geom_line() +
theme_classic() +
theme(legend.position = "none") +
labs(x = "Visit Day of Week",
y = "Visitor Counts",
title = "Seasonal Plot - Visits by Weekday",
subtitle = "Monday = 1")
#Appendix Material - choose a few for the actual paper
report_rangeid_vector <- c("Range34_108",
"Range35_108",
"Range6_3",
"Range21_61",
"Range34_107",
"Range35_109",
"Range17_26",
"Range35_107",
"Range21_58",
"Range136_113")
for(i in 1:length(report_rangeid_vector)) {
ggplot_range <-
air_restaurant_summrange %>%
filter(air_range_id == report_rangeid_vector[i]) %>%
group_by(visit_date) %>%
summarize(visitorsT = sum(visitorsT)) %>%
ungroup() %>%
ggplot(aes(y = visitorsT,
x = visit_date)) +
geom_line() +
labs(x = "Visit Date",
y = "Count of Visitors",
title = "Daily Restaurant Visitors",
subtitle = paste("Range ID - ", report_rangeid_vector[i])) +
theme_classic()
print(ggplot_range)
}
for(i in 1:length(report_rangeid_vector)) {
air_restaurant_seasonal <-
air_restaurant_summrange %>%
filter(air_range_id == report_rangeid_vector[i]) %>%
select(visit_week,
visit_dow,
visitorsT) %>%
group_by(visit_week, visit_dow) %>%
summarize(visitorsT = sum(visitorsT)) %>%
ungroup() %>%
mutate(visit_dow = as.integer(visit_dow),
visit_week = paste("Week of ", visit_week))
print(
ggplot(data = air_restaurant_seasonal,
aes(x = visit_dow,
y = visitorsT,
color = visit_week)) +
geom_line() +
theme_classic() +
theme(legend.position = "none") +
labs(x = "Visit Day of Week",
y = "Visitor Counts",
title = "Seasonal Plot - Visits by Weekday",
subtitle = paste("Monday = 1; ", report_rangeid_vector[i]))
)
}
ggplot(data = air_restaurant_data4,
aes(x = as.character(holiday_flag),
y = visitors)) +
geom_boxplot(notch=TRUE) +
theme_classic() +
labs(x = "Holiday Flag",
y = "Number of Visitors",
title = "Restaurant Visitors by Holiday Indication",
subtitle = "Boxplot, All Ranges")
ggplot(data = air_restaurant_data4,
aes(x = similar_restaurants,
y = visitors)) +
geom_boxplot(notch=TRUE) +
theme_classic() +
labs(x = "Number of Similar Restaurants",
y = "Number of Visitors",
title = "Restaurant Visitors by Number of Similar Restaurants",
subtitle = "Boxplot, All Ranges")
ggplot(data = air_restaurant_data4,
aes(x = visit_majres_online,
y = visitors)) +
geom_boxplot(notch=TRUE) +
theme_classic() +
labs(x = "Majority Visitors Online Reservation Indicator",
y = "Number of Visitors",
title = "Restaurant Visitors by Indicator of Maj. Reservations Online",
subtitle = "Boxplot, All Ranges")
restaurant_type_vector <- as.vector(unique(air_restaurant_data4$air_genre_name))
for(i in 1:length(restaurant_type_vector)) {
plot_data <-
air_restaurant_data4 %>%
filter(air_genre_name == restaurant_type_vector[i])
print(
ggplot(data = plot_data,
aes(x = similar_restaurants,
y = visitors)) +
geom_boxplot(notch=FALSE) +
theme_classic() +
labs(x = "Number of Similar Restaurants",
y = "Number of Visitors",
title = "Restaurant Visitors by Number of Similar Restaurants",
subtitle = paste("Boxplot, All Ranges, ", restaurant_type_vector[i]))
)
}
#MODEL EXPLORATION
air_restaurant250 <-
air_restaurant_data4 %>%
filter(visitors >= 250) %>%
select(air_store_id) %>%
unique()
air_restaurant250_vector <- as.vector(air_restaurant250$air_store_id)
for(i in 1:length(air_restaurant250_vector)) {
ggplot_restaurant <-
air_restaurant_data4 %>%
filter(air_store_id == air_restaurant250_vector[i]) %>%
group_by(visit_date) %>%
summarize(visitorsT = sum(visitors)) %>%
ungroup() %>%
ggplot(aes(y = visitorsT,
x = visit_date)) +
geom_line() +
labs(x = "Visit Date",
y = "Count of Visitors",
title = "Daily Restaurant Visitors",
subtitle = paste("Restaurant ID - ", air_restaurant250_vector[i])) +
theme_classic()
print(ggplot_restaurant)
}
#Model Selection - ARIMA appears to be an appropriate choice
#First, have to fill gaps to turn data into tsibble
air_restaurant_data5 <-
air_restaurant_data4 %>%
select(air_store_id,
air_range_id,
visit_date,
holiday_flag,
visitors) %>%
as_tsibble(key = c(air_store_id, air_range_id),
index = visit_date) %>%
fill_gaps() %>%
data.frame()
air_restaurant_data5$visitors[is.na(air_restaurant_data5$visitors)] <- 0
air_restaurant_data6 <- sqldf("
SELECT R.air_store_id
, R.air_range_id
, R.visit_date
, CASE WHEN R.holiday_flag IS NULL THEN D.holiday_flg ELSE R.holiday_flag
END AS holiday_flag
, R.visitors
FROM air_restaurant_data5 R
LEFT JOIN date_info D
ON D.calendar_date = R.visit_date
")
air_restaurant_final <- sqldf("
SELECT air_range_id
, visit_date
, CASE WHEN holiday_flag = 0 THEN 'N' ELSE 'Y' END AS holiday_ind
, SUM(visitors) AS visitors
FROM air_restaurant_data6
GROUP BY 1,2,3
")
air_restaurant_tsibble <-
air_restaurant_final %>%
as_tsibble(key = air_range_id,
index = visit_date)
air_restaurant_tsibble %>%
autoplot(visitors) +
theme_classic() +
theme(legend.position = "none") +
labs(x = "Visit Date",
y = "Visitor Count",
title = "Daily Restaurant Visitors",
subtitle = "Japan, by 25 sq. Mile Range")
#Create hierarchical aggregate key for the data (air_range_id / air_store_id)
air_restaurant_arima <-
air_restaurant_tsibble %>%
model(ARIMAauto = ARIMA(visitors))
air_restaurant_tsibble2 <-
air_restaurant_tsibble %>%
filter(visit_date >= '2016-07-01')
air_restaurant_arima2 <-
air_restaurant_tsibble2 %>%
model(ARIMAauto = ARIMA(visitors),
ETSauto = ETS(visitors))
#Residual Plots - Range-level models
vector_model <- c("ARIMAauto")
for(i in 1:length(vector_model)) {
for(j in 1:length(report_rangeid_vector)) {
visual_residual_data <-
air_restaurant_arima %>%
select(vector_model[i]) %>%
filter(air_range_id == report_rangeid_vector[j])
visual_residual <-
visual_residual_data %>%
gg_tsresiduals() +
labs(title = "Residual Plots",
subtitle = paste(report_rangeid_vector[j],
", ",
vector_model[i]))
print(visual_residual)
}
}
vector_model2 <- c("ARIMAauto", "ETSauto")
for(i in 1:length(vector_model2)) {
for(j in 1:length(report_rangeid_vector)) {
visual_residual_data <-
air_restaurant_arima2 %>%
select(vector_model2[i]) %>%
filter(air_range_id == report_rangeid_vector[j])
visual_residual <-
visual_residual_data %>%
gg_tsresiduals() +
labs(title = "Residual Plots",
subtitle = paste(report_rangeid_vector[j],
", ",
vector_model2[i]))
print(visual_residual)
}
}
#average historical proportions for stores by Range
air_store_prop <- c()
for(i in 1:length(air_store_id_vec)) {
air_data <-
air_restaurant_data6 %>%
filter(air_store_id == air_store_id_vec[i])
air_data2 <- sqldf("
SELECT R.air_store_id
, R.air_range_id
, R.visit_date
, R.visitors
, R2.visitors AS visitorsT
FROM (
SELECT air_store_id
, air_range_id
, visit_date
, SUM(visitors) AS visitors
FROM air_data
GROUP BY 1,2,3
) R
LEFT JOIN (
SELECT air_range_id
, visit_date
, SUM(visitors) AS visitors
FROM air_restaurant_final
GROUP BY 1,2
) R2
ON R2.air_range_id = R.air_range_id
AND R2.visit_date = R.visit_date
")
air_data3 <- sqldf("
WITH DATA AS (
SELECT air_store_id
, air_range_id
, SUM(visitors) AS visitors
, SUM(visitorsT) AS visitorsT
FROM air_data2
GROUP BY 1,2
)
SELECT D.*
, CAST(visitors AS FLOAT)/CAST(visitorsT AS FLOAT) AS visitors_prop
FROM DATA D
")
air_store_prop[i] <- air_data3$visitors_prop
}
air_restaurant_data_prop <- data.frame(cbind(air_store_id_vec,
air_store_prop))
air_restaurant_data_prop2 <- sqldf("
SELECT R.air_store_id
, R.air_range_id
, PROP.air_store_prop
FROM (
SELECT DISTINCT air_store_id
, air_range_id
FROM air_restaurant_data6
) R
LEFT JOIN air_restaurant_data_prop PROP
ON PROP.air_store_id_vec = R.air_store_id
")
air_restaurant_data_prop3 <- sqldf("
WITH DATA AS (
SELECT air_store_id
, air_range_id
, air_store_prop
, SUM(air_store_prop) OVER(PARTITION BY air_range_id) AS range_store_prop
FROM air_restaurant_data_prop2
)
SELECT air_store_id
, air_range_id
, air_store_prop
, range_store_prop
, CAST(air_store_prop AS FLOAT)/CAST(range_store_prop AS FLOAT)
AS air_store_prop_wt
FROM DATA
")
#Compare RMSE accuracy on training data
air_restaurant_fitted1a <- fitted(air_restaurant_arima) %>%
mutate(MODEL = .model,
FITTED = .fitted)
air_restaurant_fitted1b <- sqldf("
SELECT F.MODEL
, P.air_store_id
, P.air_range_id
, F.visit_date
, F.FITTED
, P.air_store_prop_wt
, CAST(P.air_store_prop_wt AS FLOAT) * CAST(F.FITTED AS FLOAT) AS visitors_fitted
, CASE WHEN R.visitors IS NULL THEN 0 ELSE R.visitors END AS visitors_actual
FROM air_restaurant_data_prop3 P
LEFT JOIN air_restaurant_fitted1a F
ON F.air_range_id = P.air_range_id
INNER JOIN air_restaurant_data6 R
ON R.air_store_id = P.air_store_id
AND R.air_range_id = P.air_range_id
AND R.visit_date = F.visit_date
")
air_restaurant_fitted1c <-
air_restaurant_fitted1b %>%
mutate(ERROR = visitors_fitted - visitors_actual,
SQERROR = ERROR^2)
air_restaurant_rmse <-
air_restaurant_fitted1c %>%
group_by(MODEL) %>%
summarize(MSE = mean(SQERROR)) %>%
ungroup() %>%
mutate(RMSE = sqrt(MSE))
air_restaurant_fitted2a <- fitted(air_restaurant_arima2) %>%
mutate(MODEL = .model,
FITTED = .fitted)
air_restaurant_fitted2b <- sqldf("
SELECT F.MODEL
, P.air_store_id
, P.air_range_id
, F.visit_date
, F.FITTED
, P.air_store_prop_wt
, CAST(P.air_store_prop_wt AS FLOAT) * CAST(F.FITTED AS FLOAT) AS visitors_fitted
, CASE WHEN R.visitors IS NULL THEN 0 ELSE R.visitors END AS visitors_actual
FROM air_restaurant_data_prop3 P
LEFT JOIN air_restaurant_fitted2a F
ON F.air_range_id = P.air_range_id
LEFT JOIN air_restaurant_data6 R
ON R.air_store_id = P.air_store_id
AND R.air_range_id = P.air_range_id
AND R.visit_date = F.visit_date
")
air_restaurant_fitted2c <-
air_restaurant_fitted2b %>%
mutate(ERROR = visitors_fitted - visitors_actual,
SQERROR = ERROR^2)
air_restaurant_rmse2 <-
air_restaurant_fitted2c %>%
group_by(MODEL) %>%
summarize(MSE = mean(SQERROR)) %>%
ungroup() %>%
mutate(RMSE = sqrt(MSE))
#Pull in Submission Data
sample_submission <- read.csv(paste0(csv_path,
"sample_submission.csv"))
sample_submission_storeid <- c()
sample_submission_date <- c()
for(i in 1:length(rownames(sample_submission))) {
sample_submission_storeid[i] <- substr(sample_submission[i,1], 1, 20)
sample_submission_date[i] <- substr(sample_submission[i,1], 22, 31)
}
sample_submission_final <- data.frame(cbind(sample_submission_storeid,
sample_submission_date))
colnames(sample_submission_final) <- c("air_store_id",
"visit_date")
sample_submission_final$visit_date <-
as.Date(sample_submission_final$visit_date)
submit_tsibble <- new_data(air_restaurant_tsibble,
length(unique(sample_submission_final$visit_date)))
submit_tsibble2 <- sqldf("
SELECT S.air_range_id
, S.visit_date
, CASE WHEN D.holiday_flg = 0 THEN 'N' ELSE 'Y' END AS holiday_ind
FROM submit_tsibble S
LEFT JOIN date_info D
ON D.calendar_date = S.visit_date
") %>%
as_tsibble(key = air_range_id,
index = visit_date)
#Forecast @ the Range level
#top-down approach to individual restaurants using prop table
air_restaurant_arimafc <-
air_restaurant_arima %>%
forecast(new_data = submit_tsibble2)
for(j in 1:length(report_rangeid_vector)) {
model_fc <-
air_restaurant_arimafc %>%
filter(air_range_id == report_rangeid_vector[j])
for(i in 1:length(unique(air_restaurant_arimafc$.model))) {
model_fc2 <-
model_fc %>%
filter(.model == unique(air_restaurant_arimafc$.model)[i])
air_data_tsibble <-
air_restaurant_tsibble %>%
filter(air_range_id == report_rangeid_vector[j])
print(
model_fc2 %>%
autoplot(air_data_tsibble) +
theme_classic() +
labs(x = "Visit Date",
y = "Visitor Count",
title = "Daily Visitor Count - Forecast",
subtitle = paste(unique(air_restaurant_arimafc$.model)[i],
", ",
report_rangeid_vector[j]))
)
}
}
#SECOND MODELING VARIABLE FORECASTS
air_restaurant_arimafc2 <-
air_restaurant_arima2 %>%
forecast(new_data = submit_tsibble2)
for(j in 1:length(report_rangeid_vector)) {
model_fc <-
air_restaurant_arimafc2 %>%
filter(air_range_id == report_rangeid_vector[j])
for(i in 1:length(unique(air_restaurant_arimafc2$.model))) {
model_fc2 <-
model_fc %>%
filter(.model == unique(air_restaurant_arimafc2$.model)[i])
air_data_tsibble <-
air_restaurant_tsibble2 %>%
filter(air_range_id == report_rangeid_vector[j])
print(
model_fc2 %>%
autoplot(air_data_tsibble) +
theme_classic() +
labs(x = "Visit Date",
y = "Visitor Count",
title = "Daily Visitor Count - Forecast",
subtitle = paste(unique(air_restaurant_arimafc2$.model)[i],
", ",
report_rangeid_vector[j],
", Knotted @ 7/1/2016"))
)
}
}