setwd("C:/Users/569973/Downloads/recruit-restaurant-visitor-forecasting")
files <- list.files(path="./data/", pattern="*.csv$")
for (f in files){
filepath <- file.path("./data", paste(f))
fname <- strsplit(f, "[.]")[[1]]
assign(fname[1], fread(filepath))
}
#PREPROCESSING
# split date and time
air_reserve$visit_date <- as.Date(air_reserve$visit_datetime)
air_reserve$visit_time <- format(strptime(air_reserve$visit_datetime,
format="%Y-%m-%d %H:%M:%S"), "%H:%M")
air_reserve$reserve_date <- as.Date(air_reserve$reserve_datetime)
air_reserve$reserve_time <- format(strptime(air_reserve$reserve_datetime,
format="%Y-%m-%d %H:%M:%S"), "%H:%M")
hpg_reserve$visit_date <- as.Date(hpg_reserve$visit_datetime)
hpg_reserve$visit_time <- format(strptime(hpg_reserve$visit_datetime,
format="%Y-%m-%d %H:%M:%S"), "%H:%M")
hpg_reserve$reserve_date <- as.Date(hpg_reserve$reserve_datetime)
hpg_reserve$reserve_time <- format(strptime(hpg_reserve$reserve_datetime,
format="%Y-%m-%d %H:%M:%S"), "%H:%M")
date_info$calendar_date <- as.Date(date_info$calendar_date)
air_visit_data$visit_date <- as.Date(air_visit_data$visit_date)
# add date of week
air_reserve <- air_reserve %>%
mutate(visit_dow=wday(visit_date)) %>%
mutate(reserve_dow=wday(reserve_date))
hpg_reserve <- hpg_reserve %>%
mutate(visit_dow=wday(visit_date)) %>%
mutate(reserve_dow=wday(reserve_date))
air_visit_data <- air_visit_data %>%
mutate(visit_dow=wday(visit_date))
# add holiday flags
air_reserve <- air_reserve %>% left_join(date_info, by=c("visit_date"="calendar_date"))
hpg_reserve <- hpg_reserve %>% left_join(date_info, by=c("visit_date"="calendar_date"))
air_visit_data <- air_visit_data %>% left_join(date_info, by=c("visit_date"="calendar_date"))
air_reserve$holiday_flg <- factor(air_reserve$holiday_flg, labels=c("Not Holiday", "Holiday"))
hpg_reserve$holiday_flg <- factor(hpg_reserve$holiday_flg, labels=c("Not Holiday", "Holiday"))
air_visit_data$holiday_flg <- factor(air_visit_data$holiday_flg, labels=c("Not Holiday", "Holiday"))
head(air_visit_data)
## air_store_id visit_date visitors visit_dow day_of_week holiday_flg
## 1: air_ba937bf13d40fb24 2016-01-13 25 4 Wednesday Not Holiday
## 2: air_ba937bf13d40fb24 2016-01-14 32 5 Thursday Not Holiday
## 3: air_ba937bf13d40fb24 2016-01-15 29 6 Friday Not Holiday
## 4: air_ba937bf13d40fb24 2016-01-16 22 7 Saturday Not Holiday
## 5: air_ba937bf13d40fb24 2016-01-18 6 2 Monday Not Holiday
## 6: air_ba937bf13d40fb24 2016-01-19 9 3 Tuesday Not Holiday
visit_week_reserve <- air_reserve %>%
group_by(day_of_week) %>%
dplyr::summarize(Number=sum(reserve_visitors))
#VISITORS AND RESERVES PER DAY
options(scipen = 9)
p <- ggplot(visit_week_reserve, aes(x=day_of_week, y=Number)) +
geom_bar(stat="identity") +
xlab("") +ggtitle('Reserve visitors')+theme(plot.title = element_text(size = 10, face = "bold"))
p
visit_week_ <- air_visit_data %>%
group_by(day_of_week) %>%
dplyr::summarize(Number=sum(visitors))
options(scipen = 9)
p <- ggplot(visit_week_, aes(x=day_of_week, y=Number)) +
geom_bar(stat="identity") +
xlab("") +ggtitle('Total visitors')+theme(plot.title = element_text(size = 10, face = "bold"))
p
## [1] "air_reserve contains reservation info of 314 stores"
## [1] "air_visit contains actual visitors info of 829 stores"
## [1] "air_store_info contains actual visitors info of 829 stores"
## [1] "hpg_reserve contains reservation info of 13325 stores but only 150 belongs to the AirREGI system"
## [1] "hpg_store_info contains reservation info of 4690 stores but only 63 belongs to the AirREGI system"
#VISITORS BY GENDER
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
## `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
#VISITOR BY SEASON
holiday_info <- date_info
holiday_info <- holiday_info %>%
mutate(calendar_date = as.Date(calendar_date, format="%Y-%m-%d"),
day_of_week = factor(day_of_week,
levels = format(ISOdate(2020,6,1:7),"%A")),
holiday_flg = as.factor(holiday_flg))
vis1 <- holiday_info %>%
group_by(holiday_flg) %>%
summarise(days=n()) %>%
ggplot(aes(x=holiday_flg, y=days, fill=as.factor(days))) +
geom_bar(stat = "identity") +
geom_text(aes(label=days), size=3, position = position_stack(vjust=0.5)) +
theme_minimal() +
theme(legend.position="none",plot.title = element_text(size = 10, face = "bold")) +
ggtitle('Number Of Non Holiday / Holiday Days')
vis2 <- air_visit_data %>%
group_by(holiday_flg) %>%
summarise(days=sum(visitors)) %>%
ggplot(aes(x=holiday_flg, y=days, fill=as.factor(days))) +
geom_bar(stat = "identity") +
geom_text(aes(label=days), size=3, position = position_stack(vjust=0.5)) +
theme_minimal() +
theme(legend.position="none",plot.title = element_text(size = 10, face = "bold")) +
ggtitle('Total Visitors On Non Holiday / Holiday Days')
grid.arrange(vis1, vis2, ncol=2)
#VISITORS SEASON - MONTH -DAY
air_visit_data2 <- air_visit_data %>%
left_join(holiday_info, by=(c("visit_date" = "calendar_date"))) %>%
mutate(month_of_year = factor(strftime(visit_date, format='%B'),
levels = format(ISOdate(2020,1:12,1),"%B")))
air_visit_data2$day_of_week <- air_visit_data2$day_of_week.x
air_visit_data2$holiday_flg <- air_visit_data2$holiday_flg.x
vis3 <- air_visit_data2 %>%
group_by(day_of_week, holiday_flg) %>%
summarise(mean_visitors=round(mean(visitors))) %>%
ggplot(aes(fill=holiday_flg, y=mean_visitors, x=day_of_week)) +
geom_bar(position="stack", stat="identity") +
geom_text(aes(label=mean_visitors), size=5, position = position_stack(vjust=0.5)) +
theme_minimal() +
theme(axis.text.x=element_text(angle=45, hjust=1),
legend.position="bottom") +
ggtitle('Mean Of Visitors For Each Day Of The Week')
## `summarise()` has grouped output by 'day_of_week'. You can override using the
## `.groups` argument.
vis4 <- air_visit_data2 %>%
group_by(month_of_year, holiday_flg) %>%
summarise(mean_visitors=round(mean(visitors))) %>%
ggplot(aes(fill=holiday_flg, y=mean_visitors, x=month_of_year)) +
geom_bar(position="stack", stat="identity") +
geom_text(aes(label=mean_visitors), size=5, position=position_stack(vjust=0.5)) +
theme_minimal() +
theme(axis.text.x=element_text(angle=45, hjust=1),
legend.position="bottom") +
ggtitle('Mean Of Visitors For Each Month')
## `summarise()` has grouped output by 'month_of_year'. You can override using the
## `.groups` argument.
grid.arrange(vis3, vis4, ncol=2)
#SEASON BY GENDER
vis5 <- air_store_info %>%
group_by(air_genre_name) %>%
summarise(num_of_strs=n()) %>%
ggplot(aes(fill=air_genre_name, x=air_genre_name, y=num_of_strs)) +
geom_bar(position="stack", stat="identity") +
geom_text(aes(label=num_of_strs), size=3, position=position_stack(vjust=0.5)) +
theme_minimal() +
theme(legend.position="none") +
ggtitle('Number Of Stores For Each Genre') +
coord_flip()
air_data <- left_join(air_visit_data, air_store_info, by='air_store_id')
vis6 <- air_data %>%
group_by(air_genre_name) %>%
summarise(sum_vistors=sum(visitors)) %>%
ggplot(aes(fill=air_genre_name, x=air_genre_name, y=sum_vistors)) +
geom_bar(position="stack", stat="identity") +
geom_text(aes(label=sum_vistors), size=3, position=position_stack(vjust=0.5)) +
theme_minimal() +
theme(legend.position="none") +
ggtitle('Total Visitors For Each Genre') +
coord_flip()
grid.arrange(vis5, vis6, nrow=2)
air_data %>%
group_by(visit_date, air_genre_name) %>%
summarise(mean_visitors=round(mean(visitors))) %>%
ggplot(aes(color=air_genre_name, y=mean_visitors, x=visit_date)) +
geom_line() +
facet_wrap(~air_genre_name) +
theme(legend.position="none") +
ggtitle('Mean Of Visitors For Each Genre')
## `summarise()` has grouped output by 'visit_date'. You can override using the
## `.groups` argument.
#CORRELATION
library(TSclust)
df1 <- air_store_info %>%
left_join(air_visit_data, c("air_store_id")) %>%
dplyr::group_by(air_genre_name, visit_date) %>%
dplyr::summarize(act_visitor_tol = sum(visitors)) %>%
dcast(visit_date ~ air_genre_name) %>%
select(-c(Asian)) %>% # more than half are NA entries
select(-contains("Karaoke/Party")) %>% # more than half are NA entries
select(-contains("International cuisine")) # more than half are NA entries
# impute missing values
df1 <- na.seadec(df1, algorithm="interpolation")
tsdist <- t(select(df1, -1)) %>% scale() %>% diss("ACF", p=0.05)
hc <- hclust(tsdist)
dend <- as.dendrogram(hc)
dend <- dend %>%
dendextend::set("branches_k_color", k=4) %>%
dendextend::set("branches_lwd", 1.0) %>%
dendextend::set("labels_cex", 1.2) %>%
dendextend::set("leaves_pch", 19) %>%
dendextend::set("leaves_col", c("black")) %>%
dendextend::set("leaves_cex", 0.6) %>%
highlight_branches_col
par(cex=0.4)
ggplot(dend, horiz=TRUE)
# AIR -- split location data
air_store_info <- air_store_info %>%
separate(air_area_name, sep=" ", into=c('prefecture', 'sub-division-1', 'sub-division-2'), extra="merge")
# store visit count
air1 <- air_visit_data %>% group_by(air_store_id) %>% summarize(act_visitor_tol = sum(visitors))
air2 <- air_reserve %>% group_by(air_store_id) %>% summarize(reserve_visitor_tol = sum(reserve_visitors))
air_store_summ <- air_store_info %>%
left_join(air1, c("air_store_id")) %>%
left_join(air2, c("air_store_id")) %>%
dplyr::group_by(prefecture) %>%
summarize(actual_visitor_tol = mean(act_visitor_tol, na.rm=T),
reserve_visitor_tol = mean(reserve_visitor_tol, na.rm=T),
air_store_count = n())
# HPG -- split location data
hpg_store_summ <- hpg_store_info %>%
separate(hpg_area_name, sep=" ", into=c('prefecture', 'sub-division-1', 'sub-division-2'), extra="merge") %>%
dplyr::group_by(prefecture) %>%
summarize(hpg_store_count = n())
# rename
hpg_store_summ['prefecture'] <- c("Fukuoka", "Hiroshima", "Hokkaido", "Hyogo", "Kanagawa", "Miyagi", "Niigata", "None", "Osaka", "Osaka", "Saitama", "Shizuoka", "Tokyo")
hpg_store_summ <- hpg_store_summ %>%
dplyr::group_by(prefecture) %>%
summarize(hpg_store_count = sum(hpg_store_count))
# create sp object
jmap <- map("japan", col=1, plot=FALSE, fill=TRUE)
jmap_id <-sapply(strsplit(jmap[['names']], ":"), function(x) x[1])
p4s <- CRS("+proj=longlat +datum=WGS84")
jsp <- map2SpatialPolygons(jmap, ID=jmap_id, proj4string=p4s)
# create SpatialPolygonsDataFrame
jspdf <- data.frame(prefecture=names(jsp))
rownames(jspdf) <- names(jsp)
jspdf <- SpatialPolygonsDataFrame(jsp, jspdf)
# rename columns
air_store_summ['prefecture'] <- c("Fukuoka", "Hiroshima", "Hokkaido", "Hyogo", "Miyagi", "Niigata", "Osaka", "Shizuoka", "Tokyo")
jspdf <- sp::merge(jspdf, air_store_summ, by=c("prefecture"))
#jspdf <- sp::merge(jspdf, hpg_store_summ[hpg_store_summ!="None",], by=c("prefecture"))
## rgeos version: 0.5-9, (SVN revision 684)
## GEOS runtime version: 3.9.1-CAPI-1.14.2
## Please note that rgeos will be retired by the end of 2023,
## plan transition to sf functions using GEOS at your earliest convenience.
## GEOS using OverlayNG
## Linking to sp version: 1.4-7
## Polygon checking: TRUE
#PREFECTURE VISITORS AND CONVERSION RATE
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
## `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
air_visit_data %>%
group_by(visit_date) %>%
summarise(sum_visitors = sum(visitors)) %>%
ggplot(aes(x=visit_date, y=sum_visitors)) +
geom_line() +
theme_minimal() +
ggtitle('Total Visitors For Each Day')
# Air Reserved Visitors
air1 <- air_reserve %>%
group_by(visit_date) %>% summarise(reserve_visit_tol = sum(reserve_visitors))
air2 <- air_visit_data %>% group_by(visit_date) %>% summarise(act_visit_tol = sum(visitors))
rect <- data.frame(xmin=as.Date("2016-07-01", "%Y-%m-%d"),
xmax=as.Date("2016-07-10", "%Y-%m-%d"), ymin=-Inf, ymax=Inf)
p1 <- air_visit_data %>%
group_by(visit_date) %>% summarise(act_visitor_tol = sum(visitors)) %>%
ggplot() +
geom_line(aes(x=visit_date, y=act_visitor_tol), size=0.2, col='grey25') +
labs(title="Actual Vistiors - AirREGI") +
scale_x_date(date_breaks = "1 month") +
theme(axis.text.x = element_text(angle=90, hjust=1, size=7)) +
geom_rect(data=rect, aes(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax),
colour="blue", fill="blue",
alpha=0.05, size=0.5, linetype=2, inherit.aes = FALSE)
rect <- data.frame(xmin=as.Date("2016-08-01", "%Y-%m-%d"),
xmax=as.Date("2016-10-20", "%Y-%m-%d"), ymin=-Inf, ymax=Inf)
p2 <- air1 %>%
ggplot() +
geom_line(aes(x=visit_date, y=reserve_visit_tol), size=0.2, col='steelblue') +
geom_rect(data=rect, aes(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax),
colour="blue", fill="blue",
alpha=0.05, size=0.5, linetype=2, inherit.aes = FALSE) +
labs(title="Actual & Reserved Vistiors - AirREGI") +
scale_x_date(date_breaks = "1 month") +
theme(axis.text.x = element_text(angle=90, hjust=1))
p <- subplot(p1, p2, nrows = 2, shareX=TRUE)
ggplotly(p)
## Joining, by = "visit_date"
daily_visit <- air1 %>% left_join(air2, c("visit_date")) %>%
select(reserve_visit_tol) %>%
ts(start=c(2016,1,1), frequency=7)
# Daily Visit Rates Decomposition
p2 <- decompose(daily_visit) %>% .$trend %>%
autoplot(main="Air Visitor Trend Component", xlab="Day") +
theme(plot.title = element_text(size=8), axis.title=element_text(size=8))
p3 <- decompose(daily_visit) %>% .$seasonal %>%
autoplot(main="Air Visitor Seasonal Component", xlab="Day")+
theme(plot.title = element_text(size=8), axis.title=element_text(size=8))
p4 <- decompose(daily_visit) %>% .$random %>%
autoplot(main="Air Visitor Random Component", xlab="Day")+
theme(plot.title = element_text(size=8), axis.title=element_text(size=8))
grid.arrange(p2,p3,p4,ncol=3)
p1 <- air_visit_data %>%
group_by(visit_dow, holiday_flg) %>%
summarise(visitor_tol = sum(visitors)) %>%
ggplot(aes(color=holiday_flg)) +
facet_grid(holiday_flg~., scales="free") +
geom_line(aes(x=visit_dow, y=visitor_tol), group=1, size=0.3) +
geom_point(aes(x=visit_dow, y=visitor_tol), size=1) +
scale_color_manual(values=c("#007D00", "#C00000")) +
labs(title="Visitors by Day of Week")+
guides(color=FALSE)
## `summarise()` has grouped output by 'visit_dow'. You can override using the
## `.groups` argument.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
p2 <- air_reserve %>% group_by(reserve_time, visit_dow) %>%
summarise(visitor_tol = sum(reserve_visitors)) %>%
ggplot() +
geom_col(aes(x=reserve_time, y=visitor_tol), fill="#19417D") +
labs(title="Visitors by Day of Week & Time Slot")+
scale_y_continuous(breaks=seq(0, 65000, by=10000)) +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.9))
## `summarise()` has grouped output by 'reserve_time'. You can override using the
## `.groups` argument.
subplot(p1,p2, margin = 0.05)
# convert air_reserve_visit to ts
air_visit_ts <- ts(air_visit_data[,"visitors"], start=c(2016-01-01), frequency=7)
# taking the first difference
d.air_visit_ts <- diff(air_visit_ts)
#ACF PLOTS
## Warning: Ignoring unknown parameters: main, ylab
## Ignoring unknown parameters: main, ylab
dates <- data.frame('visit_date' = as.Date(min(air_data$visit_date):max(air_data$visit_date), origin="1970-01-01"))
store_data <- air_data %>%
filter(air_store_id == "air_09a845d5b5944b01") %>%
select(visit_date, visitors)
df_store <- left_join(dates, store_data, by='visit_date')
df_store$visitors[is.na(df_store$visitors)] <- 0
df_store <- df_store[min(which(df_store$visitors!=0)):nrow(df_store),]
TS <- ts(df_store$visitors, frequency=7)
train_TS <- head(TS, -7)
test_TS <- tail(TS, 7)
dec <- decompose(train_TS, type="additive")
plot(dec)
acf(train_TS, lag.max=10)
#STATISTICAL TEST.
adf.test(train_TS)
##
## Augmented Dickey-Fuller Test
##
## data: train_TS
## Dickey-Fuller = -3.473, Lag order = 7, p-value = 0.04502
## alternative hypothesis: stationary
#MODELING
autoplot(train_TS)
ETS MODEL.
model1 <- ets(train_TS)
predictions <- forecast(model1, h=7)
autoplot(predictions) +
autolayer(test_TS, series="Data") +
autolayer(predictions$mean, series="Forecasts")
accuracy(predictions$mean,test_TS)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set -0.5084285 1.728564 1.459384 -Inf Inf -0.4961305 0
NAIVE MODEL
model2 <- naive(train_TS)
predictions <- forecast(model2, h=7)
autoplot(predictions) +
autolayer(test_TS, series="Data") +
autolayer(predictions$mean, series="Forecasts")
accuracy(predictions$mean,test_TS)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set -8.428571 9.327379 8.428571 -Inf Inf 0.04567044 0
HOLT-WINTERS
model3 <- HoltWinters(train_TS)
predictions <- forecast(model3, h=7)
autoplot(predictions) +
autolayer(test_TS, series="Data") +
autolayer(predictions$mean, series="Forecasts")
accuracy(predictions$mean,test_TS)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set -1.08704 2.171498 1.940995 -Inf Inf -0.5092342 0
ARIMA
model4 <- Arima(train_TS)
predictions <- forecast(model4, h=7)
autoplot(predictions) +
autolayer(test_TS, series="Data") +
autolayer(predictions$mean, series="Forecasts")
accuracy(predictions$mean,test_TS)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set -4.394601 5.938998 4.975736 -Inf Inf 0.04567044 0
ETS (BOXCOX)
model5 <- ets(train_TS,lambda="auto")
## Warning in guerrero(x, lower, upper): Guerrero's method for selecting a Box-Cox
## parameter (lambda) is given for strictly positive data.
predictions <- forecast(model5, h=7)
accuracy(predictions$mean,test_TS)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 2.71786 6.559491 5.401853 -Inf Inf -0.3653243 0
ARIMA(BOXCOX)
model5 <- Arima(train_TS,lambda="auto")
## Warning in guerrero(x, lower, upper): Guerrero's method for selecting a Box-Cox
## parameter (lambda) is given for strictly positive data.
predictions <- forecast(model5, h=7)
accuracy(predictions$mean,test_TS)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 5.571095 6.855384 5.57119 -Inf Inf 0.04567044 0
#KAGGLE SUBMISSION
predicted_dates <- as.Date(as.Date('2017-04-23', format="%Y-%m-%d"):as.Date('2017-05-31', format="%Y-%m-%d"),
origin="1970-01-01")
submission <- data.frame()
store_ids <- sapply(sample_submission$id, function(str) {
str <- as.character(str)
return(substr(str, start=1, stop=(nchar(str)-11)))
}) %>% unique()
for (store_id in store_ids) {
data_ts <- air_data %>%
filter(air_store_id == store_id) %>%
select(visit_date, visitors)
df <- left_join(dates, data_ts, by='visit_date')
df$visitors[is.na(df$visitors)] <- 0
df <- df[min(which(df$visitors!=0)):nrow(df),]
TS <- ts(df$visitors, frequency=7)
model <- model1
predictions <- forecast(model, h=length(predicted_dates))$mean
prediction_id_date <- data.frame(id=paste(store_id, predicted_dates, sep='_'), visitors=predictions)
suppressWarnings(submission <- bind_rows(submission, prediction_id_date))
}
submission$visitors <- floor(submission$visitors)
write.csv(x=submission, file="final_submission.csv", quote=FALSE, row.names=FALSE)