Reference Code: https://www.kaggle.com/captcalculator/a-very-extensive-recruit-exploratory-analysis

Load the data.

setwd("C:/Users/569973/Downloads/recruit-restaurant-visitor-forecasting")
files <- list.files(path="./data/", pattern="*.csv$")

for (f in files){

  filepath <- file.path("./data", paste(f))

  fname <- strsplit(f, "[.]")[[1]]

  assign(fname[1], fread(filepath))

}

#PREPROCESSING

# split date and time

air_reserve$visit_date <- as.Date(air_reserve$visit_datetime)

air_reserve$visit_time <- format(strptime(air_reserve$visit_datetime,

                                   format="%Y-%m-%d %H:%M:%S"), "%H:%M")

 

air_reserve$reserve_date <- as.Date(air_reserve$reserve_datetime)

air_reserve$reserve_time <- format(strptime(air_reserve$reserve_datetime,

                                   format="%Y-%m-%d %H:%M:%S"), "%H:%M") 

 

hpg_reserve$visit_date <- as.Date(hpg_reserve$visit_datetime)

hpg_reserve$visit_time <- format(strptime(hpg_reserve$visit_datetime,

                                   format="%Y-%m-%d %H:%M:%S"), "%H:%M")

 

hpg_reserve$reserve_date <- as.Date(hpg_reserve$reserve_datetime)

hpg_reserve$reserve_time <- format(strptime(hpg_reserve$reserve_datetime,

                                   format="%Y-%m-%d %H:%M:%S"), "%H:%M") 

 

date_info$calendar_date <- as.Date(date_info$calendar_date)

air_visit_data$visit_date <- as.Date(air_visit_data$visit_date)

 

# add date of week

air_reserve <- air_reserve %>% 

  mutate(visit_dow=wday(visit_date)) %>% 

  mutate(reserve_dow=wday(reserve_date))

 

hpg_reserve <- hpg_reserve %>% 

  mutate(visit_dow=wday(visit_date)) %>% 

  mutate(reserve_dow=wday(reserve_date))

 

air_visit_data <- air_visit_data %>% 

  mutate(visit_dow=wday(visit_date))

 

# add holiday flags

air_reserve <- air_reserve %>% left_join(date_info, by=c("visit_date"="calendar_date"))

hpg_reserve <- hpg_reserve %>% left_join(date_info, by=c("visit_date"="calendar_date"))

air_visit_data <- air_visit_data %>% left_join(date_info, by=c("visit_date"="calendar_date"))

 

air_reserve$holiday_flg <- factor(air_reserve$holiday_flg, labels=c("Not Holiday", "Holiday"))

hpg_reserve$holiday_flg <- factor(hpg_reserve$holiday_flg, labels=c("Not Holiday", "Holiday"))

air_visit_data$holiday_flg <- factor(air_visit_data$holiday_flg, labels=c("Not Holiday", "Holiday"))

DATA DESCRIPTION

head(air_visit_data)
##            air_store_id visit_date visitors visit_dow day_of_week holiday_flg
## 1: air_ba937bf13d40fb24 2016-01-13       25         4   Wednesday Not Holiday
## 2: air_ba937bf13d40fb24 2016-01-14       32         5    Thursday Not Holiday
## 3: air_ba937bf13d40fb24 2016-01-15       29         6      Friday Not Holiday
## 4: air_ba937bf13d40fb24 2016-01-16       22         7    Saturday Not Holiday
## 5: air_ba937bf13d40fb24 2016-01-18        6         2      Monday Not Holiday
## 6: air_ba937bf13d40fb24 2016-01-19        9         3     Tuesday Not Holiday
visit_week_reserve <- air_reserve %>%
  group_by(day_of_week) %>%
  dplyr::summarize(Number=sum(reserve_visitors))

#VISITORS AND RESERVES PER DAY

options(scipen = 9)
p <- ggplot(visit_week_reserve, aes(x=day_of_week, y=Number)) +
  geom_bar(stat="identity") + 
  xlab("") +ggtitle('Reserve visitors')+theme(plot.title = element_text(size = 10, face = "bold"))

p

visit_week_ <- air_visit_data %>%
  group_by(day_of_week) %>%
  dplyr::summarize(Number=sum(visitors))
options(scipen = 9)
p <- ggplot(visit_week_, aes(x=day_of_week, y=Number)) +
  geom_bar(stat="identity") + 
  xlab("") +ggtitle('Total visitors')+theme(plot.title = element_text(size = 10, face = "bold"))

p

VISITORS PER STORE

## [1] "air_reserve contains reservation info of 314 stores"
## [1] "air_visit contains actual visitors info of 829 stores"
## [1] "air_store_info contains actual visitors info of 829 stores"
## [1] "hpg_reserve contains reservation info of 13325 stores but only 150 belongs to the AirREGI system"
## [1] "hpg_store_info contains reservation info of 4690 stores but only 63 belongs to the AirREGI system"

#VISITORS BY GENDER

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
## `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.

#VISITOR BY SEASON

holiday_info <- date_info
holiday_info <- holiday_info %>%
  mutate(calendar_date = as.Date(calendar_date, format="%Y-%m-%d"),
         day_of_week = factor(day_of_week, 
                              levels = format(ISOdate(2020,6,1:7),"%A")),
         holiday_flg = as.factor(holiday_flg))
vis1 <- holiday_info %>% 
  group_by(holiday_flg) %>% 
  summarise(days=n()) %>%
  ggplot(aes(x=holiday_flg, y=days, fill=as.factor(days))) + 
  geom_bar(stat = "identity") +
  geom_text(aes(label=days), size=3, position = position_stack(vjust=0.5)) + 
  theme_minimal() + 
  theme(legend.position="none",plot.title = element_text(size = 10, face = "bold")) +
  ggtitle('Number Of Non Holiday / Holiday Days')

vis2 <- air_visit_data %>%
  group_by(holiday_flg) %>% 
  summarise(days=sum(visitors)) %>%
  ggplot(aes(x=holiday_flg, y=days, fill=as.factor(days))) + 
  geom_bar(stat = "identity") +
  geom_text(aes(label=days), size=3, position = position_stack(vjust=0.5)) + 
  theme_minimal() + 
  theme(legend.position="none",plot.title = element_text(size = 10, face = "bold")) +
  ggtitle('Total Visitors On Non Holiday / Holiday Days')

grid.arrange(vis1, vis2, ncol=2)

#VISITORS SEASON - MONTH -DAY

air_visit_data2 <- air_visit_data %>%
  left_join(holiday_info, by=(c("visit_date" = "calendar_date"))) %>%
  mutate(month_of_year = factor(strftime(visit_date, format='%B'),
                                levels = format(ISOdate(2020,1:12,1),"%B")))

air_visit_data2$day_of_week <- air_visit_data2$day_of_week.x
air_visit_data2$holiday_flg <- air_visit_data2$holiday_flg.x

vis3 <- air_visit_data2 %>% 
  group_by(day_of_week, holiday_flg) %>% 
  summarise(mean_visitors=round(mean(visitors))) %>% 
  ggplot(aes(fill=holiday_flg, y=mean_visitors, x=day_of_week)) + 
  geom_bar(position="stack", stat="identity") +
  geom_text(aes(label=mean_visitors), size=5, position = position_stack(vjust=0.5)) + 
  theme_minimal() + 
  theme(axis.text.x=element_text(angle=45, hjust=1),
        legend.position="bottom") +
  ggtitle('Mean Of Visitors For Each Day Of The Week')
## `summarise()` has grouped output by 'day_of_week'. You can override using the
## `.groups` argument.
vis4 <- air_visit_data2 %>% 
  group_by(month_of_year, holiday_flg) %>% 
  summarise(mean_visitors=round(mean(visitors))) %>% 
  ggplot(aes(fill=holiday_flg, y=mean_visitors, x=month_of_year)) + 
  geom_bar(position="stack", stat="identity") +
  geom_text(aes(label=mean_visitors), size=5, position=position_stack(vjust=0.5)) + 
  theme_minimal() + 
  theme(axis.text.x=element_text(angle=45, hjust=1),
        legend.position="bottom") +
  ggtitle('Mean Of Visitors For Each Month')
## `summarise()` has grouped output by 'month_of_year'. You can override using the
## `.groups` argument.
grid.arrange(vis3, vis4, ncol=2)

#SEASON BY GENDER

vis5 <- air_store_info %>%
  group_by(air_genre_name) %>% 
  summarise(num_of_strs=n()) %>%
  ggplot(aes(fill=air_genre_name, x=air_genre_name, y=num_of_strs)) + 
  geom_bar(position="stack", stat="identity") +
  geom_text(aes(label=num_of_strs), size=3, position=position_stack(vjust=0.5)) + 
  theme_minimal() + 
  theme(legend.position="none") +
  ggtitle('Number Of Stores For Each Genre') +
  coord_flip() 

air_data <- left_join(air_visit_data, air_store_info, by='air_store_id')

vis6 <- air_data %>%
  group_by(air_genre_name) %>% 
  summarise(sum_vistors=sum(visitors)) %>%
  ggplot(aes(fill=air_genre_name, x=air_genre_name, y=sum_vistors)) + 
  geom_bar(position="stack", stat="identity") +
  geom_text(aes(label=sum_vistors), size=3, position=position_stack(vjust=0.5)) + 
  theme_minimal() + 
  theme(legend.position="none") +
  ggtitle('Total Visitors For Each Genre') +
  coord_flip() 

grid.arrange(vis5, vis6, nrow=2)

VISITOR BY GENDER

air_data %>%
  group_by(visit_date, air_genre_name) %>% 
  summarise(mean_visitors=round(mean(visitors))) %>%
  ggplot(aes(color=air_genre_name, y=mean_visitors, x=visit_date)) + 
  geom_line() + 
  facet_wrap(~air_genre_name) + 
  theme(legend.position="none") +
  ggtitle('Mean Of Visitors For Each Genre')
## `summarise()` has grouped output by 'visit_date'. You can override using the
## `.groups` argument.

#CORRELATION

library(TSclust)

df1 <- air_store_info %>% 

  left_join(air_visit_data, c("air_store_id")) %>% 

  dplyr::group_by(air_genre_name, visit_date) %>% 

  dplyr::summarize(act_visitor_tol = sum(visitors)) %>% 

  dcast(visit_date ~ air_genre_name) %>% 

  select(-c(Asian)) %>% # more than half are NA entries

  select(-contains("Karaoke/Party")) %>% # more than half are NA entries

  select(-contains("International cuisine")) # more than half are NA entries

 

# impute missing values

df1 <- na.seadec(df1, algorithm="interpolation")

tsdist <- t(select(df1, -1)) %>% scale() %>% diss("ACF", p=0.05)

hc <- hclust(tsdist)

 

dend <- as.dendrogram(hc)

dend <- dend %>%

  dendextend::set("branches_k_color", k=4) %>% 

  dendextend::set("branches_lwd", 1.0) %>%

  dendextend::set("labels_cex", 1.2) %>% 

  dendextend::set("leaves_pch", 19) %>%  

  dendextend::set("leaves_col", c("black")) %>%

  dendextend::set("leaves_cex", 0.6) %>% 

  highlight_branches_col

 

par(cex=0.4)

ggplot(dend, horiz=TRUE) 
# AIR -- split location data

air_store_info <- air_store_info %>% 

  separate(air_area_name, sep=" ", into=c('prefecture', 'sub-division-1', 'sub-division-2'), extra="merge") 

 

# store visit count

air1 <- air_visit_data %>% group_by(air_store_id) %>% summarize(act_visitor_tol = sum(visitors))

air2 <- air_reserve %>% group_by(air_store_id) %>% summarize(reserve_visitor_tol = sum(reserve_visitors))

 

air_store_summ <- air_store_info %>% 

  left_join(air1, c("air_store_id")) %>%

  left_join(air2, c("air_store_id")) %>% 

  dplyr::group_by(prefecture) %>% 

  summarize(actual_visitor_tol = mean(act_visitor_tol, na.rm=T), 

            reserve_visitor_tol = mean(reserve_visitor_tol, na.rm=T), 

            air_store_count = n())

 

# HPG -- split location data

hpg_store_summ <- hpg_store_info %>% 

  separate(hpg_area_name, sep=" ", into=c('prefecture', 'sub-division-1', 'sub-division-2'), extra="merge") %>% 

  dplyr::group_by(prefecture) %>% 

  summarize(hpg_store_count = n())

 

# rename 

hpg_store_summ['prefecture'] <- c("Fukuoka", "Hiroshima", "Hokkaido", "Hyogo", "Kanagawa", "Miyagi", "Niigata", "None", "Osaka", "Osaka", "Saitama", "Shizuoka", "Tokyo")

 

hpg_store_summ <- hpg_store_summ %>% 

  dplyr::group_by(prefecture) %>%

  summarize(hpg_store_count = sum(hpg_store_count))
# create sp object

jmap <- map("japan", col=1, plot=FALSE, fill=TRUE)

jmap_id <-sapply(strsplit(jmap[['names']], ":"), function(x) x[1])

p4s <- CRS("+proj=longlat +datum=WGS84")

jsp <- map2SpatialPolygons(jmap, ID=jmap_id, proj4string=p4s)

 

# create SpatialPolygonsDataFrame

jspdf <- data.frame(prefecture=names(jsp))

rownames(jspdf) <- names(jsp)

jspdf <- SpatialPolygonsDataFrame(jsp, jspdf)

 

# rename columns

air_store_summ['prefecture'] <- c("Fukuoka", "Hiroshima", "Hokkaido", "Hyogo", "Miyagi", "Niigata", "Osaka", "Shizuoka", "Tokyo")

 

jspdf <- sp::merge(jspdf, air_store_summ, by=c("prefecture"))

#jspdf <- sp::merge(jspdf, hpg_store_summ[hpg_store_summ!="None",], by=c("prefecture"))
## rgeos version: 0.5-9, (SVN revision 684)
##  GEOS runtime version: 3.9.1-CAPI-1.14.2 
##  Please note that rgeos will be retired by the end of 2023,
## plan transition to sf functions using GEOS at your earliest convenience.
##  GEOS using OverlayNG
##  Linking to sp version: 1.4-7 
##  Polygon checking: TRUE

#PREFECTURE VISITORS AND CONVERSION RATE

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
## `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.

First Look at the Time Series.

air_visit_data %>%
  group_by(visit_date) %>%
  summarise(sum_visitors = sum(visitors)) %>%
  ggplot(aes(x=visit_date, y=sum_visitors)) + 
  geom_line() + 
  theme_minimal() +
  ggtitle('Total Visitors For Each Day')

# Air Reserved Visitors

air1 <- air_reserve %>%

  group_by(visit_date) %>% summarise(reserve_visit_tol = sum(reserve_visitors))

air2 <- air_visit_data %>% group_by(visit_date) %>% summarise(act_visit_tol = sum(visitors))

 

rect <- data.frame(xmin=as.Date("2016-07-01", "%Y-%m-%d"),

                   xmax=as.Date("2016-07-10", "%Y-%m-%d"), ymin=-Inf, ymax=Inf)

p1 <- air_visit_data %>%

  group_by(visit_date) %>% summarise(act_visitor_tol = sum(visitors))  %>% 

  ggplot() +

  geom_line(aes(x=visit_date, y=act_visitor_tol), size=0.2, col='grey25') +

  labs(title="Actual Vistiors - AirREGI") + 

  scale_x_date(date_breaks = "1 month") + 

  theme(axis.text.x = element_text(angle=90, hjust=1, size=7)) +

  geom_rect(data=rect, aes(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax),

              colour="blue", fill="blue", 

              alpha=0.05, size=0.5, linetype=2, inherit.aes = FALSE)

 

rect <- data.frame(xmin=as.Date("2016-08-01", "%Y-%m-%d"),

                   xmax=as.Date("2016-10-20", "%Y-%m-%d"), ymin=-Inf, ymax=Inf)

p2 <- air1 %>% 

  ggplot() +

  geom_line(aes(x=visit_date, y=reserve_visit_tol), size=0.2, col='steelblue') +

  geom_rect(data=rect, aes(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax),

              colour="blue", fill="blue", 

              alpha=0.05, size=0.5, linetype=2, inherit.aes = FALSE) + 

  labs(title="Actual & Reserved Vistiors - AirREGI") + 

  scale_x_date(date_breaks = "1 month") + 

  theme(axis.text.x = element_text(angle=90, hjust=1))

 

p <- subplot(p1, p2, nrows = 2, shareX=TRUE)

ggplotly(p)
## Joining, by = "visit_date"

MODELS

Time serie decomposition

daily_visit <- air1 %>% left_join(air2, c("visit_date")) %>% 

  select(reserve_visit_tol) %>% 

  ts(start=c(2016,1,1), frequency=7)

 

# Daily Visit Rates Decomposition

p2 <- decompose(daily_visit) %>% .$trend %>%

  autoplot(main="Air Visitor Trend Component", xlab="Day") +

  theme(plot.title = element_text(size=8), axis.title=element_text(size=8))

p3 <- decompose(daily_visit) %>% .$seasonal %>%

  autoplot(main="Air Visitor Seasonal Component", xlab="Day")+

  theme(plot.title = element_text(size=8), axis.title=element_text(size=8))

p4 <- decompose(daily_visit) %>% .$random %>%

  autoplot(main="Air Visitor Random Component", xlab="Day")+

  theme(plot.title = element_text(size=8), axis.title=element_text(size=8))

 

grid.arrange(p2,p3,p4,ncol=3)

p1 <- air_visit_data %>%

  group_by(visit_dow, holiday_flg) %>%

  summarise(visitor_tol = sum(visitors)) %>%

  ggplot(aes(color=holiday_flg)) +

  facet_grid(holiday_flg~., scales="free") +

  geom_line(aes(x=visit_dow, y=visitor_tol), group=1, size=0.3) +

  geom_point(aes(x=visit_dow, y=visitor_tol), size=1) +

  scale_color_manual(values=c("#007D00", "#C00000")) +

  labs(title="Visitors by Day of Week")+

  guides(color=FALSE)
## `summarise()` has grouped output by 'visit_dow'. You can override using the
## `.groups` argument.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
p2 <- air_reserve %>% group_by(reserve_time, visit_dow) %>%

  summarise(visitor_tol = sum(reserve_visitors)) %>% 

  ggplot() + 

  geom_col(aes(x=reserve_time, y=visitor_tol), fill="#19417D") +

  labs(title="Visitors by Day of Week & Time Slot")+

  scale_y_continuous(breaks=seq(0, 65000, by=10000)) +

  theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.9))
## `summarise()` has grouped output by 'reserve_time'. You can override using the
## `.groups` argument.
subplot(p1,p2, margin = 0.05)
# convert air_reserve_visit to ts

air_visit_ts <- ts(air_visit_data[,"visitors"], start=c(2016-01-01), frequency=7) 

 

# taking the first difference

d.air_visit_ts <- diff(air_visit_ts)

#ACF PLOTS

## Warning: Ignoring unknown parameters: main, ylab
## Ignoring unknown parameters: main, ylab

TIME SERIE DECOMPOSITION

dates <- data.frame('visit_date' = as.Date(min(air_data$visit_date):max(air_data$visit_date), origin="1970-01-01"))

store_data <- air_data %>% 
    filter(air_store_id == "air_09a845d5b5944b01") %>% 
  select(visit_date, visitors)

df_store <- left_join(dates, store_data, by='visit_date')
df_store$visitors[is.na(df_store$visitors)] <- 0

df_store <- df_store[min(which(df_store$visitors!=0)):nrow(df_store),]

TS <- ts(df_store$visitors, frequency=7)

train_TS <- head(TS, -7)
test_TS <- tail(TS, 7)

dec <- decompose(train_TS, type="additive")
plot(dec)

acf(train_TS, lag.max=10)

#STATISTICAL TEST.

adf.test(train_TS)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train_TS
## Dickey-Fuller = -3.473, Lag order = 7, p-value = 0.04502
## alternative hypothesis: stationary

#MODELING

autoplot(train_TS)

ETS MODEL.

model1 <- ets(train_TS)
predictions <- forecast(model1, h=7)

autoplot(predictions) + 
  autolayer(test_TS, series="Data") + 
  autolayer(predictions$mean, series="Forecasts")

accuracy(predictions$mean,test_TS)
##                  ME     RMSE      MAE  MPE MAPE       ACF1 Theil's U
## Test set -0.5084285 1.728564 1.459384 -Inf  Inf -0.4961305         0

NAIVE MODEL

model2 <- naive(train_TS)
predictions <- forecast(model2, h=7)

autoplot(predictions) + 
  autolayer(test_TS, series="Data") + 
  autolayer(predictions$mean, series="Forecasts")

accuracy(predictions$mean,test_TS)
##                 ME     RMSE      MAE  MPE MAPE       ACF1 Theil's U
## Test set -8.428571 9.327379 8.428571 -Inf  Inf 0.04567044         0

HOLT-WINTERS

model3 <- HoltWinters(train_TS)
predictions <- forecast(model3, h=7)

autoplot(predictions) + 
  autolayer(test_TS, series="Data") + 
  autolayer(predictions$mean, series="Forecasts")

accuracy(predictions$mean,test_TS)
##                ME     RMSE      MAE  MPE MAPE       ACF1 Theil's U
## Test set -1.08704 2.171498 1.940995 -Inf  Inf -0.5092342         0

ARIMA

model4 <- Arima(train_TS)
predictions <- forecast(model4, h=7)

autoplot(predictions) + 
  autolayer(test_TS, series="Data") + 
  autolayer(predictions$mean, series="Forecasts")

accuracy(predictions$mean,test_TS)
##                 ME     RMSE      MAE  MPE MAPE       ACF1 Theil's U
## Test set -4.394601 5.938998 4.975736 -Inf  Inf 0.04567044         0

ETS (BOXCOX)

model5 <- ets(train_TS,lambda="auto")
## Warning in guerrero(x, lower, upper): Guerrero's method for selecting a Box-Cox
## parameter (lambda) is given for strictly positive data.
predictions <- forecast(model5, h=7)
accuracy(predictions$mean,test_TS)
##               ME     RMSE      MAE  MPE MAPE       ACF1 Theil's U
## Test set 2.71786 6.559491 5.401853 -Inf  Inf -0.3653243         0

ARIMA(BOXCOX)

model5 <- Arima(train_TS,lambda="auto")
## Warning in guerrero(x, lower, upper): Guerrero's method for selecting a Box-Cox
## parameter (lambda) is given for strictly positive data.
predictions <- forecast(model5, h=7)
accuracy(predictions$mean,test_TS)
##                ME     RMSE     MAE  MPE MAPE       ACF1 Theil's U
## Test set 5.571095 6.855384 5.57119 -Inf  Inf 0.04567044         0

#KAGGLE SUBMISSION

predicted_dates <- as.Date(as.Date('2017-04-23', format="%Y-%m-%d"):as.Date('2017-05-31', format="%Y-%m-%d"), 
                           origin="1970-01-01")

submission <- data.frame()

store_ids <- sapply(sample_submission$id, function(str) {
  str <- as.character(str)
  return(substr(str, start=1, stop=(nchar(str)-11)))
  }) %>% unique()

for (store_id in store_ids) {
  
  data_ts <- air_data %>% 
    filter(air_store_id == store_id) %>% 
    select(visit_date, visitors)
  
  df <- left_join(dates, data_ts, by='visit_date') 
  df$visitors[is.na(df$visitors)] <- 0
  df <- df[min(which(df$visitors!=0)):nrow(df),]
  
  TS <- ts(df$visitors, frequency=7)
  model <- model1
  predictions <- forecast(model, h=length(predicted_dates))$mean
  
  prediction_id_date <- data.frame(id=paste(store_id, predicted_dates, sep='_'), visitors=predictions)
     
  suppressWarnings(submission <- bind_rows(submission, prediction_id_date))
}

submission$visitors <- floor(submission$visitors)
write.csv(x=submission, file="final_submission.csv", quote=FALSE, row.names=FALSE)