###########################################################################################################
rm(list = ls())

Load useful libraries

library(plyr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(lubridate)
library(zoo)
library(data.table)
library(fpp2)
library(seasonal)
library(matrixStats)

Import Data

df_air_reserve = read.csv("/Users/yannos/Documents/PredictiveAnalytics/Midterm/RawData/air_reserve.csv")
df_air_store_info = read.csv("/Users/yannos/Documents/PredictiveAnalytics/Midterm/RawData/air_store_info.csv")
df_air_visit_data = read.csv("/Users/yannos/Documents/PredictiveAnalytics/Midterm/RawData/air_visit_data.csv")
df_date_info = read.csv("/Users/yannos/Documents/PredictiveAnalytics/Midterm/RawData/date_info.csv")
df_hpg_reserve = read.csv("/Users/yannos/Documents/PredictiveAnalytics/Midterm/RawData/hpg_reserve.csv")
df_hpg_store_info = read.csv("/Users/yannos/Documents/PredictiveAnalytics/Midterm/RawData/hpg_store_info.csv")
df_sample_submission = read.csv("/Users/yannos/Documents/PredictiveAnalytics/Midterm/RawData/sample_submission.csv")
df_store_id_relation = read.csv("/Users/yannos/Documents/PredictiveAnalytics/Midterm/RawData/store_id_relation.csv")

Data Preparation - Part 1

Look at loaded data

df_air_visit_data %>% head()
##           air_store_id visit_date visitors
## 1 air_ba937bf13d40fb24 2016-01-13       25
## 2 air_ba937bf13d40fb24 2016-01-14       32
## 3 air_ba937bf13d40fb24 2016-01-15       29
## 4 air_ba937bf13d40fb24 2016-01-16       22
## 5 air_ba937bf13d40fb24 2016-01-18        6
## 6 air_ba937bf13d40fb24 2016-01-19        9
df_air_reserve %>% head()
##           air_store_id      visit_datetime    reserve_datetime reserve_visitors
## 1 air_877f79706adbfb06 2016-01-01 19:00:00 2016-01-01 16:00:00                1
## 2 air_db4b38ebe7a7ceff 2016-01-01 19:00:00 2016-01-01 19:00:00                3
## 3 air_db4b38ebe7a7ceff 2016-01-01 19:00:00 2016-01-01 19:00:00                6
## 4 air_877f79706adbfb06 2016-01-01 20:00:00 2016-01-01 16:00:00                2
## 5 air_db80363d35f10926 2016-01-01 20:00:00 2016-01-01 01:00:00                5
## 6 air_db80363d35f10926 2016-01-02 01:00:00 2016-01-01 16:00:00                2
df_hpg_reserve %>% head()
##           hpg_store_id      visit_datetime    reserve_datetime reserve_visitors
## 1 hpg_c63f6f42e088e50f 2016-01-01 11:00:00 2016-01-01 09:00:00                1
## 2 hpg_dac72789163a3f47 2016-01-01 13:00:00 2016-01-01 06:00:00                3
## 3 hpg_c8e24dcf51ca1eb5 2016-01-01 16:00:00 2016-01-01 14:00:00                2
## 4 hpg_24bb207e5fd49d4a 2016-01-01 17:00:00 2016-01-01 11:00:00                5
## 5 hpg_25291c542ebb3bc2 2016-01-01 17:00:00 2016-01-01 03:00:00               13
## 6 hpg_28bdf7a336ec6a7b 2016-01-01 17:00:00 2016-01-01 15:00:00                2
df_air_store_info %>% head()
##           air_store_id air_genre_name                air_area_name latitude
## 1 air_0f0cdeee6c9bf3d7 Italian/French Hyōgo-ken Kōbe-shi Kumoidōri 34.69512
## 2 air_7cc17a324ae5c7dc Italian/French Hyōgo-ken Kōbe-shi Kumoidōri 34.69512
## 3 air_fee8dcf4d619598e Italian/French Hyōgo-ken Kōbe-shi Kumoidōri 34.69512
## 4 air_a17f0778617c76e2 Italian/French Hyōgo-ken Kōbe-shi Kumoidōri 34.69512
## 5 air_83db5aff8f50478e Italian/French Tōkyō-to Minato-ku Shibakōen 35.65807
## 6 air_99c3eae84130c1cb Italian/French Tōkyō-to Minato-ku Shibakōen 35.65807
##   longitude
## 1  135.1979
## 2  135.1979
## 3  135.1979
## 4  135.1979
## 5  139.7516
## 6  139.7516
df_hpg_store_info %>% head
##           hpg_store_id hpg_genre_name                 hpg_area_name latitude
## 1 hpg_6622b62385aec8bf Japanese style Tōkyō-to Setagaya-ku Taishidō 35.64367
## 2 hpg_e9e068dd49c5fa00 Japanese style Tōkyō-to Setagaya-ku Taishidō 35.64367
## 3 hpg_2976f7acb4b3a3bc Japanese style Tōkyō-to Setagaya-ku Taishidō 35.64367
## 4 hpg_e51a522e098f024c Japanese style Tōkyō-to Setagaya-ku Taishidō 35.64367
## 5 hpg_e3d0e1519894f275 Japanese style Tōkyō-to Setagaya-ku Taishidō 35.64367
## 6 hpg_530cd91db13b938e Japanese style Tōkyō-to Setagaya-ku Taishidō 35.64367
##   longitude
## 1  139.6682
## 2  139.6682
## 3  139.6682
## 4  139.6682
## 5  139.6682
## 6  139.6682
df_store_id_relation %>% head()
##           air_store_id         hpg_store_id
## 1 air_63b13c56b7201bd9 hpg_4bc649e72e2a239a
## 2 air_a24bf50c3e90d583 hpg_c34b496d0305a809
## 3 air_c7f78b4f3cba33ff hpg_cd8ae0d9bbd58ff9
## 4 air_947eb2cae4f3e8f2 hpg_de24ea49dc25d6b8
## 5 air_965b2e0cf4119003 hpg_653238a84804d8e7
## 6 air_a38f25e3399d1b25 hpg_50378da9ffb9b6cd
df_date_info %>% head()
##   calendar_date day_of_week holiday_flg
## 1    2016-01-01      Friday           1
## 2    2016-01-02    Saturday           1
## 3    2016-01-03      Sunday           1
## 4    2016-01-04      Monday           0
## 5    2016-01-05     Tuesday           0
## 6    2016-01-06   Wednesday           0
df_sample_submission %>% head()
##                                id visitors
## 1 air_00a91d42b08b08d9_2017-04-23        0
## 2 air_00a91d42b08b08d9_2017-04-24        0
## 3 air_00a91d42b08b08d9_2017-04-25        0
## 4 air_00a91d42b08b08d9_2017-04-26        0
## 5 air_00a91d42b08b08d9_2017-04-27        0
## 6 air_00a91d42b08b08d9_2017-04-28        0

As we can see above we have the following data:

Preparing air_visit_data - Make visit_date into date type

df_air_visit_data <- df_air_visit_data %>%
  mutate(visit_date = as.Date(visit_date))

Preparing reserve data - Keep only visit date from air and hpg reserve data and make into date type

df_air_reserve <- df_air_reserve %>%
  mutate(visit_date = as.Date(visit_datetime),
         air_reserve_visitors = reserve_visitors) %>%
  select(air_store_id, visit_date, air_reserve_visitors)

df_hpg_reserve <- df_hpg_reserve %>%
  mutate(visit_date = as.Date(visit_datetime),
         hpg_reserve_visitors = reserve_visitors) %>%
  select(hpg_store_id, visit_date, hpg_reserve_visitors)

Preparing date info - get useful date related columns

df_date_info <- df_date_info %>%
  mutate(visit_date = as.Date(calendar_date)) %>%
  select(visit_date, day_of_week, holiday_flg)

Data Preparation - Part 2

Create train set and test set and combine to generate features

Create df train - get all historical dates and restaurant ids

df_train <- df_air_visit_data %>% copy()

Create df test - split restaurant id and visit date from sample submission file

df_test <- df_sample_submission %>%
  mutate(air_store_id = substr(df_sample_submission$id,1,20),
         visit_date = as.Date(substr(df_sample_submission$id,22,31))
  )

Combine to generate features

df_full <- rbind(df_train %>% select(air_store_id, visit_date, visitors),
                 df_test %>% select(air_store_id, visit_date, visitors))

Create total reservations

# get total air reservations for each store / day
df_air_reserve_tot <- df_air_reserve %>%
  group_by(air_store_id,visit_date) %>%
  summarise(air_reserve_visitors = sum(air_reserve_visitors)) %>%
  ungroup()

# get total hpg reservations for each store / day
df_hpg_reserve_tot <- df_hpg_reserve %>%
  group_by(hpg_store_id,visit_date) %>%
  summarise(hpg_reserve_visitors = sum(hpg_reserve_visitors)) %>%
  ungroup()

# join reservations and sum up to get total reservations
df_reservations <- df_air_reserve_tot %>%
  inner_join(df_store_id_relation, by = c("air_store_id")) %>%
  left_join(df_hpg_reserve_tot, by = c("hpg_store_id","visit_date")) %>%
  select(air_store_id, visit_date, air_reserve_visitors, hpg_reserve_visitors) %>%
  mutate(total_reservations = air_reserve_visitors + hpg_reserve_visitors) %>%
  select(air_store_id, visit_date, total_reservations)
## Warning: Column `air_store_id` joining factors with different levels, coercing
## to character vector
## Warning: Column `hpg_store_id` joining factors with different levels, coercing
## to character vector
# look at total reservation data
df_reservations %>% head()
## # A tibble: 6 x 3
##   air_store_id         visit_date total_reservations
##   <chr>                <date>                  <int>
## 1 air_00a91d42b08b08d9 2016-10-31                  4
## 2 air_00a91d42b08b08d9 2016-12-05                 10
## 3 air_00a91d42b08b08d9 2016-12-14                 NA
## 4 air_00a91d42b08b08d9 2016-12-17                  4
## 5 air_00a91d42b08b08d9 2016-12-20                  8
## 6 air_00a91d42b08b08d9 2017-02-18                 18

Merge historical visitors, reservation, date info, store info data into df_full

# Merge reservation datasets into historical visitors dataset
df_full <- df_full %>%
  left_join(df_reservations, by = c("air_store_id","visit_date")) %>%
  select(air_store_id, visit_date, visitors, total_reservations)
## Warning: Column `air_store_id` joining factor and character vector, coercing
## into character vector
# Merge in date info and store info
df_full <- df_full %>%
  left_join(df_air_store_info, by = c("air_store_id")) %>%
  left_join(df_date_info, by = c("visit_date")) 
## Warning: Column `air_store_id` joining character vector and factor, coercing
## into character vector
# Generate year, month, day from visit_date
df_full <- df_full %>%
  mutate(Year = as.factor(year(visit_date)),
         Month = as.factor(month(visit_date)),
         Day = as.factor(day(visit_date)),
         day_of_week = as.factor(day_of_week),
         holiday_flg = as.factor(holiday_flg))

# Create area column
GeneralArea = gsub( " .*$", "", df_full$air_area_name)
df_full$GeneralArea = as.factor(GeneralArea)
df_full$air_genre_name = as.factor(df_full$air_genre_name)

# look at df_full
df_full %>% head()
##           air_store_id visit_date visitors total_reservations air_genre_name
## 1 air_ba937bf13d40fb24 2016-01-13       25                 NA     Dining bar
## 2 air_ba937bf13d40fb24 2016-01-14       32                 NA     Dining bar
## 3 air_ba937bf13d40fb24 2016-01-15       29                 NA     Dining bar
## 4 air_ba937bf13d40fb24 2016-01-16       22                 NA     Dining bar
## 5 air_ba937bf13d40fb24 2016-01-18        6                 NA     Dining bar
## 6 air_ba937bf13d40fb24 2016-01-19        9                 NA     Dining bar
##                  air_area_name latitude longitude day_of_week holiday_flg Year
## 1 Tōkyō-to Minato-ku Shibakōen 35.65807  139.7516   Wednesday           0 2016
## 2 Tōkyō-to Minato-ku Shibakōen 35.65807  139.7516    Thursday           0 2016
## 3 Tōkyō-to Minato-ku Shibakōen 35.65807  139.7516      Friday           0 2016
## 4 Tōkyō-to Minato-ku Shibakōen 35.65807  139.7516    Saturday           0 2016
## 5 Tōkyō-to Minato-ku Shibakōen 35.65807  139.7516      Monday           0 2016
## 6 Tōkyō-to Minato-ku Shibakōen 35.65807  139.7516     Tuesday           0 2016
##   Month Day GeneralArea
## 1     1  13    Tōkyō-to
## 2     1  14    Tōkyō-to
## 3     1  15    Tōkyō-to
## 4     1  16    Tōkyō-to
## 5     1  18    Tōkyō-to
## 6     1  19    Tōkyō-to

Now split again to df train and df test depending on visit_date

# df_train
df_train <- df_train %>%
  select('air_store_id', 'visit_date') %>%
  inner_join(df_full,
             by=c('air_store_id','visit_date'))
## Warning: Column `air_store_id` joining factor and character vector, coercing
## into character vector
# df_test
df_test <- df_test %>%
  select('air_store_id', 'visit_date') %>%
  inner_join(df_full,
             by=c('air_store_id','visit_date'))

# impute missing reservation data
df_train$total_reservations[is.na(df_train$total_reservations)] <- 0
df_test$total_reservations[is.na(df_test$total_reservations)] <- 0

Data Exploration

Look at df_train and df_test descriptive statistics

df_train %>% summary()
##  air_store_id         visit_date            visitors      total_reservations
##  Length:252108      Min.   :2016-01-01   Min.   :  1.00   Min.   :  0.0000  
##  Class :character   1st Qu.:2016-07-23   1st Qu.:  9.00   1st Qu.:  0.0000  
##  Mode  :character   Median :2016-10-23   Median : 17.00   Median :  0.0000  
##                     Mean   :2016-10-12   Mean   : 20.97   Mean   :  0.5029  
##                     3rd Qu.:2017-01-24   3rd Qu.: 29.00   3rd Qu.:  0.0000  
##                     Max.   :2017-04-22   Max.   :877.00   Max.   :264.0000  
##                                                                             
##         air_genre_name                         air_area_name       latitude    
##  Izakaya       :62052   Fukuoka-ken Fukuoka-shi Daimyō: 19775   Min.   :33.21  
##  Cafe/Sweets   :52764   Tōkyō-to Shibuya-ku Shibuya   : 17352   1st Qu.:34.69  
##  Dining bar    :34192   Tōkyō-to Minato-ku Shibakōen  : 14696   Median :35.66  
##  Italian/French:30011   Tōkyō-to Shinjuku-ku Kabukichō: 12517   Mean   :35.61  
##  Bar/Cocktail  :25135   Tōkyō-to Setagaya-ku Setagaya :  8719   3rd Qu.:35.69  
##  Japanese food :18789   Tōkyō-to Chūō-ku Tsukiji      :  8370   Max.   :44.02  
##  (Other)       :29165   (Other)                       :170679                  
##    longitude        day_of_week    holiday_flg   Year            Month       
##  Min.   :130.2   Friday   :40351   0:239333    2016:174535   3      : 30570  
##  1st Qu.:135.3   Monday   :31682   1: 12775    2017: 77573   2      : 27415  
##  Median :139.7   Saturday :39262                             1      : 27020  
##  Mean   :137.4   Sunday   :29991                             4      : 23859  
##  3rd Qu.:139.8   Thursday :37996                             12     : 21515  
##  Max.   :144.3   Tuesday  :36015                             10     : 21466  
##                  Wednesday:36811                             (Other):100263  
##       Day                GeneralArea    
##  8      :  8761   Tōkyō-to     :133063  
##  14     :  8700   Fukuoka-ken  : 39645  
##  22     :  8687   Ōsaka-fu     : 22821  
##  21     :  8621   Hyōgo-ken    : 17846  
##  17     :  8619   Hokkaidō     : 13055  
##  7      :  8605   Hiroshima-ken:  9858  
##  (Other):200115   (Other)      : 15820
df_test %>% summary()
##  air_store_id         visit_date            visitors total_reservations
##  Length:32019       Min.   :2017-04-23   Min.   :0   Min.   : 0.00000  
##  Class :character   1st Qu.:2017-05-02   1st Qu.:0   1st Qu.: 0.00000  
##  Mode  :character   Median :2017-05-12   Median :0   Median : 0.00000  
##                     Mean   :2017-05-12   Mean   :0   Mean   : 0.08561  
##                     3rd Qu.:2017-05-22   3rd Qu.:0   3rd Qu.: 0.00000  
##                     Max.   :2017-05-31   Max.   :0   Max.   :54.00000  
##                                                                        
##         air_genre_name                        air_area_name      latitude    
##  Izakaya       :7605   Fukuoka-ken Fukuoka-shi Daimyō: 2457   Min.   :33.21  
##  Cafe/Sweets   :6903   Tōkyō-to Shibuya-ku Shibuya   : 2262   1st Qu.:34.70  
##  Dining bar    :4173   Tōkyō-to Minato-ku Shibakōen  : 1989   Median :35.66  
##  Italian/French:3939   Tōkyō-to Shinjuku-ku Kabukichō: 1521   Mean   :35.63  
##  Bar/Cocktail  :3081   Tōkyō-to Setagaya-ku Setagaya : 1170   3rd Qu.:35.69  
##  Japanese food :2457   Tōkyō-to Chūō-ku Tsukiji      : 1092   Max.   :44.02  
##  (Other)       :3861   (Other)                       :21528                  
##    longitude        day_of_week   holiday_flg   Year           Month      
##  Min.   :130.2   Friday   :4105   0:28735     2016:    0   5      :25451  
##  1st Qu.:135.3   Monday   :4926   1: 3284     2017:32019   4      : 6568  
##  Median :139.7   Saturday :4105                            1      :    0  
##  Mean   :137.4   Sunday   :4926                            2      :    0  
##  3rd Qu.:139.8   Thursday :4105                            3      :    0  
##  Max.   :144.3   Tuesday  :4926                            6      :    0  
##                  Wednesday:4926                            (Other):    0  
##       Day               GeneralArea   
##  23     : 1642   Tōkyō-to     :17160  
##  24     : 1642   Fukuoka-ken  : 4875  
##  25     : 1642   Ōsaka-fu     : 2886  
##  26     : 1642   Hyōgo-ken    : 2223  
##  27     : 1642   Hokkaidō     : 1716  
##  28     : 1642   Hiroshima-ken: 1248  
##  (Other):22167   (Other)      : 1911

Plots

Plot daily historical average visitors

p_daily_his_mean <- df_train %>%
  group_by(visit_date) %>%
  summarise(Visits = mean(visitors)) %>%
  ggplot(aes(visit_date, Visits, alpha=0.9)) +
  geom_line() +
  theme(legend.position = "none") +
  labs(x = "Date", y = "Average Visitors") +
  ggtitle("Historical Average Daily Visitors")

p_daily_his_mean

Plot monthly historical average visitors

p_monthly_mean <- df_train %>%
  group_by(Month) %>%
  summarise(Visits = mean(visitors)) %>%
  ggplot(aes(Month, Visits, alpha=0.9)) +
  geom_col() +
  theme(legend.position = "none") +
  labs(x = "Month", y = "Average Visitors") +
  ggtitle("Average Monthly Visitors")
p_monthly_mean

Plot historical average visitors by day of week

p_daily_mean <- df_train %>%
  group_by(day_of_week) %>%
  summarise(Visits = mean(visitors)) %>%
  ggplot(aes(day_of_week, Visits, alpha=0.9)) +
  geom_col() +
  theme(legend.position = "none") +
  labs(x = "Day of Week", y = "Average Visitors") +
  ggtitle("Average Visitors by Day of Week")
p_daily_mean

Plot historical average visitors by holiday occurence

p_holiday <- df_train %>%
  group_by(holiday_flg) %>%
  summarise(Visits = mean(visitors)) %>%
  ggplot(aes(holiday_flg, Visits, alpha=0.9)) +
  geom_col() +
  theme(legend.position = "none") +
  labs(x = "Holiday", y = "Average Visitors") +
  ggtitle("Average Visitors by Holiday indicator")
p_holiday

Plot Visitors vs Reservations

p_reserve_vs_visits <- df_train %>%
  filter(total_reservations < 100) %>%
  ggplot(aes(total_reservations, visitors)) +
  geom_point(color = "black", alpha = 0.5) +
  geom_smooth(method = "lm", color = "red") +
  ggtitle("Visitors vs Reservations")
p_reserve_vs_visits
## `geom_smooth()` using formula 'y ~ x'

Feature Engineering

In this section we will be writing the necessary code to generate historical average and weighted average features

# get combined train and test id, date, visitors to generate average predictor
df_temp = rbind(df_train %>% select(air_store_id, visit_date, Year, Month, Day, visitors),
                df_test %>% select(air_store_id, visit_date, Year, Month, Day, visitors))
# Function that generates weights
GenerateWeights<-function(alpha,n)
{
  w <- list(alpha)
  for (i in 1:n-1){
    w_0 <- alpha*(1-alpha)**i
    w[i+1] <- w_0
  }
  return(w)
}
# create dataframe that will hold generated features
df_hist_average_visitors <-
  data.frame(air_store_id=character(),
             visit_date=as.Date(character()),
             hist_avg_visitors=double(),
             hist_wa005_visitors=double(),
             hist_wa01_visitors=double(),
             hist_wa02_visitors=double(),
             hist_wa05_visitors=double(),
             stringsAsFactors=FALSE) 
# Generate historical average and weighted average features. The code below generates a new temporary
# dataframe for each unique store id with columns being different visitor lags and then does a rowMeans
# and a rowWeightedMeans to get historical average or historical weighted average respectively (using previous
# function to calculate weights) 
for (i in unique(df_temp$air_store_id)){
  
  df_temp_0  <- df_temp %>%
    filter(air_store_id == i)
  df_temp_0 <- df_temp_0[order(df_temp_0$visit_date),]
 
  df_temp_lags <- df_temp_0 %>% copy()
  for (j in 39:403){
    lag <- lag(df_temp_lags$visitors, j)
    df_temp_lags <- cbind(df_temp_lags, lag)
  }
  # get historical mean
  hist_mean <- rowMeans(df_temp_lags[7:371], na.rm = TRUE)
  # get historical weighted mean
  w005 <- GenerateWeights(0.05,365)
  w01 <- GenerateWeights(0.1,365)
  w02 <- GenerateWeights(0.2,365)
  w05 <- GenerateWeights(0.5,365)
  hist_weighted_mean_005 <- rowWeightedMeans(as.matrix(df_temp_lags[7:371]), w = as.numeric(w005), na.rm = TRUE)
  hist_weighted_mean_01 <- rowWeightedMeans(as.matrix(df_temp_lags[7:371]), w = as.numeric(w01), na.rm = TRUE)
  hist_weighted_mean_02 <- rowWeightedMeans(as.matrix(df_temp_lags[7:371]), w = as.numeric(w02), na.rm = TRUE)
  hist_weighted_mean_05 <- rowWeightedMeans(as.matrix(df_temp_lags[7:371]), w = as.numeric(w05), na.rm = TRUE)
  
  df_temp_0 <- df_temp_0 %>%
    mutate(hist_avg_visitors = hist_mean,
           hist_wa005_visitors = hist_weighted_mean_005,
           hist_wa01_visitors = hist_weighted_mean_01,
           hist_wa02_visitors = hist_weighted_mean_02,
           hist_wa05_visitors = hist_weighted_mean_05
           ) %>%
    select(air_store_id, visit_date, hist_avg_visitors,
           hist_wa005_visitors, hist_wa01_visitors, hist_wa02_visitors, hist_wa05_visitors)
  
  df_hist_average_visitors <- rbind(df_temp_0, df_hist_average_visitors)
}
# look at the generated features
df_hist_average_visitors %>% tail()
##                air_store_id visit_date hist_avg_visitors hist_wa005_visitors
## 284122 air_ba937bf13d40fb24 2017-05-26          22.93699            16.83007
## 284123 air_ba937bf13d40fb24 2017-05-27          22.92603            16.53857
## 284124 air_ba937bf13d40fb24 2017-05-28          22.86849            16.26164
## 284125 air_ba937bf13d40fb24 2017-05-29          22.89863            16.14856
## 284126 air_ba937bf13d40fb24 2017-05-30          22.93699            17.34113
## 284127 air_ba937bf13d40fb24 2017-05-31          22.97808            17.62407
##        hist_wa01_visitors hist_wa02_visitors hist_wa05_visitors
## 284122           15.58814           13.39556           9.769546
## 284123           15.12933           12.91645          10.384773
## 284124           14.71640           12.53316          10.692387
## 284125           14.64476           12.82653          12.346193
## 284126           17.18028           18.26122          26.173097
## 284127           17.76225           19.20898          24.586548

Merge with df_train and df_test

df_train <- df_train %>%
  select('air_store_id', 'visit_date') %>%
  inner_join(df_full,
             by=c('air_store_id','visit_date')) %>%
  inner_join(df_hist_average_visitors,
             by=c('air_store_id','visit_date'))

df_test <- df_test %>%
  select('air_store_id', 'visit_date') %>%
  inner_join(df_full,
             by=c('air_store_id','visit_date')) %>%
  inner_join(df_hist_average_visitors,
             by=c('air_store_id','visit_date'))

# impute missing reservation data
df_train$total_reservations[is.na(df_train$total_reservations)] <- 0
df_test$total_reservations[is.na(df_test$total_reservations)] <- 0

Models & Evaluation

Split Train - Validation set

df_train_eval <- df_train %>%
  filter(visit_date < '2017-02-01') %>%
  mutate(log_visitors = log(visitors))

df_val <- df_train %>%
  filter(visit_date >= '2017-02-01')

Evaluation Metric functions

MSE <- function(actual,pred){
  mean((actual - pred)^2)
}

Bias <- function(actual,pred){
  mean(actual) - mean(pred)
}

sMAPE <- function(actual,pred){
  mean(200*(abs(actual - pred))/(actual+pred))
}

Use total historical mean as prediction

# Fit
df_mean_preds <- df_train_eval %>%
  group_by(air_store_id) %>%
  summarise(pred_mean = mean(visitors))

# Predict
df_val <- df_val %>% left_join(df_mean_preds, by='air_store_id')

# Evaluate
df_eval <- df_val %>% filter(!is.na(hist_avg_visitors) & !is.na(pred_mean))
print("Mean Pred Performance")
## [1] "Mean Pred Performance"
MSE(df_eval$visitors, df_eval$pred_mean)
## [1] 189.4146
Bias(df_eval$visitors, df_eval$pred_mean)
## [1] 0.2480329
sMAPE(df_eval$visitors, df_eval$pred_mean)
## [1] 47.1184

Linear Regression with Month, Day of Week Dummies and Holiday indicator

# Fit
fit.lm1 <- lm(visitors ~ Month + day_of_week + holiday_flg,
              data=df_train_eval)
fit.lm1 %>% summary()
## 
## Call:
## lm(formula = visitors ~ Month + day_of_week + holiday_flg, data = df_train_eval)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -30.67 -11.56  -3.64   7.96 651.32 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          21.76227    0.12982 167.634  < 2e-16 ***
## Month2                1.29742    0.20969   6.187 6.13e-10 ***
## Month3                3.15489    0.20293  15.547  < 2e-16 ***
## Month4                2.44712    0.20473  11.953  < 2e-16 ***
## Month5                2.07393    0.20405  10.164  < 2e-16 ***
## Month6                1.73742    0.20344   8.540  < 2e-16 ***
## Month7                1.11302    0.14785   7.528 5.18e-14 ***
## Month8               -0.12682    0.14956  -0.848   0.3965    
## Month9               -0.05772    0.14898  -0.387   0.6984    
## Month10               0.34126    0.14777   2.309   0.0209 *  
## Month11              -0.04487    0.14879  -0.302   0.7630    
## Month12               2.97634    0.14779  20.139  < 2e-16 ***
## day_of_weekMonday    -5.94421    0.13858 -42.893  < 2e-16 ***
## day_of_weekSaturday   3.18609    0.13089  24.342  < 2e-16 ***
## day_of_weekSunday     0.80109    0.14002   5.721 1.06e-08 ***
## day_of_weekThursday  -4.14070    0.13236 -31.284  < 2e-16 ***
## day_of_weekTuesday   -5.06489    0.13332 -37.991  < 2e-16 ***
## day_of_weekWednesday -3.57537    0.13328 -26.826  < 2e-16 ***
## holiday_flg1          3.74797    0.15905  23.564  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.15 on 194216 degrees of freedom
## Multiple R-squared:  0.04364,    Adjusted R-squared:  0.04355 
## F-statistic: 492.4 on 18 and 194216 DF,  p-value: < 2.2e-16
# Predict
lm1.pred <- predict(fit.lm1, df_val)
df_val <- cbind(df_val, lm1.pred)

# Evaluate
df_eval <- df_val %>% filter(!is.na(hist_avg_visitors) & !is.na(pred_mean))
print("lm1 Performance")
## [1] "lm1 Performance"
MSE(df_eval$visitors, df_eval$lm1.pred)
## [1] 281.6519
Bias(df_eval$visitors, df_eval$lm1.pred)
## [1] -1.079654
sMAPE(df_eval$visitors, df_eval$lm1.pred)
## [1] 63.40645

Linear Regression with Dummies and Additional Historical Average Predictor

# Fit
fit.lm2 <- lm(visitors ~ hist_avg_visitors + Month + day_of_week + holiday_flg,
              data=df_train_eval,
              na.action = na.omit)
fit.lm2 %>% summary()
## 
## Call:
## lm(formula = visitors ~ hist_avg_visitors + Month + day_of_week + 
##     holiday_flg, data = df_train_eval, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -109.31   -6.84   -1.08    5.45  608.76 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2.97322    0.12938  22.980  < 2e-16 ***
## hist_avg_visitors     0.89495    0.00282 317.393  < 2e-16 ***
## Month2                2.23945    0.23639   9.473  < 2e-16 ***
## Month3                3.75956    0.16662  22.564  < 2e-16 ***
## Month4                2.44076    0.16764  14.559  < 2e-16 ***
## Month5                1.40402    0.16673   8.421  < 2e-16 ***
## Month6                0.89428    0.16589   5.391 7.02e-08 ***
## Month7                0.40577    0.16400   2.474  0.01335 *  
## Month8               -0.98559    0.13682  -7.203 5.90e-13 ***
## Month9               -0.35643    0.12586  -2.832  0.00463 ** 
## Month10               0.44770    0.12469   3.590  0.00033 ***
## Month11               0.22565    0.12539   1.800  0.07193 .  
## Month12               3.41379    0.12456  27.406  < 2e-16 ***
## day_of_weekMonday    -6.25746    0.11811 -52.979  < 2e-16 ***
## day_of_weekSaturday   3.33941    0.11188  29.848  < 2e-16 ***
## day_of_weekSunday     0.63494    0.11951   5.313 1.08e-07 ***
## day_of_weekThursday  -4.25548    0.11273 -37.750  < 2e-16 ***
## day_of_weekTuesday   -5.01811    0.11372 -44.126  < 2e-16 ***
## day_of_weekWednesday -3.56000    0.11368 -31.316  < 2e-16 ***
## holiday_flg1          3.66354    0.13386  27.369  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.58 on 162371 degrees of freedom
##   (31844 observations deleted due to missingness)
## Multiple R-squared:  0.4113, Adjusted R-squared:  0.4113 
## F-statistic:  5972 on 19 and 162371 DF,  p-value: < 2.2e-16
# Predict
lm2.pred <- predict(fit.lm2, df_val)
df_val <- cbind(df_val, lm2.pred)

# Evaluate
print("lm2 Performance")
## [1] "lm2 Performance"
df_eval <- df_val %>% filter(!is.na(hist_avg_visitors) & !is.na(pred_mean))
MSE(df_eval$visitors, df_eval$lm2.pred)
## [1] 174.9837
Bias(df_eval$visitors, df_eval$lm2.pred)
## [1] -1.487614
sMAPE(df_eval$visitors, df_eval$lm2.pred)
## [1] 47.56116

Linear Regression with Dummies and Additional Historical Weighted Average (alpha = 0.05) Predictor

# Fit
fit.lm3a <- lm(visitors ~ hist_wa005_visitors + Month + day_of_week + holiday_flg,
               data=df_train_eval,
               na.action = na.omit)
fit.lm3a %>% summary()
## 
## Call:
## lm(formula = visitors ~ hist_wa005_visitors + Month + day_of_week + 
##     holiday_flg, data = df_train_eval, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -110.37   -6.71   -1.04    5.29  608.46 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3.001338   0.127496  23.541  < 2e-16 ***
## hist_wa005_visitors   0.904811   0.002772 326.463  < 2e-16 ***
## Month2                1.975037   0.233818   8.447  < 2e-16 ***
## Month3                3.488600   0.164801  21.169  < 2e-16 ***
## Month4                1.899261   0.165827  11.453  < 2e-16 ***
## Month5                0.206002   0.165011   1.248   0.2119    
## Month6               -0.072422   0.164166  -0.441   0.6591    
## Month7                0.178200   0.162220   1.099   0.2720    
## Month8               -1.103055   0.135339  -8.150 3.66e-16 ***
## Month9               -0.318682   0.124486  -2.560   0.0105 *  
## Month10               0.904375   0.123336   7.333 2.27e-13 ***
## Month11               0.644851   0.124030   5.199 2.00e-07 ***
## Month12               3.697349   0.123210  30.008  < 2e-16 ***
## day_of_weekMonday    -6.147372   0.116822 -52.622  < 2e-16 ***
## day_of_weekSaturday   3.365573   0.110661  30.413  < 2e-16 ***
## day_of_weekSunday     0.681994   0.118200   5.770 7.95e-09 ***
## day_of_weekThursday  -4.291005   0.111497 -38.485  < 2e-16 ***
## day_of_weekTuesday   -4.943685   0.112480 -43.952  < 2e-16 ***
## day_of_weekWednesday -3.551547   0.112440 -31.586  < 2e-16 ***
## holiday_flg1          3.699869   0.132393  27.946  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.44 on 162371 degrees of freedom
##   (31844 observations deleted due to missingness)
## Multiple R-squared:  0.4241, Adjusted R-squared:  0.4241 
## F-statistic:  6294 on 19 and 162371 DF,  p-value: < 2.2e-16
# Predict
lm3a.pred <- predict(fit.lm3a, df_val)
df_val <- cbind(df_val, lm3a.pred)

# Evaluate
print("lm3a Performance")
## [1] "lm3a Performance"
df_eval <- df_val %>% filter(!is.na(hist_avg_visitors) & !is.na(pred_mean))
MSE(df_eval$visitors, df_eval$lm3a.pred)
## [1] 172.9475
Bias(df_eval$visitors, df_eval$lm3a.pred)
## [1] -1.471193
sMAPE(df_eval$visitors, df_eval$lm3a.pred)
## [1] 46.6034

Linear Regression with Dummies and Additional Historical Weighted Average (alpha = 0.1) Predictor

# Fit
fit.lm3b <- lm(visitors ~ hist_wa01_visitors + Month + day_of_week + holiday_flg,
              data=df_train_eval,
              na.action = na.omit)
fit.lm3b %>% summary()
## 
## Call:
## lm(formula = visitors ~ hist_wa01_visitors + Month + day_of_week + 
##     holiday_flg, data = df_train_eval, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -107.91   -6.75   -1.09    5.30  608.16 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3.209500   0.128121  25.051  < 2e-16 ***
## hist_wa01_visitors    0.883215   0.002743 321.961  < 2e-16 ***
## Month2                2.187511   0.235094   9.305  < 2e-16 ***
## Month3                3.667851   0.165701  22.135  < 2e-16 ***
## Month4                1.922978   0.166734  11.533  < 2e-16 ***
## Month5                0.112554   0.165928   0.678    0.498    
## Month6                0.140103   0.165041   0.849    0.396    
## Month7                0.675862   0.163076   4.144 3.41e-05 ***
## Month8               -0.788061   0.136049  -5.792 6.95e-09 ***
## Month9                0.031432   0.125156   0.251    0.802    
## Month10               1.337408   0.124028  10.783  < 2e-16 ***
## Month11               0.837683   0.124717   6.717 1.87e-11 ***
## Month12               3.885505   0.123892  31.362  < 2e-16 ***
## day_of_weekMonday    -6.022117   0.117460 -51.270  < 2e-16 ***
## day_of_weekSaturday   3.401265   0.111266  30.569  < 2e-16 ***
## day_of_weekSunday     0.757816   0.118847   6.376 1.82e-10 ***
## day_of_weekThursday  -4.332408   0.112108 -38.645  < 2e-16 ***
## day_of_weekTuesday   -4.861539   0.113096 -42.986  < 2e-16 ***
## day_of_weekWednesday -3.548601   0.113055 -31.388  < 2e-16 ***
## holiday_flg1          3.701853   0.133118  27.809  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.51 on 162371 degrees of freedom
##   (31844 observations deleted due to missingness)
## Multiple R-squared:  0.4178, Adjusted R-squared:  0.4177 
## F-statistic:  6133 on 19 and 162371 DF,  p-value: < 2.2e-16
# Predict
lm3b.pred <- predict(fit.lm3b, df_val)
df_val <- cbind(df_val, lm3b.pred)

# Evaluate
print("lm3b Performance")
## [1] "lm3b Performance"
df_eval <- df_val %>% filter(!is.na(hist_avg_visitors) & !is.na(pred_mean))
MSE(df_eval$visitors, df_eval$lm3b.pred)
## [1] 177.647
Bias(df_eval$visitors, df_eval$lm3b.pred)
## [1] -1.392367
sMAPE(df_eval$visitors, df_eval$lm3b.pred)
## [1] 47.03719

Linear Regression with Dummies and Additional Historical Weighted Average (alpha = 0.2) Predictor

# Fit
fit.lm3c <- lm(visitors ~ hist_wa02_visitors + Month + day_of_week + holiday_flg,
              data=df_train_eval,
              na.action = na.omit)
fit.lm3c %>% summary()
## 
## Call:
## lm(formula = visitors ~ hist_wa02_visitors + Month + day_of_week + 
##     holiday_flg, data = df_train_eval, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -123.50   -6.91   -1.21    5.39  607.57 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            3.9376     0.1297  30.352  < 2e-16 ***
## hist_wa02_visitors     0.8347     0.0027 309.121  < 2e-16 ***
## Month2                 2.4283     0.2388  10.171  < 2e-16 ***
## Month3                 3.8483     0.1683  22.868  < 2e-16 ***
## Month4                 2.0198     0.1693  11.928  < 2e-16 ***
## Month5                 0.3455     0.1685   2.050  0.04032 *  
## Month6                 0.4795     0.1676   2.861  0.00422 ** 
## Month7                 1.0816     0.1656   6.531 6.53e-11 ***
## Month8                -0.4320     0.1381  -3.127  0.00177 ** 
## Month9                 0.3763     0.1271   2.961  0.00307 ** 
## Month10                1.6381     0.1260  13.003  < 2e-16 ***
## Month11                0.9633     0.1267   7.605 2.87e-14 ***
## Month12                4.0989     0.1258  32.573  < 2e-16 ***
## day_of_weekMonday     -5.7872     0.1193 -48.512  < 2e-16 ***
## day_of_weekSaturday    3.4843     0.1130  30.834  < 2e-16 ***
## day_of_weekSunday      0.9219     0.1207   7.638 2.21e-14 ***
## day_of_weekThursday   -4.4349     0.1139 -38.951  < 2e-16 ***
## day_of_weekTuesday    -4.7436     0.1149 -41.299  < 2e-16 ***
## day_of_weekWednesday  -3.5896     0.1148 -31.263  < 2e-16 ***
## holiday_flg1           3.6424     0.1352  26.942  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.71 on 162371 degrees of freedom
##   (31844 observations deleted due to missingness)
## Multiple R-squared:  0.3995, Adjusted R-squared:  0.3994 
## F-statistic:  5686 on 19 and 162371 DF,  p-value: < 2.2e-16
# Predict
lm3c.pred <- predict(fit.lm3c, df_val)
df_val <- cbind(df_val, lm3c.pred)

# Evaluate
print("lm3c Performance")
## [1] "lm3c Performance"
df_eval <- df_val %>% filter(!is.na(hist_avg_visitors) & !is.na(pred_mean))
MSE(df_eval$visitors, df_eval$lm3c.pred)
## [1] 184.9788
Bias(df_eval$visitors, df_eval$lm3c.pred)
## [1] -1.30465
sMAPE(df_eval$visitors, df_eval$lm3c.pred)
## [1] 48.01632

Linear Regression with Dummies and Additional Historical Weighted Average (alpha = 0.5) Predictor

# Fit
fit.lm3d <- lm(visitors ~ hist_wa05_visitors + Month + day_of_week + holiday_flg,
              data=df_train_eval,
              na.action = na.omit)
fit.lm3d %>% summary()
## 
## Call:
## lm(formula = visitors ~ hist_wa05_visitors + Month + day_of_week + 
##     holiday_flg, data = df_train_eval, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -210.39   -7.67   -1.75    5.77  606.75 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           7.286198   0.134858  54.028  < 2e-16 ***
## hist_wa05_visitors    0.667569   0.002558 260.980  < 2e-16 ***
## Month2                2.617716   0.252570  10.364  < 2e-16 ***
## Month3                3.961512   0.178025  22.253  < 2e-16 ***
## Month4                2.201940   0.179123  12.293  < 2e-16 ***
## Month5                0.944442   0.178192   5.300 1.16e-07 ***
## Month6                0.985090   0.177245   5.558 2.74e-08 ***
## Month7                1.370464   0.175179   7.823 5.18e-15 ***
## Month8                0.037952   0.146111   0.260    0.795    
## Month9                0.615621   0.134465   4.578 4.69e-06 ***
## Month10               1.611125   0.133284  12.088  < 2e-16 ***
## Month11               0.969834   0.134005   7.237 4.60e-13 ***
## Month12               4.132974   0.133128  31.045  < 2e-16 ***
## day_of_weekMonday    -5.502005   0.126207 -43.595  < 2e-16 ***
## day_of_weekSaturday   3.658518   0.119545  30.604  < 2e-16 ***
## day_of_weekSunday     1.197661   0.127696   9.379  < 2e-16 ***
## day_of_weekThursday  -4.808271   0.120467 -39.914  < 2e-16 ***
## day_of_weekTuesday   -4.835508   0.121506 -39.796  < 2e-16 ***
## day_of_weekWednesday -4.019119   0.121475 -33.086  < 2e-16 ***
## holiday_flg1          3.607038   0.143018  25.221  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.44 on 162371 degrees of freedom
##   (31844 observations deleted due to missingness)
## Multiple R-squared:  0.328,  Adjusted R-squared:  0.3279 
## F-statistic:  4171 on 19 and 162371 DF,  p-value: < 2.2e-16
# Predict
lm3d.pred <- predict(fit.lm3d, df_val)
df_val <- cbind(df_val, lm3d.pred)

# Evaluate
print("lm3d Performance")
## [1] "lm3d Performance"
df_eval <- df_val %>% filter(!is.na(hist_avg_visitors) & !is.na(pred_mean))
MSE(df_eval$visitors, df_eval$lm3d.pred)
## [1] 206.4613
Bias(df_eval$visitors, df_eval$lm3d.pred)
## [1] -1.252286
sMAPE(df_eval$visitors, df_eval$lm3d.pred)
## [1] 51.53464

Linear Regression with Dummies, Historical Weighted Average (alpha=0.05) and General Area new dummy

# Fit
fit.lm4 <- lm(visitors ~ hist_wa005_visitors + Month + day_of_week + holiday_flg + GeneralArea,
              data=df_train_eval,
              na.action = na.omit)
fit.lm4 %>% summary()
## 
## Call:
## lm(formula = visitors ~ hist_wa005_visitors + Month + day_of_week + 
##     holiday_flg + GeneralArea, data = df_train_eval, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -110.66   -6.71   -1.03    5.30  608.54 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               2.927940   0.145742  20.090  < 2e-16 ***
## hist_wa005_visitors       0.904540   0.002783 325.043  < 2e-16 ***
## Month2                    1.973429   0.233897   8.437  < 2e-16 ***
## Month3                    3.489250   0.164862  21.165  < 2e-16 ***
## Month4                    1.900034   0.165882  11.454  < 2e-16 ***
## Month5                    0.208418   0.165059   1.263   0.2067    
## Month6                   -0.070394   0.164204  -0.429   0.6681    
## Month7                    0.179682   0.162255   1.107   0.2681    
## Month8                   -1.102032   0.135340  -8.143 3.89e-16 ***
## Month9                   -0.317826   0.124489  -2.553   0.0107 *  
## Month10                   0.905014   0.123336   7.338 2.18e-13 ***
## Month11                   0.645326   0.124028   5.203 1.96e-07 ***
## Month12                   3.697290   0.123208  30.009  < 2e-16 ***
## day_of_weekMonday        -6.147698   0.116823 -52.624  < 2e-16 ***
## day_of_weekSaturday       3.363656   0.110659  30.397  < 2e-16 ***
## day_of_weekSunday         0.677486   0.118225   5.731 1.00e-08 ***
## day_of_weekThursday      -4.290398   0.111495 -38.481  < 2e-16 ***
## day_of_weekTuesday       -4.942976   0.112479 -43.946  < 2e-16 ***
## day_of_weekWednesday     -3.549584   0.112440 -31.569  < 2e-16 ***
## holiday_flg1              3.693123   0.132424  27.889  < 2e-16 ***
## GeneralAreaHiroshima-ken  0.284062   0.173485   1.637   0.1016    
## GeneralAreaHokkaidō       0.395686   0.157697   2.509   0.0121 *  
## GeneralAreaHyōgo-ken      0.174196   0.139523   1.249   0.2118    
## GeneralAreaMiyagi-ken     0.256293   0.209305   1.224   0.2208    
## GeneralAreaNiigata-ken    0.569932   0.256148   2.225   0.0261 *  
## GeneralAreaŌsaka-fu       0.196627   0.128623   1.529   0.1263    
## GeneralAreaShizuoka-ken  -0.042434   0.215625  -0.197   0.8440    
## GeneralAreaTōkyō-to       0.005900   0.088501   0.067   0.9468    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.44 on 162363 degrees of freedom
##   (31844 observations deleted due to missingness)
## Multiple R-squared:  0.4242, Adjusted R-squared:  0.4241 
## F-statistic:  4430 on 27 and 162363 DF,  p-value: < 2.2e-16
# Predict
lm4.pred <- predict(fit.lm4, df_val)
df_val <- cbind(df_val, lm4.pred)
# high p-values for a bunch of them so probably not gonna use

# Evaluate
print("lm4 Performance")
## [1] "lm4 Performance"
df_eval <- df_val %>% filter(!is.na(hist_avg_visitors) & !is.na(pred_mean))
MSE(df_eval$visitors, df_eval$lm4.pred)
## [1] 173.0206
Bias(df_eval$visitors, df_eval$lm4.pred)
## [1] -1.470203
sMAPE(df_eval$visitors, df_eval$lm4.pred)
## [1] 46.60608

Linear Regression with Dummies, Historical Weighted Average (alpha=0.05) and Restaurant Genre new dummy

# Fit
fit.lm5 <- lm(visitors ~ hist_wa005_visitors + Month + day_of_week + holiday_flg + air_genre_name,
              data=df_train_eval,
              na.action = na.omit)
fit.lm5 %>% summary()
## 
## Call:
## lm(formula = visitors ~ hist_wa005_visitors + Month + day_of_week + 
##     holiday_flg + air_genre_name, data = df_train_eval, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -108.52   -6.70   -1.01    5.29  608.74 
## 
## Coefficients:
##                                             Estimate Std. Error t value
## (Intercept)                                 1.269353   0.706450   1.797
## hist_wa005_visitors                         0.897191   0.002877 311.832
## Month2                                      1.987279   0.233649   8.505
## Month3                                      3.489207   0.164680  21.188
## Month4                                      1.905315   0.165711  11.498
## Month5                                      0.229307   0.164920   1.390
## Month6                                     -0.050499   0.164072  -0.308
## Month7                                      0.199635   0.162110   1.231
## Month8                                     -1.091272   0.135222  -8.070
## Month9                                     -0.310790   0.124365  -2.499
## Month10                                     0.906158   0.123215   7.354
## Month11                                     0.644158   0.123909   5.199
## Month12                                     3.691605   0.123089  29.991
## day_of_weekMonday                          -6.146289   0.116721 -52.658
## day_of_weekSaturday                         3.369929   0.110552  30.483
## day_of_weekSunday                           0.710572   0.118128   6.015
## day_of_weekThursday                        -4.294790   0.111390 -38.556
## day_of_weekTuesday                         -4.948137   0.112374 -44.033
## day_of_weekWednesday                       -3.557511   0.112336 -31.668
## holiday_flg1                                3.709026   0.132280  28.039
## air_genre_nameBar/Cocktail                  0.892744   0.699454   1.276
## air_genre_nameCafe/Sweets                   1.625506   0.693329   2.344
## air_genre_nameCreative cuisine              1.759123   0.734511   2.395
## air_genre_nameDining bar                    1.612224   0.695950   2.317
## air_genre_nameInternational cuisine         4.109411   1.174406   3.499
## air_genre_nameItalian/French                2.142525   0.695903   3.079
## air_genre_nameIzakaya                       2.420095   0.692785   3.493
## air_genre_nameJapanese food                 2.016094   0.699992   2.880
## air_genre_nameKaraoke/Party                 8.971594   0.997824   8.991
## air_genre_nameOkonomiyaki/Monja/Teppanyaki  2.045559   0.739382   2.767
## air_genre_nameOther                         2.023047   0.711384   2.844
## air_genre_nameWestern food                  1.079764   0.725010   1.489
## air_genre_nameYakiniku/Korean food          2.401852   0.714982   3.359
##                                            Pr(>|t|)    
## (Intercept)                                0.072369 .  
## hist_wa005_visitors                         < 2e-16 ***
## Month2                                      < 2e-16 ***
## Month3                                      < 2e-16 ***
## Month4                                      < 2e-16 ***
## Month5                                     0.164406    
## Month6                                     0.758247    
## Month7                                     0.218146    
## Month8                                     7.06e-16 ***
## Month9                                     0.012455 *  
## Month10                                    1.93e-13 ***
## Month11                                    2.01e-07 ***
## Month12                                     < 2e-16 ***
## day_of_weekMonday                           < 2e-16 ***
## day_of_weekSaturday                         < 2e-16 ***
## day_of_weekSunday                          1.80e-09 ***
## day_of_weekThursday                         < 2e-16 ***
## day_of_weekTuesday                          < 2e-16 ***
## day_of_weekWednesday                        < 2e-16 ***
## holiday_flg1                                < 2e-16 ***
## air_genre_nameBar/Cocktail                 0.201835    
## air_genre_nameCafe/Sweets                  0.019054 *  
## air_genre_nameCreative cuisine             0.016623 *  
## air_genre_nameDining bar                   0.020528 *  
## air_genre_nameInternational cuisine        0.000467 ***
## air_genre_nameItalian/French               0.002079 ** 
## air_genre_nameIzakaya                      0.000477 ***
## air_genre_nameJapanese food                0.003975 ** 
## air_genre_nameKaraoke/Party                 < 2e-16 ***
## air_genre_nameOkonomiyaki/Monja/Teppanyaki 0.005665 ** 
## air_genre_nameOther                        0.004458 ** 
## air_genre_nameWestern food                 0.136408    
## air_genre_nameYakiniku/Korean food         0.000782 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.43 on 162358 degrees of freedom
##   (31844 observations deleted due to missingness)
## Multiple R-squared:  0.4253, Adjusted R-squared:  0.4252 
## F-statistic:  3755 on 32 and 162358 DF,  p-value: < 2.2e-16
# Predict
lm5.pred <- predict(fit.lm5, df_val)
df_val <- cbind(df_val, lm5.pred)

# Evaluate
print("lm5 Performance")
## [1] "lm5 Performance"
df_eval <- df_val %>% filter(!is.na(hist_avg_visitors) & !is.na(pred_mean))
MSE(df_eval$visitors, df_eval$lm5.pred)
## [1] 173.6027
Bias(df_eval$visitors, df_eval$lm5.pred)
## [1] -1.472677
sMAPE(df_eval$visitors, df_eval$lm5.pred)
## [1] 46.57466

Linear Regression with Dummies, Historical Weighted Average (alpha=0.05) and Total Reservations - WINNING MODEL

# Fit
fit.lm6 <- lm(visitors ~ hist_wa005_visitors + total_reservations + Month + day_of_week + holiday_flg,
              data=df_train_eval,
              na.action = na.omit)
fit.lm6 %>% summary()
## 
## Call:
## lm(formula = visitors ~ hist_wa005_visitors + total_reservations + 
##     Month + day_of_week + holiday_flg, data = df_train_eval, 
##     na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -109.78   -6.65   -1.03    5.27  608.40 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2.754522   0.126770  21.728  < 2e-16 ***
## hist_wa005_visitors   0.900671   0.002755 326.948  < 2e-16 ***
## total_reservations    0.353064   0.007604  46.432  < 2e-16 ***
## Month2                2.103222   0.232298   9.054  < 2e-16 ***
## Month3                3.566850   0.163727  21.785  < 2e-16 ***
## Month4                2.028944   0.164761  12.314  < 2e-16 ***
## Month5                0.391480   0.163976   2.387  0.01697 *  
## Month6                0.133548   0.163147   0.819  0.41303    
## Month7                0.422139   0.161240   2.618  0.00884 ** 
## Month8               -0.849210   0.134561  -6.311 2.78e-10 ***
## Month9               -0.065005   0.123789  -0.525  0.59949    
## Month10               1.103557   0.122600   9.001  < 2e-16 ***
## Month11               0.691226   0.123219   5.610 2.03e-08 ***
## Month12               3.418851   0.122548  27.898  < 2e-16 ***
## day_of_weekMonday    -5.988683   0.116104 -51.580  < 2e-16 ***
## day_of_weekSaturday   3.316480   0.109939  30.167  < 2e-16 ***
## day_of_weekSunday     0.820027   0.117461   6.981 2.94e-12 ***
## day_of_weekThursday  -4.178845   0.110791 -37.718  < 2e-16 ***
## day_of_weekTuesday   -4.790355   0.111790 -42.851  < 2e-16 ***
## day_of_weekWednesday -3.425616   0.111734 -30.659  < 2e-16 ***
## holiday_flg1          3.640271   0.131530  27.676  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.36 on 162370 degrees of freedom
##   (31844 observations deleted due to missingness)
## Multiple R-squared:  0.4317, Adjusted R-squared:  0.4316 
## F-statistic:  6166 on 20 and 162370 DF,  p-value: < 2.2e-16
# Predict
lm6.pred <- predict(fit.lm6, df_val)
df_val <- cbind(df_val, lm6.pred)

# Evaluate
print("lm6 Performance")
## [1] "lm6 Performance"
df_eval <- df_val %>% filter(!is.na(hist_avg_visitors) & !is.na(pred_mean))
MSE(df_eval$visitors, df_eval$lm6.pred)
## [1] 171.6071
Bias(df_eval$visitors, df_eval$lm6.pred)
## [1] -1.63508
sMAPE(df_eval$visitors, df_eval$lm6.pred)
## [1] 46.51082

As we can see from the model structures above and their respective performances, the model with the lowest MSE and sMAPE is the last proposed one with Month, Day of Week Dummies, a Holiday indicator, the weighted average and total reservations features (lm6). The robust and very simple historical average method also does very well - even better compared to the simple regression with only dummies. However, the final model has improved performance as far as MSE and sMAPE compared to both model 0 and model 1 benchmarks as well as compared to the rest - slightly more complex - models. It is worth noticing that the winning model has a higher absolute bias than the simpler model 0 as it is underpredicting more than model 0 is overpredicting. However, we would prefer a model with significantly better accuracy and slightly higher bias since the latter is adjustable.

Prediction

Generate Final Prediction and Submission file

# Final Prediction
lm_final.pred <- predict(fit.lm6, df_test)
df_forecast <- df_test %>%
  mutate(visitors = lm_final.pred)
# if prediction is negative then set to 0
df_forecast$visitors[df_forecast$visitors < 0] <- 0

# look at final forecasts
df_forecast %>% head()
##           air_store_id visit_date visitors total_reservations air_genre_name
## 1 air_00a91d42b08b08d9 2017-04-23 30.30785                  0 Italian/French
## 2 air_00a91d42b08b08d9 2017-04-24 23.52486                  0 Italian/French
## 3 air_00a91d42b08b08d9 2017-04-25 25.19798                  0 Italian/French
## 4 air_00a91d42b08b08d9 2017-04-26 27.05880                  0 Italian/French
## 5 air_00a91d42b08b08d9 2017-04-27 25.42580                  0 Italian/French
## 6 air_00a91d42b08b08d9 2017-04-28 30.11991                  0 Italian/French
##                     air_area_name latitude longitude day_of_week holiday_flg
## 1 Tōkyō-to Chiyoda-ku Kudanminami   35.694  139.7536      Sunday           0
## 2 Tōkyō-to Chiyoda-ku Kudanminami   35.694  139.7536      Monday           0
## 3 Tōkyō-to Chiyoda-ku Kudanminami   35.694  139.7536     Tuesday           0
## 4 Tōkyō-to Chiyoda-ku Kudanminami   35.694  139.7536   Wednesday           0
## 5 Tōkyō-to Chiyoda-ku Kudanminami   35.694  139.7536    Thursday           0
## 6 Tōkyō-to Chiyoda-ku Kudanminami   35.694  139.7536      Friday           0
##   Year Month Day GeneralArea hist_avg_visitors hist_wa005_visitors
## 1 2017     4  23    Tōkyō-to          25.47938            27.42883
## 2 2017     4  24    Tōkyō-to          25.49231            27.45739
## 3 2017     4  25    Tōkyō-to          25.55612            27.98454
## 4 2017     4  26    Tōkyō-to          25.62437            28.53534
## 5 2017     4  27    Tōkyō-to          25.54040            27.55853
## 6 2017     4  28    Tōkyō-to          25.60804            28.13063
##   hist_wa01_visitors hist_wa02_visitors hist_wa05_visitors
## 1           26.56268           25.23260           25.72418
## 2           26.70641           25.78608           26.86209
## 3           27.83577           28.22887           32.43104
## 4           28.95219           30.38309           35.71552
## 5           26.95697           26.10647           22.35776
## 6           28.16128           28.68518           30.67888
df_forecast %>% summary()
##  air_store_id         visit_date            visitors      total_reservations
##  Length:32019       Min.   :2017-04-23   Min.   :  0.00   Min.   : 0.00000  
##  Class :character   1st Qu.:2017-05-02   1st Qu.: 12.10   1st Qu.: 0.00000  
##  Mode  :character   Median :2017-05-12   Median : 19.29   Median : 0.00000  
##                     Mean   :2017-05-12   Mean   : 21.00   Mean   : 0.08561  
##                     3rd Qu.:2017-05-22   3rd Qu.: 28.37   3rd Qu.: 0.00000  
##                     Max.   :2017-05-31   Max.   :121.86   Max.   :54.00000  
##                                          NA's   :19                         
##         air_genre_name                        air_area_name      latitude    
##  Izakaya       :7605   Fukuoka-ken Fukuoka-shi Daimyō: 2457   Min.   :33.21  
##  Cafe/Sweets   :6903   Tōkyō-to Shibuya-ku Shibuya   : 2262   1st Qu.:34.70  
##  Dining bar    :4173   Tōkyō-to Minato-ku Shibakōen  : 1989   Median :35.66  
##  Italian/French:3939   Tōkyō-to Shinjuku-ku Kabukichō: 1521   Mean   :35.63  
##  Bar/Cocktail  :3081   Tōkyō-to Setagaya-ku Setagaya : 1170   3rd Qu.:35.69  
##  Japanese food :2457   Tōkyō-to Chūō-ku Tsukiji      : 1092   Max.   :44.02  
##  (Other)       :3861   (Other)                       :21528                  
##    longitude        day_of_week   holiday_flg   Year           Month      
##  Min.   :130.2   Friday   :4105   0:28735     2016:    0   5      :25451  
##  1st Qu.:135.3   Monday   :4926   1: 3284     2017:32019   4      : 6568  
##  Median :139.7   Saturday :4105                            1      :    0  
##  Mean   :137.4   Sunday   :4926                            2      :    0  
##  3rd Qu.:139.8   Thursday :4105                            3      :    0  
##  Max.   :144.3   Tuesday  :4926                            6      :    0  
##                  Wednesday:4926                            (Other):    0  
##       Day               GeneralArea    hist_avg_visitors hist_wa005_visitors
##  23     : 1642   Tōkyō-to     :17160   Min.   :  1.186   Min.   :  1.062    
##  24     : 1642   Fukuoka-ken  : 4875   1st Qu.: 11.773   1st Qu.: 11.620    
##  25     : 1642   Ōsaka-fu     : 2886   Median : 18.992   Median : 18.987    
##  26     : 1642   Hyōgo-ken    : 2223   Mean   : 21.050   Mean   : 21.416    
##  27     : 1642   Hokkaidō     : 1716   3rd Qu.: 28.461   3rd Qu.: 29.412    
##  28     : 1642   Hiroshima-ken: 1248   Max.   :122.719   Max.   :127.829    
##  (Other):22167   (Other)      : 1911   NA's   :19        NA's   :19         
##  hist_wa01_visitors hist_wa02_visitors hist_wa05_visitors
##  Min.   :  1.027    Min.   :  1.004    Min.   :  1.00    
##  1st Qu.: 11.510    1st Qu.: 11.322    1st Qu.: 10.63    
##  Median : 19.043    Median : 18.914    Median : 18.50    
##  Mean   : 21.638    Mean   : 21.715    Mean   : 21.75    
##  3rd Qu.: 29.634    3rd Qu.: 29.609    3rd Qu.: 29.71    
##  Max.   :131.974    Max.   :175.177    Max.   :398.21    
##  NA's   :19         NA's   :19         NA's   :19
# Submission file
df_submission <- df_sample_submission %>%
  mutate(air_store_id = substr(df_sample_submission$id,1,20),
         visit_date = as.Date(substr(df_sample_submission$id,22,31))
  ) %>%
  select(id, air_store_id, visit_date) 

df_submission <- df_submission %>%
  inner_join(df_forecast, by=c('air_store_id', 'visit_date')) %>%
  select(id, visitors)

# look at final submission file - should be the same as sample submission format / dimensions
dim(df_submission)
## [1] 32019     2
dim(df_sample_submission)
## [1] 32019     2
df_submission %>% head()
##                                id visitors
## 1 air_00a91d42b08b08d9_2017-04-23 30.30785
## 2 air_00a91d42b08b08d9_2017-04-24 23.52486
## 3 air_00a91d42b08b08d9_2017-04-25 25.19798
## 4 air_00a91d42b08b08d9_2017-04-26 27.05880
## 5 air_00a91d42b08b08d9_2017-04-27 25.42580
## 6 air_00a91d42b08b08d9_2017-04-28 30.11991

Write to csv

write.csv(df_submission,"/Users/yannos/Documents/PredictiveAnalytics/Midterm/submission.csv")