###########################################################################################################
rm(list = ls())
library(plyr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(lubridate)
library(zoo)
library(data.table)
library(fpp2)
library(seasonal)
library(matrixStats)
df_air_reserve = read.csv("/Users/yannos/Documents/PredictiveAnalytics/Midterm/RawData/air_reserve.csv")
df_air_store_info = read.csv("/Users/yannos/Documents/PredictiveAnalytics/Midterm/RawData/air_store_info.csv")
df_air_visit_data = read.csv("/Users/yannos/Documents/PredictiveAnalytics/Midterm/RawData/air_visit_data.csv")
df_date_info = read.csv("/Users/yannos/Documents/PredictiveAnalytics/Midterm/RawData/date_info.csv")
df_hpg_reserve = read.csv("/Users/yannos/Documents/PredictiveAnalytics/Midterm/RawData/hpg_reserve.csv")
df_hpg_store_info = read.csv("/Users/yannos/Documents/PredictiveAnalytics/Midterm/RawData/hpg_store_info.csv")
df_sample_submission = read.csv("/Users/yannos/Documents/PredictiveAnalytics/Midterm/RawData/sample_submission.csv")
df_store_id_relation = read.csv("/Users/yannos/Documents/PredictiveAnalytics/Midterm/RawData/store_id_relation.csv")
Look at loaded data
df_air_visit_data %>% head()
## air_store_id visit_date visitors
## 1 air_ba937bf13d40fb24 2016-01-13 25
## 2 air_ba937bf13d40fb24 2016-01-14 32
## 3 air_ba937bf13d40fb24 2016-01-15 29
## 4 air_ba937bf13d40fb24 2016-01-16 22
## 5 air_ba937bf13d40fb24 2016-01-18 6
## 6 air_ba937bf13d40fb24 2016-01-19 9
df_air_reserve %>% head()
## air_store_id visit_datetime reserve_datetime reserve_visitors
## 1 air_877f79706adbfb06 2016-01-01 19:00:00 2016-01-01 16:00:00 1
## 2 air_db4b38ebe7a7ceff 2016-01-01 19:00:00 2016-01-01 19:00:00 3
## 3 air_db4b38ebe7a7ceff 2016-01-01 19:00:00 2016-01-01 19:00:00 6
## 4 air_877f79706adbfb06 2016-01-01 20:00:00 2016-01-01 16:00:00 2
## 5 air_db80363d35f10926 2016-01-01 20:00:00 2016-01-01 01:00:00 5
## 6 air_db80363d35f10926 2016-01-02 01:00:00 2016-01-01 16:00:00 2
df_hpg_reserve %>% head()
## hpg_store_id visit_datetime reserve_datetime reserve_visitors
## 1 hpg_c63f6f42e088e50f 2016-01-01 11:00:00 2016-01-01 09:00:00 1
## 2 hpg_dac72789163a3f47 2016-01-01 13:00:00 2016-01-01 06:00:00 3
## 3 hpg_c8e24dcf51ca1eb5 2016-01-01 16:00:00 2016-01-01 14:00:00 2
## 4 hpg_24bb207e5fd49d4a 2016-01-01 17:00:00 2016-01-01 11:00:00 5
## 5 hpg_25291c542ebb3bc2 2016-01-01 17:00:00 2016-01-01 03:00:00 13
## 6 hpg_28bdf7a336ec6a7b 2016-01-01 17:00:00 2016-01-01 15:00:00 2
df_air_store_info %>% head()
## air_store_id air_genre_name air_area_name latitude
## 1 air_0f0cdeee6c9bf3d7 Italian/French Hyōgo-ken Kōbe-shi Kumoidōri 34.69512
## 2 air_7cc17a324ae5c7dc Italian/French Hyōgo-ken Kōbe-shi Kumoidōri 34.69512
## 3 air_fee8dcf4d619598e Italian/French Hyōgo-ken Kōbe-shi Kumoidōri 34.69512
## 4 air_a17f0778617c76e2 Italian/French Hyōgo-ken Kōbe-shi Kumoidōri 34.69512
## 5 air_83db5aff8f50478e Italian/French Tōkyō-to Minato-ku Shibakōen 35.65807
## 6 air_99c3eae84130c1cb Italian/French Tōkyō-to Minato-ku Shibakōen 35.65807
## longitude
## 1 135.1979
## 2 135.1979
## 3 135.1979
## 4 135.1979
## 5 139.7516
## 6 139.7516
df_hpg_store_info %>% head
## hpg_store_id hpg_genre_name hpg_area_name latitude
## 1 hpg_6622b62385aec8bf Japanese style Tōkyō-to Setagaya-ku Taishidō 35.64367
## 2 hpg_e9e068dd49c5fa00 Japanese style Tōkyō-to Setagaya-ku Taishidō 35.64367
## 3 hpg_2976f7acb4b3a3bc Japanese style Tōkyō-to Setagaya-ku Taishidō 35.64367
## 4 hpg_e51a522e098f024c Japanese style Tōkyō-to Setagaya-ku Taishidō 35.64367
## 5 hpg_e3d0e1519894f275 Japanese style Tōkyō-to Setagaya-ku Taishidō 35.64367
## 6 hpg_530cd91db13b938e Japanese style Tōkyō-to Setagaya-ku Taishidō 35.64367
## longitude
## 1 139.6682
## 2 139.6682
## 3 139.6682
## 4 139.6682
## 5 139.6682
## 6 139.6682
df_store_id_relation %>% head()
## air_store_id hpg_store_id
## 1 air_63b13c56b7201bd9 hpg_4bc649e72e2a239a
## 2 air_a24bf50c3e90d583 hpg_c34b496d0305a809
## 3 air_c7f78b4f3cba33ff hpg_cd8ae0d9bbd58ff9
## 4 air_947eb2cae4f3e8f2 hpg_de24ea49dc25d6b8
## 5 air_965b2e0cf4119003 hpg_653238a84804d8e7
## 6 air_a38f25e3399d1b25 hpg_50378da9ffb9b6cd
df_date_info %>% head()
## calendar_date day_of_week holiday_flg
## 1 2016-01-01 Friday 1
## 2 2016-01-02 Saturday 1
## 3 2016-01-03 Sunday 1
## 4 2016-01-04 Monday 0
## 5 2016-01-05 Tuesday 0
## 6 2016-01-06 Wednesday 0
df_sample_submission %>% head()
## id visitors
## 1 air_00a91d42b08b08d9_2017-04-23 0
## 2 air_00a91d42b08b08d9_2017-04-24 0
## 3 air_00a91d42b08b08d9_2017-04-25 0
## 4 air_00a91d42b08b08d9_2017-04-26 0
## 5 air_00a91d42b08b08d9_2017-04-27 0
## 6 air_00a91d42b08b08d9_2017-04-28 0
As we can see above we have the following data:
Visitor data for each restaurant id for dates between 01/01/2016 and 04/22/2017.
Reservation data for each restaurant between 01/01/2016 and 05/31/2017 from 2 platforms (Hot Pepper Gourmet & AirRegi).
Date information including the day of week and a holiday flag.
Store information including restaurant location and type for each restaurant id.
A sample submission dataset with a combined restaurant id and date column for each restaurant and dates between 04/23/2017 and 05/31/2017 for which we will be providing forecasts.
Preparing air_visit_data - Make visit_date into date type
df_air_visit_data <- df_air_visit_data %>%
mutate(visit_date = as.Date(visit_date))
Preparing reserve data - Keep only visit date from air and hpg reserve data and make into date type
df_air_reserve <- df_air_reserve %>%
mutate(visit_date = as.Date(visit_datetime),
air_reserve_visitors = reserve_visitors) %>%
select(air_store_id, visit_date, air_reserve_visitors)
df_hpg_reserve <- df_hpg_reserve %>%
mutate(visit_date = as.Date(visit_datetime),
hpg_reserve_visitors = reserve_visitors) %>%
select(hpg_store_id, visit_date, hpg_reserve_visitors)
Preparing date info - get useful date related columns
df_date_info <- df_date_info %>%
mutate(visit_date = as.Date(calendar_date)) %>%
select(visit_date, day_of_week, holiday_flg)
Create df train - get all historical dates and restaurant ids
df_train <- df_air_visit_data %>% copy()
Create df test - split restaurant id and visit date from sample submission file
df_test <- df_sample_submission %>%
mutate(air_store_id = substr(df_sample_submission$id,1,20),
visit_date = as.Date(substr(df_sample_submission$id,22,31))
)
Combine to generate features
df_full <- rbind(df_train %>% select(air_store_id, visit_date, visitors),
df_test %>% select(air_store_id, visit_date, visitors))
# get total air reservations for each store / day
df_air_reserve_tot <- df_air_reserve %>%
group_by(air_store_id,visit_date) %>%
summarise(air_reserve_visitors = sum(air_reserve_visitors)) %>%
ungroup()
# get total hpg reservations for each store / day
df_hpg_reserve_tot <- df_hpg_reserve %>%
group_by(hpg_store_id,visit_date) %>%
summarise(hpg_reserve_visitors = sum(hpg_reserve_visitors)) %>%
ungroup()
# join reservations and sum up to get total reservations
df_reservations <- df_air_reserve_tot %>%
inner_join(df_store_id_relation, by = c("air_store_id")) %>%
left_join(df_hpg_reserve_tot, by = c("hpg_store_id","visit_date")) %>%
select(air_store_id, visit_date, air_reserve_visitors, hpg_reserve_visitors) %>%
mutate(total_reservations = air_reserve_visitors + hpg_reserve_visitors) %>%
select(air_store_id, visit_date, total_reservations)
## Warning: Column `air_store_id` joining factors with different levels, coercing
## to character vector
## Warning: Column `hpg_store_id` joining factors with different levels, coercing
## to character vector
# look at total reservation data
df_reservations %>% head()
## # A tibble: 6 x 3
## air_store_id visit_date total_reservations
## <chr> <date> <int>
## 1 air_00a91d42b08b08d9 2016-10-31 4
## 2 air_00a91d42b08b08d9 2016-12-05 10
## 3 air_00a91d42b08b08d9 2016-12-14 NA
## 4 air_00a91d42b08b08d9 2016-12-17 4
## 5 air_00a91d42b08b08d9 2016-12-20 8
## 6 air_00a91d42b08b08d9 2017-02-18 18
# Merge reservation datasets into historical visitors dataset
df_full <- df_full %>%
left_join(df_reservations, by = c("air_store_id","visit_date")) %>%
select(air_store_id, visit_date, visitors, total_reservations)
## Warning: Column `air_store_id` joining factor and character vector, coercing
## into character vector
# Merge in date info and store info
df_full <- df_full %>%
left_join(df_air_store_info, by = c("air_store_id")) %>%
left_join(df_date_info, by = c("visit_date"))
## Warning: Column `air_store_id` joining character vector and factor, coercing
## into character vector
# Generate year, month, day from visit_date
df_full <- df_full %>%
mutate(Year = as.factor(year(visit_date)),
Month = as.factor(month(visit_date)),
Day = as.factor(day(visit_date)),
day_of_week = as.factor(day_of_week),
holiday_flg = as.factor(holiday_flg))
# Create area column
GeneralArea = gsub( " .*$", "", df_full$air_area_name)
df_full$GeneralArea = as.factor(GeneralArea)
df_full$air_genre_name = as.factor(df_full$air_genre_name)
# look at df_full
df_full %>% head()
## air_store_id visit_date visitors total_reservations air_genre_name
## 1 air_ba937bf13d40fb24 2016-01-13 25 NA Dining bar
## 2 air_ba937bf13d40fb24 2016-01-14 32 NA Dining bar
## 3 air_ba937bf13d40fb24 2016-01-15 29 NA Dining bar
## 4 air_ba937bf13d40fb24 2016-01-16 22 NA Dining bar
## 5 air_ba937bf13d40fb24 2016-01-18 6 NA Dining bar
## 6 air_ba937bf13d40fb24 2016-01-19 9 NA Dining bar
## air_area_name latitude longitude day_of_week holiday_flg Year
## 1 Tōkyō-to Minato-ku Shibakōen 35.65807 139.7516 Wednesday 0 2016
## 2 Tōkyō-to Minato-ku Shibakōen 35.65807 139.7516 Thursday 0 2016
## 3 Tōkyō-to Minato-ku Shibakōen 35.65807 139.7516 Friday 0 2016
## 4 Tōkyō-to Minato-ku Shibakōen 35.65807 139.7516 Saturday 0 2016
## 5 Tōkyō-to Minato-ku Shibakōen 35.65807 139.7516 Monday 0 2016
## 6 Tōkyō-to Minato-ku Shibakōen 35.65807 139.7516 Tuesday 0 2016
## Month Day GeneralArea
## 1 1 13 Tōkyō-to
## 2 1 14 Tōkyō-to
## 3 1 15 Tōkyō-to
## 4 1 16 Tōkyō-to
## 5 1 18 Tōkyō-to
## 6 1 19 Tōkyō-to
# df_train
df_train <- df_train %>%
select('air_store_id', 'visit_date') %>%
inner_join(df_full,
by=c('air_store_id','visit_date'))
## Warning: Column `air_store_id` joining factor and character vector, coercing
## into character vector
# df_test
df_test <- df_test %>%
select('air_store_id', 'visit_date') %>%
inner_join(df_full,
by=c('air_store_id','visit_date'))
# impute missing reservation data
df_train$total_reservations[is.na(df_train$total_reservations)] <- 0
df_test$total_reservations[is.na(df_test$total_reservations)] <- 0
df_train %>% summary()
## air_store_id visit_date visitors total_reservations
## Length:252108 Min. :2016-01-01 Min. : 1.00 Min. : 0.0000
## Class :character 1st Qu.:2016-07-23 1st Qu.: 9.00 1st Qu.: 0.0000
## Mode :character Median :2016-10-23 Median : 17.00 Median : 0.0000
## Mean :2016-10-12 Mean : 20.97 Mean : 0.5029
## 3rd Qu.:2017-01-24 3rd Qu.: 29.00 3rd Qu.: 0.0000
## Max. :2017-04-22 Max. :877.00 Max. :264.0000
##
## air_genre_name air_area_name latitude
## Izakaya :62052 Fukuoka-ken Fukuoka-shi Daimyō: 19775 Min. :33.21
## Cafe/Sweets :52764 Tōkyō-to Shibuya-ku Shibuya : 17352 1st Qu.:34.69
## Dining bar :34192 Tōkyō-to Minato-ku Shibakōen : 14696 Median :35.66
## Italian/French:30011 Tōkyō-to Shinjuku-ku Kabukichō: 12517 Mean :35.61
## Bar/Cocktail :25135 Tōkyō-to Setagaya-ku Setagaya : 8719 3rd Qu.:35.69
## Japanese food :18789 Tōkyō-to Chūō-ku Tsukiji : 8370 Max. :44.02
## (Other) :29165 (Other) :170679
## longitude day_of_week holiday_flg Year Month
## Min. :130.2 Friday :40351 0:239333 2016:174535 3 : 30570
## 1st Qu.:135.3 Monday :31682 1: 12775 2017: 77573 2 : 27415
## Median :139.7 Saturday :39262 1 : 27020
## Mean :137.4 Sunday :29991 4 : 23859
## 3rd Qu.:139.8 Thursday :37996 12 : 21515
## Max. :144.3 Tuesday :36015 10 : 21466
## Wednesday:36811 (Other):100263
## Day GeneralArea
## 8 : 8761 Tōkyō-to :133063
## 14 : 8700 Fukuoka-ken : 39645
## 22 : 8687 Ōsaka-fu : 22821
## 21 : 8621 Hyōgo-ken : 17846
## 17 : 8619 Hokkaidō : 13055
## 7 : 8605 Hiroshima-ken: 9858
## (Other):200115 (Other) : 15820
df_test %>% summary()
## air_store_id visit_date visitors total_reservations
## Length:32019 Min. :2017-04-23 Min. :0 Min. : 0.00000
## Class :character 1st Qu.:2017-05-02 1st Qu.:0 1st Qu.: 0.00000
## Mode :character Median :2017-05-12 Median :0 Median : 0.00000
## Mean :2017-05-12 Mean :0 Mean : 0.08561
## 3rd Qu.:2017-05-22 3rd Qu.:0 3rd Qu.: 0.00000
## Max. :2017-05-31 Max. :0 Max. :54.00000
##
## air_genre_name air_area_name latitude
## Izakaya :7605 Fukuoka-ken Fukuoka-shi Daimyō: 2457 Min. :33.21
## Cafe/Sweets :6903 Tōkyō-to Shibuya-ku Shibuya : 2262 1st Qu.:34.70
## Dining bar :4173 Tōkyō-to Minato-ku Shibakōen : 1989 Median :35.66
## Italian/French:3939 Tōkyō-to Shinjuku-ku Kabukichō: 1521 Mean :35.63
## Bar/Cocktail :3081 Tōkyō-to Setagaya-ku Setagaya : 1170 3rd Qu.:35.69
## Japanese food :2457 Tōkyō-to Chūō-ku Tsukiji : 1092 Max. :44.02
## (Other) :3861 (Other) :21528
## longitude day_of_week holiday_flg Year Month
## Min. :130.2 Friday :4105 0:28735 2016: 0 5 :25451
## 1st Qu.:135.3 Monday :4926 1: 3284 2017:32019 4 : 6568
## Median :139.7 Saturday :4105 1 : 0
## Mean :137.4 Sunday :4926 2 : 0
## 3rd Qu.:139.8 Thursday :4105 3 : 0
## Max. :144.3 Tuesday :4926 6 : 0
## Wednesday:4926 (Other): 0
## Day GeneralArea
## 23 : 1642 Tōkyō-to :17160
## 24 : 1642 Fukuoka-ken : 4875
## 25 : 1642 Ōsaka-fu : 2886
## 26 : 1642 Hyōgo-ken : 2223
## 27 : 1642 Hokkaidō : 1716
## 28 : 1642 Hiroshima-ken: 1248
## (Other):22167 (Other) : 1911
Plot daily historical average visitors
p_daily_his_mean <- df_train %>%
group_by(visit_date) %>%
summarise(Visits = mean(visitors)) %>%
ggplot(aes(visit_date, Visits, alpha=0.9)) +
geom_line() +
theme(legend.position = "none") +
labs(x = "Date", y = "Average Visitors") +
ggtitle("Historical Average Daily Visitors")
p_daily_his_mean
Plot monthly historical average visitors
p_monthly_mean <- df_train %>%
group_by(Month) %>%
summarise(Visits = mean(visitors)) %>%
ggplot(aes(Month, Visits, alpha=0.9)) +
geom_col() +
theme(legend.position = "none") +
labs(x = "Month", y = "Average Visitors") +
ggtitle("Average Monthly Visitors")
p_monthly_mean
Plot historical average visitors by day of week
p_daily_mean <- df_train %>%
group_by(day_of_week) %>%
summarise(Visits = mean(visitors)) %>%
ggplot(aes(day_of_week, Visits, alpha=0.9)) +
geom_col() +
theme(legend.position = "none") +
labs(x = "Day of Week", y = "Average Visitors") +
ggtitle("Average Visitors by Day of Week")
p_daily_mean
Plot historical average visitors by holiday occurence
p_holiday <- df_train %>%
group_by(holiday_flg) %>%
summarise(Visits = mean(visitors)) %>%
ggplot(aes(holiday_flg, Visits, alpha=0.9)) +
geom_col() +
theme(legend.position = "none") +
labs(x = "Holiday", y = "Average Visitors") +
ggtitle("Average Visitors by Holiday indicator")
p_holiday
Plot Visitors vs Reservations
p_reserve_vs_visits <- df_train %>%
filter(total_reservations < 100) %>%
ggplot(aes(total_reservations, visitors)) +
geom_point(color = "black", alpha = 0.5) +
geom_smooth(method = "lm", color = "red") +
ggtitle("Visitors vs Reservations")
p_reserve_vs_visits
## `geom_smooth()` using formula 'y ~ x'
In this section we will be writing the necessary code to generate historical average and weighted average features
# get combined train and test id, date, visitors to generate average predictor
df_temp = rbind(df_train %>% select(air_store_id, visit_date, Year, Month, Day, visitors),
df_test %>% select(air_store_id, visit_date, Year, Month, Day, visitors))
# Function that generates weights
GenerateWeights<-function(alpha,n)
{
w <- list(alpha)
for (i in 1:n-1){
w_0 <- alpha*(1-alpha)**i
w[i+1] <- w_0
}
return(w)
}
# create dataframe that will hold generated features
df_hist_average_visitors <-
data.frame(air_store_id=character(),
visit_date=as.Date(character()),
hist_avg_visitors=double(),
hist_wa005_visitors=double(),
hist_wa01_visitors=double(),
hist_wa02_visitors=double(),
hist_wa05_visitors=double(),
stringsAsFactors=FALSE)
# Generate historical average and weighted average features. The code below generates a new temporary
# dataframe for each unique store id with columns being different visitor lags and then does a rowMeans
# and a rowWeightedMeans to get historical average or historical weighted average respectively (using previous
# function to calculate weights)
for (i in unique(df_temp$air_store_id)){
df_temp_0 <- df_temp %>%
filter(air_store_id == i)
df_temp_0 <- df_temp_0[order(df_temp_0$visit_date),]
df_temp_lags <- df_temp_0 %>% copy()
for (j in 39:403){
lag <- lag(df_temp_lags$visitors, j)
df_temp_lags <- cbind(df_temp_lags, lag)
}
# get historical mean
hist_mean <- rowMeans(df_temp_lags[7:371], na.rm = TRUE)
# get historical weighted mean
w005 <- GenerateWeights(0.05,365)
w01 <- GenerateWeights(0.1,365)
w02 <- GenerateWeights(0.2,365)
w05 <- GenerateWeights(0.5,365)
hist_weighted_mean_005 <- rowWeightedMeans(as.matrix(df_temp_lags[7:371]), w = as.numeric(w005), na.rm = TRUE)
hist_weighted_mean_01 <- rowWeightedMeans(as.matrix(df_temp_lags[7:371]), w = as.numeric(w01), na.rm = TRUE)
hist_weighted_mean_02 <- rowWeightedMeans(as.matrix(df_temp_lags[7:371]), w = as.numeric(w02), na.rm = TRUE)
hist_weighted_mean_05 <- rowWeightedMeans(as.matrix(df_temp_lags[7:371]), w = as.numeric(w05), na.rm = TRUE)
df_temp_0 <- df_temp_0 %>%
mutate(hist_avg_visitors = hist_mean,
hist_wa005_visitors = hist_weighted_mean_005,
hist_wa01_visitors = hist_weighted_mean_01,
hist_wa02_visitors = hist_weighted_mean_02,
hist_wa05_visitors = hist_weighted_mean_05
) %>%
select(air_store_id, visit_date, hist_avg_visitors,
hist_wa005_visitors, hist_wa01_visitors, hist_wa02_visitors, hist_wa05_visitors)
df_hist_average_visitors <- rbind(df_temp_0, df_hist_average_visitors)
}
# look at the generated features
df_hist_average_visitors %>% tail()
## air_store_id visit_date hist_avg_visitors hist_wa005_visitors
## 284122 air_ba937bf13d40fb24 2017-05-26 22.93699 16.83007
## 284123 air_ba937bf13d40fb24 2017-05-27 22.92603 16.53857
## 284124 air_ba937bf13d40fb24 2017-05-28 22.86849 16.26164
## 284125 air_ba937bf13d40fb24 2017-05-29 22.89863 16.14856
## 284126 air_ba937bf13d40fb24 2017-05-30 22.93699 17.34113
## 284127 air_ba937bf13d40fb24 2017-05-31 22.97808 17.62407
## hist_wa01_visitors hist_wa02_visitors hist_wa05_visitors
## 284122 15.58814 13.39556 9.769546
## 284123 15.12933 12.91645 10.384773
## 284124 14.71640 12.53316 10.692387
## 284125 14.64476 12.82653 12.346193
## 284126 17.18028 18.26122 26.173097
## 284127 17.76225 19.20898 24.586548
Merge with df_train and df_test
df_train <- df_train %>%
select('air_store_id', 'visit_date') %>%
inner_join(df_full,
by=c('air_store_id','visit_date')) %>%
inner_join(df_hist_average_visitors,
by=c('air_store_id','visit_date'))
df_test <- df_test %>%
select('air_store_id', 'visit_date') %>%
inner_join(df_full,
by=c('air_store_id','visit_date')) %>%
inner_join(df_hist_average_visitors,
by=c('air_store_id','visit_date'))
# impute missing reservation data
df_train$total_reservations[is.na(df_train$total_reservations)] <- 0
df_test$total_reservations[is.na(df_test$total_reservations)] <- 0
Split Train - Validation set
df_train_eval <- df_train %>%
filter(visit_date < '2017-02-01') %>%
mutate(log_visitors = log(visitors))
df_val <- df_train %>%
filter(visit_date >= '2017-02-01')
Evaluation Metric functions
MSE <- function(actual,pred){
mean((actual - pred)^2)
}
Bias <- function(actual,pred){
mean(actual) - mean(pred)
}
sMAPE <- function(actual,pred){
mean(200*(abs(actual - pred))/(actual+pred))
}
# Fit
df_mean_preds <- df_train_eval %>%
group_by(air_store_id) %>%
summarise(pred_mean = mean(visitors))
# Predict
df_val <- df_val %>% left_join(df_mean_preds, by='air_store_id')
# Evaluate
df_eval <- df_val %>% filter(!is.na(hist_avg_visitors) & !is.na(pred_mean))
print("Mean Pred Performance")
## [1] "Mean Pred Performance"
MSE(df_eval$visitors, df_eval$pred_mean)
## [1] 189.4146
Bias(df_eval$visitors, df_eval$pred_mean)
## [1] 0.2480329
sMAPE(df_eval$visitors, df_eval$pred_mean)
## [1] 47.1184
# Fit
fit.lm1 <- lm(visitors ~ Month + day_of_week + holiday_flg,
data=df_train_eval)
fit.lm1 %>% summary()
##
## Call:
## lm(formula = visitors ~ Month + day_of_week + holiday_flg, data = df_train_eval)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.67 -11.56 -3.64 7.96 651.32
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.76227 0.12982 167.634 < 2e-16 ***
## Month2 1.29742 0.20969 6.187 6.13e-10 ***
## Month3 3.15489 0.20293 15.547 < 2e-16 ***
## Month4 2.44712 0.20473 11.953 < 2e-16 ***
## Month5 2.07393 0.20405 10.164 < 2e-16 ***
## Month6 1.73742 0.20344 8.540 < 2e-16 ***
## Month7 1.11302 0.14785 7.528 5.18e-14 ***
## Month8 -0.12682 0.14956 -0.848 0.3965
## Month9 -0.05772 0.14898 -0.387 0.6984
## Month10 0.34126 0.14777 2.309 0.0209 *
## Month11 -0.04487 0.14879 -0.302 0.7630
## Month12 2.97634 0.14779 20.139 < 2e-16 ***
## day_of_weekMonday -5.94421 0.13858 -42.893 < 2e-16 ***
## day_of_weekSaturday 3.18609 0.13089 24.342 < 2e-16 ***
## day_of_weekSunday 0.80109 0.14002 5.721 1.06e-08 ***
## day_of_weekThursday -4.14070 0.13236 -31.284 < 2e-16 ***
## day_of_weekTuesday -5.06489 0.13332 -37.991 < 2e-16 ***
## day_of_weekWednesday -3.57537 0.13328 -26.826 < 2e-16 ***
## holiday_flg1 3.74797 0.15905 23.564 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.15 on 194216 degrees of freedom
## Multiple R-squared: 0.04364, Adjusted R-squared: 0.04355
## F-statistic: 492.4 on 18 and 194216 DF, p-value: < 2.2e-16
# Predict
lm1.pred <- predict(fit.lm1, df_val)
df_val <- cbind(df_val, lm1.pred)
# Evaluate
df_eval <- df_val %>% filter(!is.na(hist_avg_visitors) & !is.na(pred_mean))
print("lm1 Performance")
## [1] "lm1 Performance"
MSE(df_eval$visitors, df_eval$lm1.pred)
## [1] 281.6519
Bias(df_eval$visitors, df_eval$lm1.pred)
## [1] -1.079654
sMAPE(df_eval$visitors, df_eval$lm1.pred)
## [1] 63.40645
# Fit
fit.lm2 <- lm(visitors ~ hist_avg_visitors + Month + day_of_week + holiday_flg,
data=df_train_eval,
na.action = na.omit)
fit.lm2 %>% summary()
##
## Call:
## lm(formula = visitors ~ hist_avg_visitors + Month + day_of_week +
## holiday_flg, data = df_train_eval, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -109.31 -6.84 -1.08 5.45 608.76
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.97322 0.12938 22.980 < 2e-16 ***
## hist_avg_visitors 0.89495 0.00282 317.393 < 2e-16 ***
## Month2 2.23945 0.23639 9.473 < 2e-16 ***
## Month3 3.75956 0.16662 22.564 < 2e-16 ***
## Month4 2.44076 0.16764 14.559 < 2e-16 ***
## Month5 1.40402 0.16673 8.421 < 2e-16 ***
## Month6 0.89428 0.16589 5.391 7.02e-08 ***
## Month7 0.40577 0.16400 2.474 0.01335 *
## Month8 -0.98559 0.13682 -7.203 5.90e-13 ***
## Month9 -0.35643 0.12586 -2.832 0.00463 **
## Month10 0.44770 0.12469 3.590 0.00033 ***
## Month11 0.22565 0.12539 1.800 0.07193 .
## Month12 3.41379 0.12456 27.406 < 2e-16 ***
## day_of_weekMonday -6.25746 0.11811 -52.979 < 2e-16 ***
## day_of_weekSaturday 3.33941 0.11188 29.848 < 2e-16 ***
## day_of_weekSunday 0.63494 0.11951 5.313 1.08e-07 ***
## day_of_weekThursday -4.25548 0.11273 -37.750 < 2e-16 ***
## day_of_weekTuesday -5.01811 0.11372 -44.126 < 2e-16 ***
## day_of_weekWednesday -3.56000 0.11368 -31.316 < 2e-16 ***
## holiday_flg1 3.66354 0.13386 27.369 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.58 on 162371 degrees of freedom
## (31844 observations deleted due to missingness)
## Multiple R-squared: 0.4113, Adjusted R-squared: 0.4113
## F-statistic: 5972 on 19 and 162371 DF, p-value: < 2.2e-16
# Predict
lm2.pred <- predict(fit.lm2, df_val)
df_val <- cbind(df_val, lm2.pred)
# Evaluate
print("lm2 Performance")
## [1] "lm2 Performance"
df_eval <- df_val %>% filter(!is.na(hist_avg_visitors) & !is.na(pred_mean))
MSE(df_eval$visitors, df_eval$lm2.pred)
## [1] 174.9837
Bias(df_eval$visitors, df_eval$lm2.pred)
## [1] -1.487614
sMAPE(df_eval$visitors, df_eval$lm2.pred)
## [1] 47.56116
# Fit
fit.lm3a <- lm(visitors ~ hist_wa005_visitors + Month + day_of_week + holiday_flg,
data=df_train_eval,
na.action = na.omit)
fit.lm3a %>% summary()
##
## Call:
## lm(formula = visitors ~ hist_wa005_visitors + Month + day_of_week +
## holiday_flg, data = df_train_eval, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -110.37 -6.71 -1.04 5.29 608.46
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.001338 0.127496 23.541 < 2e-16 ***
## hist_wa005_visitors 0.904811 0.002772 326.463 < 2e-16 ***
## Month2 1.975037 0.233818 8.447 < 2e-16 ***
## Month3 3.488600 0.164801 21.169 < 2e-16 ***
## Month4 1.899261 0.165827 11.453 < 2e-16 ***
## Month5 0.206002 0.165011 1.248 0.2119
## Month6 -0.072422 0.164166 -0.441 0.6591
## Month7 0.178200 0.162220 1.099 0.2720
## Month8 -1.103055 0.135339 -8.150 3.66e-16 ***
## Month9 -0.318682 0.124486 -2.560 0.0105 *
## Month10 0.904375 0.123336 7.333 2.27e-13 ***
## Month11 0.644851 0.124030 5.199 2.00e-07 ***
## Month12 3.697349 0.123210 30.008 < 2e-16 ***
## day_of_weekMonday -6.147372 0.116822 -52.622 < 2e-16 ***
## day_of_weekSaturday 3.365573 0.110661 30.413 < 2e-16 ***
## day_of_weekSunday 0.681994 0.118200 5.770 7.95e-09 ***
## day_of_weekThursday -4.291005 0.111497 -38.485 < 2e-16 ***
## day_of_weekTuesday -4.943685 0.112480 -43.952 < 2e-16 ***
## day_of_weekWednesday -3.551547 0.112440 -31.586 < 2e-16 ***
## holiday_flg1 3.699869 0.132393 27.946 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.44 on 162371 degrees of freedom
## (31844 observations deleted due to missingness)
## Multiple R-squared: 0.4241, Adjusted R-squared: 0.4241
## F-statistic: 6294 on 19 and 162371 DF, p-value: < 2.2e-16
# Predict
lm3a.pred <- predict(fit.lm3a, df_val)
df_val <- cbind(df_val, lm3a.pred)
# Evaluate
print("lm3a Performance")
## [1] "lm3a Performance"
df_eval <- df_val %>% filter(!is.na(hist_avg_visitors) & !is.na(pred_mean))
MSE(df_eval$visitors, df_eval$lm3a.pred)
## [1] 172.9475
Bias(df_eval$visitors, df_eval$lm3a.pred)
## [1] -1.471193
sMAPE(df_eval$visitors, df_eval$lm3a.pred)
## [1] 46.6034
# Fit
fit.lm3b <- lm(visitors ~ hist_wa01_visitors + Month + day_of_week + holiday_flg,
data=df_train_eval,
na.action = na.omit)
fit.lm3b %>% summary()
##
## Call:
## lm(formula = visitors ~ hist_wa01_visitors + Month + day_of_week +
## holiday_flg, data = df_train_eval, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -107.91 -6.75 -1.09 5.30 608.16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.209500 0.128121 25.051 < 2e-16 ***
## hist_wa01_visitors 0.883215 0.002743 321.961 < 2e-16 ***
## Month2 2.187511 0.235094 9.305 < 2e-16 ***
## Month3 3.667851 0.165701 22.135 < 2e-16 ***
## Month4 1.922978 0.166734 11.533 < 2e-16 ***
## Month5 0.112554 0.165928 0.678 0.498
## Month6 0.140103 0.165041 0.849 0.396
## Month7 0.675862 0.163076 4.144 3.41e-05 ***
## Month8 -0.788061 0.136049 -5.792 6.95e-09 ***
## Month9 0.031432 0.125156 0.251 0.802
## Month10 1.337408 0.124028 10.783 < 2e-16 ***
## Month11 0.837683 0.124717 6.717 1.87e-11 ***
## Month12 3.885505 0.123892 31.362 < 2e-16 ***
## day_of_weekMonday -6.022117 0.117460 -51.270 < 2e-16 ***
## day_of_weekSaturday 3.401265 0.111266 30.569 < 2e-16 ***
## day_of_weekSunday 0.757816 0.118847 6.376 1.82e-10 ***
## day_of_weekThursday -4.332408 0.112108 -38.645 < 2e-16 ***
## day_of_weekTuesday -4.861539 0.113096 -42.986 < 2e-16 ***
## day_of_weekWednesday -3.548601 0.113055 -31.388 < 2e-16 ***
## holiday_flg1 3.701853 0.133118 27.809 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.51 on 162371 degrees of freedom
## (31844 observations deleted due to missingness)
## Multiple R-squared: 0.4178, Adjusted R-squared: 0.4177
## F-statistic: 6133 on 19 and 162371 DF, p-value: < 2.2e-16
# Predict
lm3b.pred <- predict(fit.lm3b, df_val)
df_val <- cbind(df_val, lm3b.pred)
# Evaluate
print("lm3b Performance")
## [1] "lm3b Performance"
df_eval <- df_val %>% filter(!is.na(hist_avg_visitors) & !is.na(pred_mean))
MSE(df_eval$visitors, df_eval$lm3b.pred)
## [1] 177.647
Bias(df_eval$visitors, df_eval$lm3b.pred)
## [1] -1.392367
sMAPE(df_eval$visitors, df_eval$lm3b.pred)
## [1] 47.03719
# Fit
fit.lm3c <- lm(visitors ~ hist_wa02_visitors + Month + day_of_week + holiday_flg,
data=df_train_eval,
na.action = na.omit)
fit.lm3c %>% summary()
##
## Call:
## lm(formula = visitors ~ hist_wa02_visitors + Month + day_of_week +
## holiday_flg, data = df_train_eval, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -123.50 -6.91 -1.21 5.39 607.57
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.9376 0.1297 30.352 < 2e-16 ***
## hist_wa02_visitors 0.8347 0.0027 309.121 < 2e-16 ***
## Month2 2.4283 0.2388 10.171 < 2e-16 ***
## Month3 3.8483 0.1683 22.868 < 2e-16 ***
## Month4 2.0198 0.1693 11.928 < 2e-16 ***
## Month5 0.3455 0.1685 2.050 0.04032 *
## Month6 0.4795 0.1676 2.861 0.00422 **
## Month7 1.0816 0.1656 6.531 6.53e-11 ***
## Month8 -0.4320 0.1381 -3.127 0.00177 **
## Month9 0.3763 0.1271 2.961 0.00307 **
## Month10 1.6381 0.1260 13.003 < 2e-16 ***
## Month11 0.9633 0.1267 7.605 2.87e-14 ***
## Month12 4.0989 0.1258 32.573 < 2e-16 ***
## day_of_weekMonday -5.7872 0.1193 -48.512 < 2e-16 ***
## day_of_weekSaturday 3.4843 0.1130 30.834 < 2e-16 ***
## day_of_weekSunday 0.9219 0.1207 7.638 2.21e-14 ***
## day_of_weekThursday -4.4349 0.1139 -38.951 < 2e-16 ***
## day_of_weekTuesday -4.7436 0.1149 -41.299 < 2e-16 ***
## day_of_weekWednesday -3.5896 0.1148 -31.263 < 2e-16 ***
## holiday_flg1 3.6424 0.1352 26.942 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.71 on 162371 degrees of freedom
## (31844 observations deleted due to missingness)
## Multiple R-squared: 0.3995, Adjusted R-squared: 0.3994
## F-statistic: 5686 on 19 and 162371 DF, p-value: < 2.2e-16
# Predict
lm3c.pred <- predict(fit.lm3c, df_val)
df_val <- cbind(df_val, lm3c.pred)
# Evaluate
print("lm3c Performance")
## [1] "lm3c Performance"
df_eval <- df_val %>% filter(!is.na(hist_avg_visitors) & !is.na(pred_mean))
MSE(df_eval$visitors, df_eval$lm3c.pred)
## [1] 184.9788
Bias(df_eval$visitors, df_eval$lm3c.pred)
## [1] -1.30465
sMAPE(df_eval$visitors, df_eval$lm3c.pred)
## [1] 48.01632
# Fit
fit.lm3d <- lm(visitors ~ hist_wa05_visitors + Month + day_of_week + holiday_flg,
data=df_train_eval,
na.action = na.omit)
fit.lm3d %>% summary()
##
## Call:
## lm(formula = visitors ~ hist_wa05_visitors + Month + day_of_week +
## holiday_flg, data = df_train_eval, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -210.39 -7.67 -1.75 5.77 606.75
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.286198 0.134858 54.028 < 2e-16 ***
## hist_wa05_visitors 0.667569 0.002558 260.980 < 2e-16 ***
## Month2 2.617716 0.252570 10.364 < 2e-16 ***
## Month3 3.961512 0.178025 22.253 < 2e-16 ***
## Month4 2.201940 0.179123 12.293 < 2e-16 ***
## Month5 0.944442 0.178192 5.300 1.16e-07 ***
## Month6 0.985090 0.177245 5.558 2.74e-08 ***
## Month7 1.370464 0.175179 7.823 5.18e-15 ***
## Month8 0.037952 0.146111 0.260 0.795
## Month9 0.615621 0.134465 4.578 4.69e-06 ***
## Month10 1.611125 0.133284 12.088 < 2e-16 ***
## Month11 0.969834 0.134005 7.237 4.60e-13 ***
## Month12 4.132974 0.133128 31.045 < 2e-16 ***
## day_of_weekMonday -5.502005 0.126207 -43.595 < 2e-16 ***
## day_of_weekSaturday 3.658518 0.119545 30.604 < 2e-16 ***
## day_of_weekSunday 1.197661 0.127696 9.379 < 2e-16 ***
## day_of_weekThursday -4.808271 0.120467 -39.914 < 2e-16 ***
## day_of_weekTuesday -4.835508 0.121506 -39.796 < 2e-16 ***
## day_of_weekWednesday -4.019119 0.121475 -33.086 < 2e-16 ***
## holiday_flg1 3.607038 0.143018 25.221 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.44 on 162371 degrees of freedom
## (31844 observations deleted due to missingness)
## Multiple R-squared: 0.328, Adjusted R-squared: 0.3279
## F-statistic: 4171 on 19 and 162371 DF, p-value: < 2.2e-16
# Predict
lm3d.pred <- predict(fit.lm3d, df_val)
df_val <- cbind(df_val, lm3d.pred)
# Evaluate
print("lm3d Performance")
## [1] "lm3d Performance"
df_eval <- df_val %>% filter(!is.na(hist_avg_visitors) & !is.na(pred_mean))
MSE(df_eval$visitors, df_eval$lm3d.pred)
## [1] 206.4613
Bias(df_eval$visitors, df_eval$lm3d.pred)
## [1] -1.252286
sMAPE(df_eval$visitors, df_eval$lm3d.pred)
## [1] 51.53464
# Fit
fit.lm4 <- lm(visitors ~ hist_wa005_visitors + Month + day_of_week + holiday_flg + GeneralArea,
data=df_train_eval,
na.action = na.omit)
fit.lm4 %>% summary()
##
## Call:
## lm(formula = visitors ~ hist_wa005_visitors + Month + day_of_week +
## holiday_flg + GeneralArea, data = df_train_eval, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -110.66 -6.71 -1.03 5.30 608.54
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.927940 0.145742 20.090 < 2e-16 ***
## hist_wa005_visitors 0.904540 0.002783 325.043 < 2e-16 ***
## Month2 1.973429 0.233897 8.437 < 2e-16 ***
## Month3 3.489250 0.164862 21.165 < 2e-16 ***
## Month4 1.900034 0.165882 11.454 < 2e-16 ***
## Month5 0.208418 0.165059 1.263 0.2067
## Month6 -0.070394 0.164204 -0.429 0.6681
## Month7 0.179682 0.162255 1.107 0.2681
## Month8 -1.102032 0.135340 -8.143 3.89e-16 ***
## Month9 -0.317826 0.124489 -2.553 0.0107 *
## Month10 0.905014 0.123336 7.338 2.18e-13 ***
## Month11 0.645326 0.124028 5.203 1.96e-07 ***
## Month12 3.697290 0.123208 30.009 < 2e-16 ***
## day_of_weekMonday -6.147698 0.116823 -52.624 < 2e-16 ***
## day_of_weekSaturday 3.363656 0.110659 30.397 < 2e-16 ***
## day_of_weekSunday 0.677486 0.118225 5.731 1.00e-08 ***
## day_of_weekThursday -4.290398 0.111495 -38.481 < 2e-16 ***
## day_of_weekTuesday -4.942976 0.112479 -43.946 < 2e-16 ***
## day_of_weekWednesday -3.549584 0.112440 -31.569 < 2e-16 ***
## holiday_flg1 3.693123 0.132424 27.889 < 2e-16 ***
## GeneralAreaHiroshima-ken 0.284062 0.173485 1.637 0.1016
## GeneralAreaHokkaidō 0.395686 0.157697 2.509 0.0121 *
## GeneralAreaHyōgo-ken 0.174196 0.139523 1.249 0.2118
## GeneralAreaMiyagi-ken 0.256293 0.209305 1.224 0.2208
## GeneralAreaNiigata-ken 0.569932 0.256148 2.225 0.0261 *
## GeneralAreaŌsaka-fu 0.196627 0.128623 1.529 0.1263
## GeneralAreaShizuoka-ken -0.042434 0.215625 -0.197 0.8440
## GeneralAreaTōkyō-to 0.005900 0.088501 0.067 0.9468
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.44 on 162363 degrees of freedom
## (31844 observations deleted due to missingness)
## Multiple R-squared: 0.4242, Adjusted R-squared: 0.4241
## F-statistic: 4430 on 27 and 162363 DF, p-value: < 2.2e-16
# Predict
lm4.pred <- predict(fit.lm4, df_val)
df_val <- cbind(df_val, lm4.pred)
# high p-values for a bunch of them so probably not gonna use
# Evaluate
print("lm4 Performance")
## [1] "lm4 Performance"
df_eval <- df_val %>% filter(!is.na(hist_avg_visitors) & !is.na(pred_mean))
MSE(df_eval$visitors, df_eval$lm4.pred)
## [1] 173.0206
Bias(df_eval$visitors, df_eval$lm4.pred)
## [1] -1.470203
sMAPE(df_eval$visitors, df_eval$lm4.pred)
## [1] 46.60608
# Fit
fit.lm5 <- lm(visitors ~ hist_wa005_visitors + Month + day_of_week + holiday_flg + air_genre_name,
data=df_train_eval,
na.action = na.omit)
fit.lm5 %>% summary()
##
## Call:
## lm(formula = visitors ~ hist_wa005_visitors + Month + day_of_week +
## holiday_flg + air_genre_name, data = df_train_eval, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -108.52 -6.70 -1.01 5.29 608.74
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 1.269353 0.706450 1.797
## hist_wa005_visitors 0.897191 0.002877 311.832
## Month2 1.987279 0.233649 8.505
## Month3 3.489207 0.164680 21.188
## Month4 1.905315 0.165711 11.498
## Month5 0.229307 0.164920 1.390
## Month6 -0.050499 0.164072 -0.308
## Month7 0.199635 0.162110 1.231
## Month8 -1.091272 0.135222 -8.070
## Month9 -0.310790 0.124365 -2.499
## Month10 0.906158 0.123215 7.354
## Month11 0.644158 0.123909 5.199
## Month12 3.691605 0.123089 29.991
## day_of_weekMonday -6.146289 0.116721 -52.658
## day_of_weekSaturday 3.369929 0.110552 30.483
## day_of_weekSunday 0.710572 0.118128 6.015
## day_of_weekThursday -4.294790 0.111390 -38.556
## day_of_weekTuesday -4.948137 0.112374 -44.033
## day_of_weekWednesday -3.557511 0.112336 -31.668
## holiday_flg1 3.709026 0.132280 28.039
## air_genre_nameBar/Cocktail 0.892744 0.699454 1.276
## air_genre_nameCafe/Sweets 1.625506 0.693329 2.344
## air_genre_nameCreative cuisine 1.759123 0.734511 2.395
## air_genre_nameDining bar 1.612224 0.695950 2.317
## air_genre_nameInternational cuisine 4.109411 1.174406 3.499
## air_genre_nameItalian/French 2.142525 0.695903 3.079
## air_genre_nameIzakaya 2.420095 0.692785 3.493
## air_genre_nameJapanese food 2.016094 0.699992 2.880
## air_genre_nameKaraoke/Party 8.971594 0.997824 8.991
## air_genre_nameOkonomiyaki/Monja/Teppanyaki 2.045559 0.739382 2.767
## air_genre_nameOther 2.023047 0.711384 2.844
## air_genre_nameWestern food 1.079764 0.725010 1.489
## air_genre_nameYakiniku/Korean food 2.401852 0.714982 3.359
## Pr(>|t|)
## (Intercept) 0.072369 .
## hist_wa005_visitors < 2e-16 ***
## Month2 < 2e-16 ***
## Month3 < 2e-16 ***
## Month4 < 2e-16 ***
## Month5 0.164406
## Month6 0.758247
## Month7 0.218146
## Month8 7.06e-16 ***
## Month9 0.012455 *
## Month10 1.93e-13 ***
## Month11 2.01e-07 ***
## Month12 < 2e-16 ***
## day_of_weekMonday < 2e-16 ***
## day_of_weekSaturday < 2e-16 ***
## day_of_weekSunday 1.80e-09 ***
## day_of_weekThursday < 2e-16 ***
## day_of_weekTuesday < 2e-16 ***
## day_of_weekWednesday < 2e-16 ***
## holiday_flg1 < 2e-16 ***
## air_genre_nameBar/Cocktail 0.201835
## air_genre_nameCafe/Sweets 0.019054 *
## air_genre_nameCreative cuisine 0.016623 *
## air_genre_nameDining bar 0.020528 *
## air_genre_nameInternational cuisine 0.000467 ***
## air_genre_nameItalian/French 0.002079 **
## air_genre_nameIzakaya 0.000477 ***
## air_genre_nameJapanese food 0.003975 **
## air_genre_nameKaraoke/Party < 2e-16 ***
## air_genre_nameOkonomiyaki/Monja/Teppanyaki 0.005665 **
## air_genre_nameOther 0.004458 **
## air_genre_nameWestern food 0.136408
## air_genre_nameYakiniku/Korean food 0.000782 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.43 on 162358 degrees of freedom
## (31844 observations deleted due to missingness)
## Multiple R-squared: 0.4253, Adjusted R-squared: 0.4252
## F-statistic: 3755 on 32 and 162358 DF, p-value: < 2.2e-16
# Predict
lm5.pred <- predict(fit.lm5, df_val)
df_val <- cbind(df_val, lm5.pred)
# Evaluate
print("lm5 Performance")
## [1] "lm5 Performance"
df_eval <- df_val %>% filter(!is.na(hist_avg_visitors) & !is.na(pred_mean))
MSE(df_eval$visitors, df_eval$lm5.pred)
## [1] 173.6027
Bias(df_eval$visitors, df_eval$lm5.pred)
## [1] -1.472677
sMAPE(df_eval$visitors, df_eval$lm5.pred)
## [1] 46.57466
# Fit
fit.lm6 <- lm(visitors ~ hist_wa005_visitors + total_reservations + Month + day_of_week + holiday_flg,
data=df_train_eval,
na.action = na.omit)
fit.lm6 %>% summary()
##
## Call:
## lm(formula = visitors ~ hist_wa005_visitors + total_reservations +
## Month + day_of_week + holiday_flg, data = df_train_eval,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -109.78 -6.65 -1.03 5.27 608.40
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.754522 0.126770 21.728 < 2e-16 ***
## hist_wa005_visitors 0.900671 0.002755 326.948 < 2e-16 ***
## total_reservations 0.353064 0.007604 46.432 < 2e-16 ***
## Month2 2.103222 0.232298 9.054 < 2e-16 ***
## Month3 3.566850 0.163727 21.785 < 2e-16 ***
## Month4 2.028944 0.164761 12.314 < 2e-16 ***
## Month5 0.391480 0.163976 2.387 0.01697 *
## Month6 0.133548 0.163147 0.819 0.41303
## Month7 0.422139 0.161240 2.618 0.00884 **
## Month8 -0.849210 0.134561 -6.311 2.78e-10 ***
## Month9 -0.065005 0.123789 -0.525 0.59949
## Month10 1.103557 0.122600 9.001 < 2e-16 ***
## Month11 0.691226 0.123219 5.610 2.03e-08 ***
## Month12 3.418851 0.122548 27.898 < 2e-16 ***
## day_of_weekMonday -5.988683 0.116104 -51.580 < 2e-16 ***
## day_of_weekSaturday 3.316480 0.109939 30.167 < 2e-16 ***
## day_of_weekSunday 0.820027 0.117461 6.981 2.94e-12 ***
## day_of_weekThursday -4.178845 0.110791 -37.718 < 2e-16 ***
## day_of_weekTuesday -4.790355 0.111790 -42.851 < 2e-16 ***
## day_of_weekWednesday -3.425616 0.111734 -30.659 < 2e-16 ***
## holiday_flg1 3.640271 0.131530 27.676 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.36 on 162370 degrees of freedom
## (31844 observations deleted due to missingness)
## Multiple R-squared: 0.4317, Adjusted R-squared: 0.4316
## F-statistic: 6166 on 20 and 162370 DF, p-value: < 2.2e-16
# Predict
lm6.pred <- predict(fit.lm6, df_val)
df_val <- cbind(df_val, lm6.pred)
# Evaluate
print("lm6 Performance")
## [1] "lm6 Performance"
df_eval <- df_val %>% filter(!is.na(hist_avg_visitors) & !is.na(pred_mean))
MSE(df_eval$visitors, df_eval$lm6.pred)
## [1] 171.6071
Bias(df_eval$visitors, df_eval$lm6.pred)
## [1] -1.63508
sMAPE(df_eval$visitors, df_eval$lm6.pred)
## [1] 46.51082
As we can see from the model structures above and their respective performances, the model with the lowest MSE and sMAPE is the last proposed one with Month, Day of Week Dummies, a Holiday indicator, the weighted average and total reservations features (lm6). The robust and very simple historical average method also does very well - even better compared to the simple regression with only dummies. However, the final model has improved performance as far as MSE and sMAPE compared to both model 0 and model 1 benchmarks as well as compared to the rest - slightly more complex - models. It is worth noticing that the winning model has a higher absolute bias than the simpler model 0 as it is underpredicting more than model 0 is overpredicting. However, we would prefer a model with significantly better accuracy and slightly higher bias since the latter is adjustable.
Generate Final Prediction and Submission file
# Final Prediction
lm_final.pred <- predict(fit.lm6, df_test)
df_forecast <- df_test %>%
mutate(visitors = lm_final.pred)
# if prediction is negative then set to 0
df_forecast$visitors[df_forecast$visitors < 0] <- 0
# look at final forecasts
df_forecast %>% head()
## air_store_id visit_date visitors total_reservations air_genre_name
## 1 air_00a91d42b08b08d9 2017-04-23 30.30785 0 Italian/French
## 2 air_00a91d42b08b08d9 2017-04-24 23.52486 0 Italian/French
## 3 air_00a91d42b08b08d9 2017-04-25 25.19798 0 Italian/French
## 4 air_00a91d42b08b08d9 2017-04-26 27.05880 0 Italian/French
## 5 air_00a91d42b08b08d9 2017-04-27 25.42580 0 Italian/French
## 6 air_00a91d42b08b08d9 2017-04-28 30.11991 0 Italian/French
## air_area_name latitude longitude day_of_week holiday_flg
## 1 Tōkyō-to Chiyoda-ku Kudanminami 35.694 139.7536 Sunday 0
## 2 Tōkyō-to Chiyoda-ku Kudanminami 35.694 139.7536 Monday 0
## 3 Tōkyō-to Chiyoda-ku Kudanminami 35.694 139.7536 Tuesday 0
## 4 Tōkyō-to Chiyoda-ku Kudanminami 35.694 139.7536 Wednesday 0
## 5 Tōkyō-to Chiyoda-ku Kudanminami 35.694 139.7536 Thursday 0
## 6 Tōkyō-to Chiyoda-ku Kudanminami 35.694 139.7536 Friday 0
## Year Month Day GeneralArea hist_avg_visitors hist_wa005_visitors
## 1 2017 4 23 Tōkyō-to 25.47938 27.42883
## 2 2017 4 24 Tōkyō-to 25.49231 27.45739
## 3 2017 4 25 Tōkyō-to 25.55612 27.98454
## 4 2017 4 26 Tōkyō-to 25.62437 28.53534
## 5 2017 4 27 Tōkyō-to 25.54040 27.55853
## 6 2017 4 28 Tōkyō-to 25.60804 28.13063
## hist_wa01_visitors hist_wa02_visitors hist_wa05_visitors
## 1 26.56268 25.23260 25.72418
## 2 26.70641 25.78608 26.86209
## 3 27.83577 28.22887 32.43104
## 4 28.95219 30.38309 35.71552
## 5 26.95697 26.10647 22.35776
## 6 28.16128 28.68518 30.67888
df_forecast %>% summary()
## air_store_id visit_date visitors total_reservations
## Length:32019 Min. :2017-04-23 Min. : 0.00 Min. : 0.00000
## Class :character 1st Qu.:2017-05-02 1st Qu.: 12.10 1st Qu.: 0.00000
## Mode :character Median :2017-05-12 Median : 19.29 Median : 0.00000
## Mean :2017-05-12 Mean : 21.00 Mean : 0.08561
## 3rd Qu.:2017-05-22 3rd Qu.: 28.37 3rd Qu.: 0.00000
## Max. :2017-05-31 Max. :121.86 Max. :54.00000
## NA's :19
## air_genre_name air_area_name latitude
## Izakaya :7605 Fukuoka-ken Fukuoka-shi Daimyō: 2457 Min. :33.21
## Cafe/Sweets :6903 Tōkyō-to Shibuya-ku Shibuya : 2262 1st Qu.:34.70
## Dining bar :4173 Tōkyō-to Minato-ku Shibakōen : 1989 Median :35.66
## Italian/French:3939 Tōkyō-to Shinjuku-ku Kabukichō: 1521 Mean :35.63
## Bar/Cocktail :3081 Tōkyō-to Setagaya-ku Setagaya : 1170 3rd Qu.:35.69
## Japanese food :2457 Tōkyō-to Chūō-ku Tsukiji : 1092 Max. :44.02
## (Other) :3861 (Other) :21528
## longitude day_of_week holiday_flg Year Month
## Min. :130.2 Friday :4105 0:28735 2016: 0 5 :25451
## 1st Qu.:135.3 Monday :4926 1: 3284 2017:32019 4 : 6568
## Median :139.7 Saturday :4105 1 : 0
## Mean :137.4 Sunday :4926 2 : 0
## 3rd Qu.:139.8 Thursday :4105 3 : 0
## Max. :144.3 Tuesday :4926 6 : 0
## Wednesday:4926 (Other): 0
## Day GeneralArea hist_avg_visitors hist_wa005_visitors
## 23 : 1642 Tōkyō-to :17160 Min. : 1.186 Min. : 1.062
## 24 : 1642 Fukuoka-ken : 4875 1st Qu.: 11.773 1st Qu.: 11.620
## 25 : 1642 Ōsaka-fu : 2886 Median : 18.992 Median : 18.987
## 26 : 1642 Hyōgo-ken : 2223 Mean : 21.050 Mean : 21.416
## 27 : 1642 Hokkaidō : 1716 3rd Qu.: 28.461 3rd Qu.: 29.412
## 28 : 1642 Hiroshima-ken: 1248 Max. :122.719 Max. :127.829
## (Other):22167 (Other) : 1911 NA's :19 NA's :19
## hist_wa01_visitors hist_wa02_visitors hist_wa05_visitors
## Min. : 1.027 Min. : 1.004 Min. : 1.00
## 1st Qu.: 11.510 1st Qu.: 11.322 1st Qu.: 10.63
## Median : 19.043 Median : 18.914 Median : 18.50
## Mean : 21.638 Mean : 21.715 Mean : 21.75
## 3rd Qu.: 29.634 3rd Qu.: 29.609 3rd Qu.: 29.71
## Max. :131.974 Max. :175.177 Max. :398.21
## NA's :19 NA's :19 NA's :19
# Submission file
df_submission <- df_sample_submission %>%
mutate(air_store_id = substr(df_sample_submission$id,1,20),
visit_date = as.Date(substr(df_sample_submission$id,22,31))
) %>%
select(id, air_store_id, visit_date)
df_submission <- df_submission %>%
inner_join(df_forecast, by=c('air_store_id', 'visit_date')) %>%
select(id, visitors)
# look at final submission file - should be the same as sample submission format / dimensions
dim(df_submission)
## [1] 32019 2
dim(df_sample_submission)
## [1] 32019 2
df_submission %>% head()
## id visitors
## 1 air_00a91d42b08b08d9_2017-04-23 30.30785
## 2 air_00a91d42b08b08d9_2017-04-24 23.52486
## 3 air_00a91d42b08b08d9_2017-04-25 25.19798
## 4 air_00a91d42b08b08d9_2017-04-26 27.05880
## 5 air_00a91d42b08b08d9_2017-04-27 25.42580
## 6 air_00a91d42b08b08d9_2017-04-28 30.11991
Write to csv
write.csv(df_submission,"/Users/yannos/Documents/PredictiveAnalytics/Midterm/submission.csv")