##Reading in Data

getwd()
## [1] "C:/Users/tlaff/OneDrive/Documents/R"
av_data <- read.csv('air_visit_data.csv')
submission_data <- read.csv('sample_submission.csv')
date_info <- read.csv('date_info.csv')
air_store_info <- read.csv('air_store_info.csv')

#av_data
head(av_data)
##           air_store_id visit_date visitors
## 1 air_ba937bf13d40fb24  1/13/2016       25
## 2 air_ba937bf13d40fb24  1/14/2016       32
## 3 air_ba937bf13d40fb24  1/15/2016       29
## 4 air_ba937bf13d40fb24  1/16/2016       22
## 5 air_ba937bf13d40fb24  1/18/2016        6
## 6 air_ba937bf13d40fb24  1/19/2016        9
  str(av_data)
## 'data.frame':    252108 obs. of  3 variables:
##  $ air_store_id: chr  "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" ...
##  $ visit_date  : chr  "1/13/2016" "1/14/2016" "1/15/2016" "1/16/2016" ...
##  $ visitors    : int  25 32 29 22 6 9 31 21 18 26 ...
#convert to date
av_data$visit_date <- as.Date.character(av_data$visit_date, "%m/%d/%Y")
sum(is.na(av_data))
## [1] 0
#submission data
head(submission_data)
##                                id visitors
## 1 air_00a91d42b08b08d9_2017-04-23        0
## 2 air_00a91d42b08b08d9_2017-04-24        0
## 3 air_00a91d42b08b08d9_2017-04-25        0
## 4 air_00a91d42b08b08d9_2017-04-26        0
## 5 air_00a91d42b08b08d9_2017-04-27        0
## 6 air_00a91d42b08b08d9_2017-04-28        0
submission_data$air_store_id <- str_sub(submission_data$id, 1,-12)
submission_data$visit_date <- str_sub(submission_data$id, 22,33)
head(submission_data)
##                                id visitors         air_store_id visit_date
## 1 air_00a91d42b08b08d9_2017-04-23        0 air_00a91d42b08b08d9 2017-04-23
## 2 air_00a91d42b08b08d9_2017-04-24        0 air_00a91d42b08b08d9 2017-04-24
## 3 air_00a91d42b08b08d9_2017-04-25        0 air_00a91d42b08b08d9 2017-04-25
## 4 air_00a91d42b08b08d9_2017-04-26        0 air_00a91d42b08b08d9 2017-04-26
## 5 air_00a91d42b08b08d9_2017-04-27        0 air_00a91d42b08b08d9 2017-04-27
## 6 air_00a91d42b08b08d9_2017-04-28        0 air_00a91d42b08b08d9 2017-04-28
str(submission_data)
## 'data.frame':    32019 obs. of  4 variables:
##  $ id          : chr  "air_00a91d42b08b08d9_2017-04-23" "air_00a91d42b08b08d9_2017-04-24" "air_00a91d42b08b08d9_2017-04-25" "air_00a91d42b08b08d9_2017-04-26" ...
##  $ visitors    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ air_store_id: chr  "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" ...
##  $ visit_date  : chr  "2017-04-23" "2017-04-24" "2017-04-25" "2017-04-26" ...
submission_data$visit_date <- as.Date.character(submission_data$visit_date)

str(date_info)
## 'data.frame':    517 obs. of  3 variables:
##  $ calendar_date: chr  "1/1/2016" "1/2/2016" "1/3/2016" "1/4/2016" ...
##  $ day_of_week  : chr  "Friday" "Saturday" "Sunday" "Monday" ...
##  $ holiday_flg  : int  1 1 1 0 0 0 0 0 0 0 ...
head(date_info)
##   calendar_date day_of_week holiday_flg
## 1      1/1/2016      Friday           1
## 2      1/2/2016    Saturday           1
## 3      1/3/2016      Sunday           1
## 4      1/4/2016      Monday           0
## 5      1/5/2016     Tuesday           0
## 6      1/6/2016   Wednesday           0
date_info$calendar_date <- as.Date.character(date_info$calendar_date, "%m/%d/%Y")
str(date_info)
## 'data.frame':    517 obs. of  3 variables:
##  $ calendar_date: Date, format: "2016-01-01" "2016-01-02" ...
##  $ day_of_week  : chr  "Friday" "Saturday" "Sunday" "Monday" ...
##  $ holiday_flg  : int  1 1 1 0 0 0 0 0 0 0 ...
date_info$visit_date <-date_info$calendar_date

#Bind Data frames
#Join Visitor data with Date info by visit_date
visitor_df <- left_join(av_data,date_info, by = "visit_date")
head(visitor_df)
##           air_store_id visit_date visitors calendar_date day_of_week
## 1 air_ba937bf13d40fb24 2016-01-13       25    2016-01-13   Wednesday
## 2 air_ba937bf13d40fb24 2016-01-14       32    2016-01-14    Thursday
## 3 air_ba937bf13d40fb24 2016-01-15       29    2016-01-15      Friday
## 4 air_ba937bf13d40fb24 2016-01-16       22    2016-01-16    Saturday
## 5 air_ba937bf13d40fb24 2016-01-18        6    2016-01-18      Monday
## 6 air_ba937bf13d40fb24 2016-01-19        9    2016-01-19     Tuesday
##   holiday_flg
## 1           0
## 2           0
## 3           0
## 4           0
## 5           0
## 6           0
tail(visitor_df)
##                air_store_id visit_date visitors calendar_date day_of_week
## 252103 air_24e8414b9b07decb 2017-04-15        7    2017-04-15    Saturday
## 252104 air_24e8414b9b07decb 2017-04-18        6    2017-04-18     Tuesday
## 252105 air_24e8414b9b07decb 2017-04-19        6    2017-04-19   Wednesday
## 252106 air_24e8414b9b07decb 2017-04-20        7    2017-04-20    Thursday
## 252107 air_24e8414b9b07decb 2017-04-21        8    2017-04-21      Friday
## 252108 air_24e8414b9b07decb 2017-04-22        5    2017-04-22    Saturday
##        holiday_flg
## 252103           0
## 252104           0
## 252105           0
## 252106           0
## 252107           0
## 252108           0
sum(is.na(visitor_df))
## [1] 0
#Join Store info from AirRegi by air_store_id
visitor_df<- left_join(visitor_df,air_store_info, by = "air_store_id")
head(visitor_df)
##           air_store_id visit_date visitors calendar_date day_of_week
## 1 air_ba937bf13d40fb24 2016-01-13       25    2016-01-13   Wednesday
## 2 air_ba937bf13d40fb24 2016-01-14       32    2016-01-14    Thursday
## 3 air_ba937bf13d40fb24 2016-01-15       29    2016-01-15      Friday
## 4 air_ba937bf13d40fb24 2016-01-16       22    2016-01-16    Saturday
## 5 air_ba937bf13d40fb24 2016-01-18        6    2016-01-18      Monday
## 6 air_ba937bf13d40fb24 2016-01-19        9    2016-01-19     Tuesday
##   holiday_flg air_genre_name                   air_area_name latitude longitude
## 1           0     Dining bar TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en 35.65807  139.7516
## 2           0     Dining bar TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en 35.65807  139.7516
## 3           0     Dining bar TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en 35.65807  139.7516
## 4           0     Dining bar TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en 35.65807  139.7516
## 5           0     Dining bar TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en 35.65807  139.7516
## 6           0     Dining bar TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en 35.65807  139.7516
tail(visitor_df)
##                air_store_id visit_date visitors calendar_date day_of_week
## 252103 air_24e8414b9b07decb 2017-04-15        7    2017-04-15    Saturday
## 252104 air_24e8414b9b07decb 2017-04-18        6    2017-04-18     Tuesday
## 252105 air_24e8414b9b07decb 2017-04-19        6    2017-04-19   Wednesday
## 252106 air_24e8414b9b07decb 2017-04-20        7    2017-04-20    Thursday
## 252107 air_24e8414b9b07decb 2017-04-21        8    2017-04-21      Friday
## 252108 air_24e8414b9b07decb 2017-04-22        5    2017-04-22    Saturday
##        holiday_flg air_genre_name                 air_area_name latitude
## 252103           0          Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322
## 252104           0          Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322
## 252105           0          Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322
## 252106           0          Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322
## 252107           0          Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322
## 252108           0          Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322
##        longitude
## 252103   139.711
## 252104   139.711
## 252105   139.711
## 252106   139.711
## 252107   139.711
## 252108   139.711
sum(is.na(visitor_df))
## [1] 0
str(visitor_df)
## 'data.frame':    252108 obs. of  10 variables:
##  $ air_store_id  : chr  "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" ...
##  $ visit_date    : Date, format: "2016-01-13" "2016-01-14" ...
##  $ visitors      : int  25 32 29 22 6 9 31 21 18 26 ...
##  $ calendar_date : Date, format: "2016-01-13" "2016-01-14" ...
##  $ day_of_week   : chr  "Wednesday" "Thursday" "Friday" "Saturday" ...
##  $ holiday_flg   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ air_genre_name: chr  "Dining bar" "Dining bar" "Dining bar" "Dining bar" ...
##  $ air_area_name : chr  "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" ...
##  $ latitude      : num  35.7 35.7 35.7 35.7 35.7 ...
##  $ longitude     : num  140 140 140 140 140 ...
visitor_df <- subset(visitor_df, select = -c(calendar_date))
str(visitor_df)
## 'data.frame':    252108 obs. of  9 variables:
##  $ air_store_id  : chr  "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" ...
##  $ visit_date    : Date, format: "2016-01-13" "2016-01-14" ...
##  $ visitors      : int  25 32 29 22 6 9 31 21 18 26 ...
##  $ day_of_week   : chr  "Wednesday" "Thursday" "Friday" "Saturday" ...
##  $ holiday_flg   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ air_genre_name: chr  "Dining bar" "Dining bar" "Dining bar" "Dining bar" ...
##  $ air_area_name : chr  "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" ...
##  $ latitude      : num  35.7 35.7 35.7 35.7 35.7 ...
##  $ longitude     : num  140 140 140 140 140 ...
#EDA
dim(visitor_df)
## [1] 252108      9
summary(visitor_df)
##  air_store_id         visit_date            visitors      day_of_week       
##  Length:252108      Min.   :2016-01-01   Min.   :  1.00   Length:252108     
##  Class :character   1st Qu.:2016-07-23   1st Qu.:  9.00   Class :character  
##  Mode  :character   Median :2016-10-23   Median : 17.00   Mode  :character  
##                     Mean   :2016-10-12   Mean   : 20.97                     
##                     3rd Qu.:2017-01-24   3rd Qu.: 29.00                     
##                     Max.   :2017-04-22   Max.   :877.00                     
##   holiday_flg      air_genre_name     air_area_name         latitude    
##  Min.   :0.00000   Length:252108      Length:252108      Min.   :33.21  
##  1st Qu.:0.00000   Class :character   Class :character   1st Qu.:34.69  
##  Median :0.00000   Mode  :character   Mode  :character   Median :35.66  
##  Mean   :0.05067                                         Mean   :35.61  
##  3rd Qu.:0.00000                                         3rd Qu.:35.69  
##  Max.   :1.00000                                         Max.   :44.02  
##    longitude    
##  Min.   :130.2  
##  1st Qu.:135.3  
##  Median :139.7  
##  Mean   :137.4  
##  3rd Qu.:139.8  
##  Max.   :144.3
#Weekday view
visitor_df %>%
  mutate(day = wday(visit_date, label = TRUE, week_start = 1)) %>%
  group_by(day) %>%
  summarise(visits = mean(visitors)) %>%
  ggplot(aes(day, visits, fill = day)) +
  geom_col() +
  theme(legend.position = "none", axis.text.x  = element_text()) +
  labs(x = "Day of the week", y = "Mean visitors") + ggtitle("Visit Summary by Day of Week")

#Holiday Binary
visitor_df %>%
  ggplot(aes(holiday_flg, fill = holiday_flg)) +
  geom_bar() +
  theme(legend.position = "none")  

#Peak months
visitor_df %>%
  mutate(month = month(visit_date, label = TRUE)) %>%
  group_by(month) %>%
  summarise(visits = mean(visitors)) %>%
  ggplot(aes(month, visits, fill = month)) +
  geom_col() +  theme(legend.position = "none") + 
  labs(x = "Month", y = "Mean visitors") + ggtitle("Visit Summary by Month")

#Will define peak months by Mar - Jul and Dec

#Number of unique restaurants
restaurants <- unique(visitor_df$air_store_id)
length(restaurants)
## [1] 829
#829 restaurants
rest_genres <- unique(visitor_df$air_genre_name)
rest_genres
##  [1] "Dining bar"                   "Izakaya"                     
##  [3] "Cafe/Sweets"                  "Bar/Cocktail"                
##  [5] "Italian/French"               "Okonomiyaki/Monja/Teppanyaki"
##  [7] "Other"                        "International cuisine"       
##  [9] "Japanese food"                "Yakiniku/Korean food"        
## [11] "Karaoke/Party"                "Western food"                
## [13] "Creative cuisine"             "Asian"
#Aggregate Genre
agg_genre <- aggregate(visitors ~ air_genre_name, visitor_df, mean)
agg_genre$visitors <-round(agg_genre$visitors,1)
agg_genre
##                  air_genre_name visitors
## 1                         Asian     38.7
## 2                  Bar/Cocktail     13.3
## 3                   Cafe/Sweets     22.6
## 4              Creative cuisine     23.6
## 5                    Dining bar     18.7
## 6         International cuisine     25.2
## 7                Italian/French     22.6
## 8                       Izakaya     23.1
## 9                 Japanese food     19.6
## 10                Karaoke/Party     30.0
## 11 Okonomiyaki/Monja/Teppanyaki     22.6
## 12                        Other     19.9
## 13                 Western food     22.3
## 14         Yakiniku/Korean food     21.2
#Create Time-Series of Aggregated data
visitor_agg <- aggregate(x = visitor_df[c("visitors")], FUN = sum, by = list(Group.date = visitor_df$visit_date)) 
str(visitor_agg)
## 'data.frame':    478 obs. of  2 variables:
##  $ Group.date: Date, format: "2016-01-01" "2016-01-02" ...
##  $ visitors  : int  1033 1764 2368 3326 3927 4154 4431 6115 7306 6066 ...
visitor_agg_ts <- ts(visitor_agg$visitors, start = c(2016, 01), frequency = 366)
autoplot(visitor_agg_ts) + ylab("Total Visitors") + ggtitle("Aggregated Restaurant Visitor Totals by Day")

#can't decompose or show seasonality because it is a daily timeseries
#The trend is relatively steady with a leap during the summer of 2016, maybe representative of new restaurants entering the fold
#Trend appears additive, not exponential

ts_agg_display<- ggtsdisplay(visitor_agg_ts)

acf_agg_ts <- acf(visitor_agg_ts)

#Extreme autocorrelation, non stationary data
visitors_lambda <- BoxCox.lambda(visitor_agg_ts)
#No boxcox transformation
ndiffs(visitor_agg_ts)
## [1] 1
#Would difference 1 time


#Feature Creation
visitor_df_feat <- visitor_df
#create dummy variables for day of week, MorT / WoTh / FoSu, Saturday with coefficient
visitor_df_feat$MoT_flg <- ifelse(visitor_df_feat$day_of_week=="Monday"|visitor_df_feat$day_of_week=="Tuesday" ,1,0)
visitor_df_feat$WoTh_flg <- ifelse(visitor_df_feat$day_of_week=="Wednesday"|visitor_df_feat$day_of_week=="Thursday" ,1,0)
visitor_df_feat$FoSu_flg <- ifelse(visitor_df_feat$day_of_week=="Friday"|visitor_df_feat$day_of_week=="Sunday" ,1,0)
str(visitor_df_feat)
## 'data.frame':    252108 obs. of  12 variables:
##  $ air_store_id  : chr  "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" ...
##  $ visit_date    : Date, format: "2016-01-13" "2016-01-14" ...
##  $ visitors      : int  25 32 29 22 6 9 31 21 18 26 ...
##  $ day_of_week   : chr  "Wednesday" "Thursday" "Friday" "Saturday" ...
##  $ holiday_flg   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ air_genre_name: chr  "Dining bar" "Dining bar" "Dining bar" "Dining bar" ...
##  $ air_area_name : chr  "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" ...
##  $ latitude      : num  35.7 35.7 35.7 35.7 35.7 ...
##  $ longitude     : num  140 140 140 140 140 ...
##  $ MoT_flg       : num  0 0 0 0 1 1 0 0 0 0 ...
##  $ WoTh_flg      : num  1 1 0 0 0 0 1 1 0 0 ...
##  $ FoSu_flg      : num  0 0 1 0 0 0 0 0 1 0 ...
#Create Month column, to isolate peak months
visitor_df_feat$month <- format(visitor_df_feat$visit_date,"%B")
visitor_df_feat$Peak_flg <- ifelse(visitor_df_feat$month=="March"|visitor_df_feat$month=="December"|visitor_df_feat$month=="April"|visitor_df_feat$month=="May"|visitor_df_feat$month=="June"|visitor_df_feat$month=="July",1,0)
#Create dummy variables for type of restaurant, leaving Asian out as part of coefficient
visitor_df_feat$Bar_gflg <- ifelse(visitor_df_feat$air_genre_name=="Bar/Cocktail",1,0)
visitor_df_feat$Cafe_gflg <- ifelse(visitor_df_feat$air_genre_name=="Cafe/Sweets",1,0)
visitor_df_feat$Creative_gflg <- ifelse(visitor_df_feat$air_genre_name=="Creative cuisine",1,0)
visitor_df_feat$Dining_bar_gflg <- ifelse(visitor_df_feat$air_genre_name=="Dining bar",1,0)
visitor_df_feat$Int_gflg <- ifelse(visitor_df_feat$air_genre_name=="International cuisine",1,0)
visitor_df_feat$IF_gflg <- ifelse(visitor_df_feat$air_genre_name=="Italian/French",1,0)
visitor_df_feat$Izakaya_gflg <- ifelse(visitor_df_feat$air_genre_name=="Izakaya",1,0)
visitor_df_feat$Japanese_gflg <- ifelse(visitor_df_feat$air_genre_name=="Japanese food",1,0)
visitor_df_feat$OMT_gflg <- ifelse(visitor_df_feat$air_genre_name=="Okonomiyaki/Monja/Teppanyaki",1,0)
visitor_df_feat$Other_gflg <- ifelse(visitor_df_feat$air_genre_name=="Other",1,0)
visitor_df_feat$Western_gflg <- ifelse(visitor_df_feat$air_genre_name=="Western food",1,0)
visitor_df_feat$Korean_gflg <- ifelse(visitor_df_feat$air_genre_name=="Yakiniku/Korean food",1,0)
str(visitor_df_feat)
## 'data.frame':    252108 obs. of  26 variables:
##  $ air_store_id   : chr  "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" ...
##  $ visit_date     : Date, format: "2016-01-13" "2016-01-14" ...
##  $ visitors       : int  25 32 29 22 6 9 31 21 18 26 ...
##  $ day_of_week    : chr  "Wednesday" "Thursday" "Friday" "Saturday" ...
##  $ holiday_flg    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ air_genre_name : chr  "Dining bar" "Dining bar" "Dining bar" "Dining bar" ...
##  $ air_area_name  : chr  "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" ...
##  $ latitude       : num  35.7 35.7 35.7 35.7 35.7 ...
##  $ longitude      : num  140 140 140 140 140 ...
##  $ MoT_flg        : num  0 0 0 0 1 1 0 0 0 0 ...
##  $ WoTh_flg       : num  1 1 0 0 0 0 1 1 0 0 ...
##  $ FoSu_flg       : num  0 0 1 0 0 0 0 0 1 0 ...
##  $ month          : chr  "January" "January" "January" "January" ...
##  $ Peak_flg       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Bar_gflg       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Cafe_gflg      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Creative_gflg  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Dining_bar_gflg: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Int_gflg       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ IF_gflg        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Izakaya_gflg   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Japanese_gflg  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ OMT_gflg       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Other_gflg     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Western_gflg   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Korean_gflg    : num  0 0 0 0 0 0 0 0 0 0 ...
#Create a numeric data frame - remove ->
visitor_df_mod <- subset(visitor_df_feat, select = -c(day_of_week, air_area_name, month, air_genre_name, air_store_id))
sum(is.na(visitor_df_mod))
## [1] 0
str(visitor_df_mod)
## 'data.frame':    252108 obs. of  21 variables:
##  $ visit_date     : Date, format: "2016-01-13" "2016-01-14" ...
##  $ visitors       : int  25 32 29 22 6 9 31 21 18 26 ...
##  $ holiday_flg    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ latitude       : num  35.7 35.7 35.7 35.7 35.7 ...
##  $ longitude      : num  140 140 140 140 140 ...
##  $ MoT_flg        : num  0 0 0 0 1 1 0 0 0 0 ...
##  $ WoTh_flg       : num  1 1 0 0 0 0 1 1 0 0 ...
##  $ FoSu_flg       : num  0 0 1 0 0 0 0 0 1 0 ...
##  $ Peak_flg       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Bar_gflg       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Cafe_gflg      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Creative_gflg  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Dining_bar_gflg: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Int_gflg       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ IF_gflg        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Izakaya_gflg   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Japanese_gflg  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ OMT_gflg       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Other_gflg     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Western_gflg   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Korean_gflg    : num  0 0 0 0 0 0 0 0 0 0 ...
#Create TS for TSLM
visitor_df_mod_ts <- ts(visitor_df_mod, frequency = 366)
str(visitor_df_mod_ts)
##  Time-Series [1:252108, 1:21] from 1 to 690: 16813 16814 16815 16816 16818 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:21] "visit_date" "visitors" "holiday_flg" "latitude" ...
#Includes Trend + holiday_flg + day of week flag
vfit_tslm_3 <- tslm(visitors~holiday_flg + MoT_flg + WoTh_flg + FoSu_flg, , data = visitor_df_mod_ts)
summary(vfit_tslm_3)
## 
## Call:
## tslm(formula = visitors ~ holiday_flg + MoT_flg + WoTh_flg + 
##     FoSu_flg, data = visitor_df_mod_ts)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -28.64 -11.29  -3.88   7.84 858.12 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 26.15800    0.08315  314.59   <2e-16 ***
## holiday_flg  3.48109    0.14929   23.32   <2e-16 ***
## MoT_flg     -8.93802    0.10421  -85.77   <2e-16 ***
## WoTh_flg    -7.27607    0.10236  -71.09   <2e-16 ***
## FoSu_flg    -2.87282    0.10346  -27.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.42 on 252103 degrees of freedom
## Multiple R-squared:  0.03956,    Adjusted R-squared:  0.03954 
## F-statistic:  2596 on 4 and 252103 DF,  p-value: < 2.2e-16
#Layering in peak month
vfit_tslm_4 <- tslm(visitors~holiday_flg + MoT_flg + WoTh_flg + FoSu_flg + Peak_flg, , data = visitor_df_mod_ts)
summary(vfit_tslm_4)
## 
## Call:
## tslm(formula = visitors ~ holiday_flg + MoT_flg + WoTh_flg + 
##     FoSu_flg + Peak_flg, data = visitor_df_mod_ts)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -29.84 -11.36  -3.37   8.03 857.06 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 25.23596    0.08848  285.21   <2e-16 ***
## holiday_flg  3.63388    0.14911   24.37   <2e-16 ***
## MoT_flg     -8.87460    0.10405  -85.29   <2e-16 ***
## WoTh_flg    -7.26724    0.10217  -71.13   <2e-16 ***
## FoSu_flg    -2.86110    0.10328  -27.70   <2e-16 ***
## Peak_flg     1.97508    0.06567   30.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.39 on 252102 degrees of freedom
## Multiple R-squared:  0.04299,    Adjusted R-squared:  0.04297 
## F-statistic:  2265 on 5 and 252102 DF,  p-value: < 2.2e-16
#Layering in Restaurant genre
vfit_tslm_5 <- tslm(visitors~holiday_flg + MoT_flg + WoTh_flg + 
                      FoSu_flg + Peak_flg + Bar_gflg + Cafe_gflg + Creative_gflg + 
                      Dining_bar_gflg + Int_gflg + IF_gflg + Izakaya_gflg +
                      Japanese_gflg + OMT_gflg + Other_gflg + Western_gflg+
                      Korean_gflg, , data = visitor_df_mod_ts)
summary(vfit_tslm_5)
## 
## Call:
## tslm(formula = visitors ~ holiday_flg + MoT_flg + WoTh_flg + 
##     FoSu_flg + Peak_flg + Bar_gflg + Cafe_gflg + Creative_gflg + 
##     Dining_bar_gflg + Int_gflg + IF_gflg + Izakaya_gflg + Japanese_gflg + 
##     OMT_gflg + Other_gflg + Western_gflg + Korean_gflg, data = visitor_df_mod_ts)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -41.18 -10.98  -3.38   7.68 864.77 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      38.50071    0.50277   76.58   <2e-16 ***
## holiday_flg       3.67894    0.14640   25.13   <2e-16 ***
## MoT_flg          -8.88626    0.10216  -86.98   <2e-16 ***
## WoTh_flg         -7.26159    0.10032  -72.39   <2e-16 ***
## FoSu_flg         -2.84862    0.10140  -28.09   <2e-16 ***
## Peak_flg          1.99318    0.06448   30.91   <2e-16 ***
## Bar_gflg        -21.00221    0.50674  -41.45   <2e-16 ***
## Cafe_gflg       -11.73169    0.50138  -23.40   <2e-16 ***
## Creative_gflg   -10.66212    0.55985  -19.05   <2e-16 ***
## Dining_bar_gflg -15.50877    0.50404  -30.77   <2e-16 ***
## Int_gflg         -8.97146    0.97097   -9.24   <2e-16 ***
## IF_gflg         -11.67420    0.50510  -23.11   <2e-16 ***
## Izakaya_gflg    -11.11048    0.50066  -22.19   <2e-16 ***
## Japanese_gflg   -14.64771    0.51018  -28.71   <2e-16 ***
## OMT_gflg        -11.46343    0.56248  -20.38   <2e-16 ***
## Other_gflg      -14.38117    0.52716  -27.28   <2e-16 ***
## Western_gflg    -11.91596    0.54715  -21.78   <2e-16 ***
## Korean_gflg     -12.97055    0.53231  -24.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.09 on 252090 degrees of freedom
## Multiple R-squared:  0.07761,    Adjusted R-squared:  0.07754 
## F-statistic:  1248 on 17 and 252090 DF,  p-value: < 2.2e-16
plot(vfit_tslm_5$residuals)

#Overall R_Sqrd is quite low, though much of the difference may be explained in error term

#Simple TSLM
vfit_tslm_1 <- tslm(visitors~visit_date, data = visitor_df_mod_ts)
summary(vfit_tslm_1)
## 
## Call:
## tslm(formula = visitors ~ visit_date, data = visitor_df_mod_ts)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -20.07 -11.99  -3.97   8.04 856.08 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 26.7732080  4.6304546   5.782 7.39e-09 ***
## visit_date  -0.0003394  0.0002710  -1.252     0.21    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.76 on 252106 degrees of freedom
## Multiple R-squared:  6.222e-06,  Adjusted R-squared:  2.256e-06 
## F-statistic: 1.569 on 1 and 252106 DF,  p-value: 0.2104
#Simple TSLM with holiday flag
vfit_tslm_2 <- tslm(visitors~visit_date + holiday_flg , data = visitor_df_mod_ts)
summary(vfit_tslm_2)
## 
## Call:
## tslm(formula = visitors ~ visit_date + holiday_flg, data = visitor_df_mod_ts)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -22.76 -11.85  -3.84   8.17 856.20 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 24.6698732  4.6285317    5.33 9.83e-08 ***
## visit_date  -0.0002248  0.0002709   -0.83    0.407    
## holiday_flg  2.8724353  0.1520934   18.89  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.75 on 252105 degrees of freedom
## Multiple R-squared:  0.001419,   Adjusted R-squared:  0.001411 
## F-statistic: 179.1 on 2 and 252105 DF,  p-value: < 2.2e-16
#Create Train & Test Sets
visitor_train_ts <- subset(visitor_agg_ts, end = 412)
visitor_test_ts <- subset(visitor_agg_ts, start = 413)

#ETS Model
vfit_ets <- ets(visitor_train_ts)
## Warning in ets(visitor_train_ts): I can't handle data with frequency greater
## than 24. Seasonality will be ignored. Try stlf() if you need seasonal forecasts.
vfit_ets
## ETS(M,A,N) 
## 
## Call:
##  ets(y = visitor_train_ts) 
## 
##   Smoothing parameters:
##     alpha = 0.999 
##     beta  = 1e-04 
## 
##   Initial states:
##     l = 655.2775 
##     b = 439.2025 
## 
##   sigma:  0.2384
## 
##      AIC     AICc      BIC 
## 8856.739 8856.887 8876.844
#ETS (M,A,N) -> Multiplicative errors, additive trend and no seasonality
#ETS models do not handle daily frequency well
#The Alpha value of 0.9999 is extremely high which means the weighting of oberservations is heavy to the most recent
#AICc = 8856.887
vfc_ets <- forecast(vfit_ets, h = 105, level = c(95))
autoplot(vfc_ets)

vfc_ets
##           Point Forecast       Lo 95     Hi 95
## 2017.1257       14253.69    7594.958  20912.42
## 2017.1284       14676.43    4992.214  24360.64
## 2017.1311       15099.16    2901.413  27296.91
## 2017.1339       15521.90    1039.628  30004.17
## 2017.1366       15944.64    -700.293  32589.56
## 2017.1393       16367.37   -2372.364  35107.11
## 2017.1421       16790.11   -4008.294  37588.51
## 2017.1448       17212.84   -5628.629  40054.32
## 2017.1475       17635.58   -7247.656  42518.82
## 2017.1503       18058.32   -8875.872  44992.51
## 2017.1530       18481.05  -10521.335  47483.44
## 2017.1557       18903.79  -12190.472  49998.05
## 2017.1585       19326.53  -13888.572  52541.63
## 2017.1612       19749.26  -15620.114  55118.64
## 2017.1639       20172.00  -17388.988  57732.99
## 2017.1667       20594.74  -19198.644  60388.12
## 2017.1694       21017.47  -21052.209  63087.16
## 2017.1721       21440.21  -22952.561  65832.98
## 2017.1749       21862.95  -24902.392  68628.28
## 2017.1776       22285.68  -26904.253  71475.62
## 2017.1803       22708.42  -28960.592  74377.43
## 2017.1831       23131.16  -31073.777  77336.09
## 2017.1858       23553.89  -33246.124  80353.91
## 2017.1885       23976.63  -35479.912  83433.17
## 2017.1913       24399.36  -37777.397  86576.13
## 2017.1940       24822.10  -40140.828  89785.03
## 2017.1967       25244.84  -42572.455  93062.13
## 2017.1995       25667.57  -45074.535  96409.68
## 2017.2022       26090.31  -47649.347  99829.97
## 2017.2049       26513.05  -50299.193 103325.29
## 2017.2077       26935.78  -53026.405 106897.97
## 2017.2104       27358.52  -55833.352 110550.39
## 2017.2131       27781.26  -58722.442 114284.96
## 2017.2158       28203.99  -61696.130 118104.12
## 2017.2186       28626.73  -64756.920 122010.38
## 2017.2213       29049.47  -67907.369 126006.30
## 2017.2240       29472.20  -71150.089 130094.49
## 2017.2268       29894.94  -74487.754 134277.63
## 2017.2295       30317.68  -77923.102 138558.45
## 2017.2322       30740.41  -81458.936 142939.76
## 2017.2350       31163.15  -85098.129 147424.43
## 2017.2377       31585.88  -88843.629 152015.40
## 2017.2404       32008.62  -92698.456 156715.70
## 2017.2432       32431.36  -96665.714 161528.43
## 2017.2459       32854.09 -100748.584 166456.77
## 2017.2486       33276.83 -104950.334 171504.00
## 2017.2514       33699.57 -109274.321 176673.46
## 2017.2541       34122.30 -113723.991 181968.60
## 2017.2568       34545.04 -118302.885 187392.97
## 2017.2596       34967.78 -123014.641 192950.19
## 2017.2623       35390.51 -127862.995 198644.02
## 2017.2650       35813.25 -132851.791 204478.29
## 2017.2678       36235.99 -137984.974 210456.95
## 2017.2705       36658.72 -143266.602 216584.05
## 2017.2732       37081.46 -148700.846 222863.76
## 2017.2760       37504.20 -154291.993 229300.38
## 2017.2787       37926.93 -160044.450 235898.31
## 2017.2814       38349.67 -165962.747 242662.08
## 2017.2842       38772.40 -172051.542 249596.35
## 2017.2869       39195.14 -178315.624 256705.91
## 2017.2896       39617.88 -184759.915 263995.67
## 2017.2923       40040.61 -191389.478 271470.71
## 2017.2951       40463.35 -198209.518 279136.22
## 2017.2978       40886.09 -205225.386 286997.56
## 2017.3005       41308.82 -212442.583 295060.23
## 2017.3033       41731.56 -219866.768 303329.89
## 2017.3060       42154.30 -227503.757 311812.35
## 2017.3087       42577.03 -235359.531 320513.60
## 2017.3115       42999.77 -243440.242 329439.78
## 2017.3142       43422.51 -251752.212 338597.22
## 2017.3169       43845.24 -260301.943 347992.43
## 2017.3197       44267.98 -269096.123 357632.08
## 2017.3224       44690.72 -278141.624 367523.06
## 2017.3251       45113.45 -287445.517 377672.42
## 2017.3279       45536.19 -297015.068 388087.45
## 2017.3306       45958.93 -306857.752 398775.60
## 2017.3333       46381.66 -316981.253 409744.58
## 2017.3361       46804.40 -327393.472 421002.27
## 2017.3388       47227.13 -338102.533 432556.80
## 2017.3415       47649.87 -349116.790 444416.53
## 2017.3443       48072.61 -360444.830 456590.05
## 2017.3470       48495.34 -372095.486 469086.17
## 2017.3497       48918.08 -384077.838 481914.00
## 2017.3525       49340.82 -396401.221 495082.85
## 2017.3552       49763.55 -409075.235 508602.34
## 2017.3579       50186.29 -422109.749 522482.33
## 2017.3607       50609.03 -435514.912 536732.96
## 2017.3634       51031.76 -449301.157 551364.68
## 2017.3661       51454.50 -463479.211 566388.21
## 2017.3689       51877.24 -478060.106 581814.58
## 2017.3716       52299.97 -493055.180 597655.12
## 2017.3743       52722.71 -508476.094 613921.51
## 2017.3770       53145.45 -524334.835 630625.73
## 2017.3798       53568.18 -540643.728 647780.09
## 2017.3825       53990.92 -557415.446 665397.28
## 2017.3852       54413.65 -574663.017 683490.33
## 2017.3880       54836.39 -592399.836 702072.62
## 2017.3907       55259.13 -610639.675 721157.93
## 2017.3934       55681.86 -629396.692 740760.42
## 2017.3962       56104.60 -648685.446 760894.65
## 2017.3989       56527.34 -668520.902 781575.58
## 2017.4016       56950.07 -688918.448 802818.60
## 2017.4044       57372.81 -709893.904 824639.52
## 2017.4071       57795.55 -731463.534 847054.63
## 2017.4098       58218.28 -753644.060 870080.63
checkresiduals(vfc_ets)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,A,N)
## Q* = 2977.7, df = 78, p-value < 2.2e-16
## 
## Model df: 4.   Total lags used: 82
#Passed the Ljung-box text but still apears to have autocorrelation
accuracy(vfc_ets, visitor_agg_ts)
##                      ME      RMSE       MAE        MPE     MAPE      MASE
## Training set   -399.648  2791.139  1999.147  -7.947741 20.68651 0.2562708
## Test set     -12397.822 15053.249 12791.049 -87.538742 89.50499 1.6396862
##                   ACF1 Theil's U
## Training set 0.1309531        NA
## Test set     0.8892403  4.845375
#RMSE = 15053, essentially each day's forecast has as a standard deviation of 2791 visitors

#ARIMA Model
vfit_arima <- auto.arima(visitor_train_ts)
## Warning: The chosen seasonal unit root test encountered an error when testing for the first difference.
## From stl(): series is not periodic or has less than two periods
## 0 seasonal differences will be used. Consider using a different unit root test.
vfit_arima
## Series: visitor_train_ts 
## ARIMA(5,1,3) 
## 
## Coefficients:
##          ar1      ar2      ar3      ar4      ar5      ma1     ma2      ma3
##       0.2046  -0.9608  -0.0021  -0.4692  -0.4734  -0.2706  0.5058  -0.2420
## s.e.  0.0680   0.0525   0.0926   0.0475   0.0596   0.0732  0.0502   0.0865
## 
## sigma^2 estimated as 2413885:  log likelihood=-3601.79
## AIC=7221.58   AICc=7222.03   BIC=7257.75
#ARIMA(5,1,3), no seasonality. AICc = 7222.03, 1 differencing
fc_arima <- forecast(vfit_arima, h = 105, level = c(95))
fc_arima
##           Point Forecast     Lo 95    Hi 95
## 2017.1257       13230.44 10185.303 16275.57
## 2017.1284       16800.70 12634.108 20967.29
## 2017.1311       18384.02 13983.039 22785.00
## 2017.1339       13635.19  9196.760 18073.62
## 2017.1366       10706.31  6254.440 15158.18
## 2017.1393       13276.31  8818.143 17734.47
## 2017.1421       14192.83  9690.197 18695.45
## 2017.1448       13395.66  8274.723 18516.59
## 2017.1475       15969.11 10278.116 21660.11
## 2017.1503       17440.29 11637.325 23243.26
## 2017.1530       13623.51  7780.627 19466.40
## 2017.1557       11364.05  5487.810 17240.29
## 2017.1585       13735.99  7857.145 19614.83
## 2017.1612       14491.40  8564.872 20417.93
## 2017.1639       13465.96  7161.043 19770.87
## 2017.1667       15392.57  8738.579 22046.57
## 2017.1694       16727.13  9996.517 23457.74
## 2017.1721       13673.72  6901.624 20445.82
## 2017.1749       11886.40  5062.977 18709.83
## 2017.1776       14033.31  7194.566 20872.05
## 2017.1803       14657.80  7768.679 21546.91
## 2017.1831       13527.31  6374.973 20679.65
## 2017.1858       14975.77  7582.699 22368.85
## 2017.1885       16195.78  8742.167 23649.38
## 2017.1913       13746.55  6249.112 21243.98
## 2017.1940       12305.13  4744.004 19866.26
## 2017.1967       14216.61  6627.013 21806.20
## 2017.1995       14739.45  7098.406 22380.49
## 2017.2022       13584.41  5746.934 21421.89
## 2017.2049       14677.67  6660.835 22694.51
## 2017.2077       15795.52  7726.050 23865.00
## 2017.2104       13825.89  5710.465 21941.32
## 2017.2131       12641.10  4454.683 20827.52
## 2017.2158       14322.76  6097.630 22547.89
## 2017.2186       14767.12  6490.499 23043.74
## 2017.2213       13639.64  5208.936 22070.35
## 2017.2240       14466.96  5894.576 23039.34
## 2017.2268       15490.44  6869.365 24111.51
## 2017.2295       13902.59  5233.942 22571.23
## 2017.2322       12911.35  4168.292 21654.41
## 2017.2350       14377.70  5588.859 23166.54
## 2017.2377       14761.45  5921.635 23601.26
## 2017.2404       13693.58  4728.032 22659.13
## 2017.2432       14320.22  5237.493 23402.95
## 2017.2459       15254.89  6125.395 24384.38
## 2017.2486       13971.94  4793.797 23150.08
## 2017.2514       13129.54  3876.266 22382.81
## 2017.2541       14399.50  5095.989 23703.02
## 2017.2568       14736.11  5382.411 24089.81
## 2017.2596       13745.98  4286.256 23205.70
## 2017.2623       14220.03  4659.821 23780.23
## 2017.2650       15070.58  5464.475 24676.68
## 2017.2678       14031.96  4376.635 23687.29
## 2017.2705       13306.52  3577.078 23035.96
## 2017.2732       14400.62  4618.413 24182.83
## 2017.2760       14700.08  4868.553 24531.61
## 2017.2787       13796.26  3872.863 23719.65
## 2017.2814       14153.47  4141.435 24165.51
## 2017.2842       14924.41  4866.820 24982.01
## 2017.2869       14082.29  3975.299 24189.27
## 2017.2896       13450.86  3271.796 23629.91
## 2017.2923       14389.52  4156.526 24622.51
## 2017.2951       14659.12  4377.666 24940.56
## 2017.2978       13843.83  3480.944 24206.72
## 2017.3005       14111.04  3668.228 24553.85
## 2017.3033       14806.99  4318.723 25295.25
## 2017.3060       14123.41  3585.905 24660.92
## 2017.3087       13569.25  2962.257 24176.25
## 2017.3115       14371.85  3710.659 25033.04
## 2017.3142       14616.84  3908.014 25325.67
## 2017.3169       13888.21  3105.817 24670.60
## 2017.3197       14085.75  3230.060 24941.43
## 2017.3224       14711.50  3810.396 25612.60
## 2017.3251       14156.29  3206.324 25106.26
## 2017.3279       13666.97  2650.320 24683.62
## 2017.3306       14351.30  3280.787 25421.82
## 2017.3333       14575.46  3458.076 25692.85
## 2017.3361       13929.06  2744.188 25113.94
## 2017.3388       14072.49  2819.520 25325.46
## 2017.3415       14633.00  3334.667 25931.33
## 2017.3443       14182.03  2835.373 25528.68
## 2017.3470       13748.11  2337.599 25158.62
## 2017.3497       14330.21  2866.533 25793.89
## 2017.3525       14536.24  3026.410 26046.08
## 2017.3552       13966.23  2393.685 25538.77
## 2017.3579       14067.56  2431.116 25704.01
## 2017.3607       14567.85  2886.160 26249.54
## 2017.3634       14201.75  2472.381 25931.11
## 2017.3661       13815.88  2025.395 25606.36
## 2017.3689       14309.98  2467.244 26152.72
## 2017.3716       14499.85  2611.628 26388.07
## 2017.3743       13999.67  2052.578 25946.76
## 2017.3770       14068.28  2060.740 26075.82
## 2017.3798       14513.35  2460.761 26565.93
## 2017.3825       14216.49  2116.946 26316.04
## 2017.3852       13872.80  1714.709 26030.89
## 2017.3880       14291.40  2082.093 26500.71
## 2017.3907       14466.56  2212.406 26720.72
## 2017.3934       14029.48  1719.591 26339.37
## 2017.3962       14072.71  1705.286 26440.14
## 2017.3989       14467.45  2055.258 26879.64
## 2017.4016       14227.20  1768.798 26685.60
## 2017.4044       13920.85  1406.291 26435.41
## 2017.4071       14274.85  1710.158 26839.54
## 2017.4098       14436.42  1827.498 27045.34
autoplot(fc_arima)

checkresiduals(fc_arima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,1,3)
## Q* = 131.32, df = 74, p-value = 4.64e-05
## 
## Model df: 8.   Total lags used: 82
#Looks much more normally distributed with white noise residuals
accuracy(fc_arima, visitor_agg_ts)
##                      ME     RMSE      MAE        MPE      MAPE      MASE
## Training set   81.27446 1536.605  989.320 -0.4383417  9.954058 0.1268211
## Test set     1350.05740 3012.803 2244.132  5.4988128 13.222301 0.2876755
##                     ACF1 Theil's U
## Training set -0.02974393        NA
## Test set      0.52568893 0.8183462
#The ARIMA model produces much better fitting predicitions in the test set than ETS
#RMSE = 3012

#TBATS
vfit_tbats <- tbats(visitor_train_ts)
vfit_tbats
## BATS(0.052, {5,3}, 0.961, -)
## 
## Call: tbats(y = visitor_train_ts)
## 
## Parameters
##   Lambda: 0.052406
##   Alpha: 0.9130191
##   Beta: 0.04428732
##   Damping Parameter: 0.96069
##   AR coefficients: 0.147807 -0.973041 -0.047549 -0.48054 -0.489072
##   MA coefficients: -0.108566 0.502682 -0.219851
## 
## Seed States:
##            [,1]
##  [1,] 9.7074953
##  [2,] 0.2464777
##  [3,] 0.0000000
##  [4,] 0.0000000
##  [5,] 0.0000000
##  [6,] 0.0000000
##  [7,] 0.0000000
##  [8,] 0.0000000
##  [9,] 0.0000000
## [10,] 0.0000000
## attr(,"lambda")
## [1] 0.05240574
## 
## Sigma: 0.2536455
## AIC: 8513.792
#Almost a full log transformation
#ARMA terms of 5 and 3, respectively
#Dampening at a rate 0.961
#No fourier terms selected
vfc_tbats <- forecast(vfit_tbats, h = 105, level = c(95))
vfc_tbats
##           Point Forecast     Lo 95     Hi 95
## 2017.1257       13327.55  9827.659  17987.54
## 2017.1284       16805.90 10977.808  25490.04
## 2017.1311       18500.77 11710.706  28916.65
## 2017.1339       13141.10  8193.974  20835.97
## 2017.1366       10847.46  6701.863  17350.53
## 2017.1393       13439.88  8343.355  21399.44
## 2017.1421       14508.48  8902.210  23358.86
## 2017.1448       13598.15  7695.363  23635.62
## 2017.1475       16099.90  8521.114  29801.14
## 2017.1503       17410.55  9060.310  32741.02
## 2017.1530       13078.22  6673.684  25048.40
## 2017.1557       11470.62  5763.270  22289.33
## 2017.1585       14166.89  7125.284  27502.32
## 2017.1612       14948.17  7397.414  29459.98
## 2017.1639       13692.28  6372.938  28563.50
## 2017.1667       15598.90  6947.321  33890.99
## 2017.1694       16684.39  7333.877  36689.58
## 2017.1721       13148.62  5650.925  29518.53
## 2017.1749       12023.13  5063.187  27499.42
## 2017.1776       14674.02  6161.369  33653.09
## 2017.1803       15167.88  6250.620  35386.03
## 2017.1831       13722.72  5374.138  33535.39
## 2017.1858       15242.05  5778.116  38363.55
## 2017.1885       16205.90  6073.141  41216.98
## 2017.1913       13280.38  4855.614  34537.09
## 2017.1940       12495.72  4464.053  33182.44
## 2017.1967       15003.26  5330.379  40039.49
## 2017.1995       15247.25  5310.832  41421.47
## 2017.2022       13729.81  4574.886  38810.66
## 2017.2049       14997.58  4870.171  43381.13
## 2017.2077       15887.60  5103.826  46399.58
## 2017.2104       13434.54  4204.961  40154.09
## 2017.2131       12887.64  3936.391  39364.03
## 2017.2158       15198.83  4610.485  46707.22
## 2017.2186       15243.38  4531.891  47685.58
## 2017.2213       13735.20  3924.487  44500.00
## 2017.2240       14836.89  4149.229  48985.57
## 2017.2268       15672.27  4337.298  52220.63
## 2017.2295       13588.42  3661.398  46350.62
## 2017.2322       13205.42  3471.637  46024.71
## 2017.2350       15299.21  3992.112  53671.51
## 2017.2377       15195.00  3886.190  54246.21
## 2017.2404       13748.62  3390.943  50656.07
## 2017.2432       14736.49  3567.839  55175.58
## 2017.2459       15522.33  3719.630  58637.98
## 2017.2486       13729.98  3202.677  53088.89
## 2017.2514       13459.14  3064.581  53136.69
## 2017.2541       15335.14  3465.086  60943.82
## 2017.2568       15126.75  3350.938  61148.38
## 2017.2596       13772.96  2950.214  57303.37
## 2017.2623       14677.96  3092.832  61933.02
## 2017.2650       15413.29  3214.753  65605.94
## 2017.2678       13853.94  2813.186  60330.35
## 2017.2705       13659.89  2709.872  60676.52
## 2017.2732       15329.72  3017.961  68537.74
## 2017.2760       15053.33  2906.341  68423.99
## 2017.2787       13807.59  2583.440  64451.03
## 2017.2814       14647.39  2700.164  69233.79
## 2017.2842       15329.47  2797.235  73083.40
## 2017.2869       13959.13  2480.952  68042.67
## 2017.2896       13818.22  2401.768  68628.60
## 2017.2923       15299.46  2639.175  76468.98
## 2017.2951       14982.84  2535.580  76096.07
## 2017.2978       13850.31  2275.786  72099.44
## 2017.3005       14634.60  2372.093  77053.08
## 2017.3033       15261.07  2448.540  81036.88
## 2017.3060       14046.62  2196.404  76201.95
## 2017.3087       13943.37  2134.488  76984.96
## 2017.3115       15255.66  2317.972  84754.03
## 2017.3142       14919.23  2224.734  84181.18
## 2017.3169       13898.47  2015.636  80244.15
## 2017.3197       14632.41  2095.359  85368.18
## 2017.3224       15202.21  2154.829  89441.30
## 2017.3251       14118.59  1951.743  84792.63
## 2017.3279       14042.97  1902.553  85743.71
## 2017.3306       15205.76  2044.866  93408.99
## 2017.3333       14863.92  1962.511  92691.21
## 2017.3361       13949.54  1793.922  88878.49
## 2017.3388       14635.92  1859.926  94160.01
## 2017.3415       15149.47  1905.568  98279.66
## 2017.3443       14177.57  1740.551  93806.35
## 2017.3470       14123.07  1700.985  94907.32
## 2017.3497       15154.43  1811.770 102448.82
## 2017.3525       14816.96  1739.855 101634.76
## 2017.3552       14001.41  1603.556  97995.26
## 2017.3579       14641.93  1658.106 103413.75
## 2017.3607       15101.00  1692.606 107542.03
## 2017.3634       14226.02  1557.531 103240.47
## 2017.3661       14188.36  1525.394 104481.07
## 2017.3689       15104.43  1611.927 111887.07
## 2017.3716       14777.66  1549.551 111018.35
## 2017.3743       14052.45  1438.980 107587.99
## 2017.3770       14648.47  1483.933 113119.01
## 2017.3798       15055.87  1509.549 117224.36
## 2017.3825       14266.11  1398.292 113096.64
## 2017.3852       14242.35  1371.982 114471.92
## 2017.3880       15057.28  1439.761 121735.92
## 2017.3907       14745.07  1385.853 120847.30
## 2017.3934       14101.50  1295.807 117651.60
## 2017.3962       14654.41  1332.717 123269.60
## 2017.3989       15013.68  1351.322 127327.29
## 2017.4016       14299.68  1259.194 123379.50
## 2017.4044       14287.66  1237.507 124887.74
## 2017.4071       15013.68  1290.703 132006.42
## 2017.4098       14718.14  1244.183 131126.50
autoplot(vfc_tbats)

checkresiduals(vfc_tbats)

## 
##  Ljung-Box test
## 
## data:  Residuals from BATS(0.052, {5,3}, 0.961, -)
## Q* = 81.954, df = 60, p-value = 0.03139
## 
## Model df: 22.   Total lags used: 82
#White noise, residuals with relatively consistent residuals and normally distributed
accuracy(vfc_tbats, visitor_agg_ts)
##                      ME     RMSE      MAE       MPE     MAPE      MASE
## Training set   28.48698 1597.778 1001.513 -1.993263 10.54526 0.1283840
## Test set     1118.26458 2998.777 2335.599  3.857327 14.15857 0.2994007
##                     ACF1 Theil's U
## Training set -0.04089051        NA
## Test set      0.55109087 0.8319025
#Very similar RMSE to the ARIMA model, 2998

#TSLM model of aggregate
vts_agg <- ts(visitor_agg, frequency = 366)
vfit_tslm_agg <- tslm(visitors~Group.date, vts_agg)
summary(vfit_tslm_agg)
## 
## Call:
## tslm(formula = visitors ~ Group.date, data = vts_agg)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -11132  -2297   -529   1988   9959 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.521e+05  1.934e+04  -23.38   <2e-16 ***
## Group.date   2.718e+01  1.135e+00   23.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3424 on 476 degrees of freedom
## Multiple R-squared:  0.5464, Adjusted R-squared:  0.5455 
## F-statistic: 573.5 on 1 and 476 DF,  p-value: < 2.2e-16
###Predict data prep
predict_df <- submission_data
str(predict_df)
## 'data.frame':    32019 obs. of  4 variables:
##  $ id          : chr  "air_00a91d42b08b08d9_2017-04-23" "air_00a91d42b08b08d9_2017-04-24" "air_00a91d42b08b08d9_2017-04-25" "air_00a91d42b08b08d9_2017-04-26" ...
##  $ visitors    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ air_store_id: chr  "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" ...
##  $ visit_date  : Date, format: "2017-04-23" "2017-04-24" ...
#remove id field and null visitor field
predict_df <- subset(predict_df, select = -c(id, visitors))
predict_df <- left_join(predict_df,date_info, by = "visit_date")
head(predict_df)
##           air_store_id visit_date calendar_date day_of_week holiday_flg
## 1 air_00a91d42b08b08d9 2017-04-23    2017-04-23      Sunday           0
## 2 air_00a91d42b08b08d9 2017-04-24    2017-04-24      Monday           0
## 3 air_00a91d42b08b08d9 2017-04-25    2017-04-25     Tuesday           0
## 4 air_00a91d42b08b08d9 2017-04-26    2017-04-26   Wednesday           0
## 5 air_00a91d42b08b08d9 2017-04-27    2017-04-27    Thursday           0
## 6 air_00a91d42b08b08d9 2017-04-28    2017-04-28      Friday           0
tail(predict_df)
##               air_store_id visit_date calendar_date day_of_week holiday_flg
## 32014 air_fff68b929994bfbd 2017-05-26    2017-05-26      Friday           0
## 32015 air_fff68b929994bfbd 2017-05-27    2017-05-27    Saturday           0
## 32016 air_fff68b929994bfbd 2017-05-28    2017-05-28      Sunday           0
## 32017 air_fff68b929994bfbd 2017-05-29    2017-05-29      Monday           0
## 32018 air_fff68b929994bfbd 2017-05-30    2017-05-30     Tuesday           0
## 32019 air_fff68b929994bfbd 2017-05-31    2017-05-31   Wednesday           0
sum(is.na(predict_df))
## [1] 0
#Join Store info from AeRegi by air_store_id
predict_df<- left_join(predict_df,air_store_info, by = "air_store_id")
head(predict_df)
##           air_store_id visit_date calendar_date day_of_week holiday_flg
## 1 air_00a91d42b08b08d9 2017-04-23    2017-04-23      Sunday           0
## 2 air_00a91d42b08b08d9 2017-04-24    2017-04-24      Monday           0
## 3 air_00a91d42b08b08d9 2017-04-25    2017-04-25     Tuesday           0
## 4 air_00a91d42b08b08d9 2017-04-26    2017-04-26   Wednesday           0
## 5 air_00a91d42b08b08d9 2017-04-27    2017-04-27    Thursday           0
## 6 air_00a91d42b08b08d9 2017-04-28    2017-04-28      Friday           0
##   air_genre_name                     air_area_name latitude longitude
## 1 Italian/French TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami   35.694  139.7536
## 2 Italian/French TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami   35.694  139.7536
## 3 Italian/French TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami   35.694  139.7536
## 4 Italian/French TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami   35.694  139.7536
## 5 Italian/French TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami   35.694  139.7536
## 6 Italian/French TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami   35.694  139.7536
tail(visitor_df)
##                air_store_id visit_date visitors day_of_week holiday_flg
## 252103 air_24e8414b9b07decb 2017-04-15        7    Saturday           0
## 252104 air_24e8414b9b07decb 2017-04-18        6     Tuesday           0
## 252105 air_24e8414b9b07decb 2017-04-19        6   Wednesday           0
## 252106 air_24e8414b9b07decb 2017-04-20        7    Thursday           0
## 252107 air_24e8414b9b07decb 2017-04-21        8      Friday           0
## 252108 air_24e8414b9b07decb 2017-04-22        5    Saturday           0
##        air_genre_name                 air_area_name latitude longitude
## 252103          Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322   139.711
## 252104          Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322   139.711
## 252105          Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322   139.711
## 252106          Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322   139.711
## 252107          Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322   139.711
## 252108          Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322   139.711
sum(is.na(predict_df))
## [1] 0
str(predict_df)
## 'data.frame':    32019 obs. of  9 variables:
##  $ air_store_id  : chr  "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" ...
##  $ visit_date    : Date, format: "2017-04-23" "2017-04-24" ...
##  $ calendar_date : Date, format: "2017-04-23" "2017-04-24" ...
##  $ day_of_week   : chr  "Sunday" "Monday" "Tuesday" "Wednesday" ...
##  $ holiday_flg   : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ air_genre_name: chr  "Italian/French" "Italian/French" "Italian/French" "Italian/French" ...
##  $ air_area_name : chr  "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" ...
##  $ latitude      : num  35.7 35.7 35.7 35.7 35.7 ...
##  $ longitude     : num  140 140 140 140 140 ...
predict_df <- subset(predict_df, select = -c(calendar_date))
str(predict_df)
## 'data.frame':    32019 obs. of  8 variables:
##  $ air_store_id  : chr  "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" ...
##  $ visit_date    : Date, format: "2017-04-23" "2017-04-24" ...
##  $ day_of_week   : chr  "Sunday" "Monday" "Tuesday" "Wednesday" ...
##  $ holiday_flg   : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ air_genre_name: chr  "Italian/French" "Italian/French" "Italian/French" "Italian/French" ...
##  $ air_area_name : chr  "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" ...
##  $ latitude      : num  35.7 35.7 35.7 35.7 35.7 ...
##  $ longitude     : num  140 140 140 140 140 ...
#Feature Creation - submission set
predict_df_feat <- predict_df
#create dummy variables for day of week, MorT / WoTh / FoSu, Saturday with coefficient
predict_df_feat$MoT_flg <- ifelse(predict_df_feat$day_of_week=="Monday"|predict_df_feat$day_of_week=="Tuesday" ,1,0)
predict_df_feat$WoTh_flg <- ifelse(predict_df_feat$day_of_week=="Wednesday"|predict_df_feat$day_of_week=="Thursday" ,1,0)
predict_df_feat$FoSu_flg <- ifelse(predict_df_feat$day_of_week=="Friday"|predict_df_feat$day_of_week=="Sunday" ,1,0)
str(predict_df_feat)
## 'data.frame':    32019 obs. of  11 variables:
##  $ air_store_id  : chr  "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" ...
##  $ visit_date    : Date, format: "2017-04-23" "2017-04-24" ...
##  $ day_of_week   : chr  "Sunday" "Monday" "Tuesday" "Wednesday" ...
##  $ holiday_flg   : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ air_genre_name: chr  "Italian/French" "Italian/French" "Italian/French" "Italian/French" ...
##  $ air_area_name : chr  "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" ...
##  $ latitude      : num  35.7 35.7 35.7 35.7 35.7 ...
##  $ longitude     : num  140 140 140 140 140 ...
##  $ MoT_flg       : num  0 1 1 0 0 0 0 0 1 1 ...
##  $ WoTh_flg      : num  0 0 0 1 1 0 0 0 0 0 ...
##  $ FoSu_flg      : num  1 0 0 0 0 1 0 1 0 0 ...
#Create Month column, to isolate peak months
predict_df_feat$month <- format(predict_df_feat$visit_date,"%B")
predict_df_feat$Peak_flg <- ifelse(predict_df_feat$month=="March"|predict_df_feat$month=="December"|predict_df_feat$month=="April"|predict_df_feat$month=="May"|predict_df_feat$month=="June"|predict_df_feat$month=="July",1,0)
#Create dummy variables for type of restaurant, leaving Asian out as part of coefficient
predict_df_feat$Bar_gflg <- ifelse(predict_df_feat$air_genre_name=="Bar/Cocktail",1,0)
predict_df_feat$Cafe_gflg <- ifelse(predict_df_feat$air_genre_name=="Cafe/Sweets",1,0)
predict_df_feat$Creative_gflg <- ifelse(predict_df_feat$air_genre_name=="Creative cuisine",1,0)
predict_df_feat$Dining_bar_gflg <- ifelse(predict_df_feat$air_genre_name=="Dining bar",1,0)
predict_df_feat$Int_gflg <- ifelse(predict_df_feat$air_genre_name=="International cuisine",1,0)
predict_df_feat$IF_gflg <- ifelse(predict_df_feat$air_genre_name=="Italian/French",1,0)
predict_df_feat$Izakaya_gflg <- ifelse(predict_df_feat$air_genre_name=="Izakaya",1,0)
predict_df_feat$Japanese_gflg <- ifelse(predict_df_feat$air_genre_name=="Japanese food",1,0)
predict_df_feat$OMT_gflg <- ifelse(predict_df_feat$air_genre_name=="Okonomiyaki/Monja/Teppanyaki",1,0)
predict_df_feat$Other_gflg <- ifelse(predict_df_feat$air_genre_name=="Other",1,0)
predict_df_feat$Western_gflg <- ifelse(predict_df_feat$air_genre_name=="Western food",1,0)
predict_df_feat$Korean_gflg <- ifelse(predict_df_feat$air_genre_name=="Yakiniku/Korean food",1,0)
str(predict_df_feat)
## 'data.frame':    32019 obs. of  25 variables:
##  $ air_store_id   : chr  "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" ...
##  $ visit_date     : Date, format: "2017-04-23" "2017-04-24" ...
##  $ day_of_week    : chr  "Sunday" "Monday" "Tuesday" "Wednesday" ...
##  $ holiday_flg    : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ air_genre_name : chr  "Italian/French" "Italian/French" "Italian/French" "Italian/French" ...
##  $ air_area_name  : chr  "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" ...
##  $ latitude       : num  35.7 35.7 35.7 35.7 35.7 ...
##  $ longitude      : num  140 140 140 140 140 ...
##  $ MoT_flg        : num  0 1 1 0 0 0 0 0 1 1 ...
##  $ WoTh_flg       : num  0 0 0 1 1 0 0 0 0 0 ...
##  $ FoSu_flg       : num  1 0 0 0 0 1 0 1 0 0 ...
##  $ month          : chr  "April" "April" "April" "April" ...
##  $ Peak_flg       : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Bar_gflg       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Cafe_gflg      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Creative_gflg  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Dining_bar_gflg: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Int_gflg       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ IF_gflg        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Izakaya_gflg   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Japanese_gflg  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ OMT_gflg       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Other_gflg     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Western_gflg   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Korean_gflg    : num  0 0 0 0 0 0 0 0 0 0 ...
#Create a numeric data frame - remove ->
predict_df_submit <- subset(predict_df_feat, select = -c(day_of_week, air_area_name, month, air_genre_name, air_store_id))
str(predict_df_submit)
## 'data.frame':    32019 obs. of  20 variables:
##  $ visit_date     : Date, format: "2017-04-23" "2017-04-24" ...
##  $ holiday_flg    : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ latitude       : num  35.7 35.7 35.7 35.7 35.7 ...
##  $ longitude      : num  140 140 140 140 140 ...
##  $ MoT_flg        : num  0 1 1 0 0 0 0 0 1 1 ...
##  $ WoTh_flg       : num  0 0 0 1 1 0 0 0 0 0 ...
##  $ FoSu_flg       : num  1 0 0 0 0 1 0 1 0 0 ...
##  $ Peak_flg       : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Bar_gflg       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Cafe_gflg      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Creative_gflg  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Dining_bar_gflg: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Int_gflg       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ IF_gflg        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Izakaya_gflg   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Japanese_gflg  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ OMT_gflg       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Other_gflg     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Western_gflg   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Korean_gflg    : num  0 0 0 0 0 0 0 0 0 0 ...
sum(is.na(predict_df_submit))
## [1] 0
#Create submission TS
predict_df_ts <- ts(predict_df_submit, frequency = 366)
str(predict_df_ts)
##  Time-Series [1:32019, 1:20] from 1 to 88.5: 17279 17280 17281 17282 17283 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:20] "visit_date" "holiday_flg" "latitude" "longitude" ...
#Make response prediction using TSLM 5
predicition1 <- predict(vfit_tslm_5, predict_df_submit, type = "response")
visitors_pred = as.vector(predicition1)
str(visitors_pred)
##  num [1:32019] 26 19.9 19.9 21.6 21.6 ...
submission_pred <- predict_df_submit
submission_pred$visitors <- round(visitors_pred,0)
str(submission_pred)
## 'data.frame':    32019 obs. of  21 variables:
##  $ visit_date     : Date, format: "2017-04-23" "2017-04-24" ...
##  $ holiday_flg    : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ latitude       : num  35.7 35.7 35.7 35.7 35.7 ...
##  $ longitude      : num  140 140 140 140 140 ...
##  $ MoT_flg        : num  0 1 1 0 0 0 0 0 1 1 ...
##  $ WoTh_flg       : num  0 0 0 1 1 0 0 0 0 0 ...
##  $ FoSu_flg       : num  1 0 0 0 0 1 0 1 0 0 ...
##  $ Peak_flg       : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Bar_gflg       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Cafe_gflg      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Creative_gflg  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Dining_bar_gflg: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Int_gflg       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ IF_gflg        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Izakaya_gflg   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Japanese_gflg  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ OMT_gflg       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Other_gflg     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Western_gflg   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Korean_gflg    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ visitors       : num  26 20 20 22 22 26 32 26 20 20 ...
summary(submission_pred$visitors) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   11.00   20.00   22.00   22.12   26.00   44.00
#confirmed no negative counts

#aggregate prediction
visitor_agg_pred <- aggregate(x = submission_pred[c("visitors")], FUN = sum, by = list(Group.date = submission_pred$visit_date)) 
visitor_agg_pred_ts <- ts(visitor_agg_pred$visitors, start = c(2017, 113), frequency = 366)
autoplot(visitor_agg_pred_ts)

autoplot(visitor_agg_ts, series = "Train_data") + autolayer(visitor_agg_pred_ts, series = "Predict_data") + ggtitle("TSLM Forecast Predicitions (04/23-05/31)")

#Write new submission file
submission_final <- read.csv('sample_submission.csv')
submission_final$visitors <- submission_pred$visitors
head(submission_final)
##                                id visitors
## 1 air_00a91d42b08b08d9_2017-04-23       26
## 2 air_00a91d42b08b08d9_2017-04-24       20
## 3 air_00a91d42b08b08d9_2017-04-25       20
## 4 air_00a91d42b08b08d9_2017-04-26       22
## 5 air_00a91d42b08b08d9_2017-04-27       22
## 6 air_00a91d42b08b08d9_2017-04-28       26
#write.csv(submission_final, file = "./TFL_Recruit_submission.csv", row.names=FALSE)