##Reading in Data
getwd()
## [1] "C:/Users/tlaff/OneDrive/Documents/R"
av_data <- read.csv('air_visit_data.csv')
submission_data <- read.csv('sample_submission.csv')
date_info <- read.csv('date_info.csv')
air_store_info <- read.csv('air_store_info.csv')
#av_data
head(av_data)
## air_store_id visit_date visitors
## 1 air_ba937bf13d40fb24 1/13/2016 25
## 2 air_ba937bf13d40fb24 1/14/2016 32
## 3 air_ba937bf13d40fb24 1/15/2016 29
## 4 air_ba937bf13d40fb24 1/16/2016 22
## 5 air_ba937bf13d40fb24 1/18/2016 6
## 6 air_ba937bf13d40fb24 1/19/2016 9
str(av_data)
## 'data.frame': 252108 obs. of 3 variables:
## $ air_store_id: chr "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" ...
## $ visit_date : chr "1/13/2016" "1/14/2016" "1/15/2016" "1/16/2016" ...
## $ visitors : int 25 32 29 22 6 9 31 21 18 26 ...
#convert to date
av_data$visit_date <- as.Date.character(av_data$visit_date, "%m/%d/%Y")
sum(is.na(av_data))
## [1] 0
#submission data
head(submission_data)
## id visitors
## 1 air_00a91d42b08b08d9_2017-04-23 0
## 2 air_00a91d42b08b08d9_2017-04-24 0
## 3 air_00a91d42b08b08d9_2017-04-25 0
## 4 air_00a91d42b08b08d9_2017-04-26 0
## 5 air_00a91d42b08b08d9_2017-04-27 0
## 6 air_00a91d42b08b08d9_2017-04-28 0
submission_data$air_store_id <- str_sub(submission_data$id, 1,-12)
submission_data$visit_date <- str_sub(submission_data$id, 22,33)
head(submission_data)
## id visitors air_store_id visit_date
## 1 air_00a91d42b08b08d9_2017-04-23 0 air_00a91d42b08b08d9 2017-04-23
## 2 air_00a91d42b08b08d9_2017-04-24 0 air_00a91d42b08b08d9 2017-04-24
## 3 air_00a91d42b08b08d9_2017-04-25 0 air_00a91d42b08b08d9 2017-04-25
## 4 air_00a91d42b08b08d9_2017-04-26 0 air_00a91d42b08b08d9 2017-04-26
## 5 air_00a91d42b08b08d9_2017-04-27 0 air_00a91d42b08b08d9 2017-04-27
## 6 air_00a91d42b08b08d9_2017-04-28 0 air_00a91d42b08b08d9 2017-04-28
str(submission_data)
## 'data.frame': 32019 obs. of 4 variables:
## $ id : chr "air_00a91d42b08b08d9_2017-04-23" "air_00a91d42b08b08d9_2017-04-24" "air_00a91d42b08b08d9_2017-04-25" "air_00a91d42b08b08d9_2017-04-26" ...
## $ visitors : int 0 0 0 0 0 0 0 0 0 0 ...
## $ air_store_id: chr "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" ...
## $ visit_date : chr "2017-04-23" "2017-04-24" "2017-04-25" "2017-04-26" ...
submission_data$visit_date <- as.Date.character(submission_data$visit_date)
str(date_info)
## 'data.frame': 517 obs. of 3 variables:
## $ calendar_date: chr "1/1/2016" "1/2/2016" "1/3/2016" "1/4/2016" ...
## $ day_of_week : chr "Friday" "Saturday" "Sunday" "Monday" ...
## $ holiday_flg : int 1 1 1 0 0 0 0 0 0 0 ...
head(date_info)
## calendar_date day_of_week holiday_flg
## 1 1/1/2016 Friday 1
## 2 1/2/2016 Saturday 1
## 3 1/3/2016 Sunday 1
## 4 1/4/2016 Monday 0
## 5 1/5/2016 Tuesday 0
## 6 1/6/2016 Wednesday 0
date_info$calendar_date <- as.Date.character(date_info$calendar_date, "%m/%d/%Y")
str(date_info)
## 'data.frame': 517 obs. of 3 variables:
## $ calendar_date: Date, format: "2016-01-01" "2016-01-02" ...
## $ day_of_week : chr "Friday" "Saturday" "Sunday" "Monday" ...
## $ holiday_flg : int 1 1 1 0 0 0 0 0 0 0 ...
date_info$visit_date <-date_info$calendar_date
#Bind Data frames
#Join Visitor data with Date info by visit_date
visitor_df <- left_join(av_data,date_info, by = "visit_date")
head(visitor_df)
## air_store_id visit_date visitors calendar_date day_of_week
## 1 air_ba937bf13d40fb24 2016-01-13 25 2016-01-13 Wednesday
## 2 air_ba937bf13d40fb24 2016-01-14 32 2016-01-14 Thursday
## 3 air_ba937bf13d40fb24 2016-01-15 29 2016-01-15 Friday
## 4 air_ba937bf13d40fb24 2016-01-16 22 2016-01-16 Saturday
## 5 air_ba937bf13d40fb24 2016-01-18 6 2016-01-18 Monday
## 6 air_ba937bf13d40fb24 2016-01-19 9 2016-01-19 Tuesday
## holiday_flg
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
tail(visitor_df)
## air_store_id visit_date visitors calendar_date day_of_week
## 252103 air_24e8414b9b07decb 2017-04-15 7 2017-04-15 Saturday
## 252104 air_24e8414b9b07decb 2017-04-18 6 2017-04-18 Tuesday
## 252105 air_24e8414b9b07decb 2017-04-19 6 2017-04-19 Wednesday
## 252106 air_24e8414b9b07decb 2017-04-20 7 2017-04-20 Thursday
## 252107 air_24e8414b9b07decb 2017-04-21 8 2017-04-21 Friday
## 252108 air_24e8414b9b07decb 2017-04-22 5 2017-04-22 Saturday
## holiday_flg
## 252103 0
## 252104 0
## 252105 0
## 252106 0
## 252107 0
## 252108 0
sum(is.na(visitor_df))
## [1] 0
#Join Store info from AirRegi by air_store_id
visitor_df<- left_join(visitor_df,air_store_info, by = "air_store_id")
head(visitor_df)
## air_store_id visit_date visitors calendar_date day_of_week
## 1 air_ba937bf13d40fb24 2016-01-13 25 2016-01-13 Wednesday
## 2 air_ba937bf13d40fb24 2016-01-14 32 2016-01-14 Thursday
## 3 air_ba937bf13d40fb24 2016-01-15 29 2016-01-15 Friday
## 4 air_ba937bf13d40fb24 2016-01-16 22 2016-01-16 Saturday
## 5 air_ba937bf13d40fb24 2016-01-18 6 2016-01-18 Monday
## 6 air_ba937bf13d40fb24 2016-01-19 9 2016-01-19 Tuesday
## holiday_flg air_genre_name air_area_name latitude longitude
## 1 0 Dining bar TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en 35.65807 139.7516
## 2 0 Dining bar TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en 35.65807 139.7516
## 3 0 Dining bar TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en 35.65807 139.7516
## 4 0 Dining bar TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en 35.65807 139.7516
## 5 0 Dining bar TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en 35.65807 139.7516
## 6 0 Dining bar TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en 35.65807 139.7516
tail(visitor_df)
## air_store_id visit_date visitors calendar_date day_of_week
## 252103 air_24e8414b9b07decb 2017-04-15 7 2017-04-15 Saturday
## 252104 air_24e8414b9b07decb 2017-04-18 6 2017-04-18 Tuesday
## 252105 air_24e8414b9b07decb 2017-04-19 6 2017-04-19 Wednesday
## 252106 air_24e8414b9b07decb 2017-04-20 7 2017-04-20 Thursday
## 252107 air_24e8414b9b07decb 2017-04-21 8 2017-04-21 Friday
## 252108 air_24e8414b9b07decb 2017-04-22 5 2017-04-22 Saturday
## holiday_flg air_genre_name air_area_name latitude
## 252103 0 Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322
## 252104 0 Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322
## 252105 0 Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322
## 252106 0 Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322
## 252107 0 Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322
## 252108 0 Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322
## longitude
## 252103 139.711
## 252104 139.711
## 252105 139.711
## 252106 139.711
## 252107 139.711
## 252108 139.711
sum(is.na(visitor_df))
## [1] 0
str(visitor_df)
## 'data.frame': 252108 obs. of 10 variables:
## $ air_store_id : chr "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" ...
## $ visit_date : Date, format: "2016-01-13" "2016-01-14" ...
## $ visitors : int 25 32 29 22 6 9 31 21 18 26 ...
## $ calendar_date : Date, format: "2016-01-13" "2016-01-14" ...
## $ day_of_week : chr "Wednesday" "Thursday" "Friday" "Saturday" ...
## $ holiday_flg : int 0 0 0 0 0 0 0 0 0 0 ...
## $ air_genre_name: chr "Dining bar" "Dining bar" "Dining bar" "Dining bar" ...
## $ air_area_name : chr "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" ...
## $ latitude : num 35.7 35.7 35.7 35.7 35.7 ...
## $ longitude : num 140 140 140 140 140 ...
visitor_df <- subset(visitor_df, select = -c(calendar_date))
str(visitor_df)
## 'data.frame': 252108 obs. of 9 variables:
## $ air_store_id : chr "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" ...
## $ visit_date : Date, format: "2016-01-13" "2016-01-14" ...
## $ visitors : int 25 32 29 22 6 9 31 21 18 26 ...
## $ day_of_week : chr "Wednesday" "Thursday" "Friday" "Saturday" ...
## $ holiday_flg : int 0 0 0 0 0 0 0 0 0 0 ...
## $ air_genre_name: chr "Dining bar" "Dining bar" "Dining bar" "Dining bar" ...
## $ air_area_name : chr "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" ...
## $ latitude : num 35.7 35.7 35.7 35.7 35.7 ...
## $ longitude : num 140 140 140 140 140 ...
#EDA
dim(visitor_df)
## [1] 252108 9
summary(visitor_df)
## air_store_id visit_date visitors day_of_week
## Length:252108 Min. :2016-01-01 Min. : 1.00 Length:252108
## Class :character 1st Qu.:2016-07-23 1st Qu.: 9.00 Class :character
## Mode :character Median :2016-10-23 Median : 17.00 Mode :character
## Mean :2016-10-12 Mean : 20.97
## 3rd Qu.:2017-01-24 3rd Qu.: 29.00
## Max. :2017-04-22 Max. :877.00
## holiday_flg air_genre_name air_area_name latitude
## Min. :0.00000 Length:252108 Length:252108 Min. :33.21
## 1st Qu.:0.00000 Class :character Class :character 1st Qu.:34.69
## Median :0.00000 Mode :character Mode :character Median :35.66
## Mean :0.05067 Mean :35.61
## 3rd Qu.:0.00000 3rd Qu.:35.69
## Max. :1.00000 Max. :44.02
## longitude
## Min. :130.2
## 1st Qu.:135.3
## Median :139.7
## Mean :137.4
## 3rd Qu.:139.8
## Max. :144.3
#Weekday view
visitor_df %>%
mutate(day = wday(visit_date, label = TRUE, week_start = 1)) %>%
group_by(day) %>%
summarise(visits = mean(visitors)) %>%
ggplot(aes(day, visits, fill = day)) +
geom_col() +
theme(legend.position = "none", axis.text.x = element_text()) +
labs(x = "Day of the week", y = "Mean visitors") + ggtitle("Visit Summary by Day of Week")

#Holiday Binary
visitor_df %>%
ggplot(aes(holiday_flg, fill = holiday_flg)) +
geom_bar() +
theme(legend.position = "none")

#Peak months
visitor_df %>%
mutate(month = month(visit_date, label = TRUE)) %>%
group_by(month) %>%
summarise(visits = mean(visitors)) %>%
ggplot(aes(month, visits, fill = month)) +
geom_col() + theme(legend.position = "none") +
labs(x = "Month", y = "Mean visitors") + ggtitle("Visit Summary by Month")

#Will define peak months by Mar - Jul and Dec
#Number of unique restaurants
restaurants <- unique(visitor_df$air_store_id)
length(restaurants)
## [1] 829
#829 restaurants
rest_genres <- unique(visitor_df$air_genre_name)
rest_genres
## [1] "Dining bar" "Izakaya"
## [3] "Cafe/Sweets" "Bar/Cocktail"
## [5] "Italian/French" "Okonomiyaki/Monja/Teppanyaki"
## [7] "Other" "International cuisine"
## [9] "Japanese food" "Yakiniku/Korean food"
## [11] "Karaoke/Party" "Western food"
## [13] "Creative cuisine" "Asian"
#Aggregate Genre
agg_genre <- aggregate(visitors ~ air_genre_name, visitor_df, mean)
agg_genre$visitors <-round(agg_genre$visitors,1)
agg_genre
## air_genre_name visitors
## 1 Asian 38.7
## 2 Bar/Cocktail 13.3
## 3 Cafe/Sweets 22.6
## 4 Creative cuisine 23.6
## 5 Dining bar 18.7
## 6 International cuisine 25.2
## 7 Italian/French 22.6
## 8 Izakaya 23.1
## 9 Japanese food 19.6
## 10 Karaoke/Party 30.0
## 11 Okonomiyaki/Monja/Teppanyaki 22.6
## 12 Other 19.9
## 13 Western food 22.3
## 14 Yakiniku/Korean food 21.2
#Create Time-Series of Aggregated data
visitor_agg <- aggregate(x = visitor_df[c("visitors")], FUN = sum, by = list(Group.date = visitor_df$visit_date))
str(visitor_agg)
## 'data.frame': 478 obs. of 2 variables:
## $ Group.date: Date, format: "2016-01-01" "2016-01-02" ...
## $ visitors : int 1033 1764 2368 3326 3927 4154 4431 6115 7306 6066 ...
visitor_agg_ts <- ts(visitor_agg$visitors, start = c(2016, 01), frequency = 366)
autoplot(visitor_agg_ts) + ylab("Total Visitors") + ggtitle("Aggregated Restaurant Visitor Totals by Day")

#can't decompose or show seasonality because it is a daily timeseries
#The trend is relatively steady with a leap during the summer of 2016, maybe representative of new restaurants entering the fold
#Trend appears additive, not exponential
ts_agg_display<- ggtsdisplay(visitor_agg_ts)

acf_agg_ts <- acf(visitor_agg_ts)

#Extreme autocorrelation, non stationary data
visitors_lambda <- BoxCox.lambda(visitor_agg_ts)
#No boxcox transformation
ndiffs(visitor_agg_ts)
## [1] 1
#Would difference 1 time
#Feature Creation
visitor_df_feat <- visitor_df
#create dummy variables for day of week, MorT / WoTh / FoSu, Saturday with coefficient
visitor_df_feat$MoT_flg <- ifelse(visitor_df_feat$day_of_week=="Monday"|visitor_df_feat$day_of_week=="Tuesday" ,1,0)
visitor_df_feat$WoTh_flg <- ifelse(visitor_df_feat$day_of_week=="Wednesday"|visitor_df_feat$day_of_week=="Thursday" ,1,0)
visitor_df_feat$FoSu_flg <- ifelse(visitor_df_feat$day_of_week=="Friday"|visitor_df_feat$day_of_week=="Sunday" ,1,0)
str(visitor_df_feat)
## 'data.frame': 252108 obs. of 12 variables:
## $ air_store_id : chr "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" ...
## $ visit_date : Date, format: "2016-01-13" "2016-01-14" ...
## $ visitors : int 25 32 29 22 6 9 31 21 18 26 ...
## $ day_of_week : chr "Wednesday" "Thursday" "Friday" "Saturday" ...
## $ holiday_flg : int 0 0 0 0 0 0 0 0 0 0 ...
## $ air_genre_name: chr "Dining bar" "Dining bar" "Dining bar" "Dining bar" ...
## $ air_area_name : chr "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" ...
## $ latitude : num 35.7 35.7 35.7 35.7 35.7 ...
## $ longitude : num 140 140 140 140 140 ...
## $ MoT_flg : num 0 0 0 0 1 1 0 0 0 0 ...
## $ WoTh_flg : num 1 1 0 0 0 0 1 1 0 0 ...
## $ FoSu_flg : num 0 0 1 0 0 0 0 0 1 0 ...
#Create Month column, to isolate peak months
visitor_df_feat$month <- format(visitor_df_feat$visit_date,"%B")
visitor_df_feat$Peak_flg <- ifelse(visitor_df_feat$month=="March"|visitor_df_feat$month=="December"|visitor_df_feat$month=="April"|visitor_df_feat$month=="May"|visitor_df_feat$month=="June"|visitor_df_feat$month=="July",1,0)
#Create dummy variables for type of restaurant, leaving Asian out as part of coefficient
visitor_df_feat$Bar_gflg <- ifelse(visitor_df_feat$air_genre_name=="Bar/Cocktail",1,0)
visitor_df_feat$Cafe_gflg <- ifelse(visitor_df_feat$air_genre_name=="Cafe/Sweets",1,0)
visitor_df_feat$Creative_gflg <- ifelse(visitor_df_feat$air_genre_name=="Creative cuisine",1,0)
visitor_df_feat$Dining_bar_gflg <- ifelse(visitor_df_feat$air_genre_name=="Dining bar",1,0)
visitor_df_feat$Int_gflg <- ifelse(visitor_df_feat$air_genre_name=="International cuisine",1,0)
visitor_df_feat$IF_gflg <- ifelse(visitor_df_feat$air_genre_name=="Italian/French",1,0)
visitor_df_feat$Izakaya_gflg <- ifelse(visitor_df_feat$air_genre_name=="Izakaya",1,0)
visitor_df_feat$Japanese_gflg <- ifelse(visitor_df_feat$air_genre_name=="Japanese food",1,0)
visitor_df_feat$OMT_gflg <- ifelse(visitor_df_feat$air_genre_name=="Okonomiyaki/Monja/Teppanyaki",1,0)
visitor_df_feat$Other_gflg <- ifelse(visitor_df_feat$air_genre_name=="Other",1,0)
visitor_df_feat$Western_gflg <- ifelse(visitor_df_feat$air_genre_name=="Western food",1,0)
visitor_df_feat$Korean_gflg <- ifelse(visitor_df_feat$air_genre_name=="Yakiniku/Korean food",1,0)
str(visitor_df_feat)
## 'data.frame': 252108 obs. of 26 variables:
## $ air_store_id : chr "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" "air_ba937bf13d40fb24" ...
## $ visit_date : Date, format: "2016-01-13" "2016-01-14" ...
## $ visitors : int 25 32 29 22 6 9 31 21 18 26 ...
## $ day_of_week : chr "Wednesday" "Thursday" "Friday" "Saturday" ...
## $ holiday_flg : int 0 0 0 0 0 0 0 0 0 0 ...
## $ air_genre_name : chr "Dining bar" "Dining bar" "Dining bar" "Dining bar" ...
## $ air_area_name : chr "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" "TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en" ...
## $ latitude : num 35.7 35.7 35.7 35.7 35.7 ...
## $ longitude : num 140 140 140 140 140 ...
## $ MoT_flg : num 0 0 0 0 1 1 0 0 0 0 ...
## $ WoTh_flg : num 1 1 0 0 0 0 1 1 0 0 ...
## $ FoSu_flg : num 0 0 1 0 0 0 0 0 1 0 ...
## $ month : chr "January" "January" "January" "January" ...
## $ Peak_flg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Bar_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Cafe_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Creative_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Dining_bar_gflg: num 1 1 1 1 1 1 1 1 1 1 ...
## $ Int_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ IF_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Izakaya_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Japanese_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ OMT_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Other_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Western_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Korean_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
#Create a numeric data frame - remove ->
visitor_df_mod <- subset(visitor_df_feat, select = -c(day_of_week, air_area_name, month, air_genre_name, air_store_id))
sum(is.na(visitor_df_mod))
## [1] 0
str(visitor_df_mod)
## 'data.frame': 252108 obs. of 21 variables:
## $ visit_date : Date, format: "2016-01-13" "2016-01-14" ...
## $ visitors : int 25 32 29 22 6 9 31 21 18 26 ...
## $ holiday_flg : int 0 0 0 0 0 0 0 0 0 0 ...
## $ latitude : num 35.7 35.7 35.7 35.7 35.7 ...
## $ longitude : num 140 140 140 140 140 ...
## $ MoT_flg : num 0 0 0 0 1 1 0 0 0 0 ...
## $ WoTh_flg : num 1 1 0 0 0 0 1 1 0 0 ...
## $ FoSu_flg : num 0 0 1 0 0 0 0 0 1 0 ...
## $ Peak_flg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Bar_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Cafe_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Creative_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Dining_bar_gflg: num 1 1 1 1 1 1 1 1 1 1 ...
## $ Int_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ IF_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Izakaya_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Japanese_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ OMT_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Other_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Western_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Korean_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
#Create TS for TSLM
visitor_df_mod_ts <- ts(visitor_df_mod, frequency = 366)
str(visitor_df_mod_ts)
## Time-Series [1:252108, 1:21] from 1 to 690: 16813 16814 16815 16816 16818 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:21] "visit_date" "visitors" "holiday_flg" "latitude" ...
#Includes Trend + holiday_flg + day of week flag
vfit_tslm_3 <- tslm(visitors~holiday_flg + MoT_flg + WoTh_flg + FoSu_flg, , data = visitor_df_mod_ts)
summary(vfit_tslm_3)
##
## Call:
## tslm(formula = visitors ~ holiday_flg + MoT_flg + WoTh_flg +
## FoSu_flg, data = visitor_df_mod_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.64 -11.29 -3.88 7.84 858.12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.15800 0.08315 314.59 <2e-16 ***
## holiday_flg 3.48109 0.14929 23.32 <2e-16 ***
## MoT_flg -8.93802 0.10421 -85.77 <2e-16 ***
## WoTh_flg -7.27607 0.10236 -71.09 <2e-16 ***
## FoSu_flg -2.87282 0.10346 -27.77 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.42 on 252103 degrees of freedom
## Multiple R-squared: 0.03956, Adjusted R-squared: 0.03954
## F-statistic: 2596 on 4 and 252103 DF, p-value: < 2.2e-16
#Layering in peak month
vfit_tslm_4 <- tslm(visitors~holiday_flg + MoT_flg + WoTh_flg + FoSu_flg + Peak_flg, , data = visitor_df_mod_ts)
summary(vfit_tslm_4)
##
## Call:
## tslm(formula = visitors ~ holiday_flg + MoT_flg + WoTh_flg +
## FoSu_flg + Peak_flg, data = visitor_df_mod_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.84 -11.36 -3.37 8.03 857.06
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.23596 0.08848 285.21 <2e-16 ***
## holiday_flg 3.63388 0.14911 24.37 <2e-16 ***
## MoT_flg -8.87460 0.10405 -85.29 <2e-16 ***
## WoTh_flg -7.26724 0.10217 -71.13 <2e-16 ***
## FoSu_flg -2.86110 0.10328 -27.70 <2e-16 ***
## Peak_flg 1.97508 0.06567 30.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.39 on 252102 degrees of freedom
## Multiple R-squared: 0.04299, Adjusted R-squared: 0.04297
## F-statistic: 2265 on 5 and 252102 DF, p-value: < 2.2e-16
#Layering in Restaurant genre
vfit_tslm_5 <- tslm(visitors~holiday_flg + MoT_flg + WoTh_flg +
FoSu_flg + Peak_flg + Bar_gflg + Cafe_gflg + Creative_gflg +
Dining_bar_gflg + Int_gflg + IF_gflg + Izakaya_gflg +
Japanese_gflg + OMT_gflg + Other_gflg + Western_gflg+
Korean_gflg, , data = visitor_df_mod_ts)
summary(vfit_tslm_5)
##
## Call:
## tslm(formula = visitors ~ holiday_flg + MoT_flg + WoTh_flg +
## FoSu_flg + Peak_flg + Bar_gflg + Cafe_gflg + Creative_gflg +
## Dining_bar_gflg + Int_gflg + IF_gflg + Izakaya_gflg + Japanese_gflg +
## OMT_gflg + Other_gflg + Western_gflg + Korean_gflg, data = visitor_df_mod_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.18 -10.98 -3.38 7.68 864.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.50071 0.50277 76.58 <2e-16 ***
## holiday_flg 3.67894 0.14640 25.13 <2e-16 ***
## MoT_flg -8.88626 0.10216 -86.98 <2e-16 ***
## WoTh_flg -7.26159 0.10032 -72.39 <2e-16 ***
## FoSu_flg -2.84862 0.10140 -28.09 <2e-16 ***
## Peak_flg 1.99318 0.06448 30.91 <2e-16 ***
## Bar_gflg -21.00221 0.50674 -41.45 <2e-16 ***
## Cafe_gflg -11.73169 0.50138 -23.40 <2e-16 ***
## Creative_gflg -10.66212 0.55985 -19.05 <2e-16 ***
## Dining_bar_gflg -15.50877 0.50404 -30.77 <2e-16 ***
## Int_gflg -8.97146 0.97097 -9.24 <2e-16 ***
## IF_gflg -11.67420 0.50510 -23.11 <2e-16 ***
## Izakaya_gflg -11.11048 0.50066 -22.19 <2e-16 ***
## Japanese_gflg -14.64771 0.51018 -28.71 <2e-16 ***
## OMT_gflg -11.46343 0.56248 -20.38 <2e-16 ***
## Other_gflg -14.38117 0.52716 -27.28 <2e-16 ***
## Western_gflg -11.91596 0.54715 -21.78 <2e-16 ***
## Korean_gflg -12.97055 0.53231 -24.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.09 on 252090 degrees of freedom
## Multiple R-squared: 0.07761, Adjusted R-squared: 0.07754
## F-statistic: 1248 on 17 and 252090 DF, p-value: < 2.2e-16
plot(vfit_tslm_5$residuals)

#Overall R_Sqrd is quite low, though much of the difference may be explained in error term
#Simple TSLM
vfit_tslm_1 <- tslm(visitors~visit_date, data = visitor_df_mod_ts)
summary(vfit_tslm_1)
##
## Call:
## tslm(formula = visitors ~ visit_date, data = visitor_df_mod_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.07 -11.99 -3.97 8.04 856.08
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.7732080 4.6304546 5.782 7.39e-09 ***
## visit_date -0.0003394 0.0002710 -1.252 0.21
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.76 on 252106 degrees of freedom
## Multiple R-squared: 6.222e-06, Adjusted R-squared: 2.256e-06
## F-statistic: 1.569 on 1 and 252106 DF, p-value: 0.2104
#Simple TSLM with holiday flag
vfit_tslm_2 <- tslm(visitors~visit_date + holiday_flg , data = visitor_df_mod_ts)
summary(vfit_tslm_2)
##
## Call:
## tslm(formula = visitors ~ visit_date + holiday_flg, data = visitor_df_mod_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.76 -11.85 -3.84 8.17 856.20
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.6698732 4.6285317 5.33 9.83e-08 ***
## visit_date -0.0002248 0.0002709 -0.83 0.407
## holiday_flg 2.8724353 0.1520934 18.89 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.75 on 252105 degrees of freedom
## Multiple R-squared: 0.001419, Adjusted R-squared: 0.001411
## F-statistic: 179.1 on 2 and 252105 DF, p-value: < 2.2e-16
#Create Train & Test Sets
visitor_train_ts <- subset(visitor_agg_ts, end = 412)
visitor_test_ts <- subset(visitor_agg_ts, start = 413)
#ETS Model
vfit_ets <- ets(visitor_train_ts)
## Warning in ets(visitor_train_ts): I can't handle data with frequency greater
## than 24. Seasonality will be ignored. Try stlf() if you need seasonal forecasts.
vfit_ets
## ETS(M,A,N)
##
## Call:
## ets(y = visitor_train_ts)
##
## Smoothing parameters:
## alpha = 0.999
## beta = 1e-04
##
## Initial states:
## l = 655.2775
## b = 439.2025
##
## sigma: 0.2384
##
## AIC AICc BIC
## 8856.739 8856.887 8876.844
#ETS (M,A,N) -> Multiplicative errors, additive trend and no seasonality
#ETS models do not handle daily frequency well
#The Alpha value of 0.9999 is extremely high which means the weighting of oberservations is heavy to the most recent
#AICc = 8856.887
vfc_ets <- forecast(vfit_ets, h = 105, level = c(95))
autoplot(vfc_ets)

vfc_ets
## Point Forecast Lo 95 Hi 95
## 2017.1257 14253.69 7594.958 20912.42
## 2017.1284 14676.43 4992.214 24360.64
## 2017.1311 15099.16 2901.413 27296.91
## 2017.1339 15521.90 1039.628 30004.17
## 2017.1366 15944.64 -700.293 32589.56
## 2017.1393 16367.37 -2372.364 35107.11
## 2017.1421 16790.11 -4008.294 37588.51
## 2017.1448 17212.84 -5628.629 40054.32
## 2017.1475 17635.58 -7247.656 42518.82
## 2017.1503 18058.32 -8875.872 44992.51
## 2017.1530 18481.05 -10521.335 47483.44
## 2017.1557 18903.79 -12190.472 49998.05
## 2017.1585 19326.53 -13888.572 52541.63
## 2017.1612 19749.26 -15620.114 55118.64
## 2017.1639 20172.00 -17388.988 57732.99
## 2017.1667 20594.74 -19198.644 60388.12
## 2017.1694 21017.47 -21052.209 63087.16
## 2017.1721 21440.21 -22952.561 65832.98
## 2017.1749 21862.95 -24902.392 68628.28
## 2017.1776 22285.68 -26904.253 71475.62
## 2017.1803 22708.42 -28960.592 74377.43
## 2017.1831 23131.16 -31073.777 77336.09
## 2017.1858 23553.89 -33246.124 80353.91
## 2017.1885 23976.63 -35479.912 83433.17
## 2017.1913 24399.36 -37777.397 86576.13
## 2017.1940 24822.10 -40140.828 89785.03
## 2017.1967 25244.84 -42572.455 93062.13
## 2017.1995 25667.57 -45074.535 96409.68
## 2017.2022 26090.31 -47649.347 99829.97
## 2017.2049 26513.05 -50299.193 103325.29
## 2017.2077 26935.78 -53026.405 106897.97
## 2017.2104 27358.52 -55833.352 110550.39
## 2017.2131 27781.26 -58722.442 114284.96
## 2017.2158 28203.99 -61696.130 118104.12
## 2017.2186 28626.73 -64756.920 122010.38
## 2017.2213 29049.47 -67907.369 126006.30
## 2017.2240 29472.20 -71150.089 130094.49
## 2017.2268 29894.94 -74487.754 134277.63
## 2017.2295 30317.68 -77923.102 138558.45
## 2017.2322 30740.41 -81458.936 142939.76
## 2017.2350 31163.15 -85098.129 147424.43
## 2017.2377 31585.88 -88843.629 152015.40
## 2017.2404 32008.62 -92698.456 156715.70
## 2017.2432 32431.36 -96665.714 161528.43
## 2017.2459 32854.09 -100748.584 166456.77
## 2017.2486 33276.83 -104950.334 171504.00
## 2017.2514 33699.57 -109274.321 176673.46
## 2017.2541 34122.30 -113723.991 181968.60
## 2017.2568 34545.04 -118302.885 187392.97
## 2017.2596 34967.78 -123014.641 192950.19
## 2017.2623 35390.51 -127862.995 198644.02
## 2017.2650 35813.25 -132851.791 204478.29
## 2017.2678 36235.99 -137984.974 210456.95
## 2017.2705 36658.72 -143266.602 216584.05
## 2017.2732 37081.46 -148700.846 222863.76
## 2017.2760 37504.20 -154291.993 229300.38
## 2017.2787 37926.93 -160044.450 235898.31
## 2017.2814 38349.67 -165962.747 242662.08
## 2017.2842 38772.40 -172051.542 249596.35
## 2017.2869 39195.14 -178315.624 256705.91
## 2017.2896 39617.88 -184759.915 263995.67
## 2017.2923 40040.61 -191389.478 271470.71
## 2017.2951 40463.35 -198209.518 279136.22
## 2017.2978 40886.09 -205225.386 286997.56
## 2017.3005 41308.82 -212442.583 295060.23
## 2017.3033 41731.56 -219866.768 303329.89
## 2017.3060 42154.30 -227503.757 311812.35
## 2017.3087 42577.03 -235359.531 320513.60
## 2017.3115 42999.77 -243440.242 329439.78
## 2017.3142 43422.51 -251752.212 338597.22
## 2017.3169 43845.24 -260301.943 347992.43
## 2017.3197 44267.98 -269096.123 357632.08
## 2017.3224 44690.72 -278141.624 367523.06
## 2017.3251 45113.45 -287445.517 377672.42
## 2017.3279 45536.19 -297015.068 388087.45
## 2017.3306 45958.93 -306857.752 398775.60
## 2017.3333 46381.66 -316981.253 409744.58
## 2017.3361 46804.40 -327393.472 421002.27
## 2017.3388 47227.13 -338102.533 432556.80
## 2017.3415 47649.87 -349116.790 444416.53
## 2017.3443 48072.61 -360444.830 456590.05
## 2017.3470 48495.34 -372095.486 469086.17
## 2017.3497 48918.08 -384077.838 481914.00
## 2017.3525 49340.82 -396401.221 495082.85
## 2017.3552 49763.55 -409075.235 508602.34
## 2017.3579 50186.29 -422109.749 522482.33
## 2017.3607 50609.03 -435514.912 536732.96
## 2017.3634 51031.76 -449301.157 551364.68
## 2017.3661 51454.50 -463479.211 566388.21
## 2017.3689 51877.24 -478060.106 581814.58
## 2017.3716 52299.97 -493055.180 597655.12
## 2017.3743 52722.71 -508476.094 613921.51
## 2017.3770 53145.45 -524334.835 630625.73
## 2017.3798 53568.18 -540643.728 647780.09
## 2017.3825 53990.92 -557415.446 665397.28
## 2017.3852 54413.65 -574663.017 683490.33
## 2017.3880 54836.39 -592399.836 702072.62
## 2017.3907 55259.13 -610639.675 721157.93
## 2017.3934 55681.86 -629396.692 740760.42
## 2017.3962 56104.60 -648685.446 760894.65
## 2017.3989 56527.34 -668520.902 781575.58
## 2017.4016 56950.07 -688918.448 802818.60
## 2017.4044 57372.81 -709893.904 824639.52
## 2017.4071 57795.55 -731463.534 847054.63
## 2017.4098 58218.28 -753644.060 870080.63
checkresiduals(vfc_ets)

##
## Ljung-Box test
##
## data: Residuals from ETS(M,A,N)
## Q* = 2977.7, df = 78, p-value < 2.2e-16
##
## Model df: 4. Total lags used: 82
#Passed the Ljung-box text but still apears to have autocorrelation
accuracy(vfc_ets, visitor_agg_ts)
## ME RMSE MAE MPE MAPE MASE
## Training set -399.648 2791.139 1999.147 -7.947741 20.68651 0.2562708
## Test set -12397.822 15053.249 12791.049 -87.538742 89.50499 1.6396862
## ACF1 Theil's U
## Training set 0.1309531 NA
## Test set 0.8892403 4.845375
#RMSE = 15053, essentially each day's forecast has as a standard deviation of 2791 visitors
#ARIMA Model
vfit_arima <- auto.arima(visitor_train_ts)
## Warning: The chosen seasonal unit root test encountered an error when testing for the first difference.
## From stl(): series is not periodic or has less than two periods
## 0 seasonal differences will be used. Consider using a different unit root test.
vfit_arima
## Series: visitor_train_ts
## ARIMA(5,1,3)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ma1 ma2 ma3
## 0.2046 -0.9608 -0.0021 -0.4692 -0.4734 -0.2706 0.5058 -0.2420
## s.e. 0.0680 0.0525 0.0926 0.0475 0.0596 0.0732 0.0502 0.0865
##
## sigma^2 estimated as 2413885: log likelihood=-3601.79
## AIC=7221.58 AICc=7222.03 BIC=7257.75
#ARIMA(5,1,3), no seasonality. AICc = 7222.03, 1 differencing
fc_arima <- forecast(vfit_arima, h = 105, level = c(95))
fc_arima
## Point Forecast Lo 95 Hi 95
## 2017.1257 13230.44 10185.303 16275.57
## 2017.1284 16800.70 12634.108 20967.29
## 2017.1311 18384.02 13983.039 22785.00
## 2017.1339 13635.19 9196.760 18073.62
## 2017.1366 10706.31 6254.440 15158.18
## 2017.1393 13276.31 8818.143 17734.47
## 2017.1421 14192.83 9690.197 18695.45
## 2017.1448 13395.66 8274.723 18516.59
## 2017.1475 15969.11 10278.116 21660.11
## 2017.1503 17440.29 11637.325 23243.26
## 2017.1530 13623.51 7780.627 19466.40
## 2017.1557 11364.05 5487.810 17240.29
## 2017.1585 13735.99 7857.145 19614.83
## 2017.1612 14491.40 8564.872 20417.93
## 2017.1639 13465.96 7161.043 19770.87
## 2017.1667 15392.57 8738.579 22046.57
## 2017.1694 16727.13 9996.517 23457.74
## 2017.1721 13673.72 6901.624 20445.82
## 2017.1749 11886.40 5062.977 18709.83
## 2017.1776 14033.31 7194.566 20872.05
## 2017.1803 14657.80 7768.679 21546.91
## 2017.1831 13527.31 6374.973 20679.65
## 2017.1858 14975.77 7582.699 22368.85
## 2017.1885 16195.78 8742.167 23649.38
## 2017.1913 13746.55 6249.112 21243.98
## 2017.1940 12305.13 4744.004 19866.26
## 2017.1967 14216.61 6627.013 21806.20
## 2017.1995 14739.45 7098.406 22380.49
## 2017.2022 13584.41 5746.934 21421.89
## 2017.2049 14677.67 6660.835 22694.51
## 2017.2077 15795.52 7726.050 23865.00
## 2017.2104 13825.89 5710.465 21941.32
## 2017.2131 12641.10 4454.683 20827.52
## 2017.2158 14322.76 6097.630 22547.89
## 2017.2186 14767.12 6490.499 23043.74
## 2017.2213 13639.64 5208.936 22070.35
## 2017.2240 14466.96 5894.576 23039.34
## 2017.2268 15490.44 6869.365 24111.51
## 2017.2295 13902.59 5233.942 22571.23
## 2017.2322 12911.35 4168.292 21654.41
## 2017.2350 14377.70 5588.859 23166.54
## 2017.2377 14761.45 5921.635 23601.26
## 2017.2404 13693.58 4728.032 22659.13
## 2017.2432 14320.22 5237.493 23402.95
## 2017.2459 15254.89 6125.395 24384.38
## 2017.2486 13971.94 4793.797 23150.08
## 2017.2514 13129.54 3876.266 22382.81
## 2017.2541 14399.50 5095.989 23703.02
## 2017.2568 14736.11 5382.411 24089.81
## 2017.2596 13745.98 4286.256 23205.70
## 2017.2623 14220.03 4659.821 23780.23
## 2017.2650 15070.58 5464.475 24676.68
## 2017.2678 14031.96 4376.635 23687.29
## 2017.2705 13306.52 3577.078 23035.96
## 2017.2732 14400.62 4618.413 24182.83
## 2017.2760 14700.08 4868.553 24531.61
## 2017.2787 13796.26 3872.863 23719.65
## 2017.2814 14153.47 4141.435 24165.51
## 2017.2842 14924.41 4866.820 24982.01
## 2017.2869 14082.29 3975.299 24189.27
## 2017.2896 13450.86 3271.796 23629.91
## 2017.2923 14389.52 4156.526 24622.51
## 2017.2951 14659.12 4377.666 24940.56
## 2017.2978 13843.83 3480.944 24206.72
## 2017.3005 14111.04 3668.228 24553.85
## 2017.3033 14806.99 4318.723 25295.25
## 2017.3060 14123.41 3585.905 24660.92
## 2017.3087 13569.25 2962.257 24176.25
## 2017.3115 14371.85 3710.659 25033.04
## 2017.3142 14616.84 3908.014 25325.67
## 2017.3169 13888.21 3105.817 24670.60
## 2017.3197 14085.75 3230.060 24941.43
## 2017.3224 14711.50 3810.396 25612.60
## 2017.3251 14156.29 3206.324 25106.26
## 2017.3279 13666.97 2650.320 24683.62
## 2017.3306 14351.30 3280.787 25421.82
## 2017.3333 14575.46 3458.076 25692.85
## 2017.3361 13929.06 2744.188 25113.94
## 2017.3388 14072.49 2819.520 25325.46
## 2017.3415 14633.00 3334.667 25931.33
## 2017.3443 14182.03 2835.373 25528.68
## 2017.3470 13748.11 2337.599 25158.62
## 2017.3497 14330.21 2866.533 25793.89
## 2017.3525 14536.24 3026.410 26046.08
## 2017.3552 13966.23 2393.685 25538.77
## 2017.3579 14067.56 2431.116 25704.01
## 2017.3607 14567.85 2886.160 26249.54
## 2017.3634 14201.75 2472.381 25931.11
## 2017.3661 13815.88 2025.395 25606.36
## 2017.3689 14309.98 2467.244 26152.72
## 2017.3716 14499.85 2611.628 26388.07
## 2017.3743 13999.67 2052.578 25946.76
## 2017.3770 14068.28 2060.740 26075.82
## 2017.3798 14513.35 2460.761 26565.93
## 2017.3825 14216.49 2116.946 26316.04
## 2017.3852 13872.80 1714.709 26030.89
## 2017.3880 14291.40 2082.093 26500.71
## 2017.3907 14466.56 2212.406 26720.72
## 2017.3934 14029.48 1719.591 26339.37
## 2017.3962 14072.71 1705.286 26440.14
## 2017.3989 14467.45 2055.258 26879.64
## 2017.4016 14227.20 1768.798 26685.60
## 2017.4044 13920.85 1406.291 26435.41
## 2017.4071 14274.85 1710.158 26839.54
## 2017.4098 14436.42 1827.498 27045.34
autoplot(fc_arima)

checkresiduals(fc_arima)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,1,3)
## Q* = 131.32, df = 74, p-value = 4.64e-05
##
## Model df: 8. Total lags used: 82
#Looks much more normally distributed with white noise residuals
accuracy(fc_arima, visitor_agg_ts)
## ME RMSE MAE MPE MAPE MASE
## Training set 81.27446 1536.605 989.320 -0.4383417 9.954058 0.1268211
## Test set 1350.05740 3012.803 2244.132 5.4988128 13.222301 0.2876755
## ACF1 Theil's U
## Training set -0.02974393 NA
## Test set 0.52568893 0.8183462
#The ARIMA model produces much better fitting predicitions in the test set than ETS
#RMSE = 3012
#TBATS
vfit_tbats <- tbats(visitor_train_ts)
vfit_tbats
## BATS(0.052, {5,3}, 0.961, -)
##
## Call: tbats(y = visitor_train_ts)
##
## Parameters
## Lambda: 0.052406
## Alpha: 0.9130191
## Beta: 0.04428732
## Damping Parameter: 0.96069
## AR coefficients: 0.147807 -0.973041 -0.047549 -0.48054 -0.489072
## MA coefficients: -0.108566 0.502682 -0.219851
##
## Seed States:
## [,1]
## [1,] 9.7074953
## [2,] 0.2464777
## [3,] 0.0000000
## [4,] 0.0000000
## [5,] 0.0000000
## [6,] 0.0000000
## [7,] 0.0000000
## [8,] 0.0000000
## [9,] 0.0000000
## [10,] 0.0000000
## attr(,"lambda")
## [1] 0.05240574
##
## Sigma: 0.2536455
## AIC: 8513.792
#Almost a full log transformation
#ARMA terms of 5 and 3, respectively
#Dampening at a rate 0.961
#No fourier terms selected
vfc_tbats <- forecast(vfit_tbats, h = 105, level = c(95))
vfc_tbats
## Point Forecast Lo 95 Hi 95
## 2017.1257 13327.55 9827.659 17987.54
## 2017.1284 16805.90 10977.808 25490.04
## 2017.1311 18500.77 11710.706 28916.65
## 2017.1339 13141.10 8193.974 20835.97
## 2017.1366 10847.46 6701.863 17350.53
## 2017.1393 13439.88 8343.355 21399.44
## 2017.1421 14508.48 8902.210 23358.86
## 2017.1448 13598.15 7695.363 23635.62
## 2017.1475 16099.90 8521.114 29801.14
## 2017.1503 17410.55 9060.310 32741.02
## 2017.1530 13078.22 6673.684 25048.40
## 2017.1557 11470.62 5763.270 22289.33
## 2017.1585 14166.89 7125.284 27502.32
## 2017.1612 14948.17 7397.414 29459.98
## 2017.1639 13692.28 6372.938 28563.50
## 2017.1667 15598.90 6947.321 33890.99
## 2017.1694 16684.39 7333.877 36689.58
## 2017.1721 13148.62 5650.925 29518.53
## 2017.1749 12023.13 5063.187 27499.42
## 2017.1776 14674.02 6161.369 33653.09
## 2017.1803 15167.88 6250.620 35386.03
## 2017.1831 13722.72 5374.138 33535.39
## 2017.1858 15242.05 5778.116 38363.55
## 2017.1885 16205.90 6073.141 41216.98
## 2017.1913 13280.38 4855.614 34537.09
## 2017.1940 12495.72 4464.053 33182.44
## 2017.1967 15003.26 5330.379 40039.49
## 2017.1995 15247.25 5310.832 41421.47
## 2017.2022 13729.81 4574.886 38810.66
## 2017.2049 14997.58 4870.171 43381.13
## 2017.2077 15887.60 5103.826 46399.58
## 2017.2104 13434.54 4204.961 40154.09
## 2017.2131 12887.64 3936.391 39364.03
## 2017.2158 15198.83 4610.485 46707.22
## 2017.2186 15243.38 4531.891 47685.58
## 2017.2213 13735.20 3924.487 44500.00
## 2017.2240 14836.89 4149.229 48985.57
## 2017.2268 15672.27 4337.298 52220.63
## 2017.2295 13588.42 3661.398 46350.62
## 2017.2322 13205.42 3471.637 46024.71
## 2017.2350 15299.21 3992.112 53671.51
## 2017.2377 15195.00 3886.190 54246.21
## 2017.2404 13748.62 3390.943 50656.07
## 2017.2432 14736.49 3567.839 55175.58
## 2017.2459 15522.33 3719.630 58637.98
## 2017.2486 13729.98 3202.677 53088.89
## 2017.2514 13459.14 3064.581 53136.69
## 2017.2541 15335.14 3465.086 60943.82
## 2017.2568 15126.75 3350.938 61148.38
## 2017.2596 13772.96 2950.214 57303.37
## 2017.2623 14677.96 3092.832 61933.02
## 2017.2650 15413.29 3214.753 65605.94
## 2017.2678 13853.94 2813.186 60330.35
## 2017.2705 13659.89 2709.872 60676.52
## 2017.2732 15329.72 3017.961 68537.74
## 2017.2760 15053.33 2906.341 68423.99
## 2017.2787 13807.59 2583.440 64451.03
## 2017.2814 14647.39 2700.164 69233.79
## 2017.2842 15329.47 2797.235 73083.40
## 2017.2869 13959.13 2480.952 68042.67
## 2017.2896 13818.22 2401.768 68628.60
## 2017.2923 15299.46 2639.175 76468.98
## 2017.2951 14982.84 2535.580 76096.07
## 2017.2978 13850.31 2275.786 72099.44
## 2017.3005 14634.60 2372.093 77053.08
## 2017.3033 15261.07 2448.540 81036.88
## 2017.3060 14046.62 2196.404 76201.95
## 2017.3087 13943.37 2134.488 76984.96
## 2017.3115 15255.66 2317.972 84754.03
## 2017.3142 14919.23 2224.734 84181.18
## 2017.3169 13898.47 2015.636 80244.15
## 2017.3197 14632.41 2095.359 85368.18
## 2017.3224 15202.21 2154.829 89441.30
## 2017.3251 14118.59 1951.743 84792.63
## 2017.3279 14042.97 1902.553 85743.71
## 2017.3306 15205.76 2044.866 93408.99
## 2017.3333 14863.92 1962.511 92691.21
## 2017.3361 13949.54 1793.922 88878.49
## 2017.3388 14635.92 1859.926 94160.01
## 2017.3415 15149.47 1905.568 98279.66
## 2017.3443 14177.57 1740.551 93806.35
## 2017.3470 14123.07 1700.985 94907.32
## 2017.3497 15154.43 1811.770 102448.82
## 2017.3525 14816.96 1739.855 101634.76
## 2017.3552 14001.41 1603.556 97995.26
## 2017.3579 14641.93 1658.106 103413.75
## 2017.3607 15101.00 1692.606 107542.03
## 2017.3634 14226.02 1557.531 103240.47
## 2017.3661 14188.36 1525.394 104481.07
## 2017.3689 15104.43 1611.927 111887.07
## 2017.3716 14777.66 1549.551 111018.35
## 2017.3743 14052.45 1438.980 107587.99
## 2017.3770 14648.47 1483.933 113119.01
## 2017.3798 15055.87 1509.549 117224.36
## 2017.3825 14266.11 1398.292 113096.64
## 2017.3852 14242.35 1371.982 114471.92
## 2017.3880 15057.28 1439.761 121735.92
## 2017.3907 14745.07 1385.853 120847.30
## 2017.3934 14101.50 1295.807 117651.60
## 2017.3962 14654.41 1332.717 123269.60
## 2017.3989 15013.68 1351.322 127327.29
## 2017.4016 14299.68 1259.194 123379.50
## 2017.4044 14287.66 1237.507 124887.74
## 2017.4071 15013.68 1290.703 132006.42
## 2017.4098 14718.14 1244.183 131126.50
autoplot(vfc_tbats)

checkresiduals(vfc_tbats)

##
## Ljung-Box test
##
## data: Residuals from BATS(0.052, {5,3}, 0.961, -)
## Q* = 81.954, df = 60, p-value = 0.03139
##
## Model df: 22. Total lags used: 82
#White noise, residuals with relatively consistent residuals and normally distributed
accuracy(vfc_tbats, visitor_agg_ts)
## ME RMSE MAE MPE MAPE MASE
## Training set 28.48698 1597.778 1001.513 -1.993263 10.54526 0.1283840
## Test set 1118.26458 2998.777 2335.599 3.857327 14.15857 0.2994007
## ACF1 Theil's U
## Training set -0.04089051 NA
## Test set 0.55109087 0.8319025
#Very similar RMSE to the ARIMA model, 2998
#TSLM model of aggregate
vts_agg <- ts(visitor_agg, frequency = 366)
vfit_tslm_agg <- tslm(visitors~Group.date, vts_agg)
summary(vfit_tslm_agg)
##
## Call:
## tslm(formula = visitors ~ Group.date, data = vts_agg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11132 -2297 -529 1988 9959
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.521e+05 1.934e+04 -23.38 <2e-16 ***
## Group.date 2.718e+01 1.135e+00 23.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3424 on 476 degrees of freedom
## Multiple R-squared: 0.5464, Adjusted R-squared: 0.5455
## F-statistic: 573.5 on 1 and 476 DF, p-value: < 2.2e-16
###Predict data prep
predict_df <- submission_data
str(predict_df)
## 'data.frame': 32019 obs. of 4 variables:
## $ id : chr "air_00a91d42b08b08d9_2017-04-23" "air_00a91d42b08b08d9_2017-04-24" "air_00a91d42b08b08d9_2017-04-25" "air_00a91d42b08b08d9_2017-04-26" ...
## $ visitors : int 0 0 0 0 0 0 0 0 0 0 ...
## $ air_store_id: chr "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" ...
## $ visit_date : Date, format: "2017-04-23" "2017-04-24" ...
#remove id field and null visitor field
predict_df <- subset(predict_df, select = -c(id, visitors))
predict_df <- left_join(predict_df,date_info, by = "visit_date")
head(predict_df)
## air_store_id visit_date calendar_date day_of_week holiday_flg
## 1 air_00a91d42b08b08d9 2017-04-23 2017-04-23 Sunday 0
## 2 air_00a91d42b08b08d9 2017-04-24 2017-04-24 Monday 0
## 3 air_00a91d42b08b08d9 2017-04-25 2017-04-25 Tuesday 0
## 4 air_00a91d42b08b08d9 2017-04-26 2017-04-26 Wednesday 0
## 5 air_00a91d42b08b08d9 2017-04-27 2017-04-27 Thursday 0
## 6 air_00a91d42b08b08d9 2017-04-28 2017-04-28 Friday 0
tail(predict_df)
## air_store_id visit_date calendar_date day_of_week holiday_flg
## 32014 air_fff68b929994bfbd 2017-05-26 2017-05-26 Friday 0
## 32015 air_fff68b929994bfbd 2017-05-27 2017-05-27 Saturday 0
## 32016 air_fff68b929994bfbd 2017-05-28 2017-05-28 Sunday 0
## 32017 air_fff68b929994bfbd 2017-05-29 2017-05-29 Monday 0
## 32018 air_fff68b929994bfbd 2017-05-30 2017-05-30 Tuesday 0
## 32019 air_fff68b929994bfbd 2017-05-31 2017-05-31 Wednesday 0
sum(is.na(predict_df))
## [1] 0
#Join Store info from AeRegi by air_store_id
predict_df<- left_join(predict_df,air_store_info, by = "air_store_id")
head(predict_df)
## air_store_id visit_date calendar_date day_of_week holiday_flg
## 1 air_00a91d42b08b08d9 2017-04-23 2017-04-23 Sunday 0
## 2 air_00a91d42b08b08d9 2017-04-24 2017-04-24 Monday 0
## 3 air_00a91d42b08b08d9 2017-04-25 2017-04-25 Tuesday 0
## 4 air_00a91d42b08b08d9 2017-04-26 2017-04-26 Wednesday 0
## 5 air_00a91d42b08b08d9 2017-04-27 2017-04-27 Thursday 0
## 6 air_00a91d42b08b08d9 2017-04-28 2017-04-28 Friday 0
## air_genre_name air_area_name latitude longitude
## 1 Italian/French TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami 35.694 139.7536
## 2 Italian/French TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami 35.694 139.7536
## 3 Italian/French TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami 35.694 139.7536
## 4 Italian/French TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami 35.694 139.7536
## 5 Italian/French TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami 35.694 139.7536
## 6 Italian/French TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami 35.694 139.7536
tail(visitor_df)
## air_store_id visit_date visitors day_of_week holiday_flg
## 252103 air_24e8414b9b07decb 2017-04-15 7 Saturday 0
## 252104 air_24e8414b9b07decb 2017-04-18 6 Tuesday 0
## 252105 air_24e8414b9b07decb 2017-04-19 6 Wednesday 0
## 252106 air_24e8414b9b07decb 2017-04-20 7 Thursday 0
## 252107 air_24e8414b9b07decb 2017-04-21 8 Friday 0
## 252108 air_24e8414b9b07decb 2017-04-22 5 Saturday 0
## air_genre_name air_area_name latitude longitude
## 252103 Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322 139.711
## 252104 Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322 139.711
## 252105 Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322 139.711
## 252106 Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322 139.711
## 252107 Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322 139.711
## 252108 Other TÅ\215kyÅ\215-to Shibuya-ku Higashi 35.65322 139.711
sum(is.na(predict_df))
## [1] 0
str(predict_df)
## 'data.frame': 32019 obs. of 9 variables:
## $ air_store_id : chr "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" ...
## $ visit_date : Date, format: "2017-04-23" "2017-04-24" ...
## $ calendar_date : Date, format: "2017-04-23" "2017-04-24" ...
## $ day_of_week : chr "Sunday" "Monday" "Tuesday" "Wednesday" ...
## $ holiday_flg : int 0 0 0 0 0 0 1 0 0 0 ...
## $ air_genre_name: chr "Italian/French" "Italian/French" "Italian/French" "Italian/French" ...
## $ air_area_name : chr "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" ...
## $ latitude : num 35.7 35.7 35.7 35.7 35.7 ...
## $ longitude : num 140 140 140 140 140 ...
predict_df <- subset(predict_df, select = -c(calendar_date))
str(predict_df)
## 'data.frame': 32019 obs. of 8 variables:
## $ air_store_id : chr "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" ...
## $ visit_date : Date, format: "2017-04-23" "2017-04-24" ...
## $ day_of_week : chr "Sunday" "Monday" "Tuesday" "Wednesday" ...
## $ holiday_flg : int 0 0 0 0 0 0 1 0 0 0 ...
## $ air_genre_name: chr "Italian/French" "Italian/French" "Italian/French" "Italian/French" ...
## $ air_area_name : chr "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" ...
## $ latitude : num 35.7 35.7 35.7 35.7 35.7 ...
## $ longitude : num 140 140 140 140 140 ...
#Feature Creation - submission set
predict_df_feat <- predict_df
#create dummy variables for day of week, MorT / WoTh / FoSu, Saturday with coefficient
predict_df_feat$MoT_flg <- ifelse(predict_df_feat$day_of_week=="Monday"|predict_df_feat$day_of_week=="Tuesday" ,1,0)
predict_df_feat$WoTh_flg <- ifelse(predict_df_feat$day_of_week=="Wednesday"|predict_df_feat$day_of_week=="Thursday" ,1,0)
predict_df_feat$FoSu_flg <- ifelse(predict_df_feat$day_of_week=="Friday"|predict_df_feat$day_of_week=="Sunday" ,1,0)
str(predict_df_feat)
## 'data.frame': 32019 obs. of 11 variables:
## $ air_store_id : chr "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" ...
## $ visit_date : Date, format: "2017-04-23" "2017-04-24" ...
## $ day_of_week : chr "Sunday" "Monday" "Tuesday" "Wednesday" ...
## $ holiday_flg : int 0 0 0 0 0 0 1 0 0 0 ...
## $ air_genre_name: chr "Italian/French" "Italian/French" "Italian/French" "Italian/French" ...
## $ air_area_name : chr "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" ...
## $ latitude : num 35.7 35.7 35.7 35.7 35.7 ...
## $ longitude : num 140 140 140 140 140 ...
## $ MoT_flg : num 0 1 1 0 0 0 0 0 1 1 ...
## $ WoTh_flg : num 0 0 0 1 1 0 0 0 0 0 ...
## $ FoSu_flg : num 1 0 0 0 0 1 0 1 0 0 ...
#Create Month column, to isolate peak months
predict_df_feat$month <- format(predict_df_feat$visit_date,"%B")
predict_df_feat$Peak_flg <- ifelse(predict_df_feat$month=="March"|predict_df_feat$month=="December"|predict_df_feat$month=="April"|predict_df_feat$month=="May"|predict_df_feat$month=="June"|predict_df_feat$month=="July",1,0)
#Create dummy variables for type of restaurant, leaving Asian out as part of coefficient
predict_df_feat$Bar_gflg <- ifelse(predict_df_feat$air_genre_name=="Bar/Cocktail",1,0)
predict_df_feat$Cafe_gflg <- ifelse(predict_df_feat$air_genre_name=="Cafe/Sweets",1,0)
predict_df_feat$Creative_gflg <- ifelse(predict_df_feat$air_genre_name=="Creative cuisine",1,0)
predict_df_feat$Dining_bar_gflg <- ifelse(predict_df_feat$air_genre_name=="Dining bar",1,0)
predict_df_feat$Int_gflg <- ifelse(predict_df_feat$air_genre_name=="International cuisine",1,0)
predict_df_feat$IF_gflg <- ifelse(predict_df_feat$air_genre_name=="Italian/French",1,0)
predict_df_feat$Izakaya_gflg <- ifelse(predict_df_feat$air_genre_name=="Izakaya",1,0)
predict_df_feat$Japanese_gflg <- ifelse(predict_df_feat$air_genre_name=="Japanese food",1,0)
predict_df_feat$OMT_gflg <- ifelse(predict_df_feat$air_genre_name=="Okonomiyaki/Monja/Teppanyaki",1,0)
predict_df_feat$Other_gflg <- ifelse(predict_df_feat$air_genre_name=="Other",1,0)
predict_df_feat$Western_gflg <- ifelse(predict_df_feat$air_genre_name=="Western food",1,0)
predict_df_feat$Korean_gflg <- ifelse(predict_df_feat$air_genre_name=="Yakiniku/Korean food",1,0)
str(predict_df_feat)
## 'data.frame': 32019 obs. of 25 variables:
## $ air_store_id : chr "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" "air_00a91d42b08b08d9" ...
## $ visit_date : Date, format: "2017-04-23" "2017-04-24" ...
## $ day_of_week : chr "Sunday" "Monday" "Tuesday" "Wednesday" ...
## $ holiday_flg : int 0 0 0 0 0 0 1 0 0 0 ...
## $ air_genre_name : chr "Italian/French" "Italian/French" "Italian/French" "Italian/French" ...
## $ air_area_name : chr "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" "TÅ\215kyÅ\215-to Chiyoda-ku Kudanminami" ...
## $ latitude : num 35.7 35.7 35.7 35.7 35.7 ...
## $ longitude : num 140 140 140 140 140 ...
## $ MoT_flg : num 0 1 1 0 0 0 0 0 1 1 ...
## $ WoTh_flg : num 0 0 0 1 1 0 0 0 0 0 ...
## $ FoSu_flg : num 1 0 0 0 0 1 0 1 0 0 ...
## $ month : chr "April" "April" "April" "April" ...
## $ Peak_flg : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Bar_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Cafe_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Creative_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Dining_bar_gflg: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Int_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ IF_gflg : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Izakaya_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Japanese_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ OMT_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Other_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Western_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Korean_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
#Create a numeric data frame - remove ->
predict_df_submit <- subset(predict_df_feat, select = -c(day_of_week, air_area_name, month, air_genre_name, air_store_id))
str(predict_df_submit)
## 'data.frame': 32019 obs. of 20 variables:
## $ visit_date : Date, format: "2017-04-23" "2017-04-24" ...
## $ holiday_flg : int 0 0 0 0 0 0 1 0 0 0 ...
## $ latitude : num 35.7 35.7 35.7 35.7 35.7 ...
## $ longitude : num 140 140 140 140 140 ...
## $ MoT_flg : num 0 1 1 0 0 0 0 0 1 1 ...
## $ WoTh_flg : num 0 0 0 1 1 0 0 0 0 0 ...
## $ FoSu_flg : num 1 0 0 0 0 1 0 1 0 0 ...
## $ Peak_flg : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Bar_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Cafe_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Creative_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Dining_bar_gflg: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Int_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ IF_gflg : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Izakaya_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Japanese_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ OMT_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Other_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Western_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Korean_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
sum(is.na(predict_df_submit))
## [1] 0
#Create submission TS
predict_df_ts <- ts(predict_df_submit, frequency = 366)
str(predict_df_ts)
## Time-Series [1:32019, 1:20] from 1 to 88.5: 17279 17280 17281 17282 17283 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:20] "visit_date" "holiday_flg" "latitude" "longitude" ...
#Make response prediction using TSLM 5
predicition1 <- predict(vfit_tslm_5, predict_df_submit, type = "response")
visitors_pred = as.vector(predicition1)
str(visitors_pred)
## num [1:32019] 26 19.9 19.9 21.6 21.6 ...
submission_pred <- predict_df_submit
submission_pred$visitors <- round(visitors_pred,0)
str(submission_pred)
## 'data.frame': 32019 obs. of 21 variables:
## $ visit_date : Date, format: "2017-04-23" "2017-04-24" ...
## $ holiday_flg : int 0 0 0 0 0 0 1 0 0 0 ...
## $ latitude : num 35.7 35.7 35.7 35.7 35.7 ...
## $ longitude : num 140 140 140 140 140 ...
## $ MoT_flg : num 0 1 1 0 0 0 0 0 1 1 ...
## $ WoTh_flg : num 0 0 0 1 1 0 0 0 0 0 ...
## $ FoSu_flg : num 1 0 0 0 0 1 0 1 0 0 ...
## $ Peak_flg : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Bar_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Cafe_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Creative_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Dining_bar_gflg: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Int_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ IF_gflg : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Izakaya_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Japanese_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ OMT_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Other_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Western_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Korean_gflg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ visitors : num 26 20 20 22 22 26 32 26 20 20 ...
summary(submission_pred$visitors)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.00 20.00 22.00 22.12 26.00 44.00
#confirmed no negative counts
#aggregate prediction
visitor_agg_pred <- aggregate(x = submission_pred[c("visitors")], FUN = sum, by = list(Group.date = submission_pred$visit_date))
visitor_agg_pred_ts <- ts(visitor_agg_pred$visitors, start = c(2017, 113), frequency = 366)
autoplot(visitor_agg_pred_ts)

autoplot(visitor_agg_ts, series = "Train_data") + autolayer(visitor_agg_pred_ts, series = "Predict_data") + ggtitle("TSLM Forecast Predicitions (04/23-05/31)")

#Write new submission file
submission_final <- read.csv('sample_submission.csv')
submission_final$visitors <- submission_pred$visitors
head(submission_final)
## id visitors
## 1 air_00a91d42b08b08d9_2017-04-23 26
## 2 air_00a91d42b08b08d9_2017-04-24 20
## 3 air_00a91d42b08b08d9_2017-04-25 20
## 4 air_00a91d42b08b08d9_2017-04-26 22
## 5 air_00a91d42b08b08d9_2017-04-27 22
## 6 air_00a91d42b08b08d9_2017-04-28 26
#write.csv(submission_final, file = "./TFL_Recruit_submission.csv", row.names=FALSE)