0. Goal:

Forecast for a specific genre in an area the number of visitors in next 2 weeks

1. Aggregating and EDA

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df.visit <- read.csv("air_visit_data.csv")
df.store <- read.csv("air_store_info.csv")
df.full <- df.visit %>%
            inner_join(df.store, by = "air_store_id") %>%
            dplyr::select(visit_date, visitors, air_area_name, air_genre_name) %>%
            group_by(air_area_name, air_genre_name, visit_date) %>%
            summarise(total_visitors = sum(visitors))

How visits going of different genres

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
df.agg.genre <- df.full %>%
                  mutate(Week = as.Date(cut(as.Date(visit_date), breaks = "week"))) %>%
                  group_by(air_genre_name, Week) %>%
                  summarise(total_visitors = sum(total_visitors)) 
## Warning: package 'bindrcpp' was built under R version 3.2.5
ggplot(data = df.agg.genre,
  aes(Week, total_visitors, fill = air_genre_name)) +
  stat_summary(fun.y = sum, # adds up all observations for the week
    geom = "bar") 

2. Time Series Plot and Forecast

Set filter here:

genre_name = "Japanese food"
area_name = "Fukuoka-ken Fukuoka-shi Daimyō"
df.jpfood <- df.full %>%
                  filter(air_genre_name == genre_name & 
                           air_area_name == area_name) %>%
                  mutate(Date = as.Date(visit_date)) %>%
                  group_by(Date) %>%
                  summarise(jp_visits = sum(total_visitors))

ggplot(data = df.jpfood,
  aes(Date, jp_visits)) +
  stat_summary(fun.y = sum, # adds up all observations for the week
    geom = "line")          

Visualize total by Month:

df.jpfood.month <- df.jpfood %>%
                    mutate(Month = as.integer(format(Date, "%m"))) %>%
                    group_by(Month) %>%
                    summarise(month.visits = sum(jp_visits))

ggplot(data = df.jpfood.month,
  aes(Month, month.visits)) +
  stat_summary(fun.y = sum, # adds up all observations for the week
    geom = "line") + scale_x_continuous(breaks=c(1:12)) +
  ggtitle("Visit by month of Japanese food in Fukuoka-ken Fukuoka-shi Daimyō area") +
  ylab("Visits")

May and June has lowest visits.

By weekday:

df.jpfood.weekday <- df.jpfood %>%
                    mutate(weekday = weekdays(as.Date(Date))) %>%
                    group_by(weekday) %>%
                    summarise(weekday.visits = sum(jp_visits))
                    
df.jpfood.weekday$weekday <- factor(df.jpfood.weekday$weekday, levels= c("Monday", 
    "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday","Sunday"))

ggplot(data = df.jpfood.weekday,
  aes(weekday, weekday.visits)) +
  geom_bar(stat = "identity")          

Friday and Saturday has highest visits.

Training and Testing

Use last 3 weeks as training set:

split.date = as.Date("2017-04-01")
df.jpfood.train <- df.jpfood[df.jpfood$Date <= split.date,]
df.jpfood.test <- df.jpfood[df.jpfood$Date > split.date,]

Forecast:

library('forecast')
## Warning: package 'forecast' was built under R version 3.2.5
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.2.5
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: timeDate
## This is forecast 7.3
library('tseries')
## Warning: package 'tseries' was built under R version 3.2.5
arima.fit <- auto.arima(tsclean(ts(df.jpfood.train$jp_visits, frequency = 7)),
                        stepwise = FALSE, approximation = FALSE)
pred.visits <- arima.fit %>% forecast(h = 21, level = c(50,95))
pred.visits %>%
  autoplot +
  labs(x = "Time [weeks]", y = "log1p visitors vs auto.arima predictions")

So here we have: * range of lag: 1 * difference: 1 * moving average of: 2 previous terms

Error:

by log mean squared error:

\[\sqrt{\frac{1}{n}\sum_{i=1}^{n}(log (p_i+1) - log (a_i+1))^2}\]

p <- pred.visits$mean
a <- df.jpfood.test$jp_visits
sqrt(1/21 * sum((log(p+1) - log(a-1))^2))
## [1] 0.8609883

By root-mean-square deviation:

sqrt(1/21 * sum((a-p)^2))
## [1] 19.08264

Average 20 visits per day of actual visits away from prediction.

By week:

df.result <- data.frame(idx = c(1:21), a, p)
df.result.agg <- df.result %>% mutate(weekNo = idx %/% 7) %>% 
                   group_by(weekNo) %>%
                   summarise(s.a = sum(a), s.p = sum(p))
df.result.agg
## # A tibble: 4 x 3
##   weekNo   s.a       s.p
##    <dbl> <int>     <dbl>
## 1      0   167 292.32003
## 2      1   292 344.13779
## 3      2   340 345.04282
## 4      3    46  49.96349

Average:

df.result.agg %>% filter(weekNo %in% c(0:2)) %>% 
                    mutate(diff.week = s.p - s.a) %>%
                    summarise(mean(diff.week))
## # A tibble: 1 x 1
##   `mean(diff.week)`
##               <dbl>
## 1          60.83355

Prediction is in average 60 more visits higher than actual visits during last 3 weeks.

df.result.agg %>% filter(weekNo %in% c(0:2)) %>% 
                    mutate(diff.week = s.p - s.a) %>%
                    summarise(mean(diff.week) / sum(s.a))
## # A tibble: 1 x 1
##   `mean(diff.week)/sum(s.a)`
##                        <dbl>
## 1                 0.07613711

7.61% error rate.

3. Future extension

Because of time limit, I was not able to add full datasets, skipped tables like weather, location, and local market information to improve the performance. Here’s just first step. The rank of this result is more than 1600 out of 2000 teams by square root of mean log error.

In future, I will add more detailed analysis to improve forecast.