Install Packages

# Clear the workspace
rm(list = ls()) # Clear environment
gc()            # Clear unused memory
cat("\f")       # Clear the console

library("feasts")
library("seasonal")
library("tsibble")
library("tsibbledata")
library("dplyr")
library("ggplot2")
library("forecast")
library("fable")
library("weathermetrics")
library("seasonalview")
library("tibbletime")
library("fpp3")
library("fma")
library("fpp")
library("fpp2")
library("grid")
library("gridExtra")
library("lubridate")
library("seasonal")
library("zoo")
library("psych")
library("wesanderson")
library("leaflet")

setwd("/Users/spoll/OneDrive/Documents/Boston College/Predictive Analytics_Forecasting/Week 8 Final")

Load Data

raw_reserve <- read.csv("air_reserve.csv"
                           , check.names = FALSE
                           , stringsAsFactors = FALSE
                           , na.strings = ""
)

raw_visit <- read.csv("air_visit_data.csv"
                           , check.names = FALSE
                           , stringsAsFactors = FALSE
                           , na.strings = ""
)

raw_store <- read.csv("air_store_info.csv"
                           , check.names = FALSE
                           , stringsAsFactors = FALSE
                           , na.strings = ""
)

raw_date <- read.csv("date_info.csv"
                           , check.names = FALSE
                           , stringsAsFactors = FALSE
                           , na.strings = ""
)

raw_hpg_reserve <- read.csv("hpg_reserve.csv"
                           , check.names = FALSE
                           , stringsAsFactors = FALSE
                           , na.strings = ""
)

raw_hpg_store <- read.csv("hpg_store_info.csv"
                           , check.names = FALSE
                           , stringsAsFactors = FALSE
                           , na.strings = ""
)

store_id_relation <- read.csv("store_id_relation.csv"
                           , check.names = FALSE
                           , stringsAsFactors = FALSE
                           , na.strings = ""
)

sample <- read.csv("sample_submission.csv"
                           , check.names = FALSE
                           , stringsAsFactors = FALSE
                           , na.strings = ""
)

# sample$id <- gsub("[_]", " ", sample$id)
# sample <- sample %>% separate(id, c('system', 'id', 'date'), pattern )
# sample1 <- separate(sample, id, c('system', 'id', 'date'), sep = " ")
# sample <- sample1 %>% 
#   dplyr::select("date", "visitors")
# sample$date <- as.Date(sample$date)
# sample <- sample %>% distinct(.keep_all = TRUE)
# sample_ts <- ts(sample, frequency = 12)

Summary of Data

summary(raw_reserve)
##  air_store_id       visit_datetime     reserve_datetime   reserve_visitors 
##  Length:92378       Length:92378       Length:92378       Min.   :  1.000  
##  Class :character   Class :character   Class :character   1st Qu.:  2.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :  3.000  
##                                                           Mean   :  4.482  
##                                                           3rd Qu.:  5.000  
##                                                           Max.   :100.000
summary(raw_visit)
##  air_store_id        visit_date           visitors     
##  Length:252108      Length:252108      Min.   :  1.00  
##  Class :character   Class :character   1st Qu.:  9.00  
##  Mode  :character   Mode  :character   Median : 17.00  
##                                        Mean   : 20.97  
##                                        3rd Qu.: 29.00  
##                                        Max.   :877.00
summary(raw_store)
##  air_store_id       air_genre_name     air_area_name         latitude    
##  Length:829         Length:829         Length:829         Min.   :33.21  
##  Class :character   Class :character   Class :character   1st Qu.:34.70  
##  Mode  :character   Mode  :character   Mode  :character   Median :35.66  
##                                                           Mean   :35.65  
##                                                           3rd Qu.:35.69  
##                                                           Max.   :44.02  
##    longitude    
##  Min.   :130.2  
##  1st Qu.:135.3  
##  Median :139.7  
##  Mean   :137.4  
##  3rd Qu.:139.8  
##  Max.   :144.3
summary(raw_date)
##  calendar_date      day_of_week         holiday_flg    
##  Length:517         Length:517         Min.   :0.0000  
##  Class :character   Class :character   1st Qu.:0.0000  
##  Mode  :character   Mode  :character   Median :0.0000  
##                                        Mean   :0.0677  
##                                        3rd Qu.:0.0000  
##                                        Max.   :1.0000
summary(raw_hpg_reserve)
##  hpg_store_id       visit_datetime     reserve_datetime   reserve_visitors 
##  Length:1048575     Length:1048575     Length:1048575     Min.   :  1.000  
##  Class :character   Class :character   Class :character   1st Qu.:  2.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :  3.000  
##                                                           Mean   :  4.889  
##                                                           3rd Qu.:  6.000  
##                                                           Max.   :100.000
summary(raw_hpg_store)
##  hpg_store_id       hpg_genre_name     hpg_area_name         latitude    
##  Length:4690        Length:4690        Length:4690        Min.   :33.31  
##  Class :character   Class :character   Class :character   1st Qu.:34.69  
##  Mode  :character   Mode  :character   Mode  :character   Median :35.66  
##                                                           Mean   :35.81  
##                                                           3rd Qu.:35.70  
##                                                           Max.   :43.77  
##    longitude    
##  Min.   :130.3  
##  1st Qu.:135.5  
##  Median :139.5  
##  Mean   :137.7  
##  3rd Qu.:139.7  
##  Max.   :143.7
summary(store_id_relation)
##  air_store_id       hpg_store_id      
##  Length:150         Length:150        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

Descriptive Statistics About the Data

describe(raw_reserve)
##                   vars     n    mean      sd median trimmed     mad min  max
## air_store_id*        1 92378  151.34   92.55    147  150.44  121.57   1  314
## visit_datetime*      2 92378 2958.96 1188.51   3079 3054.18 1214.25   1 4975
## reserve_datetime*    3 92378 4806.34 1942.51   5075 5004.60 1752.43   1 7513
## reserve_visitors     4 92378    4.48    4.92      3    3.39    1.48   1  100
##                   range  skew kurtosis   se
## air_store_id*       313  0.10    -1.30 0.30
## visit_datetime*    4974 -0.60    -0.39 3.91
## reserve_datetime*  7512 -0.80    -0.19 6.39
## reserve_visitors     99  4.65    33.52 0.02
describe(raw_visit)
##               vars      n   mean     sd median trimmed    mad min max range
## air_store_id*    1 252108 413.70 239.70    407  413.21 309.86   1 829   828
## visit_date*      2 252108 239.75 142.68    227  238.40 186.81   1 478   477
## visitors         3 252108  20.97  16.76     17   18.82  13.34   1 877   876
##               skew kurtosis   se
## air_store_id* 0.02    -1.21 0.48
## visit_date*   0.13    -1.30 0.28
## visitors      3.31    74.26 0.03
describe(raw_store)
##                 vars   n   mean     sd median trimmed    mad    min    max
## air_store_id*      1 829 415.00 239.46 415.00  415.00 306.90   1.00 829.00
## air_genre_name*    2 829   6.26   3.13   7.00    6.02   2.97   1.00  14.00
## air_area_name*     3 829  54.10  31.14  58.00   54.90  43.00   1.00 103.00
## latitude           4 829  35.65   2.08  35.66   35.26   0.10  33.21  44.02
## longitude          5 829 137.42   3.65 139.69  137.82   0.20 130.20 144.27
##                  range  skew kurtosis   se
## air_store_id*   828.00  0.00    -1.20 8.32
## air_genre_name*  13.00  0.45    -0.41 0.11
## air_area_name*  102.00 -0.17    -1.34 1.08
## latitude         10.81  2.65     7.44 0.07
## longitude        14.08 -0.93    -0.55 0.13
describe(raw_date)
##                vars   n   mean     sd median trimmed    mad min max range skew
## calendar_date*    1 517 259.00 149.39    259     259 191.26   1 517   516 0.00
## day_of_week*      2 517   4.00   2.00      4       4   2.97   1   7     6 0.00
## holiday_flg       3 517   0.07   0.25      0       0   0.00   0   1     1 3.43
##                kurtosis   se
## calendar_date*    -1.21 6.57
## day_of_week*      -1.26 0.09
## holiday_flg        9.79 0.01
describe(raw_hpg_reserve)
##                   vars       n    mean      sd median trimmed     mad min   max
## hpg_store_id*        1 1048575 6382.07 3674.70   6387 6385.96 4722.08   1 12721
## visit_datetime*      2 1048575 3210.05 1954.17   3043 3176.60 2612.34   1  6582
## reserve_datetime*    3 1048575 4044.90 2325.80   4032 4041.95 3089.74   1  7998
## reserve_visitors     4 1048575    4.89    5.09      3    3.81    1.48   1   100
##                   range  skew kurtosis   se
## hpg_store_id*     12720 -0.01    -1.21 3.59
## visit_datetime*    6581  0.13    -1.30 1.91
## reserve_datetime*  7997  0.01    -1.25 2.27
## reserve_visitors     99  4.72    38.61 0.00
describe(raw_hpg_store)
##                 vars    n    mean      sd  median trimmed     mad    min
## hpg_store_id*      1 4690 2345.50 1354.03 2345.50 2345.50 1738.35   1.00
## hpg_genre_name*    2 4690   14.85    5.08   16.00   14.65    5.93   1.00
## hpg_area_name*     3 4690   66.05   35.10   70.00   67.16   45.96   1.00
## latitude           4 4690   35.81    2.14   35.66   35.38    1.01  33.31
## longitude          5 4690  137.68    3.20  139.50  138.09    0.78 130.34
##                     max   range  skew kurtosis    se
## hpg_store_id*   4690.00 4689.00  0.00    -1.20 19.77
## hpg_genre_name*   34.00   33.00  0.41     0.47  0.07
## hpg_area_name*   119.00  118.00 -0.19    -1.21  0.51
## latitude          43.77   10.46  2.50     6.04  0.03
## longitude        143.71   13.38 -0.98    -0.14  0.05
describe(store_id_relation)
##               vars   n mean    sd median trimmed  mad min max range skew
## air_store_id*    1 150 75.5 43.45   75.5    75.5 55.6   1 150   149    0
## hpg_store_id*    2 150 75.5 43.45   75.5    75.5 55.6   1 150   149    0
##               kurtosis   se
## air_store_id*    -1.22 3.55
## hpg_store_id*    -1.22 3.55

Reformat Dates

raw_visit <- as.data.frame(raw_visit)
raw_visit$visit_date <- mdy(raw_visit$visit_date)
class(raw_visit$visit_date)
## [1] "Date"
visit <- raw_visit %>%
  mutate(month = month(visit_date, label = TRUE)) %>%
  mutate(wday = wday(visit_date, label = TRUE))
unique(raw_reserve$air_store_id)
##   [1] "air_877f79706adbfb06" "air_db4b38ebe7a7ceff" "air_db80363d35f10926"
##   [4] "air_3bb99a1fe0583897" "air_2b8b29ddfd35018e" "air_6b15edd1b4fbb96a"
##   [7] "air_789466e488705c93" "air_f690c42545146e0a" "air_326ca454ef3558bc"
##  [10] "air_6c91a28278a16f64" "air_e270aff84ac7e4c8" "air_a083834e7ffe187e"
##  [13] "air_ee3a01f0c71a769f" "air_24b9b2a020826ede" "air_142e78ba7001da9c"
##  [16] "air_1653a6c513865af3" "air_2d3afcb91762fe01" "air_54ed43163b7596c4"
##  [19] "air_6d65542aa43b598b" "air_f911308e19d64236" "air_290e7a57b390f78e"
##  [22] "air_622375b4815cf5cb" "air_de692863bb2dd758" "air_b9e27558fb8bd5c4"
##  [25] "air_614e2f7e76dff854" "air_63e28ee0b0c955a7" "air_2a3743e37aab04b4"
##  [28] "air_670a0c1c4108bcea" "air_2cee51fa6fdf6c0d" "air_c8a657c8c5c93d69"
##  [31] "air_e55abd740f93ecc4" "air_2aab19554f91ff82" "air_3c05c8f26c611eb9"
##  [34] "air_3cad29d1a23209d2" "air_8093d0b565e9dbdf" "air_96743eee94114261"
##  [37] "air_632ba66e1f75aa28" "air_6b9fa44a9cf504a1" "air_7831b00996701c0f"
##  [40] "air_03963426c9312048" "air_4cca5666eaf5c709" "air_9d93d95720f2e831"
##  [43] "air_68c1de82037d87e6" "air_32460819c7600037" "air_346ade7d29230634"
##  [46] "air_c2c8435bdb3516d4" "air_c73d319ffabf287a" "air_5e70fe82f9e4fab6"
##  [49] "air_a563896da3777078" "air_f96765e800907c77" "air_fe22ef5a9cbef123"
##  [52] "air_f068442ebb6c246c" "air_0e1eae99b8723bc1" "air_9d452a881f7f2bb7"
##  [55] "air_81c5dff692063446" "air_87f9e1024b951f01" "air_0b184ec04c741a6a"
##  [58] "air_4ce7b17062a1bf73" "air_2c989829acbd1c6b" "air_61de73b097513f58"
##  [61] "air_673acd9fa5e0dd78" "air_97159fc4e90053fe" "air_63a750d8b4b6a976"
##  [64] "air_2a485b92210c98b5" "air_5c817ef28f236bdf" "air_68301bcb11e2f389"
##  [67] "air_b4f32bcc399da2b9" "air_bfafaed35e213fd7" "air_722297e7f26db91d"
##  [70] "air_082908692355165e" "air_9241121af22ff1d6" "air_c759b6abeb552160"
##  [73] "air_48e9fc98b62495a7" "air_37189c92b6c761ec" "air_9352c401d5adb01b"
##  [76] "air_35512c42db0868da" "air_683371d9baabf410" "air_7cc17a324ae5c7dc"
##  [79] "air_70e9e8cd55879414" "air_fbadf737162a5ce3" "air_234d3dbf7f3d5a50"
##  [82] "air_ef789667e2e6fe96" "air_1033310359ceeac1" "air_483eba479dc9910d"
##  [85] "air_f2c5a1f24279c531" "air_a55d17bd3f3033cb" "air_ab3ae0e410b20069"
##  [88] "air_54d6c25d33f5260e" "air_5f3a3ef4cba110a4" "air_cb7467aed805e7fe"
##  [91] "air_0a74a5408a0b8642" "air_2fc478dc9f0a6b31" "air_0f0cdeee6c9bf3d7"
##  [94] "air_1408dd53f31a8a65" "air_396166d47733d5c9" "air_56cebcbd6906e04c"
##  [97] "air_5f6fa1b897fe80d5" "air_bc991c51d6613745" "air_375a5241615b5e22"
## [100] "air_48f4da6223571da4" "air_7ef9a5ea5c8fe39f" "air_97cf68dc1a9beac0"
## [103] "air_b45b8e456f53942a" "air_e0e69668214ff972" "air_eec5e572b9eb9c23"
## [106] "air_b23d0f519291247d" "air_a510dcfe979f09eb" "air_93b9bb641f8fc982"
## [109] "air_b2d97bd2337c5ba7" "air_8b4a46dc521bfcfe" "air_324f7c39a8410e7c"
## [112] "air_646b93e336f0dded" "air_a24bf50c3e90d583" "air_bebd55ed63ab2422"
## [115] "air_aed3a8b49abe4a48" "air_bb4ff06cd661ee9b" "air_256be208a979e023"
## [118] "air_5485912b44f976de" "air_ab9746a0f83084b7" "air_a546cbf478a8b6e4"
## [121] "air_4b55d8aea1d2b395" "air_965b2e0cf4119003" "air_cf5ab75a0afb8af9"
## [124] "air_42c9aa6d617c5057" "air_45326ebb8dc72cfb" "air_4c727b55acdee495"
## [127] "air_b439391e72899756" "air_9b6af3db40da4ae2" "air_9ca2767761efff4d"
## [130] "air_9efaa7ded03c5a71" "air_af63df35857b16e6" "air_bedd35489e666605"
## [133] "air_c88467d88b2c8ecd" "air_0867f7bebad6a649" "air_1707a3f18bb0da07"
## [136] "air_1eeff462acb24fb7" "air_1f34e9beded2231a" "air_20add8092c9bb51d"
## [139] "air_25c583983246b7b0" "air_397d3f32a7196aa2" "air_3e93f3c81008696d"
## [142] "air_4092cfbd95a3ac1b" "air_43b65e4b05bff2d3" "air_5acc13d655a6e8b2"
## [145] "air_640cf4835f0d9ba3" "air_67f87c159d9e2ee2" "air_6ff5fca957798daa"
## [148] "air_707d4b6328f2c2df" "air_7db266904cb0d72a" "air_84f6876ff7e83ae7"
## [151] "air_a88ac559064dec08" "air_b2395df0e874078d" "air_b8a5ee69e5fdcc5b"
## [154] "air_c0385db498b391e5" "air_c47aa7493b15f297" "air_c6a164dd4060e960"
## [157] "air_c7f78b4f3cba33ff" "air_ccd19a5bc5573ae5" "air_d6b3e67261f07646"
## [160] "air_db1233ad855b34d5" "air_de803f7e324936b8" "air_e053c561f32acc28"
## [163] "air_ee3ba9af184c6c82" "air_f180301886c21375" "air_fa12b40b02fecfd8"
## [166] "air_04cae7c1bc9b2a0b" "air_12c4fb7a423df20d" "air_40f6193ea3ed1b91"
## [169] "air_465bddfed3353b23" "air_710d6537cb7623df" "air_7dacea2f22afccfb"
## [172] "air_90213bcae4afa274" "air_97b2a9f975fc702c" "air_9aa32b3db0fab3a5"
## [175] "air_9d474ec2448c700d" "air_a2b29aa7feb4e36f" "air_a38f25e3399d1b25"
## [178] "air_ca6ae8d49a2f1eaf" "air_e89735e80d614a7e" "air_f8233ad00755c35c"
## [181] "air_fea5dc9594450608" "air_de88770300008624" "air_dea0655f96947922"
## [184] "air_2c6fef1ce0e13a5a" "air_399904bdb7685ca0" "air_4beac252540f865e"
## [187] "air_6b942d5ebbc759c2" "air_b1bb1fae86617d7a" "air_bb09595bab7d5cfb"
## [190] "air_0b1e72d2d4422b20" "air_602ca92c0db34f8f" "air_3c938075889fc059"
## [193] "air_066f0221b8a4d533" "air_08cb3c4ee6cd6a22" "air_26f10355d9b4d82a"
## [196] "air_08ef81d5b7a0d13f" "air_36429b5ca4407b3e" "air_4481a87c1d7c9896"
## [199] "air_4d21676ed11f0bac" "air_5a9a6cbeeb434c08" "air_8f3b563416efc6ad"
## [202] "air_938ef91ecdde6878" "air_bcce1ea4350b7b72" "air_ecf7f141339f1d57"
## [205] "air_edd5e3d696a5811b" "air_138ff410757b845f" "air_28064154614b2e6c"
## [208] "air_3440e0ea1b70a99b" "air_0164b9927d20bcc3" "air_1c0b150f9e696a5f"
## [211] "air_6836438b543ba698" "air_83db5aff8f50478e" "air_89e7328af22efe74"
## [214] "air_90f0efbb702d77b7" "air_af03c277a167b2bd" "air_7514d90009613cd6"
## [217] "air_c225148c0fcc5c72" "air_23e1b11aee2a1407" "air_034a3d5b40d5b1b1"
## [220] "air_a257c9749d8d0ff6" "air_d1418d6fd6d634f2" "air_51281cd059d7b89b"
## [223] "air_e5cf003abcc5febb" "air_33b01025210d6007" "air_42d41eb58cad170e"
## [226] "air_4579cb0669fd411b" "air_6b2268863b14a2af" "air_754ae581ad80cc9f"
## [229] "air_b8925441167c3152" "air_fc477473134e9ae5" "air_627cabe2fe53f33f"
## [232] "air_6cbe54f0aa30b615" "air_dc71c6cc06cd1aa2" "air_bf617aa68d5f1cfa"
## [235] "air_c3585b0fba3998d0" "air_00a91d42b08b08d9" "air_831658500aa7c846"
## [238] "air_2bffb19a24d11729" "air_a9178f19da58fe99" "air_96773a6236d279b1"
## [241] "air_7420042ff75f9aca" "air_04341b588bde96cd" "air_1ba4e87ef7422183"
## [244] "air_3980af67be35afdb" "air_99b01136f451fc0e" "air_6ca1d941c8199a67"
## [247] "air_d500b48a8735fbd3" "air_258ad2619d7bff9a" "air_cfcc94797d2b5d3d"
## [250] "air_df554c4527a1cfe6" "air_91beafbba9382b0a" "air_7c7774c66fb237f7"
## [253] "air_a85f0c0c889f6b7e" "air_8e492076a1179383" "air_859feab8e3c9f98d"
## [256] "air_8f273fb9ad2fed6f" "air_1d1e8860ae04f8e9" "air_a17f0778617c76e2"
## [259] "air_084d98859256acf0" "air_32b02ba5dc2027f4" "air_1dd8f6f47480d1a2"
## [262] "air_df9355c47c5df9d3" "air_48ffd31594bc3263" "air_fdc02ec4a3d21ea4"
## [265] "air_6ced51c24fb54262" "air_645cb18b33f938cf" "air_fcfbdcf7b1f82c6e"
## [268] "air_55c3627912b9c849" "air_fcd4492c83f1c6b9" "air_e524c6a9e06cc3a1"
## [271] "air_638c35eb25e53eea" "air_4433ab8e9999915f" "air_e00fe7853c0100d6"
## [274] "air_876d7a23c47811cb" "air_15ae33469e9ea2dd" "air_c8265ecc116f2284"
## [277] "air_a9133955abccf071" "air_ade6e836ffd1da64" "air_96005f79124e12bf"
## [280] "air_950381108f839348" "air_e7fbee4e3cfe65c5" "air_eca4a5a191e8d993"
## [283] "air_97958e7fce98b6a3" "air_d477b6339b8ce69f" "air_87467487d21891dd"
## [286] "air_5bd22f9cc1426a90" "air_694571ea13fb9e0e" "air_51319e7acf0438cf"
## [289] "air_883ca28ef0ed3d55" "air_900d755ebd2f7bbd" "air_fee8dcf4d619598e"
## [292] "air_06f95ac5c33aca10" "air_8cc350fd70ee0757" "air_d07e57b21109304a"
## [295] "air_629d9935273c82ae" "air_8110d68cc869b85e" "air_28dbe91c4c9656be"
## [298] "air_63b13c56b7201bd9" "air_e01d99390355408d" "air_93ebe490d4abb8e9"
## [301] "air_6b65745d432fd77f" "air_2c6c79d597e48096" "air_1f1390a8be2272b3"
## [304] "air_cbe867adcf44e14f" "air_10bbe8acd943d8f6" "air_e657ca554b0c008c"
## [307] "air_97e0f2feec4d577a" "air_cc35590cd1da8554" "air_4de6d887a7b1c1fc"
## [310] "air_b60cc7d6aee68194" "air_f0fb0975bdc2cdf9" "air_b3a824511477a4ed"
## [313] "air_cf2229e64408d9fe" "air_e700e390226d9985"

Create a Time Series For Air Data

# Air Agg Time Series
air_agg <- aggregate(reserve_visitors ~ visit_datetime
                     , data = raw_reserve
                     , FUN = sum)
air_agg$date <- as.Date(air_agg$visit_datetime)

air_agg <- air_agg %>%  
  dplyr::select("date", "reserve_visitors")
class(air_agg$date)
## [1] "Date"
air_agg <- aggregate(reserve_visitors ~ date
                     , data = air_agg
                     , FUN = sum)
air_ts <- ts(air_agg)

summary(air_ts[,"reserve_visitors"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0   245.0   587.0   956.2  1303.0  4742.0
air_plot <- autoplot(air_ts[,"reserve_visitors"]) + 
  labs(title="Air Reservations Time Series") + 
  ylab("Reservations") +
  theme(plot.title = element_text(hjust = 0.5))
air_agg <- head(air_agg, nrow(air_agg)-20) %>% 
  dplyr::select("date", "reserve_visitors")
air_test <- tail(air_agg, 20) %>%  
  dplyr::select("date", "reserve_visitors")
air_test_ts <- ts(air_test$reserve_visitors, frequency = 365)

Create a Time Series For HPG Data

# HPG Agg Time Series
hpg_agg <- aggregate(reserve_visitors ~ visit_datetime
                     , data = raw_hpg_reserve
                     , FUN = sum)
hpg_agg$date <- as.Date(hpg_agg$visit_datetime)

hpg_agg <- hpg_agg %>%  
  dplyr::select("date", "reserve_visitors")
class(hpg_agg$date)
## [1] "Date"
hpg_agg <- aggregate(reserve_visitors ~ date
                     , data = hpg_agg
                     , FUN = sum)
hpg_ts <- ts(hpg_agg)

summary(hpg_ts[,"reserve_visitors"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     493    8094   10270   13153   19673   31062
hpg_plot <- autoplot(hpg_ts[,"reserve_visitors"]) + 
  labs(title="HPG Reservations Time Series") + 
  ylab("Reservations") +
  theme(plot.title = element_text(hjust = 0.5))

Create a Univariate Time Series For Air Data

# UNIVARIATE TIME SERIES-----------------------------------------
# Air Univariate Time Series
air_ts2 <- ts(subset(air_agg
                          , select = "reserve_visitors")
                   , frequency = 2)

air_decomp_add <- decompose(air_ts2, type = "additive")
air_decomp_mult <- decompose(air_ts2, type = "multiplicative")
plot(air_decomp_add)

plot(air_decomp_mult)

ggtsdisplay(air_ts2) #confirmed diagnostics

Create a Univariate Time Series For HPG Data

# HPG Univariate Time Series
hpg_ts2 <- ts(subset(hpg_agg
                          , select = "reserve_visitors")
                   , frequency = 2)

hpg_decomp_add <- decompose(hpg_ts2, type = "additive")
hpg_decomp_mult <- decompose(hpg_ts2, type = "multiplicative")
plot(hpg_decomp_add)

plot(hpg_decomp_mult)

ggtsdisplay(hpg_ts2) #confirmed diagnostics

Data Exploration and Visualizations

visitplot <- plot(visitors ~  visit_date
     , data = raw_visit
     , main="Time Series of Air Visitors"
     , xlab="Date"
     , ylab="Number of Visitors"
     , col="red")

wkday_visits <- raw_visit %>%
  mutate(day = wday(visit_date
                     , label = TRUE)) %>%
  group_by(day) %>%
  summarise(visits = mean(visitors)) %>%
  ggplot(aes(day, visits
             , fill = day)) +
  geom_col(colour="black") + 
  labs(main = "Visits per Day"
       , x = "Day"
       , y = "Visits") +
  scale_fill_manual(values=wes_palette(n=12
                                       , name="GrandBudapest1"
                                       , type = "continuous"))

wkday_visits

monthly_visits <- raw_visit %>%
  mutate(month = month(visit_date
                       , label = TRUE)) %>%
  group_by(month) %>%
  summarise(visits = mean(visitors)) %>%
  ggplot(aes(month
             , visits
             , fill = month)) +
  geom_col(colour="black") +
  labs(x = "Month"
       , y = "Visits"
       , main="Visits Per Month") +
  scale_fill_manual(values=wes_palette(n=12
                                       , name="GrandBudapest1"
                                       , type = "continuous"))

monthly_visits

leaflet(raw_store) %>%
  addTiles() %>%
  addProviderTiles("CartoDB.Positron") %>%
  addMarkers(~longitude, ~latitude,
             popup = ~air_store_id, label = ~air_genre_name,
             clusterOptions = markerClusterOptions())
raw_hpg_store %>%
    group_by(hpg_genre_name) %>%
    summarise(total = n()) %>%
    ggplot(aes(reorder(hpg_genre_name
                       , total
                       , FUN = min)
               , total
               , fill = hpg_genre_name))  + 
    geom_col() + 
    coord_flip() + 
    theme(legend.position = "none") +
    scale_fill_manual(values=wes_palette(n=34
                                         , name="GrandBudapest1"
                                         , type = "continuous")) +
    ylab("Restaurant Style")

Time Series Plots

# Combine Time Series Plots
grid.arrange(air_plot, hpg_plot, nrow = 2)

Air Linear Regressions

# LINEAR REGRESSIONS ---------------------
# Air Reservation Linear Regression
air_lm <- tslm(reserve_visitors ~ date
                    , data = air_ts)

air_lm_plot <- autoplot(air_ts[,"reserve_visitors"], 
                             main = "Air Reservations Linear Regression") + 
  autolayer(fitted(air_lm)
            , series = "Fitted Linear Regression") +
  ylab("Reservations") +
  theme(plot.title = element_text(hjust = 0.5))

checkresiduals(air_lm)

## 
##  Breusch-Godfrey test for serial correlation of order up to 10
## 
## data:  Residuals from Linear regression model
## LM test = 321.56, df = 10, p-value < 2.2e-16

HPG Linear Regression

# Air Linear Regression
hpg_lm <- tslm(reserve_visitors ~ date
                    , data = hpg_ts)

hpg_lm_plot <- autoplot(hpg_ts[,"reserve_visitors"], 
                             main = "HPG Reservations Linear Regression") + 
  autolayer(fitted(hpg_lm)
            , series = "Fitted Linear Regression") +
  ylab("Reservations") +
  theme(plot.title = element_text(hjust = 0.5))

checkresiduals(hpg_lm)

## 
##  Breusch-Godfrey test for serial correlation of order up to 10
## 
## data:  Residuals from Linear regression model
## LM test = 44.971, df = 10, p-value = 2.201e-06
# Combine Linear Reg Plots
grid.arrange(air_lm_plot, hpg_lm_plot, nrow = 2)

Linear Regression Forecasts

newdata <- data.frame(date = c(141:150))
forecast(air_lm, newdata = newdata)
##     Point Forecast     Lo 80     Hi 80     Lo 95     Hi 95
## 434      -49174.97 -54717.91 -43632.03 -57663.01 -40686.94
## 435      -49172.01 -54714.63 -43629.39 -57659.56 -40684.46
## 436      -49169.05 -54711.36 -43626.74 -57656.12 -40681.98
## 437      -49166.08 -54708.08 -43624.09 -57652.67 -40679.50
## 438      -49163.12 -54704.80 -43621.44 -57649.23 -40677.01
## 439      -49160.16 -54701.52 -43618.79 -57645.78 -40674.53
## 440      -49157.20 -54698.24 -43616.15 -57642.34 -40672.05
## 441      -49154.23 -54694.97 -43613.50 -57638.89 -40669.57
## 442      -49151.27 -54691.69 -43610.85 -57635.45 -40667.09
## 443      -49148.31 -54688.41 -43608.20 -57632.00 -40664.61
forecast(hpg_lm, newdata = newdata)
##     Point Forecast    Lo 80   Hi 80    Lo 95   Hi 95
## 133        1479089 993299.1 1964878 732967.9 2225209
## 134        1479091 993300.5 1964881 732968.9 2225212
## 135        1479093 993301.8 1964883 732969.9 2225215
## 136        1479095 993303.2 1964886 732970.9 2225218
## 137        1479097 993304.6 1964889 732971.9 2225222
## 138        1479099 993305.9 1964892 732972.9 2225225
## 139        1479101 993307.3 1964894 732973.9 2225228
## 140        1479103 993308.7 1964897 732974.9 2225231
## 141        1479105 993310.0 1964900 732975.9 2225234
## 142        1479107 993311.4 1964902 732976.9 2225237
air_lm_acc <- accuracy(air_lm)
hpg_lm_acc <- accuracy(hpg_lm)

Air Holt-Winters’

# HOLT-WINTERS' ---------------------------
# HOLT-WINTERS' AIR RESERVATIONS
air_hw <- hw(air_ts2, seasonal = "additive")
air_hw2 <- hw(air_ts2, seasonal = "multiplicative")

air_hw_plot <- autoplot(air_ts2, 
                              main = "Air Reservations Holt-Winters'") + 
  autolayer(fitted(air_hw), series = "Fitted Additive") +
  autolayer(fitted(air_hw2), series = "Fitted Multiplicative") +
  ylab("Air Reservations") +
  theme(plot.title = element_text(hjust = 0.5))

HPG Holt-Winters’

# HOLT-WINTERS' HPG RESERVATIONS
hpg_hw <- hw(hpg_ts2, seasonal = "additive")
hpg_hw2 <- hw(hpg_ts2, seasonal = "multiplicative")

hpg_hw_plot <- autoplot(hpg_ts2, 
                              main = "HPG Reservations Holt-Winters'") + 
  autolayer(fitted(hpg_hw), series = "Fitted Additive") +
  autolayer(fitted(hpg_hw2), series = "Fitted Multiplicative") +
  ylab("Air Reservations") +
  theme(plot.title = element_text(hjust = 0.5))

grid.arrange(air_hw_plot, hpg_hw_plot, nrow = 2)

Holt-Winters’ Accuracy

air_hw_acc <- accuracy(air_hw)
air_hw2_acc <- accuracy(air_hw2)
hpg_hw_acc<- accuracy(hpg_hw)
hpg_hw2_acc <- accuracy(hpg_hw2)

Air Holt-Winters’ Damped

# HOLT-WINTERS' ---------------------------
# HOLT-WINTERS' DAMPED AIR RESERVATIONS
air_hwd <- hw(air_ts2, seasonal = "additive")
air_hwd2 <- hw(air_ts2, seasonal = "multiplicative")

air_hwd_plot <- autoplot(air_ts2, 
                              main = "Air Reservations Holt-Winters' Damped") + 
  autolayer(fitted(air_hwd), series = "Fitted Additive") +
  autolayer(fitted(air_hwd2), series = "Fitted Multiplicative") +
  ylab("Air Reservations") +
  theme(plot.title = element_text(hjust = 0.5))

HPG Holt-Winters’ Damped

hpg_hwd <- hw(hpg_ts2, seasonal = "additive")
hpg_hwd2 <- hw(hpg_ts2, seasonal = "multiplicative")

hpg_hwd_plot <- autoplot(hpg_ts2, 
                              main = "HPG Reservations Holt-Winters'Damped") + 
  autolayer(fitted(hpg_hwd), series = "Fitted Additive") +
  autolayer(fitted(hpg_hwd2), series = "Fitted Multiplicative") +
  ylab("Air Reservations") +
  theme(plot.title = element_text(hjust = 0.5))

grid.arrange(air_hwd_plot
             , hpg_hwd_plot
             , nrow = 2)

Holt-Winters’ Damped Accuracy

air_hwd_acc <- accuracy(air_hwd)
air_hwd2_acc <- accuracy(air_hwd2)
hpg_hwd_acc <- accuracy(hpg_hwd)
hpg_hwd2_acc <- accuracy(hpg_hwd2)

Air ARIMA

# ARIMA MODELS --------------
# Air ARIMA
air_arima <- auto.arima(air_ts2)

checkresiduals(air_arima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1)(2,0,0)[2]
## Q* = 105.6, df = 3, p-value < 2.2e-16
## 
## Model df: 4.   Total lags used: 7
air_arima_acc <- accuracy(air_arima)

air_arima_fc <- forecast(air_arima, h = 12)

accuracy(air_arima_fc)
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 1.866716 543.3904 350.7981 -61.12175 84.65856 0.5926106
##                     ACF1
## Training set 0.004705909
air_arima_fc_plot <- autoplot(air_arima_fc) +
  labs(title = "Air ARIMA Forecast using ARIMA(1,1,1)(2,0,0)[2]") +
  ylab("Air Reservations") +
  theme(plot.title = element_text(hjust = 0.5))

HPG ARIMA

# ARIMA MODELS --------------
# HPG ARIMA
hpg_arima <- auto.arima(hpg_ts2)

checkresiduals(hpg_arima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1)(0,0,1)[2]
## Q* = 13.217, df = 3, p-value = 0.00419
## 
## Model df: 3.   Total lags used: 6
hpg_arima_acc <- accuracy(hpg_arima)

hpg_arima_fc <- forecast(hpg_arima, h = 12)

accuracy(hpg_arima_fc)
##                    ME     RMSE      MAE       MPE   MAPE     MASE         ACF1
## Training set 1139.111 6541.987 4866.718 -6.582705 36.316 0.613335 -0.006800734
hpg_arima_fc_plot <- autoplot(hpg_arima_fc) +
  labs(title = "HPG ARIMA Forecast using ARIMA(1,1,1)(0,0,1)[2]") +
  ylab("Air Reservations") +
  theme(plot.title = element_text(hjust = 0.5))

# Combine Plots
grid.arrange(air_arima_fc_plot
             , hpg_arima_fc_plot
             , nrow = 2)

Air ETS

# ETS MODELS -------------------------
# Air ETS
air_ets <- ets(air_ts2)
air_ets_acc <- accuracy(air_ets)

checkresiduals(air_ets)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,A,A)
## Q* = 96.639, df = 3, p-value < 2.2e-16
## 
## Model df: 6.   Total lags used: 9

HPG ETS

# HPG ETS
hpg_ets <- ets(hpg_ts2)
hpg_ets_acc <- accuracy(hpg_ets)
checkresiduals(hpg_ets)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,Ad,N)
## Q* = 55.749, df = 3, p-value = 4.751e-12
## 
## Model df: 5.   Total lags used: 8

Air ETS Forecast

# ETS FORECASTS ------------------------------
# Air ETS
air_ets_fc <- forecast(air_ets, h = 12)


accuracy(air_ets_fc)
##                     ME     RMSE      MAE       MPE     MAPE      MASE      ACF1
## Training set -147.3028 658.9757 469.1776 -191.0366 205.9678 0.7925916 0.3235844
air_ets_fc_plot <- air_ets_fc %>% autoplot() +
  labs(title="Air Reservations Forecast Using ETS(M,A,A)") +
  ylab("Reservations") +
  theme(plot.title = element_text(hjust = 0.5))

HPG ETS Forecast and Combined Plots

# ETS FORECASTS ------------------------------
# HPG ETS
hpg_ets_fc <- forecast(hpg_ets, h = 12)

accuracy(hpg_ets_fc)
##                   ME     RMSE      MAE       MPE   MAPE      MASE      ACF1
## Training set 99.5092 6794.227 5854.746 -20.73566 49.281 0.7378525 0.3200496
hpg_ets_fc_plot <- hpg_ets_fc %>% autoplot() +
  labs(title="HPG Reservations Forecast Using ETS(A,Ad,N)") +
  ylab("Reservations") +
  theme(plot.title = element_text(hjust = 0.5))

grid.arrange(air_ets_fc_plot
             , hpg_ets_fc_plot
             , nrow = 2)

Air Neural Net

air_nnet <- nnetar(air_ts2)
air_nnet
## Series: air_ts2 
## Model:  NNAR(22,1,12)[2] 
## Call:   nnetar(y = air_ts2)
## 
## Average of 20 networks, each of which is
## a 22-12-1 network with 289 weights
## options were - linear output units 
## 
## sigma^2 estimated as 7661
air_nnet_acc <- accuracy(air_nnet)

air_nnet_fc <- forecast(air_nnet, h = 36)
air_nnet_fc
##        Point Forecast
## 207.50      455.53373
## 208.00      511.15022
## 208.50      506.62641
## 209.00      440.16070
## 209.50      562.93916
## 210.00      691.72423
## 210.50     1097.90591
## 211.00     2031.25240
## 211.50     1849.76000
## 212.00     1082.21688
## 212.50      796.25988
## 213.00      683.99517
## 213.50     1289.33706
## 214.00     1296.71661
## 214.50     1572.91529
## 215.00     1519.85180
## 215.50      978.78545
## 216.00      520.50875
## 216.50      461.32333
## 217.00     1126.12485
## 217.50     2011.25325
## 218.00     2202.59225
## 218.50     1778.68831
## 219.00      662.41379
## 219.50      107.05203
## 220.00      137.38559
## 220.50      924.94033
## 221.00     1261.61091
## 221.50     1631.96377
## 222.00     1714.17919
## 222.50      860.47873
## 223.00      220.23803
## 223.50       23.89943
## 224.00      825.37037
## 224.50     1268.13648
## 225.00     1337.85238
accuracy(air_nnet_fc)
##                      ME     RMSE      MAE       MPE     MAPE       MASE
## Training set -0.1742974 87.52765 41.45957 -73.77621 81.08722 0.07003853
##                     ACF1
## Training set -0.01203742
air_nnet_fc %>% autoplot() +
  labs(title="Air Reservation Forecast using NNAR(21,1,11)[2]") +
  ylab("Reservations") +
  theme(plot.title = element_text(hjust = 0.5))

HPG Neural Net

hpg_nnet <- nnetar(hpg_ts2)
hpg_nnet
## Series: hpg_ts2 
## Model:  NNAR(9,1,5)[2] 
## Call:   nnetar(y = hpg_ts2)
## 
## Average of 20 networks, each of which is
## a 9-5-1 network with 56 weights
## options were - linear output units 
## 
## sigma^2 estimated as 4771184
hpg_nnet_acc <- accuracy(hpg_nnet)
hpg_nnet_fc <- forecast(hpg_nnet, h = 36)
hpg_nnet_fc
##       Point Forecast
## 67.00      17226.193
## 67.50      13249.431
## 68.00      11397.835
## 68.50      12393.194
## 69.00      10080.696
## 69.50      16031.470
## 70.00      13862.505
## 70.50      21049.647
## 71.00      22376.994
## 71.50      12287.345
## 72.00       9438.065
## 72.50      10956.497
## 73.00      11267.199
## 73.50      11834.952
## 74.00      20634.336
## 74.50      21707.271
## 75.00      12892.186
## 75.50      12746.047
## 76.00      11857.071
## 76.50      12017.742
## 77.00      10634.955
## 77.50      11545.395
## 78.00       8503.348
## 78.50      14302.802
## 79.00      21482.694
## 79.50      14969.532
## 80.00       8956.686
## 80.50       8341.577
## 81.00      10774.967
## 81.50      13306.269
## 82.00      17325.911
## 82.50      20446.214
## 83.00      14603.830
## 83.50      12150.730
## 84.00      11369.650
## 84.50      11546.530
accuracy(hpg_nnet_fc)
##                      ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.08987451 2184.304 1429.475 -5.096843 11.75099 0.1801515
##                   ACF1
## Training set 0.0261221
hpg_nnet_fc %>% autoplot() +
  labs(title="HPG Reservation Forecast using NNAR(9,1,5)[2]") +
  ylab("Reservations") +
  theme(plot.title = element_text(hjust = 0.5))

Compare All Model Accuracies

rbind(air_lm_acc
      , hpg_lm_acc
      , air_hw_acc
      , air_hw2_acc
      , hpg_hw_acc
      , hpg_hw2_acc
      , air_hwd_acc
      , air_hwd2_acc
      , hpg_hwd_acc
      , hpg_hwd2_acc
      , air_arima_acc
      , hpg_arima_acc
      , air_ets_acc
      , hpg_ets_acc
      , air_nnet_acc
      , hpg_nnet_acc
      )
##                         ME       RMSE        MAE         MPE       MAPE
## Training set  5.455015e-14  845.10296  585.93570 -982.791825 1011.48091
## Training set  2.711636e-14 6951.25188 6000.01644  -40.641889   66.29348
## Training set -4.379600e+01  642.78592  430.71721  -66.440063   88.80221
## Training set -2.742310e+01  659.46225  415.58743  -35.135517   63.92213
## Training set -3.065214e+02 7259.44865 6122.80775  -23.173570   54.48020
## Training set -4.055501e+02 7314.79362 6208.43179  -28.895602   55.42954
## Training set -4.379600e+01  642.78592  430.71721  -66.440063   88.80221
## Training set -2.742310e+01  659.46225  415.58743  -35.135517   63.92213
## Training set -3.065214e+02 7259.44865 6122.80775  -23.173570   54.48020
## Training set -4.055501e+02 7314.79362 6208.43179  -28.895602   55.42954
## Training set  1.866716e+00  543.39036  350.79810  -61.121754   84.65856
## Training set  1.139111e+03 6541.98664 4866.71849   -6.582705   36.31600
## Training set -1.473028e+02  658.97571  469.17756 -191.036575  205.96783
## Training set  9.950920e+01 6794.22741 5854.74581  -20.735662   49.28100
## Training set -1.742974e-01   87.52765   41.45957  -73.776207   81.08722
## Training set  8.987451e-02 2184.30397 1429.47466   -5.096843   11.75099
##                    MASE         ACF1
## Training set 1.60706650  0.711057938
## Training set 1.10664471  0.350669971
## Training set 0.72761967  0.371255950
## Training set 0.70206061  0.218831401
## Training set 0.77163535  0.329198293
## Training set 0.78242623  0.360976759
## Training set 0.72761967  0.371255950
## Training set 0.70206061  0.218831401
## Training set 0.77163535  0.329198293
## Training set 0.78242623  0.360976759
## Training set 0.59261064  0.004705909
## Training set 0.61333495 -0.006800734
## Training set 0.79259155  0.323584441
## Training set 0.73785247  0.320049566
## Training set 0.07003853 -0.012037424
## Training set 0.18015153  0.026122101

Final Prediction

# Predictions
air_predict <- forecast(air_nnet, h = 36)
air_predict
##        Point Forecast
## 207.50      455.53373
## 208.00      511.15022
## 208.50      506.62641
## 209.00      440.16070
## 209.50      562.93916
## 210.00      691.72423
## 210.50     1097.90591
## 211.00     2031.25240
## 211.50     1849.76000
## 212.00     1082.21688
## 212.50      796.25988
## 213.00      683.99517
## 213.50     1289.33706
## 214.00     1296.71661
## 214.50     1572.91529
## 215.00     1519.85180
## 215.50      978.78545
## 216.00      520.50875
## 216.50      461.32333
## 217.00     1126.12485
## 217.50     2011.25325
## 218.00     2202.59225
## 218.50     1778.68831
## 219.00      662.41379
## 219.50      107.05203
## 220.00      137.38559
## 220.50      924.94033
## 221.00     1261.61091
## 221.50     1631.96377
## 222.00     1714.17919
## 222.50      860.47873
## 223.00      220.23803
## 223.50       23.89943
## 224.00      825.37037
## 224.50     1268.13648
## 225.00     1337.85238
air_nnet_fc %>% autoplot(main = "Air Reservations Predictions using NNAR(21,1,11)[2]") +
  ylab("Reservation") +
  theme(plot.title = element_text(hjust = 0.5))

unique_date <- as.Date(unique(air_agg$date))
final_prediction_output <- as.data.frame(cbind(unique_date, air_predict$x))
colnames(final_prediction_output) <- c("date", "reservations")
final_prediction_output$date <- as.Date(final_prediction_output$date)

write.csv(final_prediction_output, file = "Forecasting_Final_Kaggle_Pollastro.csv", row.names = FALSE)