Reading in the datasets from the Recruit Restaurant Visitor Forecasting Project

## Reading in the data from Kaggle
## Source: https://www.kaggle.com/c/recruit-restaurant-visitor-forecasting/data

## Importing the data into R Markdown
library(readr)

## Air Visit data
air=read_csv("C:/Users/bryce_anderson/Desktop/Boston College/Predictive Analytics and Forecasting/Week 3 (PA&F)/Midterm files/Imported data/air_visit_data.csv")

## Date info data
holiday <- read_csv("C:/Users/bryce_anderson/Desktop/Boston College/Predictive Analytics and Forecasting/Week 3 (PA&F)/Midterm files/Imported data/date_info.csv")

## Air Reservation data
reserve <- read_csv("C:/Users/bryce_anderson/Desktop/Boston College/Predictive Analytics and Forecasting/Week 3 (PA&F)/Midterm files/Imported data/air_reserve.csv")

HPGreserve <- read_csv("C:/Users/bryce_anderson/Desktop/Boston College/Predictive Analytics and Forecasting/Week 3 (PA&F)/Midterm files/Imported data/hpg_reserve.csv")

## Sample Submission data
sample=read_csv("C:/Users/bryce_anderson/Desktop/Boston College/Predictive Analytics and Forecasting/Week 3 (PA&F)/Midterm files/sample_submission.csv")

R Packages Utilized

library(readr)
library(ggplot2)
library(plyr) ## Used for count function
library(dplyr) ## Allows for pipes %>%
library(magrittr)
library(lubridate) ## Date manipulation
library(psych)
library(forecast)

About the Imported Files

Air - air_visit_data.csv - The main training data set used. Contains historical visitor counts for each air_store_id.

Holiday - date_info.csv - Contains the calendar date, day of the week, and whether the day was a Japanese holiday (binary)

Reserve - air_reserve.csv - Contains the reservations made through the Air system

Sample- sample_submission.csv - Acts as the test set for our data

Overview of the imported datasets

summary(air)
##  air_store_id         visit_date            visitors     
##  Length:252108      Min.   :2016-01-01   Min.   :  1.00  
##  Class :character   1st Qu.:2016-07-23   1st Qu.:  9.00  
##  Mode  :character   Median :2016-10-23   Median : 17.00  
##                     Mean   :2016-10-12   Mean   : 20.97  
##                     3rd Qu.:2017-01-24   3rd Qu.: 29.00  
##                     Max.   :2017-04-22   Max.   :877.00
summary(holiday) ## Binary flag signifying holidays
##  calendar_date        day_of_week         holiday_flg    
##  Min.   :2016-01-01   Length:517         Min.   :0.0000  
##  1st Qu.:2016-05-09   Class :character   1st Qu.:0.0000  
##  Median :2016-09-15   Mode  :character   Median :0.0000  
##  Mean   :2016-09-15                      Mean   :0.0677  
##  3rd Qu.:2017-01-22                      3rd Qu.:0.0000  
##  Max.   :2017-05-31                      Max.   :1.0000
summary(reserve)
##  air_store_id       visit_datetime               
##  Length:92378       Min.   :2016-01-01 19:00:00  
##  Class :character   1st Qu.:2016-11-15 19:00:00  
##  Mode  :character   Median :2017-01-05 18:00:00  
##                     Mean   :2016-12-05 08:18:58  
##                     3rd Qu.:2017-03-03 19:00:00  
##                     Max.   :2017-05-31 21:00:00  
##  reserve_datetime              reserve_visitors 
##  Min.   :2016-01-01 01:00:00   Min.   :  1.000  
##  1st Qu.:2016-11-07 17:00:00   1st Qu.:  2.000  
##  Median :2016-12-27 22:00:00   Median :  3.000  
##  Mean   :2016-11-27 01:13:07   Mean   :  4.482  
##  3rd Qu.:2017-02-26 18:00:00   3rd Qu.:  5.000  
##  Max.   :2017-04-22 23:00:00   Max.   :100.000
summary(HPGreserve)
##  hpg_store_id       visit_datetime               
##  Length:2000320     Min.   :2016-01-01 11:00:00  
##  Class :character   1st Qu.:2016-06-26 19:00:00  
##  Mode  :character   Median :2016-11-19 20:00:00  
##                     Mean   :2016-10-15 06:55:20  
##                     3rd Qu.:2017-02-03 19:00:00  
##                     Max.   :2017-05-31 23:00:00  
##  reserve_datetime              reserve_visitors 
##  Min.   :2016-01-01 00:00:00   Min.   :  1.000  
##  1st Qu.:2016-06-21 12:00:00   1st Qu.:  2.000  
##  Median :2016-11-10 20:00:00   Median :  3.000  
##  Mean   :2016-10-07 19:57:59   Mean   :  5.074  
##  3rd Qu.:2017-01-27 13:00:00   3rd Qu.:  6.000  
##  Max.   :2017-04-22 23:00:00   Max.   :100.000

Notes and Points of Interest in the data

Adding Year, Month, and Day of the Week components to the Air Visitors dataset

## Adding Month and Weekday components to Air data set
airNew <-  air %>%
  mutate(month = month(visit_date, label = TRUE)) %>%
  mutate(wday = wday(visit_date, label = TRUE))

## The inclusion of Year
airNew2 <-  air %>%
  mutate(year = year(visit_date)) %>%
  mutate(month = month(visit_date, label = TRUE)) %>%
  mutate(wday = wday(visit_date, label = TRUE))

##airFull <- cbind(airNew, holiday$holiday_flg)

Descriptive statistics of the AirNew and Holiday datasets

## Descriptive statistics of the data sets
describe(airNew)
##               vars      n  mean    sd median trimmed   mad min  max range
## air_store_id*    1 252108   NaN    NA     NA     NaN    NA Inf -Inf  -Inf
## visit_date       2 252108   NaN    NA     NA     NaN    NA Inf -Inf  -Inf
## visitors         3 252108 20.97 16.76     17   18.82 13.34   1  877   876
## month*           4 252108  6.21  3.68      7    6.15  5.93   1   12    11
## wday*            5 252108  4.19  1.97      4    4.23  2.97   1    7     6
##                skew kurtosis   se
## air_store_id*    NA       NA   NA
## visit_date       NA       NA   NA
## visitors       3.31    74.26 0.03
## month*         0.09    -1.41 0.01
## wday*         -0.12    -1.20 0.00
describe(holiday)
##               vars   n mean   sd median trimmed mad min  max range skew
## calendar_date    1 517  NaN   NA     NA     NaN  NA Inf -Inf  -Inf   NA
## day_of_week*     2 517  NaN   NA     NA     NaN  NA Inf -Inf  -Inf   NA
## holiday_flg      3 517 0.07 0.25      0       0   0   0    1     1 3.43
##               kurtosis   se
## calendar_date       NA   NA
## day_of_week*        NA   NA
## holiday_flg       9.79 0.01

Plotting and Visualizing the Air Visitors dataset

## Visualizing the data sets

## Plotting Visitors in Air data set
plot(visitors ~  visit_date, data=air, main="Air Visitors over Time", xlab="Visit Date", ylab="Number of Visitors", col="dodgerblue")

plot(visitors ~  month, data=airNew, main="Air Visitors over Time", xlab="Month", ylab="Number of Visitors", col="dodgerblue")

plot(visitors ~  wday, data=airNew, main="Air Visitors over Time", xlab="Day of the Week", ylab="Number of Visitors", col="dodgerblue")

## Ploting reservations in Air System
plot(reserve_visitors ~ reserve_datetime, data=reserve, main="Air Reservations over Time", ylab="Number of Reservation Visitors", xlab="Reservation Date", col="orange")

## Number of Holidays in the Training data set
holiday %>% group_by(holiday_flg) %>% tally()
## # A tibble: 2 x 2
##   holiday_flg     n
##         <dbl> <int>
## 1           0   482
## 2           1    35
holidayCountPercentage <- 35/482
holidayCountPercentage ## 7% of Training set days are holidays
## [1] 0.07261411

Model Formulation

## Model 1 and 2
model1 <- lm(visitors ~ month + wday, data=airNew)
model1
## 
## Call:
## lm(formula = visitors ~ month + wday, data = airNew)
## 
## Coefficients:
## (Intercept)      month.L      month.Q      month.C      month^4  
##    20.94728     -0.09444      0.09043      3.36297      0.82868  
##     month^5      month^6      month^7      month^8      month^9  
##     0.27668      0.67765      0.13739      0.91122     -0.42139  
##    month^10     month^11       wday.L       wday.Q       wday.C  
##     0.41569     -0.33283      3.79335      7.02989     -1.87201  
##      wday^4       wday^5       wday^6  
##     1.67142     -1.64797     -0.89596
summary(model1)
## 
## Call:
## lm(formula = visitors ~ month + wday, data = airNew)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -27.45 -11.56  -3.66   8.03 856.24 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 20.94728    0.03568 587.130  < 2e-16 ***
## month.L     -0.09444    0.10793  -0.875  0.38156    
## month.Q      0.09043    0.12348   0.732  0.46394    
## month.C      3.36297    0.11494  29.257  < 2e-16 ***
## month^4      0.82868    0.11739   7.059 1.68e-12 ***
## month^5      0.27668    0.12155   2.276  0.02283 *  
## month^6      0.67765    0.11663   5.810 6.25e-09 ***
## month^7      0.13739    0.12233   1.123  0.26138    
## month^8      0.91122    0.12166   7.490 6.93e-14 ***
## month^9     -0.42139    0.12068  -3.492  0.00048 ***
## month^10     0.41569    0.13553   3.067  0.00216 ** 
## month^11    -0.33283    0.14770  -2.253  0.02423 *  
## wday.L       3.79335    0.08822  42.997  < 2e-16 ***
## wday.Q       7.02989    0.08768  80.180  < 2e-16 ***
## wday.C      -1.87201    0.08720 -21.469  < 2e-16 ***
## wday^4       1.67142    0.08692  19.230  < 2e-16 ***
## wday^5      -1.64797    0.08609 -19.143  < 2e-16 ***
## wday^6      -0.89596    0.08554 -10.474  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.4 on 252090 degrees of freedom
## Multiple R-squared:  0.04202,    Adjusted R-squared:  0.04195 
## F-statistic: 650.4 on 17 and 252090 DF,  p-value: < 2.2e-16
model2 <- lm(visitors ~ year + month + wday, data=airNew2)
model2
## 
## Call:
## lm(formula = visitors ~ year + month + wday, data = airNew2)
## 
## Coefficients:
## (Intercept)         year      month.L      month.Q      month.C  
##  1957.81684     -0.96064     -1.01556      0.50815      3.49616  
##     month^4      month^5      month^6      month^7      month^8  
##     0.52691      0.37777      0.82329     -0.04514      0.94028  
##     month^9     month^10     month^11       wday.L       wday.Q  
##    -0.29557      0.25943     -0.24556      3.79258      7.03365  
##      wday.C       wday^4       wday^5       wday^6  
##    -1.87325      1.67010     -1.64602     -0.89596
summary(model2)
## 
## Call:
## lm(formula = visitors ~ year + month + wday, data = airNew2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -27.55 -11.50  -3.59   7.98 856.50 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1957.81684  221.84691   8.825  < 2e-16 ***
## year          -0.96064    0.11003  -8.731  < 2e-16 ***
## month.L       -1.01556    0.15092  -6.729 1.71e-11 ***
## month.Q        0.50815    0.13241   3.838 0.000124 ***
## month.C        3.49616    0.11594  30.156  < 2e-16 ***
## month^4        0.52691    0.12236   4.306 1.66e-05 ***
## month^5        0.37777    0.12208   3.094 0.001973 ** 
## month^6        0.82329    0.11780   6.989 2.78e-12 ***
## month^7       -0.04514    0.12409  -0.364 0.716014    
## month^8        0.94028    0.12169   7.727 1.11e-14 ***
## month^9       -0.29557    0.12151  -2.432 0.015002 *  
## month^10       0.25943    0.13669   1.898 0.057698 .  
## month^11      -0.24556    0.14801  -1.659 0.097113 .  
## wday.L         3.79258    0.08821  42.994  < 2e-16 ***
## wday.Q         7.03365    0.08766  80.234  < 2e-16 ***
## wday.C        -1.87325    0.08718 -21.486  < 2e-16 ***
## wday^4         1.67010    0.08690  19.218  < 2e-16 ***
## wday^5        -1.64602    0.08608 -19.123  < 2e-16 ***
## wday^6        -0.89596    0.08553 -10.476  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.4 on 252089 degrees of freedom
## Multiple R-squared:  0.04231,    Adjusted R-squared:  0.04224 
## F-statistic: 618.7 on 18 and 252089 DF,  p-value: < 2.2e-16
## Model 3 and 4
## Create Time series of airNew
airNewTS <- ts(airNew)
model3 <- tslm(visitors ~ month + wday, data=airNewTS)
model3
## 
## Call:
## tslm(formula = visitors ~ month + wday, data = airNewTS)
## 
## Coefficients:
## (Intercept)        month         wday  
##    17.47673     -0.00231      0.83865
summary(model3)
## 
## Call:
## tslm(formula = visitors ~ month + wday, data = airNewTS)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -22.34 -11.99  -3.97   8.01 856.18 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 17.476729   0.096070 181.917   <2e-16 ***
## month       -0.002310   0.009024  -0.256    0.798    
## wday         0.838647   0.016874  49.700   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.68 on 252105 degrees of freedom
## Multiple R-squared:  0.009703,   Adjusted R-squared:  0.009695 
## F-statistic:  1235 on 2 and 252105 DF,  p-value: < 2.2e-16
## Inclusion of Year component
airNew2TS <- ts(airNew2)

airNew2TS <- ts(airNew2)
model4 <- tslm(visitors ~ year + month + wday, data=airNew2TS)
model4
## 
## Call:
## tslm(formula = visitors ~ year + month + wday, data = airNew2TS)
## 
## Coefficients:
## (Intercept)         year        month         wday  
##   481.60174     -0.23013     -0.02201      0.83893
summary(model4)
## 
## Call:
## tslm(formula = visitors ~ year + month + wday, data = airNew2TS)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -22.52 -12.05  -3.93   7.95 856.27 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 481.60174  198.59350   2.425   0.0153 *  
## year         -0.23013    0.09847  -2.337   0.0194 *  
## month        -0.02201    0.01235  -1.782   0.0747 .  
## wday          0.83893    0.01687  49.716   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.68 on 252104 degrees of freedom
## Multiple R-squared:  0.009725,   Adjusted R-squared:  0.009713 
## F-statistic: 825.2 on 3 and 252104 DF,  p-value: < 2.2e-16

Producing a Kaggle Submission file

sample$air_store_id=substr(sample$id, 1,20)

model2 <- lm(visitors ~ year + month + wday, data=airNew2)

myagg=aggregate(visitors~air_store_id, FUN=mean, data=air)

mymerge=merge(sample, myagg, by="air_store_id", all.x=TRUE)
mymerge$visitors.x=NULL
mymerge$visitors=mymerge$visitors.y
mymerge$visitors.y=NULL
mymerge$air_store_id=NULL
## Produces .csv file for submission
write.csv(mymerge, "KaggleSubmission.csv", row.names=FALSE)