## Reading in the data from Kaggle
## Source: https://www.kaggle.com/c/recruit-restaurant-visitor-forecasting/data
## Importing the data into R Markdown
library(readr)
## Air Visit data
air=read_csv("C:/Users/bryce_anderson/Desktop/Boston College/Predictive Analytics and Forecasting/Week 3 (PA&F)/Midterm files/Imported data/air_visit_data.csv")
## Date info data
holiday <- read_csv("C:/Users/bryce_anderson/Desktop/Boston College/Predictive Analytics and Forecasting/Week 3 (PA&F)/Midterm files/Imported data/date_info.csv")
## Air Reservation data
reserve <- read_csv("C:/Users/bryce_anderson/Desktop/Boston College/Predictive Analytics and Forecasting/Week 3 (PA&F)/Midterm files/Imported data/air_reserve.csv")
HPGreserve <- read_csv("C:/Users/bryce_anderson/Desktop/Boston College/Predictive Analytics and Forecasting/Week 3 (PA&F)/Midterm files/Imported data/hpg_reserve.csv")
## Sample Submission data
sample=read_csv("C:/Users/bryce_anderson/Desktop/Boston College/Predictive Analytics and Forecasting/Week 3 (PA&F)/Midterm files/sample_submission.csv")
library(readr)
library(ggplot2)
library(plyr) ## Used for count function
library(dplyr) ## Allows for pipes %>%
library(magrittr)
library(lubridate) ## Date manipulation
library(psych)
library(forecast)
About the Imported Files
Air - air_visit_data.csv - The main training data set used. Contains historical visitor counts for each air_store_id.
Holiday - date_info.csv - Contains the calendar date, day of the week, and whether the day was a Japanese holiday (binary)
Reserve - air_reserve.csv - Contains the reservations made through the Air system
Sample- sample_submission.csv - Acts as the test set for our data
summary(air)
## air_store_id visit_date visitors
## Length:252108 Min. :2016-01-01 Min. : 1.00
## Class :character 1st Qu.:2016-07-23 1st Qu.: 9.00
## Mode :character Median :2016-10-23 Median : 17.00
## Mean :2016-10-12 Mean : 20.97
## 3rd Qu.:2017-01-24 3rd Qu.: 29.00
## Max. :2017-04-22 Max. :877.00
summary(holiday) ## Binary flag signifying holidays
## calendar_date day_of_week holiday_flg
## Min. :2016-01-01 Length:517 Min. :0.0000
## 1st Qu.:2016-05-09 Class :character 1st Qu.:0.0000
## Median :2016-09-15 Mode :character Median :0.0000
## Mean :2016-09-15 Mean :0.0677
## 3rd Qu.:2017-01-22 3rd Qu.:0.0000
## Max. :2017-05-31 Max. :1.0000
summary(reserve)
## air_store_id visit_datetime
## Length:92378 Min. :2016-01-01 19:00:00
## Class :character 1st Qu.:2016-11-15 19:00:00
## Mode :character Median :2017-01-05 18:00:00
## Mean :2016-12-05 08:18:58
## 3rd Qu.:2017-03-03 19:00:00
## Max. :2017-05-31 21:00:00
## reserve_datetime reserve_visitors
## Min. :2016-01-01 01:00:00 Min. : 1.000
## 1st Qu.:2016-11-07 17:00:00 1st Qu.: 2.000
## Median :2016-12-27 22:00:00 Median : 3.000
## Mean :2016-11-27 01:13:07 Mean : 4.482
## 3rd Qu.:2017-02-26 18:00:00 3rd Qu.: 5.000
## Max. :2017-04-22 23:00:00 Max. :100.000
summary(HPGreserve)
## hpg_store_id visit_datetime
## Length:2000320 Min. :2016-01-01 11:00:00
## Class :character 1st Qu.:2016-06-26 19:00:00
## Mode :character Median :2016-11-19 20:00:00
## Mean :2016-10-15 06:55:20
## 3rd Qu.:2017-02-03 19:00:00
## Max. :2017-05-31 23:00:00
## reserve_datetime reserve_visitors
## Min. :2016-01-01 00:00:00 Min. : 1.000
## 1st Qu.:2016-06-21 12:00:00 1st Qu.: 2.000
## Median :2016-11-10 20:00:00 Median : 3.000
## Mean :2016-10-07 19:57:59 Mean : 5.074
## 3rd Qu.:2017-01-27 13:00:00 3rd Qu.: 6.000
## Max. :2017-04-22 23:00:00 Max. :100.000
Notes and Points of Interest in the data
Training Data - Spans from 2016 until April 2017. Omits days when the restuarants were closed.
Test set - Covers last week of April (4/23/17) through May 2017
Golden Week - The Japanese Holiday week is included in the Test set (April 29 - May 7). Restuarants are closed on these days and therefore there were no visitors.
## Adding Month and Weekday components to Air data set
airNew <- air %>%
mutate(month = month(visit_date, label = TRUE)) %>%
mutate(wday = wday(visit_date, label = TRUE))
## The inclusion of Year
airNew2 <- air %>%
mutate(year = year(visit_date)) %>%
mutate(month = month(visit_date, label = TRUE)) %>%
mutate(wday = wday(visit_date, label = TRUE))
##airFull <- cbind(airNew, holiday$holiday_flg)
## Descriptive statistics of the data sets
describe(airNew)
## vars n mean sd median trimmed mad min max range
## air_store_id* 1 252108 NaN NA NA NaN NA Inf -Inf -Inf
## visit_date 2 252108 NaN NA NA NaN NA Inf -Inf -Inf
## visitors 3 252108 20.97 16.76 17 18.82 13.34 1 877 876
## month* 4 252108 6.21 3.68 7 6.15 5.93 1 12 11
## wday* 5 252108 4.19 1.97 4 4.23 2.97 1 7 6
## skew kurtosis se
## air_store_id* NA NA NA
## visit_date NA NA NA
## visitors 3.31 74.26 0.03
## month* 0.09 -1.41 0.01
## wday* -0.12 -1.20 0.00
describe(holiday)
## vars n mean sd median trimmed mad min max range skew
## calendar_date 1 517 NaN NA NA NaN NA Inf -Inf -Inf NA
## day_of_week* 2 517 NaN NA NA NaN NA Inf -Inf -Inf NA
## holiday_flg 3 517 0.07 0.25 0 0 0 0 1 1 3.43
## kurtosis se
## calendar_date NA NA
## day_of_week* NA NA
## holiday_flg 9.79 0.01
## Visualizing the data sets
## Plotting Visitors in Air data set
plot(visitors ~ visit_date, data=air, main="Air Visitors over Time", xlab="Visit Date", ylab="Number of Visitors", col="dodgerblue")
plot(visitors ~ month, data=airNew, main="Air Visitors over Time", xlab="Month", ylab="Number of Visitors", col="dodgerblue")
plot(visitors ~ wday, data=airNew, main="Air Visitors over Time", xlab="Day of the Week", ylab="Number of Visitors", col="dodgerblue")
## Ploting reservations in Air System
plot(reserve_visitors ~ reserve_datetime, data=reserve, main="Air Reservations over Time", ylab="Number of Reservation Visitors", xlab="Reservation Date", col="orange")
## Number of Holidays in the Training data set
holiday %>% group_by(holiday_flg) %>% tally()
## # A tibble: 2 x 2
## holiday_flg n
## <dbl> <int>
## 1 0 482
## 2 1 35
holidayCountPercentage <- 35/482
holidayCountPercentage ## 7% of Training set days are holidays
## [1] 0.07261411
## Visualizing Visitors per Day of the Week
WeekDayCount <- air %>%
mutate(wday = wday(visit_date, label = TRUE)) %>%
group_by(wday) %>%
summarise(visits = mean(visitors)) %>%
ggplot(aes(wday, visits, fill = wday)) +
geom_col(colour="dodgerblue") + labs(x = "Day of the week", y = "Mean visitor Count", main="Mean Air Visitor Count by Day of Week")
WeekDayCount
## Visualizing Visitors per Month
MonthCount <- air %>%
mutate(month = month(visit_date, label = TRUE)) %>%
group_by(month) %>%
summarise(visits = mean(visitors)) %>%
ggplot(aes(month, visits, fill = month)) +
geom_col(colour="dodgerblue") +
labs(x = "Month", y = "Mean visitor Count", main="Mean Air Visitor Count by Day of Week")
MonthCount
## Model 1 and 2
model1 <- lm(visitors ~ month + wday, data=airNew)
model1
##
## Call:
## lm(formula = visitors ~ month + wday, data = airNew)
##
## Coefficients:
## (Intercept) month.L month.Q month.C month^4
## 20.94728 -0.09444 0.09043 3.36297 0.82868
## month^5 month^6 month^7 month^8 month^9
## 0.27668 0.67765 0.13739 0.91122 -0.42139
## month^10 month^11 wday.L wday.Q wday.C
## 0.41569 -0.33283 3.79335 7.02989 -1.87201
## wday^4 wday^5 wday^6
## 1.67142 -1.64797 -0.89596
summary(model1)
##
## Call:
## lm(formula = visitors ~ month + wday, data = airNew)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.45 -11.56 -3.66 8.03 856.24
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.94728 0.03568 587.130 < 2e-16 ***
## month.L -0.09444 0.10793 -0.875 0.38156
## month.Q 0.09043 0.12348 0.732 0.46394
## month.C 3.36297 0.11494 29.257 < 2e-16 ***
## month^4 0.82868 0.11739 7.059 1.68e-12 ***
## month^5 0.27668 0.12155 2.276 0.02283 *
## month^6 0.67765 0.11663 5.810 6.25e-09 ***
## month^7 0.13739 0.12233 1.123 0.26138
## month^8 0.91122 0.12166 7.490 6.93e-14 ***
## month^9 -0.42139 0.12068 -3.492 0.00048 ***
## month^10 0.41569 0.13553 3.067 0.00216 **
## month^11 -0.33283 0.14770 -2.253 0.02423 *
## wday.L 3.79335 0.08822 42.997 < 2e-16 ***
## wday.Q 7.02989 0.08768 80.180 < 2e-16 ***
## wday.C -1.87201 0.08720 -21.469 < 2e-16 ***
## wday^4 1.67142 0.08692 19.230 < 2e-16 ***
## wday^5 -1.64797 0.08609 -19.143 < 2e-16 ***
## wday^6 -0.89596 0.08554 -10.474 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.4 on 252090 degrees of freedom
## Multiple R-squared: 0.04202, Adjusted R-squared: 0.04195
## F-statistic: 650.4 on 17 and 252090 DF, p-value: < 2.2e-16
model2 <- lm(visitors ~ year + month + wday, data=airNew2)
model2
##
## Call:
## lm(formula = visitors ~ year + month + wday, data = airNew2)
##
## Coefficients:
## (Intercept) year month.L month.Q month.C
## 1957.81684 -0.96064 -1.01556 0.50815 3.49616
## month^4 month^5 month^6 month^7 month^8
## 0.52691 0.37777 0.82329 -0.04514 0.94028
## month^9 month^10 month^11 wday.L wday.Q
## -0.29557 0.25943 -0.24556 3.79258 7.03365
## wday.C wday^4 wday^5 wday^6
## -1.87325 1.67010 -1.64602 -0.89596
summary(model2)
##
## Call:
## lm(formula = visitors ~ year + month + wday, data = airNew2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.55 -11.50 -3.59 7.98 856.50
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1957.81684 221.84691 8.825 < 2e-16 ***
## year -0.96064 0.11003 -8.731 < 2e-16 ***
## month.L -1.01556 0.15092 -6.729 1.71e-11 ***
## month.Q 0.50815 0.13241 3.838 0.000124 ***
## month.C 3.49616 0.11594 30.156 < 2e-16 ***
## month^4 0.52691 0.12236 4.306 1.66e-05 ***
## month^5 0.37777 0.12208 3.094 0.001973 **
## month^6 0.82329 0.11780 6.989 2.78e-12 ***
## month^7 -0.04514 0.12409 -0.364 0.716014
## month^8 0.94028 0.12169 7.727 1.11e-14 ***
## month^9 -0.29557 0.12151 -2.432 0.015002 *
## month^10 0.25943 0.13669 1.898 0.057698 .
## month^11 -0.24556 0.14801 -1.659 0.097113 .
## wday.L 3.79258 0.08821 42.994 < 2e-16 ***
## wday.Q 7.03365 0.08766 80.234 < 2e-16 ***
## wday.C -1.87325 0.08718 -21.486 < 2e-16 ***
## wday^4 1.67010 0.08690 19.218 < 2e-16 ***
## wday^5 -1.64602 0.08608 -19.123 < 2e-16 ***
## wday^6 -0.89596 0.08553 -10.476 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.4 on 252089 degrees of freedom
## Multiple R-squared: 0.04231, Adjusted R-squared: 0.04224
## F-statistic: 618.7 on 18 and 252089 DF, p-value: < 2.2e-16
## Model 3 and 4
## Create Time series of airNew
airNewTS <- ts(airNew)
model3 <- tslm(visitors ~ month + wday, data=airNewTS)
model3
##
## Call:
## tslm(formula = visitors ~ month + wday, data = airNewTS)
##
## Coefficients:
## (Intercept) month wday
## 17.47673 -0.00231 0.83865
summary(model3)
##
## Call:
## tslm(formula = visitors ~ month + wday, data = airNewTS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.34 -11.99 -3.97 8.01 856.18
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.476729 0.096070 181.917 <2e-16 ***
## month -0.002310 0.009024 -0.256 0.798
## wday 0.838647 0.016874 49.700 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.68 on 252105 degrees of freedom
## Multiple R-squared: 0.009703, Adjusted R-squared: 0.009695
## F-statistic: 1235 on 2 and 252105 DF, p-value: < 2.2e-16
## Inclusion of Year component
airNew2TS <- ts(airNew2)
airNew2TS <- ts(airNew2)
model4 <- tslm(visitors ~ year + month + wday, data=airNew2TS)
model4
##
## Call:
## tslm(formula = visitors ~ year + month + wday, data = airNew2TS)
##
## Coefficients:
## (Intercept) year month wday
## 481.60174 -0.23013 -0.02201 0.83893
summary(model4)
##
## Call:
## tslm(formula = visitors ~ year + month + wday, data = airNew2TS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.52 -12.05 -3.93 7.95 856.27
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 481.60174 198.59350 2.425 0.0153 *
## year -0.23013 0.09847 -2.337 0.0194 *
## month -0.02201 0.01235 -1.782 0.0747 .
## wday 0.83893 0.01687 49.716 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.68 on 252104 degrees of freedom
## Multiple R-squared: 0.009725, Adjusted R-squared: 0.009713
## F-statistic: 825.2 on 3 and 252104 DF, p-value: < 2.2e-16
sample$air_store_id=substr(sample$id, 1,20)
model2 <- lm(visitors ~ year + month + wday, data=airNew2)
myagg=aggregate(visitors~air_store_id, FUN=mean, data=air)
mymerge=merge(sample, myagg, by="air_store_id", all.x=TRUE)
mymerge$visitors.x=NULL
mymerge$visitors=mymerge$visitors.y
mymerge$visitors.y=NULL
mymerge$air_store_id=NULL
## Produces .csv file for submission
write.csv(mymerge, "KaggleSubmission.csv", row.names=FALSE)