Midterm

loading in the libraries I will use below

library(forecast)

## Warning: package 'forecast' was built under R version 4.0.4

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(ggplot2)
library(TTR)

## Warning: package 'TTR' was built under R version 4.0.4

library(stats)
library(readr)
library(fpp)

## Warning: package 'fpp' was built under R version 4.0.5

## Loading required package: fma

## Warning: package 'fma' was built under R version 4.0.5

## Loading required package: expsmooth

## Warning: package 'expsmooth' was built under R version 4.0.5

## Loading required package: lmtest

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: tseries

## Warning: package 'tseries' was built under R version 4.0.4

library(fpp2)

## Warning: package 'fpp2' was built under R version 4.0.5

## 
## Attaching package: 'fpp2'

## The following objects are masked from 'package:fpp':
## 
##     ausair, ausbeer, austa, austourists, debitcards, departures,
##     elecequip, euretail, guinearice, oil, sunspotarea, usmelec

library(seasonal)

## Warning: package 'seasonal' was built under R version 4.0.5

library(lubridate)

## Warning: package 'lubridate' was built under R version 4.0.5

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(psych)

## Warning: package 'psych' was built under R version 4.0.5

## 
## Attaching package: 'psych'

## The following object is masked from 'package:seasonal':
## 
##     outlier

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(fastDummies)

## Warning: package 'fastDummies' was built under R version 4.0.5

Here I will load in the files that I need

airvisitdata <- read.csv("C:\\Users\\wanga\\OneDrive\\Desktop\\PREDICTIVE ANALYTICS\\MIDTERM\\air_visit_data.csv")
datainfo <- read.csv("C:\\Users\\wanga\\OneDrive\\Desktop\\PREDICTIVE ANALYTICS\\MIDTERM\\date_info.csv")
airstoreinfo <- read.csv("C:\\Users\\wanga\\OneDrive\\Desktop\\PREDICTIVE ANALYTICS\\MIDTERM\\air_store_info.csv")
samplesubmission <- read.csv("C:\\Users\\wanga\\OneDrive\\Desktop\\PREDICTIVE ANALYTICS\\MIDTERM\\sample_submission.csv")

starting with simple summaries

head(airvisitdata)

##           air_store_id visit_date visitors
## 1 air_ba937bf13d40fb24 2016-01-13       25
## 2 air_ba937bf13d40fb24 2016-01-14       32
## 3 air_ba937bf13d40fb24 2016-01-15       29
## 4 air_ba937bf13d40fb24 2016-01-16       22
## 5 air_ba937bf13d40fb24 2016-01-18        6
## 6 air_ba937bf13d40fb24 2016-01-19        9

head(datainfo)

##   calendar_date day_of_week holiday_flg
## 1    2016-01-01      Friday           1
## 2    2016-01-02    Saturday           1
## 3    2016-01-03      Sunday           1
## 4    2016-01-04      Monday           0
## 5    2016-01-05     Tuesday           0
## 6    2016-01-06   Wednesday           0

head(airstoreinfo)

##           air_store_id air_genre_name                   air_area_name latitude
## 1 air_0f0cdeee6c9bf3d7 Italian/French HyÅ\215go-ken KÅ\215be-shi KumoidÅ\215ri 34.69512
## 2 air_7cc17a324ae5c7dc Italian/French HyÅ\215go-ken KÅ\215be-shi KumoidÅ\215ri 34.69512
## 3 air_fee8dcf4d619598e Italian/French HyÅ\215go-ken KÅ\215be-shi KumoidÅ\215ri 34.69512
## 4 air_a17f0778617c76e2 Italian/French HyÅ\215go-ken KÅ\215be-shi KumoidÅ\215ri 34.69512
## 5 air_83db5aff8f50478e Italian/French TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en 35.65807
## 6 air_99c3eae84130c1cb Italian/French TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en 35.65807
##   longitude
## 1  135.1979
## 2  135.1979
## 3  135.1979
## 4  135.1979
## 5  139.7516
## 6  139.7516

head(samplesubmission)

##                                id visitors
## 1 air_00a91d42b08b08d9_2017-04-23        0
## 2 air_00a91d42b08b08d9_2017-04-24        0
## 3 air_00a91d42b08b08d9_2017-04-25        0
## 4 air_00a91d42b08b08d9_2017-04-26        0
## 5 air_00a91d42b08b08d9_2017-04-27        0
## 6 air_00a91d42b08b08d9_2017-04-28        0

I currently dont see a decent way to use the air store info file so i will leave it out for the time being.

I want to check how many necessary results we will need for the sample submission

summary(samplesubmission)

##       id               visitors
##  Length:32019       Min.   :0  
##  Class :character   1st Qu.:0  
##  Mode  :character   Median :0  
##                     Mean   :0  
##                     3rd Qu.:0  
##                     Max.   :0

summary(airvisitdata)

##  air_store_id        visit_date           visitors     
##  Length:252108      Length:252108      Min.   :  1.00  
##  Class :character   Class :character   1st Qu.:  9.00  
##  Mode  :character   Mode  :character   Median : 17.00  
##                                        Mean   : 20.97  
##                                        3rd Qu.: 29.00  
##                                        Max.   :877.00

summary(datainfo)

##  calendar_date      day_of_week         holiday_flg    
##  Length:517         Length:517         Min.   :0.0000  
##  Class :character   Class :character   1st Qu.:0.0000  
##  Mode  :character   Mode  :character   Median :0.0000  
##                                        Mean   :0.0677  
##                                        3rd Qu.:0.0000  
##                                        Max.   :1.0000

summary(airstoreinfo)

##  air_store_id       air_genre_name     air_area_name         latitude    
##  Length:829         Length:829         Length:829         Min.   :33.21  
##  Class :character   Class :character   Class :character   1st Qu.:34.70  
##  Mode  :character   Mode  :character   Mode  :character   Median :35.66  
##                                                           Mean   :35.65  
##                                                           3rd Qu.:35.69  
##                                                           Max.   :44.02  
##    longitude    
##  Min.   :130.2  
##  1st Qu.:135.3  
##  Median :139.7  
##  Mean   :137.4  
##  3rd Qu.:139.8  
##  Max.   :144.3

Some exploratory analysis. I removed the unique() command but there are 829 different restaurants

store_obs <- 252108/829

This means that there is an average of a little less than a year of observations for each restaurant. This checks out then as the calendar_date variable only has about a year and a half of observations

airvisitdata$visit_date1 <- time(airvisitdata$visit_date)
plot(airvisitdata$visitors ~ airvisitdata$visit_date1)

This is not very helpful. Will need to fix something in order to make a plot usable. Additionally, the visit date has 252k observations. That would be like ~690 years. We will need to setup a time series to find anything useful

Lets try out the merge function. We are looking to congregate all of our needed data into one frame

summary(airvisitdata)

##  air_store_id        visit_date           visitors       visit_date1    
##  Length:252108      Length:252108      Min.   :  1.00   Min.   :     1  
##  Class :character   Class :character   1st Qu.:  9.00   1st Qu.: 63028  
##  Mode  :character   Mode  :character   Median : 17.00   Median :126055  
##                                        Mean   : 20.97   Mean   :126055  
##                                        3rd Qu.: 29.00   3rd Qu.:189081  
##                                        Max.   :877.00   Max.   :252108

summary(datainfo)

##  calendar_date      day_of_week         holiday_flg    
##  Length:517         Length:517         Min.   :0.0000  
##  Class :character   Class :character   1st Qu.:0.0000  
##  Mode  :character   Mode  :character   Median :0.0000  
##                                        Mean   :0.0677  
##                                        3rd Qu.:0.0000  
##                                        Max.   :1.0000

mergetest1 <- merge(airvisitdata, datainfo, by.x = "visit_date", by.y = "calendar_date")
head(mergetest1)

##   visit_date         air_store_id visitors visit_date1 day_of_week holiday_flg
## 1 2016-01-01 air_c31472d14e29cee8        3      184318      Friday           1
## 2 2016-01-01 air_05c325d315cc17f5       29      171315      Friday           1
## 3 2016-01-01 air_81c5dff692063446        7       65992      Friday           1
## 4 2016-01-01 air_a083834e7ffe187e       27      180079      Friday           1
## 5 2016-01-01 air_36bcf77d3382d36e       34      204402      Friday           1
## 6 2016-01-01 air_09a845d5b5944b01       56       29556      Friday           1

summary(mergetest1)

##   visit_date        air_store_id          visitors       visit_date1    
##  Length:252108      Length:252108      Min.   :  1.00   Min.   :     1  
##  Class :character   Class :character   1st Qu.:  9.00   1st Qu.: 63028  
##  Mode  :character   Mode  :character   Median : 17.00   Median :126055  
##                                        Mean   : 20.97   Mean   :126055  
##                                        3rd Qu.: 29.00   3rd Qu.:189081  
##                                        Max.   :877.00   Max.   :252108  
##  day_of_week         holiday_flg     
##  Length:252108      Min.   :0.00000  
##  Class :character   1st Qu.:0.00000  
##  Mode  :character   Median :0.00000  
##                     Mean   :0.05067  
##                     3rd Qu.:0.00000  
##                     Max.   :1.00000

We have 6 variables (would be 5 but I created an extra one for the plot which probably wont be used). But columns include visit date, store id, # visitors, day of the week and a holiday dummy variable which might be really useful later.

might be able to make some boxplots to map out relations

boxplot(mergetest1$visitors~mergetest1$day_of_week)

boxplot(mergetest1$visitors~mergetest1$day_of_week, outline = FALSE)

boxplot(mergetest1$visitors~mergetest1$holiday_flg)

boxplot(mergetest1$visitors~mergetest1$holiday_flg, outline = FALSE)

Next, lets try the aggregate function. Sample submission model got great results just using mean so we can try the same.

mergeagg <- aggregate(visitors ~ visit_date, FUN = mean, data = mergetest1)
summary(mergeagg)

##   visit_date           visitors    
##  Length:478         Min.   :13.13  
##  Class :character   1st Qu.:17.85  
##  Mode  :character   Median :20.60  
##                     Mean   :21.15  
##                     3rd Qu.:24.22  
##                     Max.   :31.27

Cool! Looks like 478 observations with a reasonable average (similar to summary(airvisitdata) statistics)

What if we tried aggregate and included store id?

mergeaggairstoreid <- aggregate(visitors ~ air_store_id, FUN = mean, data = mergetest1)
summary(mergeaggairstoreid)

##  air_store_id          visitors      
##  Length:829         Min.   :  1.188  
##  Class :character   1st Qu.: 11.812  
##  Mode  :character   Median : 19.183  
##                     Mean   : 21.204  
##                     3rd Qu.: 28.645  
##                     Max.   :115.471

For the air_store_id above, we note that there are 829 different means.

Adding air_store_id removes the mean Lets try time series plot using this newly aggregated data

tsmergetest2 <- ts(mergeagg$visitors, start = 2016, frequency = 365)
autoplot(tsmergetest2)

lets try air store id plot as well

tsmergeairstoreid <- ts(mergeaggairstoreid$visitors, frequency = 7)
autoplot(tsmergeairstoreid)

Now we might be getting somewhere. Lets try geometric mean since we were told to try that out as well

geomergeagg <- aggregate(visitors ~ visit_date, FUN = geometric.mean, data = mergetest1)
tsgeomergetest <- ts(geomergeagg$visitors, start = 2016, frequency = 365)
autoplot(tsgeomergetest)

Same trend but slightly lower mean values.

Could try decomposition? Might show us something useful but I dont know yet.

Also didnt work, maybe I need to use different frequency in my time series? Trying a weekly frequency

tsmergetest3 <- ts(mergeagg$visitors, frequency = 7)
autoplot(tsmergetest3)

additive decomposition (tsmergetest3 is weekly frequency)

amerge3 <- decompose(tsmergetest3, type = "additive")
autoplot(amerge3)

robust stl decomposition

stlmerge3 <- tsmergetest3 %>% stl(s.window = "periodic", robust = TRUE)
autoplot(stlmerge3)

Perhaps the STL model is better since the trend is a bit smoother while having essentially the same level of detail

We might begin to try linear regression just to see what we get

lm1 <- tslm(tsmergetest3 ~ trend + season)
summary(lm1)

## 
## Call:
## tslm(formula = tsmergetest3 ~ trend + season)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9732 -1.3230 -0.3687  0.9704 11.3975 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 23.6970408  0.3272741  72.407  < 2e-16 ***
## trend       -0.0016425  0.0007528  -2.182   0.0296 *  
## season2      3.2221963  0.3866256   8.334 8.66e-16 ***
## season3      0.7969265  0.3880453   2.054   0.0406 *  
## season4     -5.7605415  0.3880439 -14.845  < 2e-16 ***
## season5     -5.3658377  0.3880439 -13.828  < 2e-16 ***
## season6     -3.9593271  0.3880453 -10.203  < 2e-16 ***
## season7     -4.1449065  0.3880483 -10.681  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.271 on 470 degrees of freedom
## Multiple R-squared:  0.6739, Adjusted R-squared:  0.6691 
## F-statistic: 138.8 on 7 and 470 DF,  p-value: < 2.2e-16

checkresiduals(lm1)

## 
##  Breusch-Godfrey test for serial correlation of order up to 14
## 
## data:  Residuals from Linear regression model
## LM test = 192.87, df = 14, p-value < 2.2e-16

what happens without trend

lm1notrend <- tslm(tsmergetest3 ~ season)
summary(lm1notrend)

## 
## Call:
## tslm(formula = tsmergetest3 ~ season)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.7892 -1.3604 -0.3524  0.9510 11.1848 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  23.3045     0.2745  84.906  < 2e-16 ***
## season2       3.2206     0.3882   8.297 1.13e-15 ***
## season3       0.7994     0.3896   2.052   0.0407 *  
## season4      -5.7597     0.3896 -14.784  < 2e-16 ***
## season5      -5.3667     0.3896 -13.775  < 2e-16 ***
## season6      -3.9618     0.3896 -10.169  < 2e-16 ***
## season7      -4.1490     0.3896 -10.650  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.28 on 471 degrees of freedom
## Multiple R-squared:  0.6706, Adjusted R-squared:  0.6664 
## F-statistic: 159.8 on 6 and 471 DF,  p-value: < 2.2e-16

lets try forecasting

fclm1 <- rep(0, 35)
tsfclm1 <- ts(fclm1, frequency = 7)
mergeforecast <- forecast(lm1, newdata = tsfclm1)

## Warning in forecast.lm(lm1, newdata = tsfclm1): newdata column names not
## specified, defaulting to first variable required.

autoplot(mergeforecast) + xlim(60, 80)

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## Warning: Removed 413 row(s) containing missing values (geom_path).

summary(mergeforecast)

## 
## Forecast method: Linear regression model
## 
## Model Information:
## 
## Call:
## tslm(formula = tsmergetest3 ~ trend + season)
## 
## Coefficients:
## (Intercept)        trend      season2      season3      season4      season5  
##   23.697041    -0.001642     3.222196     0.796926    -5.760541    -5.365838  
##     season6      season7  
##   -3.959327    -4.144906  
## 
## 
## Error measures:
##                         ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -1.677256e-16 2.251823 1.616953 -1.126531 7.670276 0.8918741
##                   ACF1
## Training set 0.5585678
## 
## Forecasts:
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 69.28571       23.70722 20.76223 26.65220 19.19796 28.21647
## 69.42857       17.14810 14.20312 20.09309 12.63885 21.65736
## 69.57143       17.54117 14.59618 20.48615 13.03191 22.05042
## 69.71429       18.94603 16.00105 21.89102 14.43678 23.45529
## 69.85714       18.75881 15.81382 21.70380 14.24956 23.26806
## 70.00000       22.90208 19.95713 25.84703 18.39288 27.41127
## 70.14286       26.12263 23.17768 29.06758 21.61344 30.63182
## 70.28571       23.69572 20.75019 26.64125 19.18563 28.20580
## 70.42857       17.13661 14.19108 20.08214 12.62652 21.64669
## 70.57143       17.52967 14.58414 20.47520 13.01958 22.03975
## 70.71429       18.93454 15.98901 21.88007 14.42445 23.44462
## 70.85714       18.74731 15.80178 21.69285 14.23723 23.25740
## 71.00000       22.89058 19.94508 25.83608 18.38054 27.40062
## 71.14286       26.11113 23.16563 29.05663 21.60109 30.62117
## 71.28571       23.68422 20.73813 26.63031 19.17328 28.19516
## 71.42857       17.12511 14.17902 20.07120 12.61417 21.63605
## 71.57143       17.51817 14.57208 20.46426 13.00723 22.02911
## 71.71429       18.92304 15.97695 21.86913 14.41210 23.43398
## 71.85714       18.73582 15.78973 21.68191 14.22488 23.24676
## 72.00000       22.87908 19.93301 25.82515 18.36818 27.38999
## 72.14286       26.09964 23.15357 29.04570 21.58873 30.61054
## 72.28571       23.67272 20.72606 26.61939 19.16090 28.18454
## 72.42857       17.11361 14.16695 20.06028 12.60179 21.62543
## 72.57143       17.50667 14.56001 20.45334 12.99485 22.01849
## 72.71429       18.91154 15.96488 21.85821 14.39972 23.42336
## 72.85714       18.72432 15.77766 21.67098 14.21250 23.23614
## 73.00000       22.86758 19.92093 25.81423 18.35579 27.37938
## 73.14286       26.08814 23.14149 29.03479 21.57634 30.59993
## 73.28571       23.66123 20.71397 26.60848 19.14850 28.17395
## 73.42857       17.10211 14.15486 20.04937 12.58939 21.61484
## 73.57143       17.49518 14.54792 20.44243 12.98245 22.00790
## 73.71429       18.90004 15.95279 21.84730 14.38732 23.41277
## 73.85714       18.71282 15.76557 21.66008 14.20010 23.22554
## 74.00000       22.85609 19.90884 25.80333 18.34337 27.36880
## 74.14286       26.07664 23.12939 29.02389 21.56393 30.58935

Looks alright, it essentially is able to capture our weekly trend (less restaurant goers during start, more at the end)

Lets try some other forecasting methods?

stlfdrift <- stlf(tsmergetest3, s.window = "periodic", robust = TRUE, method = "rwdrift")
autoplot(stlfdrift)

autoplot(stlfdrift) + xlim(60, 80)

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## Warning: Removed 413 row(s) containing missing values (geom_path).

summary(stlfdrift)

## 
## Forecast method: STL +  Random walk with drift
## 
## Model Information:
## Call: rwf(y = x, h = h, drift = TRUE, level = level) 
## 
## Drift: 0.005  (se 0.1001)
## Residual sd: 2.1857 
## 
## Error measures:
##                        ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 5.697621e-16 2.183425 1.414258 -0.518423 6.802913 0.7800721
##                    ACF1
## Training set -0.1110387
## 
## Forecasts:
##          Point Forecast     Lo 80    Hi 80     Lo 95    Hi 95
## 69.28571       24.11090 21.309787 26.91201 19.826969 28.39482
## 69.42857       17.03421 13.068698 20.99973 10.969480 23.09895
## 69.57143       18.10754 13.245719 22.96937 10.672025 25.54306
## 69.71429       19.31955 13.699746 24.93936 10.724799 27.91431
## 69.85714       19.19199 12.902310 25.48167  9.572756 28.81122
## 70.00000       23.93825 17.041098 30.83541 13.389965 34.48654
## 70.14286       27.48336 20.025852 34.94086 16.078088 38.88862
## 70.28571       24.14593 16.165276 32.12659 11.940573 36.35129
## 70.42857       17.06925  8.595746 25.54275  4.110145 30.02835
## 70.57143       18.14258  9.201519 27.08364  4.468407 31.81675
## 70.71429       19.35459  9.967484 28.74169  4.998250 33.71093
## 70.85714       19.22702  9.412451 29.04160  4.216930 34.23712
## 71.00000       23.97329 13.747496 34.19908  8.334289 39.61229
## 71.14286       27.51839 16.895739 38.14104 11.272447 43.76433

STLF drift method with weekly frequency time series data provides similar results. So far tsmergetest3 has been the most effective

here is a naive model

stlfnaive <- stlf(tsmergetest3, s.window = "periodic", robust = TRUE, method = "naive")
autoplot(stlfnaive)

autoplot(stlfnaive) + xlim(60, 80)

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## Warning: Removed 413 row(s) containing missing values (geom_path).

summary(stlfnaive)

## 
## Forecast method: STL +  Random walk
## 
## Model Information:
## Call: rwf(y = x, h = h, drift = FALSE, level = level) 
## 
## Residual sd: 2.1834 
## 
## Error measures:
##                       ME     RMSE      MAE        MPE     MAPE      MASE
## Training set 0.005005011 2.183431 1.414457 -0.4939261 6.802882 0.7801821
##                    ACF1
## Training set -0.1110387
## 
## Forecasts:
##          Point Forecast     Lo 80    Hi 80     Lo 95    Hi 95
## 69.28571       24.10589 21.307713 26.90407 19.826446 28.38534
## 69.42857       17.02420 13.066982 20.98143 10.972155 23.07625
## 69.57143       18.09253 13.245940 22.93912 10.680312 25.50475
## 69.71429       19.29953 13.703176 24.89589 10.740643 27.85842
## 69.85714       19.16696 12.910045 25.42388  9.597832 28.73609
## 70.00000       23.90822 17.054113 30.76233 13.425766 34.39068
## 70.14286       27.44832 20.045035 34.85161 16.125972 38.77067
## 70.28571       24.10589 16.191446 32.02034 12.001792 36.20999
## 70.42857       17.02420  8.629668 25.41874  4.185868 29.86254
## 70.57143       18.09253  9.243910 26.94115  4.559734 31.62532
## 70.71429       19.29953 10.019024 28.58004  5.106219 33.49285
## 70.85714       19.16696  9.473787 28.86014  4.342530 33.99140
## 71.00000       23.90822 13.819246 33.99720  8.478464 39.33798
## 71.14286       27.44832 16.978493 37.91815 11.436102 43.46054

The naive model provides almost the exact same results

Now, maybe we can make a better linear model? I will try to figure out some way to incorporate the holiday flag variable. Initial thoughts is that it would have a positive coefficient (ie. people go out to eat during holidays)

summary(mergeagg)

##   visit_date           visitors    
##  Length:478         Min.   :13.13  
##  Class :character   1st Qu.:17.85  
##  Mode  :character   Median :20.60  
##                     Mean   :21.15  
##                     3rd Qu.:24.22  
##                     Max.   :31.27

Just a quick recap above for our aggregate data frame. We have the visit date and the average number of visitors per day

Here, we begin to reformat the visit date info and turn it into a more complex time series

mergetest1$visit_date <-as.Date(mergetest1$visit_date, format=c('%Y-%m-%d'))
mergetest1$Month <- as.factor(month(mergetest1$visit_date))
mergetest1$Month <- as.numeric(as.factor(mergetest1$Month))
mergetest1$visit_date1 = NULL
head(mergetest1)

##   visit_date         air_store_id visitors day_of_week holiday_flg Month
## 1 2016-01-01 air_c31472d14e29cee8        3      Friday           1     1
## 2 2016-01-01 air_05c325d315cc17f5       29      Friday           1     1
## 3 2016-01-01 air_81c5dff692063446        7      Friday           1     1
## 4 2016-01-01 air_a083834e7ffe187e       27      Friday           1     1
## 5 2016-01-01 air_36bcf77d3382d36e       34      Friday           1     1
## 6 2016-01-01 air_09a845d5b5944b01       56      Friday           1     1

Above, we now have turned visit_date into time and added a monthly flag from 1 to 12

creating dummy variables for month

mergetest1 <- dummy_cols(mergetest1, select_columns = "Month")
head(mergetest1)

##   visit_date         air_store_id visitors day_of_week holiday_flg Month
## 1 2016-01-01 air_c31472d14e29cee8        3      Friday           1     1
## 2 2016-01-01 air_05c325d315cc17f5       29      Friday           1     1
## 3 2016-01-01 air_81c5dff692063446        7      Friday           1     1
## 4 2016-01-01 air_a083834e7ffe187e       27      Friday           1     1
## 5 2016-01-01 air_36bcf77d3382d36e       34      Friday           1     1
## 6 2016-01-01 air_09a845d5b5944b01       56      Friday           1     1
##   Month_1 Month_2 Month_3 Month_4 Month_5 Month_6 Month_7 Month_8 Month_9
## 1       1       0       0       0       0       0       0       0       0
## 2       1       0       0       0       0       0       0       0       0
## 3       1       0       0       0       0       0       0       0       0
## 4       1       0       0       0       0       0       0       0       0
## 5       1       0       0       0       0       0       0       0       0
## 6       1       0       0       0       0       0       0       0       0
##   Month_10 Month_11 Month_12
## 1        0        0        0
## 2        0        0        0
## 3        0        0        0
## 4        0        0        0
## 5        0        0        0
## 6        0        0        0

summary(mergetest1)

##    visit_date         air_store_id          visitors      day_of_week       
##  Min.   :2016-01-01   Length:252108      Min.   :  1.00   Length:252108     
##  1st Qu.:2016-07-23   Class :character   1st Qu.:  9.00   Class :character  
##  Median :2016-10-23   Mode  :character   Median : 17.00   Mode  :character  
##  Mean   :2016-10-12                      Mean   : 20.97                     
##  3rd Qu.:2017-01-24                      3rd Qu.: 29.00                     
##  Max.   :2017-04-22                      Max.   :877.00                     
##   holiday_flg          Month           Month_1          Month_2      
##  Min.   :0.00000   Min.   : 1.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.: 3.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median : 7.000   Median :0.0000   Median :0.0000  
##  Mean   :0.05067   Mean   : 6.208   Mean   :0.1072   Mean   :0.1087  
##  3rd Qu.:0.00000   3rd Qu.:10.000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :12.000   Max.   :1.0000   Max.   :1.0000  
##     Month_3          Month_4           Month_5           Month_6       
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.1213   Mean   :0.09464   Mean   :0.03237   Mean   :0.03271  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##     Month_7           Month_8           Month_9           Month_10      
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.08499   Mean   :0.08201   Mean   :0.08262   Mean   :0.08515  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##     Month_11        Month_12      
##  Min.   :0.000   Min.   :0.00000  
##  1st Qu.:0.000   1st Qu.:0.00000  
##  Median :0.000   Median :0.00000  
##  Mean   :0.083   Mean   :0.08534  
##  3rd Qu.:0.000   3rd Qu.:0.00000  
##  Max.   :1.000   Max.   :1.00000

Now, above we have added dummy variables which will help us create more complex linear models

So lets try another lm now with the new dummy variables

lm2 <- lm(visitors~visit_date+holiday_flg+Month_2+
          Month_3+Month_4+Month_5+Month_6+Month_7+
          Month_8+Month_9+Month_10+Month_11+Month_12, data = mergetest1)
summary(lm2)

## 
## Call:
## lm(formula = visitors ~ visit_date + holiday_flg + Month_2 + 
##     Month_3 + Month_4 + Month_5 + Month_6 + Month_7 + Month_8 + 
##     Month_9 + Month_10 + Month_11 + Month_12, data = mergetest1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -24.83 -12.13  -3.87   8.12 854.89 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 55.7366770  5.2403515  10.636  < 2e-16 ***
## visit_date  -0.0021031  0.0003067  -6.858 7.01e-12 ***
## holiday_flg  2.9761154  0.1534097  19.400  < 2e-16 ***
## Month_2      0.6162385  0.1434554   4.296 1.74e-05 ***
## Month_3      2.6175421  0.1406553  18.610  < 2e-16 ***
## Month_4      2.1017658  0.1497246  14.038  < 2e-16 ***
## Month_5      1.6297222  0.2158005   7.552 4.30e-14 ***
## Month_6      1.2664427  0.2133947   5.935 2.95e-09 ***
## Month_7      1.2070385  0.1551855   7.778 7.39e-15 ***
## Month_8     -0.5097829  0.1554993  -3.278  0.00104 ** 
## Month_9     -0.1255054  0.1542208  -0.814  0.41576    
## Month_10     0.4481443  0.1528223   2.932  0.00336 ** 
## Month_11    -0.1709310  0.1542427  -1.108  0.26778    
## Month_12     3.1440027  0.1541984  20.389  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.71 on 252094 degrees of freedom
## Multiple R-squared:  0.006202,   Adjusted R-squared:  0.006151 
## F-statistic:   121 on 13 and 252094 DF,  p-value: < 2.2e-16

Incredibly low adj.R squared. Normal linear model doesnt do much for us. Its almost necessary that we use the tslm() function in order to use the season + trend predictors

need to figure out how to get tslm() model working for mergetest1 data to use trend and seasonal. We have used tsmergetest1 in order to compute our time series using the average visits of all stores combined

maybe i need to create new data frame with useful variables. Here we convert visit date info into actual time data

mergeagg$visit_date <-as.Date(mergeagg$visit_date, format=c('%Y-%m-%d'))
mergeagg$Month <- as.factor(month(mergeagg$visit_date))
mergeagg$Month <- as.numeric(as.factor(mergeagg$Month))
mergeagg <- dummy_cols(mergeagg, select_columns = "Month")
head(mergeagg)

##   visit_date visitors Month Month_1 Month_2 Month_3 Month_4 Month_5 Month_6
## 1 2016-01-01 21.52083     1       1       0       0       0       0       0
## 2 2016-01-02 28.00000     1       1       0       0       0       0       0
## 3 2016-01-03 29.23457     1       1       0       0       0       0       0
## 4 2016-01-04 21.18471     1       1       0       0       0       0       0
## 5 2016-01-05 17.00000     1       1       0       0       0       0       0
## 6 2016-01-06 16.61600     1       1       0       0       0       0       0
##   Month_7 Month_8 Month_9 Month_10 Month_11 Month_12
## 1       0       0       0        0        0        0
## 2       0       0       0        0        0        0
## 3       0       0       0        0        0        0
## 4       0       0       0        0        0        0
## 5       0       0       0        0        0        0
## 6       0       0       0        0        0        0

summary(mergeagg)

##    visit_date            visitors         Month           Month_1      
##  Min.   :2016-01-01   Min.   :13.13   Min.   : 1.000   Min.   :0.0000  
##  1st Qu.:2016-04-29   1st Qu.:17.85   1st Qu.: 3.000   1st Qu.:0.0000  
##  Median :2016-08-26   Median :20.60   Median : 5.000   Median :0.0000  
##  Mean   :2016-08-26   Mean   :21.15   Mean   : 5.548   Mean   :0.1297  
##  3rd Qu.:2016-12-23   3rd Qu.:24.22   3rd Qu.: 9.000   3rd Qu.:0.0000  
##  Max.   :2017-04-22   Max.   :31.27   Max.   :12.000   Max.   :1.0000  
##     Month_2          Month_3          Month_4          Month_5       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.00000  
##  Mean   :0.1192   Mean   :0.1297   Mean   :0.1088   Mean   :0.06485  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##     Month_6           Month_7           Month_8           Month_9       
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.06276   Mean   :0.06485   Mean   :0.06485   Mean   :0.06276  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##     Month_10          Month_11          Month_12      
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.06485   Mean   :0.06276   Mean   :0.06485  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000

By looking over that summary, we can see that mergeagg is now a data frame which contains dummy variables for the months, average visitors per day and the date that statistic was recorded on

Here, we convert mergeagg into time series data with a weekly frequency

tsmergetest5 <- ts(mergeagg, frequency = 7)
head(tsmergetest5)

## Time Series:
## Start = c(1, 1) 
## End = c(1, 6) 
## Frequency = 7 
##          visit_date visitors Month Month_1 Month_2 Month_3 Month_4 Month_5
## 1.000000      16801 21.52083     1       1       0       0       0       0
## 1.142857      16802 28.00000     1       1       0       0       0       0
## 1.285714      16803 29.23457     1       1       0       0       0       0
## 1.428571      16804 21.18471     1       1       0       0       0       0
## 1.571429      16805 17.00000     1       1       0       0       0       0
## 1.714286      16806 16.61600     1       1       0       0       0       0
##          Month_6 Month_7 Month_8 Month_9 Month_10 Month_11 Month_12
## 1.000000       0       0       0       0        0        0        0
## 1.142857       0       0       0       0        0        0        0
## 1.285714       0       0       0       0        0        0        0
## 1.428571       0       0       0       0        0        0        0
## 1.571429       0       0       0       0        0        0        0
## 1.714286       0       0       0       0        0        0        0

autoplot(tsmergetest5)

tslm() using months as dummy variables

lm3 <- tslm(visitors ~ visit_date + season + Month_2+
          Month_3+Month_4+Month_5+Month_6+Month_7+
          Month_8+Month_9+Month_10+Month_11 + Month_12, data = tsmergetest5)
summary(lm3)

## 
## Call:
## tslm(formula = visitors ~ visit_date + season + Month_2 + Month_3 + 
##     Month_4 + Month_5 + Month_6 + Month_7 + Month_8 + Month_9 + 
##     Month_10 + Month_11 + Month_12, data = tsmergetest5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.0153 -1.0922 -0.2242  0.7093 12.1625 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 59.737643  12.480020   4.787 2.29e-06 ***
## visit_date  -0.002183   0.000734  -2.974 0.003094 ** 
## season2      3.231050   0.347636   9.294  < 2e-16 ***
## season3      0.851229   0.349018   2.439 0.015109 *  
## season4     -5.696396   0.349102 -16.317  < 2e-16 ***
## season5     -5.329111   0.349061 -15.267  < 2e-16 ***
## season6     -3.945275   0.349103 -11.301  < 2e-16 ***
## season7     -4.174325   0.348936 -11.963  < 2e-16 ***
## Month_2      0.213291   0.375136   0.569 0.569926    
## Month_3      2.297744   0.369495   6.219 1.13e-09 ***
## Month_4      1.378286   0.386512   3.566 0.000401 ***
## Month_5      1.544538   0.451369   3.422 0.000678 ***
## Month_6      0.825379   0.454827   1.815 0.070221 .  
## Month_7      0.411927   0.449065   0.917 0.359468    
## Month_8     -0.433943   0.449743  -0.965 0.335118    
## Month_9     -0.496864   0.456332  -1.089 0.276804    
## Month_10    -0.151024   0.454003  -0.333 0.739551    
## Month_11    -0.334229   0.462830  -0.722 0.470574    
## Month_12     2.721453   0.462865   5.880 7.92e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.041 on 459 degrees of freedom
## Multiple R-squared:  0.7427, Adjusted R-squared:  0.7327 
## F-statistic: 73.62 on 18 and 459 DF,  p-value: < 2.2e-16

checkresiduals(lm3)

## 
##  Breusch-Godfrey test for serial correlation of order up to 22
## 
## data:  Residuals from Linear regression model
## LM test = 144.9, df = 22, p-value < 2.2e-16

Our adj.R squared has gone up compared to the last tslm() model. Trend showed as NA and had no effect on adj.R squared so i removed it

we need to find someway to extract date data and holiday flag data to move to our ts() for further analysis

summary(mergetest1)

##    visit_date         air_store_id          visitors      day_of_week       
##  Min.   :2016-01-01   Length:252108      Min.   :  1.00   Length:252108     
##  1st Qu.:2016-07-23   Class :character   1st Qu.:  9.00   Class :character  
##  Median :2016-10-23   Mode  :character   Median : 17.00   Mode  :character  
##  Mean   :2016-10-12                      Mean   : 20.97                     
##  3rd Qu.:2017-01-24                      3rd Qu.: 29.00                     
##  Max.   :2017-04-22                      Max.   :877.00                     
##   holiday_flg          Month           Month_1          Month_2      
##  Min.   :0.00000   Min.   : 1.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.: 3.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median : 7.000   Median :0.0000   Median :0.0000  
##  Mean   :0.05067   Mean   : 6.208   Mean   :0.1072   Mean   :0.1087  
##  3rd Qu.:0.00000   3rd Qu.:10.000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :12.000   Max.   :1.0000   Max.   :1.0000  
##     Month_3          Month_4           Month_5           Month_6       
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.1213   Mean   :0.09464   Mean   :0.03237   Mean   :0.03271  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##     Month_7           Month_8           Month_9           Month_10      
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.08499   Mean   :0.08201   Mean   :0.08262   Mean   :0.08515  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##     Month_11        Month_12      
##  Min.   :0.000   Min.   :0.00000  
##  1st Qu.:0.000   1st Qu.:0.00000  
##  Median :0.000   Median :0.00000  
##  Mean   :0.083   Mean   :0.08534  
##  3rd Qu.:0.000   3rd Qu.:0.00000  
##  Max.   :1.000   Max.   :1.00000

plot(mergetest1$holiday_flg~mergetest1$visit_date)

table(mergetest1$holiday_flg)

## 
##      0      1 
## 239333  12775

making sure datainfo will work by converting datainfo$calendar_date into time data

datainfo$calendar_date <-as.Date(datainfo$calendar_date, format=c('%Y-%m-%d'))
summary(datainfo)

##  calendar_date        day_of_week         holiday_flg    
##  Min.   :2016-01-01   Length:517         Min.   :0.0000  
##  1st Qu.:2016-05-09   Class :character   1st Qu.:0.0000  
##  Median :2016-09-15   Mode  :character   Median :0.0000  
##  Mean   :2016-09-15                      Mean   :0.0677  
##  3rd Qu.:2017-01-22                      3rd Qu.:0.0000  
##  Max.   :2017-05-31                      Max.   :1.0000

Now calendar date should be able to be merged

binding holiday flag to rest mergeagg

mergeaggtest1 <- merge(mergeagg, datainfo, by.x = "visit_date", by.y = "calendar_date")
summary(mergeaggtest1)

##    visit_date            visitors         Month           Month_1      
##  Min.   :2016-01-01   Min.   :13.13   Min.   : 1.000   Min.   :0.0000  
##  1st Qu.:2016-04-29   1st Qu.:17.85   1st Qu.: 3.000   1st Qu.:0.0000  
##  Median :2016-08-26   Median :20.60   Median : 5.000   Median :0.0000  
##  Mean   :2016-08-26   Mean   :21.15   Mean   : 5.548   Mean   :0.1297  
##  3rd Qu.:2016-12-23   3rd Qu.:24.22   3rd Qu.: 9.000   3rd Qu.:0.0000  
##  Max.   :2017-04-22   Max.   :31.27   Max.   :12.000   Max.   :1.0000  
##     Month_2          Month_3          Month_4          Month_5       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.00000  
##  Mean   :0.1192   Mean   :0.1297   Mean   :0.1088   Mean   :0.06485  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##     Month_6           Month_7           Month_8           Month_9       
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.06276   Mean   :0.06485   Mean   :0.06485   Mean   :0.06276  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##     Month_10          Month_11          Month_12       day_of_week       
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Length:478        
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   Class :character  
##  Median :0.00000   Median :0.00000   Median :0.00000   Mode  :character  
##  Mean   :0.06485   Mean   :0.06276   Mean   :0.06485                     
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000                     
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000                     
##   holiday_flg     
##  Min.   :0.00000  
##  1st Qu.:0.00000  
##  Median :0.00000  
##  Mean   :0.06485  
##  3rd Qu.:0.00000  
##  Max.   :1.00000

Now in the code above, we have merged our previous aggregate data with the date info which has holiday flags in it

another time series using the newly formed df

tsmergetest6 <- ts(mergeaggtest1, frequency = 7)
lmtsmergetest6 <- tslm(visitors ~ season + holiday_flg + Month_2+
          Month_3+Month_4+Month_5+Month_6+Month_7+
          Month_8+Month_9+Month_10+Month_11 + Month_12, data = tsmergetest6)
summary(lmtsmergetest6)

## 
## Call:
## tslm(formula = visitors ~ season + holiday_flg + Month_2 + Month_3 + 
##     Month_4 + Month_5 + Month_6 + Month_7 + Month_8 + Month_9 + 
##     Month_10 + Month_11 + Month_12, data = tsmergetest6)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.6291  -0.8610  -0.0583   0.8643   8.5140 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 22.13809    0.30727  72.047  < 2e-16 ***
## season2      3.22599    0.30693  10.511  < 2e-16 ***
## season3      0.84602    0.30814   2.746  0.00628 ** 
## season4     -6.00182    0.30925 -19.408  < 2e-16 ***
## season5     -5.21151    0.30835 -16.901  < 2e-16 ***
## season6     -3.83748    0.30836 -12.445  < 2e-16 ***
## season7     -4.30692    0.30826 -13.972  < 2e-16 ***
## holiday_flg  4.07930    0.34331  11.882  < 2e-16 ***
## Month_2      0.54318    0.33237   1.634  0.10288    
## Month_3      2.49247    0.32509   7.667 1.06e-13 ***
## Month_4      1.69289    0.34108   4.963 9.78e-07 ***
## Month_5      1.81441    0.39664   4.574 6.15e-06 ***
## Month_6      1.41900    0.40347   3.517  0.00048 ***
## Month_7      0.80593    0.39785   2.026  0.04337 *  
## Month_8     -0.50012    0.39660  -1.261  0.20794    
## Month_9     -0.37242    0.40156  -0.927  0.35419    
## Month_10     0.05202    0.39791   0.131  0.89605    
## Month_11    -0.35486    0.40151  -0.884  0.37725    
## Month_12     2.39083    0.39660   6.028 3.41e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.802 on 459 degrees of freedom
## Multiple R-squared:  0.7995, Adjusted R-squared:  0.7916 
## F-statistic: 101.7 on 18 and 459 DF,  p-value: < 2.2e-16

checkresiduals(lmtsmergetest6)

## 
##  Breusch-Godfrey test for serial correlation of order up to 22
## 
## data:  Residuals from Linear regression model
## LM test = 117.97, df = 22, p-value = 4.073e-15

Above, we use the new data frame which includes months and holiday flags to create a tslm() Once again, trend showed NA and did not change the adj.R squared value when removing it so it is not in the final model

NOTE TO SELF: TSMERGETEST6 IS MOST UP TO DATE TSLM(). NEED BETTER VARIABLE NAMES IN THE FUTURE

decomposition on the above time series? idk if this is going to work

amerge6 <- decompose(tsmergetest6, type = "additive")
autoplot(amerge6)

I am not quite sure how to read this as it is a multivariate time series or maybe it is even just completely wrong

another tslm(), same data set, different variables (no months)

lmtsmergetest7 <- tslm(visitors ~  season + holiday_flg, data = tsmergetest6)
summary(lmtsmergetest7)

## 
## Call:
## tslm(formula = visitors ~ season + holiday_flg, data = tsmergetest6)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.9807 -1.1904 -0.1493  1.0324  8.7229 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  23.0792     0.2505  92.133   <2e-16 ***
## season2       3.2206     0.3528   9.128   <2e-16 ***
## season3       0.7961     0.3541   2.248    0.025 *  
## season4      -6.0488     0.3553 -17.025   <2e-16 ***
## season5      -5.2557     0.3543 -14.834   <2e-16 ***
## season6      -3.8508     0.3543 -10.869   <2e-16 ***
## season7      -4.2666     0.3543 -12.042   <2e-16 ***
## holiday_flg   3.8860     0.3884  10.004   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.072 on 470 degrees of freedom
## Multiple R-squared:  0.7285, Adjusted R-squared:  0.7244 
## F-statistic: 180.1 on 7 and 470 DF,  p-value: < 2.2e-16

checkresiduals(lmtsmergetest7)

## 
##  Breusch-Godfrey test for serial correlation of order up to 14
## 
## data:  Residuals from Linear regression model
## LM test = 202.62, df = 14, p-value < 2.2e-16

The adjusted R-squared dropped a bit when removing months. Down from 0.79 to 0.72

lets add the last few holiday flags and predict forecast

fc_holiday_flg <- datainfo$holiday_flg[479:517]
hfforecast <- forecast(lmtsmergetest7, newdata = fc_holiday_flg)

## Warning in forecast.lm(lmtsmergetest7, newdata = fc_holiday_flg): newdata column
## names not specified, defaulting to first variable required.

autoplot(hfforecast) + xlim(60, 80)

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## Warning: Removed 413 row(s) containing missing values (geom_path).

summary(hfforecast)

## 
## Forecast method: Linear regression model
## 
## Model Information:
## 
## Call:
## tslm(formula = visitors ~ season + holiday_flg, data = tsmergetest6)
## 
## Coefficients:
## (Intercept)      season2      season3      season4      season5      season6  
##     23.0792       3.2206       0.7961      -6.0488      -5.2557      -3.8508  
##     season7  holiday_flg  
##     -4.2666       3.8860  
## 
## 
## Error measures:
##                         ME    RMSE      MAE        MPE     MAPE      MASE
## Training set -1.751594e-16 2.05496 1.493419 -0.9357917 7.072272 0.8237356
##                   ACF1
## Training set 0.5479932
## 
## Forecasts:
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 69.28571       23.87529 21.19605 26.55453 19.77294 27.97763
## 69.42857       17.03044 14.35055 19.71033 12.92709 21.13378
## 69.57143       17.82353 15.14441 20.50265 13.72137 21.92570
## 69.71429       19.22840 16.54928 21.90752 15.12624 23.33056
## 69.85714       18.81259 16.13315 21.49203 14.70993 22.91524
## 70.00000       23.07921 20.40026 25.75816 18.97730 27.18112
## 70.14286       30.18579 27.46614 32.90544 26.02157 34.35001
## 70.28571       23.87529 21.19605 26.55453 19.77294 27.97763
## 70.42857       17.03044 14.35055 19.71033 12.92709 21.13378
## 70.57143       17.82353 15.14441 20.50265 13.72137 21.92570
## 70.71429       23.11443 20.39201 25.83685 18.94596 27.28290
## 70.85714       22.69862 19.98126 25.41598 18.53790 26.85934
## 71.00000       26.96524 24.24559 29.68489 22.80102 31.12946
## 71.14286       26.29976 23.62081 28.97872 22.19785 30.40167
## 71.28571       23.87529 21.19605 26.55453 19.77294 27.97763
## 71.42857       17.03044 14.35055 19.71033 12.92709 21.13378
## 71.57143       17.82353 15.14441 20.50265 13.72137 21.92570
## 71.71429       19.22840 16.54928 21.90752 15.12624 23.33056
## 71.85714       18.81259 16.13315 21.49203 14.70993 22.91524
## 72.00000       23.07921 20.40026 25.75816 18.97730 27.18112
## 72.14286       26.29976 23.62081 28.97872 22.19785 30.40167
## 72.28571       23.87529 21.19605 26.55453 19.77294 27.97763
## 72.42857       17.03044 14.35055 19.71033 12.92709 21.13378
## 72.57143       17.82353 15.14441 20.50265 13.72137 21.92570
## 72.71429       19.22840 16.54928 21.90752 15.12624 23.33056
## 72.85714       18.81259 16.13315 21.49203 14.70993 22.91524
## 73.00000       23.07921 20.40026 25.75816 18.97730 27.18112
## 73.14286       26.29976 23.62081 28.97872 22.19785 30.40167
## 73.28571       23.87529 21.19605 26.55453 19.77294 27.97763
## 73.42857       17.03044 14.35055 19.71033 12.92709 21.13378
## 73.57143       17.82353 15.14441 20.50265 13.72137 21.92570
## 73.71429       19.22840 16.54928 21.90752 15.12624 23.33056
## 73.85714       18.81259 16.13315 21.49203 14.70993 22.91524
## 74.00000       23.07921 20.40026 25.75816 18.97730 27.18112
## 74.14286       26.29976 23.62081 28.97872 22.19785 30.40167
## 74.28571       23.87529 21.19605 26.55453 19.77294 27.97763
## 74.42857       17.03044 14.35055 19.71033 12.92709 21.13378
## 74.57143       17.82353 15.14441 20.50265 13.72137 21.92570
## 74.71429       19.22840 16.54928 21.90752 15.12624 23.33056

direct comparison with our earlier models

autoplot(mergeforecast) + xlim(60, 80)

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## Warning: Removed 413 row(s) containing missing values (geom_path).

summary(mergeforecast)

## 
## Forecast method: Linear regression model
## 
## Model Information:
## 
## Call:
## tslm(formula = tsmergetest3 ~ trend + season)
## 
## Coefficients:
## (Intercept)        trend      season2      season3      season4      season5  
##   23.697041    -0.001642     3.222196     0.796926    -5.760541    -5.365838  
##     season6      season7  
##   -3.959327    -4.144906  
## 
## 
## Error measures:
##                         ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -1.677256e-16 2.251823 1.616953 -1.126531 7.670276 0.8918741
##                   ACF1
## Training set 0.5585678
## 
## Forecasts:
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 69.28571       23.70722 20.76223 26.65220 19.19796 28.21647
## 69.42857       17.14810 14.20312 20.09309 12.63885 21.65736
## 69.57143       17.54117 14.59618 20.48615 13.03191 22.05042
## 69.71429       18.94603 16.00105 21.89102 14.43678 23.45529
## 69.85714       18.75881 15.81382 21.70380 14.24956 23.26806
## 70.00000       22.90208 19.95713 25.84703 18.39288 27.41127
## 70.14286       26.12263 23.17768 29.06758 21.61344 30.63182
## 70.28571       23.69572 20.75019 26.64125 19.18563 28.20580
## 70.42857       17.13661 14.19108 20.08214 12.62652 21.64669
## 70.57143       17.52967 14.58414 20.47520 13.01958 22.03975
## 70.71429       18.93454 15.98901 21.88007 14.42445 23.44462
## 70.85714       18.74731 15.80178 21.69285 14.23723 23.25740
## 71.00000       22.89058 19.94508 25.83608 18.38054 27.40062
## 71.14286       26.11113 23.16563 29.05663 21.60109 30.62117
## 71.28571       23.68422 20.73813 26.63031 19.17328 28.19516
## 71.42857       17.12511 14.17902 20.07120 12.61417 21.63605
## 71.57143       17.51817 14.57208 20.46426 13.00723 22.02911
## 71.71429       18.92304 15.97695 21.86913 14.41210 23.43398
## 71.85714       18.73582 15.78973 21.68191 14.22488 23.24676
## 72.00000       22.87908 19.93301 25.82515 18.36818 27.38999
## 72.14286       26.09964 23.15357 29.04570 21.58873 30.61054
## 72.28571       23.67272 20.72606 26.61939 19.16090 28.18454
## 72.42857       17.11361 14.16695 20.06028 12.60179 21.62543
## 72.57143       17.50667 14.56001 20.45334 12.99485 22.01849
## 72.71429       18.91154 15.96488 21.85821 14.39972 23.42336
## 72.85714       18.72432 15.77766 21.67098 14.21250 23.23614
## 73.00000       22.86758 19.92093 25.81423 18.35579 27.37938
## 73.14286       26.08814 23.14149 29.03479 21.57634 30.59993
## 73.28571       23.66123 20.71397 26.60848 19.14850 28.17395
## 73.42857       17.10211 14.15486 20.04937 12.58939 21.61484
## 73.57143       17.49518 14.54792 20.44243 12.98245 22.00790
## 73.71429       18.90004 15.95279 21.84730 14.38732 23.41277
## 73.85714       18.71282 15.76557 21.66008 14.20010 23.22554
## 74.00000       22.85609 19.90884 25.80333 18.34337 27.36880
## 74.14286       26.07664 23.12939 29.02389 21.56393 30.58935

We notice that most of the points are similar but the variables flagged as holidays have slightly higher predictions as expected

Lets begin working on the sample submission part now. Here, we are dividing up the id into both air_store_id and visit_date.

samplesubmission$air_store_id <- substr(samplesubmission$id, 1,20)
samplesubmission$visit_date <- substr(samplesubmission$id, 22,31)
samplesubmission$visit_date <- as.Date(samplesubmission$visit_date, format=c('%Y-%m-%d'))
head(samplesubmission)

##                                id visitors         air_store_id visit_date
## 1 air_00a91d42b08b08d9_2017-04-23        0 air_00a91d42b08b08d9 2017-04-23
## 2 air_00a91d42b08b08d9_2017-04-24        0 air_00a91d42b08b08d9 2017-04-24
## 3 air_00a91d42b08b08d9_2017-04-25        0 air_00a91d42b08b08d9 2017-04-25
## 4 air_00a91d42b08b08d9_2017-04-26        0 air_00a91d42b08b08d9 2017-04-26
## 5 air_00a91d42b08b08d9_2017-04-27        0 air_00a91d42b08b08d9 2017-04-27
## 6 air_00a91d42b08b08d9_2017-04-28        0 air_00a91d42b08b08d9 2017-04-28

breaking up id into store and date was successful. Now we need to start implementing the as.date, month and holiday flags.

summary(datainfo)

##  calendar_date        day_of_week         holiday_flg    
##  Min.   :2016-01-01   Length:517         Min.   :0.0000  
##  1st Qu.:2016-05-09   Class :character   1st Qu.:0.0000  
##  Median :2016-09-15   Mode  :character   Median :0.0000  
##  Mean   :2016-09-15                      Mean   :0.0677  
##  3rd Qu.:2017-01-22                      3rd Qu.:0.0000  
##  Max.   :2017-05-31                      Max.   :1.0000

We can see that datainfo$calendar_date is in time format (the same as samplesubmission) so there shouldnt be issues merging

samplemerge <- merge(samplesubmission, datainfo, by.x = "visit_date", by.y = "calendar_date")
head(samplemerge)

##   visit_date                              id visitors         air_store_id
## 1 2017-04-23 air_00a91d42b08b08d9_2017-04-23        0 air_00a91d42b08b08d9
## 2 2017-04-23 air_ec0fad2def4dcff0_2017-04-23        0 air_ec0fad2def4dcff0
## 3 2017-04-23 air_f2c5a1f24279c531_2017-04-23        0 air_f2c5a1f24279c531
## 4 2017-04-23 air_e270aff84ac7e4c8_2017-04-23        0 air_e270aff84ac7e4c8
## 5 2017-04-23 air_dad0b6a36138f309_2017-04-23        0 air_dad0b6a36138f309
## 6 2017-04-23 air_cfdeb326418194ff_2017-04-23        0 air_cfdeb326418194ff
##   day_of_week holiday_flg
## 1      Sunday           0
## 2      Sunday           0
## 3      Sunday           0
## 4      Sunday           0
## 5      Sunday           0
## 6      Sunday           0

summary(samplemerge)

##    visit_date              id               visitors air_store_id      
##  Min.   :2017-04-23   Length:32019       Min.   :0   Length:32019      
##  1st Qu.:2017-05-02   Class :character   1st Qu.:0   Class :character  
##  Median :2017-05-12   Mode  :character   Median :0   Mode  :character  
##  Mean   :2017-05-12                      Mean   :0                     
##  3rd Qu.:2017-05-22                      3rd Qu.:0                     
##  Max.   :2017-05-31                      Max.   :0                     
##  day_of_week         holiday_flg    
##  Length:32019       Min.   :0.0000  
##  Class :character   1st Qu.:0.0000  
##  Mode  :character   Median :0.0000  
##                     Mean   :0.1026  
##                     3rd Qu.:0.0000  
##                     Max.   :1.0000

So now we have a new data frame (?) called “samplemerge” and it contains visit_date, id (needed for submission), visitors (needed for submission), air store id, day of the week and holiday flag.

Data is classified by date first and foremost.This means that observation 1 and 2 are no longer consecutive days of the same restaurant but rather different restaurants on the same day.

Now there shouldnt be any issue predicting the tslm() model which only uses season and holiday flag as predictors (lmtsmergetest7).

However, if we are to use the more complex model “lmtsmergetest6” we will need to add dummy variables for Months 2 through 12.

Data frame works. Lets add month factors. 6568 break for april to august transition

samplemerge$Month <- as.factor(month(samplemerge$visit_date))
samplemerge$Month <- as.numeric(as.factor(samplemerge$Month))
samplemerge <- dummy_cols(samplemerge, select_columns = "Month")
samplemerge$Month_3 <- rep(0,32019)
samplemerge$Month_4 <- rep(0,32019)
samplemerge$Month_5 <- rep(0,32019)
samplemerge$Month_6 <- rep(0,32019)
samplemerge$Month_7 <- rep(0,32019)
samplemerge$Month_8 <- rep(0,32019)
samplemerge$Month_9 <- rep(0,32019)
samplemerge$Month_10 <- rep(0,32019)
samplemerge$Month_11 <- rep(0,32019)
samplemerge$Month_12 <- rep(0,32019)
head(samplemerge)

##   visit_date                              id visitors         air_store_id
## 1 2017-04-23 air_00a91d42b08b08d9_2017-04-23        0 air_00a91d42b08b08d9
## 2 2017-04-23 air_ec0fad2def4dcff0_2017-04-23        0 air_ec0fad2def4dcff0
## 3 2017-04-23 air_f2c5a1f24279c531_2017-04-23        0 air_f2c5a1f24279c531
## 4 2017-04-23 air_e270aff84ac7e4c8_2017-04-23        0 air_e270aff84ac7e4c8
## 5 2017-04-23 air_dad0b6a36138f309_2017-04-23        0 air_dad0b6a36138f309
## 6 2017-04-23 air_cfdeb326418194ff_2017-04-23        0 air_cfdeb326418194ff
##   day_of_week holiday_flg Month Month_1 Month_2 Month_3 Month_4 Month_5 Month_6
## 1      Sunday           0     1       1       0       0       0       0       0
## 2      Sunday           0     1       1       0       0       0       0       0
## 3      Sunday           0     1       1       0       0       0       0       0
## 4      Sunday           0     1       1       0       0       0       0       0
## 5      Sunday           0     1       1       0       0       0       0       0
## 6      Sunday           0     1       1       0       0       0       0       0
##   Month_7 Month_8 Month_9 Month_10 Month_11 Month_12
## 1       0       0       0        0        0        0
## 2       0       0       0        0        0        0
## 3       0       0       0        0        0        0
## 4       0       0       0        0        0        0
## 5       0       0       0        0        0        0
## 6       0       0       0        0        0        0

above, i have added all 12 dummy variable months. Now we need to change month 1 and 2 to 4 and 5 and then make month 1 and 2 0 all the way through

samplemerge$Month_4 <- samplemerge$Month_1
samplemerge$Month_5 <- samplemerge$Month_2
samplemerge$Month_1 <- rep(0,32019)
samplemerge$Month_2 <- rep(0,32019)

perfect! now we have visit date, id, visitors (to be imputed), air_store_id, day of week, holiday flag and month dummy variables

summary(samplemerge)

##    visit_date              id               visitors air_store_id      
##  Min.   :2017-04-23   Length:32019       Min.   :0   Length:32019      
##  1st Qu.:2017-05-02   Class :character   1st Qu.:0   Class :character  
##  Median :2017-05-12   Mode  :character   Median :0   Mode  :character  
##  Mean   :2017-05-12                      Mean   :0                     
##  3rd Qu.:2017-05-22                      3rd Qu.:0                     
##  Max.   :2017-05-31                      Max.   :0                     
##  day_of_week         holiday_flg         Month          Month_1     Month_2 
##  Length:32019       Min.   :0.0000   Min.   :1.000   Min.   :0   Min.   :0  
##  Class :character   1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:0   1st Qu.:0  
##  Mode  :character   Median :0.0000   Median :2.000   Median :0   Median :0  
##                     Mean   :0.1026   Mean   :1.795   Mean   :0   Mean   :0  
##                     3rd Qu.:0.0000   3rd Qu.:2.000   3rd Qu.:0   3rd Qu.:0  
##                     Max.   :1.0000   Max.   :2.000   Max.   :0   Max.   :0  
##     Month_3     Month_4          Month_5          Month_6     Month_7 
##  Min.   :0   Min.   :0.0000   Min.   :0.0000   Min.   :0   Min.   :0  
##  1st Qu.:0   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0   1st Qu.:0  
##  Median :0   Median :0.0000   Median :1.0000   Median :0   Median :0  
##  Mean   :0   Mean   :0.2051   Mean   :0.7949   Mean   :0   Mean   :0  
##  3rd Qu.:0   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0   3rd Qu.:0  
##  Max.   :0   Max.   :1.0000   Max.   :1.0000   Max.   :0   Max.   :0  
##     Month_8     Month_9     Month_10    Month_11    Month_12
##  Min.   :0   Min.   :0   Min.   :0   Min.   :0   Min.   :0  
##  1st Qu.:0   1st Qu.:0   1st Qu.:0   1st Qu.:0   1st Qu.:0  
##  Median :0   Median :0   Median :0   Median :0   Median :0  
##  Mean   :0   Mean   :0   Mean   :0   Mean   :0   Mean   :0  
##  3rd Qu.:0   3rd Qu.:0   3rd Qu.:0   3rd Qu.:0   3rd Qu.:0  
##  Max.   :0   Max.   :0   Max.   :0   Max.   :0   Max.   :0

samplefc <- forecast(lmtsmergetest6, newdata = samplemerge)
write.csv(samplefc, "test4.csv", row.names = FALSE)

using the forecast with months, holiday flags and no trend i recieved a score of 0.91. This actually is infinitely better than some of my other attempts so ill take it. I however that the results repeated over and over. I guess this makes sense when you think about it; without any downwards or upwards trend the cycle would just repeat itself over and over.

However, what if we tried adding trend?

I can see already just by examining the spread sheet that over enough rows, the trend will eventually drop our point forecast into the negatives.

samplefc1 <- forecast(lmtsmergetest7, newdata = samplemerge)
write.csv(samplefc, "test1.csv", row.names = FALSE)

Similarly, I received the exact same score of 0.91131 on my linear model which forecasted without monthly dummy variables.

I figured since I reached this point that I could attempt to add dummy variables to indicate if it is the weekend or not (Friday, Saturday and Sunday). I believe that maybe this could add some explanatory power and improve my model. I will refer back to mergeaggtest1 to do so.

mergeaggtest1$IsWeekend <- rep(0,478)
for (i in 1:length(mergeaggtest1$IsWeekend)){
  if (mergeaggtest1$day_of_week[i] == "Friday"){
    mergeaggtest1$IsWeekend[i] <- 1
  } else if (mergeaggtest1$day_of_week[i] == "Saturday"){
    mergeaggtest1$IsWeekend[i] <- 1
  } else if (mergeaggtest1$day_of_week[i] == "Sunday"){
    mergeaggtest1$IsWeekend[i] <- 1
  } else {
    mergeaggtest1$IsWeekend[i] <- 0
  }
}
summary(mergeaggtest1)

##    visit_date            visitors         Month           Month_1      
##  Min.   :2016-01-01   Min.   :13.13   Min.   : 1.000   Min.   :0.0000  
##  1st Qu.:2016-04-29   1st Qu.:17.85   1st Qu.: 3.000   1st Qu.:0.0000  
##  Median :2016-08-26   Median :20.60   Median : 5.000   Median :0.0000  
##  Mean   :2016-08-26   Mean   :21.15   Mean   : 5.548   Mean   :0.1297  
##  3rd Qu.:2016-12-23   3rd Qu.:24.22   3rd Qu.: 9.000   3rd Qu.:0.0000  
##  Max.   :2017-04-22   Max.   :31.27   Max.   :12.000   Max.   :1.0000  
##     Month_2          Month_3          Month_4          Month_5       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.00000  
##  Mean   :0.1192   Mean   :0.1297   Mean   :0.1088   Mean   :0.06485  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##     Month_6           Month_7           Month_8           Month_9       
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.06276   Mean   :0.06485   Mean   :0.06485   Mean   :0.06276  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##     Month_10          Month_11          Month_12       day_of_week       
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Length:478        
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   Class :character  
##  Median :0.00000   Median :0.00000   Median :0.00000   Mode  :character  
##  Mean   :0.06485   Mean   :0.06276   Mean   :0.06485                     
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000                     
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000                     
##   holiday_flg        IsWeekend    
##  Min.   :0.00000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:0.000  
##  Median :0.00000   Median :0.000  
##  Mean   :0.06485   Mean   :0.431  
##  3rd Qu.:0.00000   3rd Qu.:1.000  
##  Max.   :1.00000   Max.   :1.000

Neat, now we got weekend dummy to work. Lets see how it affects our model

tsmergetest8 <- ts(mergeaggtest1, frequency = 7)
lmtsmergetest8 <- tslm(visitors ~ season + holiday_flg + IsWeekend + Month_2+
          Month_3+Month_4+Month_5+Month_6+Month_7+
          Month_8+Month_9+Month_10+Month_11 + Month_12, data = tsmergetest8)
summary(lmtsmergetest8)

## 
## Call:
## tslm(formula = visitors ~ season + holiday_flg + IsWeekend + 
##     Month_2 + Month_3 + Month_4 + Month_5 + Month_6 + Month_7 + 
##     Month_8 + Month_9 + Month_10 + Month_11 + Month_12, data = tsmergetest8)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.6291  -0.8610  -0.0583   0.8643   8.5140 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 22.13809    0.30727  72.047  < 2e-16 ***
## season2      3.22599    0.30693  10.511  < 2e-16 ***
## season3      0.84602    0.30814   2.746  0.00628 ** 
## season4     -6.00182    0.30925 -19.408  < 2e-16 ***
## season5     -5.21151    0.30835 -16.901  < 2e-16 ***
## season6     -3.83748    0.30836 -12.445  < 2e-16 ***
## season7     -4.30692    0.30826 -13.972  < 2e-16 ***
## holiday_flg  4.07930    0.34331  11.882  < 2e-16 ***
## IsWeekend         NA         NA      NA       NA    
## Month_2      0.54318    0.33237   1.634  0.10288    
## Month_3      2.49247    0.32509   7.667 1.06e-13 ***
## Month_4      1.69289    0.34108   4.963 9.78e-07 ***
## Month_5      1.81441    0.39664   4.574 6.15e-06 ***
## Month_6      1.41900    0.40347   3.517  0.00048 ***
## Month_7      0.80593    0.39785   2.026  0.04337 *  
## Month_8     -0.50012    0.39660  -1.261  0.20794    
## Month_9     -0.37242    0.40156  -0.927  0.35419    
## Month_10     0.05202    0.39791   0.131  0.89605    
## Month_11    -0.35486    0.40151  -0.884  0.37725    
## Month_12     2.39083    0.39660   6.028 3.41e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.802 on 459 degrees of freedom
## Multiple R-squared:  0.7995, Adjusted R-squared:  0.7916 
## F-statistic: 101.7 on 18 and 459 DF,  p-value: < 2.2e-16

checkresiduals(lmtsmergetest8)

## 
##  Breusch-Godfrey test for serial correlation of order up to 23
## 
## data:  Residuals from Linear regression model
## LM test = 118, df = 23, p-value = 9.502e-15

Couldnt seem to get this to work.Maybe in the future I will get this

samplefc1 <- forecast(lm1, newdata = samplemerge)
write.csv(samplefc, "test2.csv", row.names = FALSE)

Looking over this linear model, my point forecast also drops into the negatives when including trend. This means that it is considering all 32019 observations to be predicted as one restaurant instead of each individual restaurant having its own separate trend.

Lets try the naive drift model here

samplefc2 <- forecast(stlfdrift, newdata = samplemerge)
write.csv(samplefc, "naivedrift.csv", row.names = FALSE)

Still recieving negative point values later on in the forecast

Lets try naive with no drift stlfnaive

samplefc3 <- forecast(stlfnaive, newdata = samplemerge)
write.csv(samplefc, "naive.csv", row.names = FALSE)

baseline example by professor. score: 0.63490

samplesubmission$air_store_id=substr(samplesubmission$id, 1,20)
myagg=aggregate(visitors~air_store_id, FUN=mean, data=airvisitdata)
mymerge=merge(samplesubmission, myagg, by="air_store_id", all.x=TRUE)
mymerge$visitors.x=NULL
mymerge$visitors=mymerge$visitors.y
mymerge$visitors.y=NULL
mymerge$air_store_id=NULL
write.csv(mymerge, "sillysubmit.csv", row.names=FALSE)

geometric mean. score: 0.59820

samplesubmission$air_store_id=substr(samplesubmission$id, 1,20)
myagg=aggregate(visitors~air_store_id, FUN=geometric.mean, data=airvisitdata)
mymerge=merge(samplesubmission, myagg, by="air_store_id", all.x=TRUE)
mymerge$visitors.x=NULL
mymerge$visitors=mymerge$visitors.y
mymerge$visit_date=NULL
mymerge$visitors.y=NULL
mymerge$air_store_id = NULL
write.csv(mymerge, "sillysubmit2.csv", row.names=FALSE)