loading in the libraries I will use below
library(forecast)
## Warning: package 'forecast' was built under R version 4.0.4
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(ggplot2)
library(TTR)
## Warning: package 'TTR' was built under R version 4.0.4
library(stats)
library(readr)
library(fpp)
## Warning: package 'fpp' was built under R version 4.0.5
## Loading required package: fma
## Warning: package 'fma' was built under R version 4.0.5
## Loading required package: expsmooth
## Warning: package 'expsmooth' was built under R version 4.0.5
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: tseries
## Warning: package 'tseries' was built under R version 4.0.4
library(fpp2)
## Warning: package 'fpp2' was built under R version 4.0.5
##
## Attaching package: 'fpp2'
## The following objects are masked from 'package:fpp':
##
## ausair, ausbeer, austa, austourists, debitcards, departures,
## elecequip, euretail, guinearice, oil, sunspotarea, usmelec
library(seasonal)
## Warning: package 'seasonal' was built under R version 4.0.5
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.0.5
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(psych)
## Warning: package 'psych' was built under R version 4.0.5
##
## Attaching package: 'psych'
## The following object is masked from 'package:seasonal':
##
## outlier
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 4.0.5
Here I will load in the files that I need
airvisitdata <- read.csv("C:\\Users\\wanga\\OneDrive\\Desktop\\PREDICTIVE ANALYTICS\\MIDTERM\\air_visit_data.csv")
datainfo <- read.csv("C:\\Users\\wanga\\OneDrive\\Desktop\\PREDICTIVE ANALYTICS\\MIDTERM\\date_info.csv")
airstoreinfo <- read.csv("C:\\Users\\wanga\\OneDrive\\Desktop\\PREDICTIVE ANALYTICS\\MIDTERM\\air_store_info.csv")
samplesubmission <- read.csv("C:\\Users\\wanga\\OneDrive\\Desktop\\PREDICTIVE ANALYTICS\\MIDTERM\\sample_submission.csv")
starting with simple summaries
head(airvisitdata)
## air_store_id visit_date visitors
## 1 air_ba937bf13d40fb24 2016-01-13 25
## 2 air_ba937bf13d40fb24 2016-01-14 32
## 3 air_ba937bf13d40fb24 2016-01-15 29
## 4 air_ba937bf13d40fb24 2016-01-16 22
## 5 air_ba937bf13d40fb24 2016-01-18 6
## 6 air_ba937bf13d40fb24 2016-01-19 9
head(datainfo)
## calendar_date day_of_week holiday_flg
## 1 2016-01-01 Friday 1
## 2 2016-01-02 Saturday 1
## 3 2016-01-03 Sunday 1
## 4 2016-01-04 Monday 0
## 5 2016-01-05 Tuesday 0
## 6 2016-01-06 Wednesday 0
head(airstoreinfo)
## air_store_id air_genre_name air_area_name latitude
## 1 air_0f0cdeee6c9bf3d7 Italian/French HyÅ\215go-ken KÅ\215be-shi KumoidÅ\215ri 34.69512
## 2 air_7cc17a324ae5c7dc Italian/French HyÅ\215go-ken KÅ\215be-shi KumoidÅ\215ri 34.69512
## 3 air_fee8dcf4d619598e Italian/French HyÅ\215go-ken KÅ\215be-shi KumoidÅ\215ri 34.69512
## 4 air_a17f0778617c76e2 Italian/French HyÅ\215go-ken KÅ\215be-shi KumoidÅ\215ri 34.69512
## 5 air_83db5aff8f50478e Italian/French TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en 35.65807
## 6 air_99c3eae84130c1cb Italian/French TÅ\215kyÅ\215-to Minato-ku ShibakÅ\215en 35.65807
## longitude
## 1 135.1979
## 2 135.1979
## 3 135.1979
## 4 135.1979
## 5 139.7516
## 6 139.7516
head(samplesubmission)
## id visitors
## 1 air_00a91d42b08b08d9_2017-04-23 0
## 2 air_00a91d42b08b08d9_2017-04-24 0
## 3 air_00a91d42b08b08d9_2017-04-25 0
## 4 air_00a91d42b08b08d9_2017-04-26 0
## 5 air_00a91d42b08b08d9_2017-04-27 0
## 6 air_00a91d42b08b08d9_2017-04-28 0
I currently dont see a decent way to use the air store info file so i will leave it out for the time being.
I want to check how many necessary results we will need for the sample submission
summary(samplesubmission)
## id visitors
## Length:32019 Min. :0
## Class :character 1st Qu.:0
## Mode :character Median :0
## Mean :0
## 3rd Qu.:0
## Max. :0
summary(airvisitdata)
## air_store_id visit_date visitors
## Length:252108 Length:252108 Min. : 1.00
## Class :character Class :character 1st Qu.: 9.00
## Mode :character Mode :character Median : 17.00
## Mean : 20.97
## 3rd Qu.: 29.00
## Max. :877.00
summary(datainfo)
## calendar_date day_of_week holiday_flg
## Length:517 Length:517 Min. :0.0000
## Class :character Class :character 1st Qu.:0.0000
## Mode :character Mode :character Median :0.0000
## Mean :0.0677
## 3rd Qu.:0.0000
## Max. :1.0000
summary(airstoreinfo)
## air_store_id air_genre_name air_area_name latitude
## Length:829 Length:829 Length:829 Min. :33.21
## Class :character Class :character Class :character 1st Qu.:34.70
## Mode :character Mode :character Mode :character Median :35.66
## Mean :35.65
## 3rd Qu.:35.69
## Max. :44.02
## longitude
## Min. :130.2
## 1st Qu.:135.3
## Median :139.7
## Mean :137.4
## 3rd Qu.:139.8
## Max. :144.3
Some exploratory analysis. I removed the unique() command but there are 829 different restaurants
store_obs <- 252108/829
This means that there is an average of a little less than a year of observations for each restaurant. This checks out then as the calendar_date variable only has about a year and a half of observations
airvisitdata$visit_date1 <- time(airvisitdata$visit_date)
plot(airvisitdata$visitors ~ airvisitdata$visit_date1)
This is not very helpful. Will need to fix something in order to make a plot usable. Additionally, the visit date has 252k observations. That would be like ~690 years. We will need to setup a time series to find anything useful
Lets try out the merge function. We are looking to congregate all of our needed data into one frame
summary(airvisitdata)
## air_store_id visit_date visitors visit_date1
## Length:252108 Length:252108 Min. : 1.00 Min. : 1
## Class :character Class :character 1st Qu.: 9.00 1st Qu.: 63028
## Mode :character Mode :character Median : 17.00 Median :126055
## Mean : 20.97 Mean :126055
## 3rd Qu.: 29.00 3rd Qu.:189081
## Max. :877.00 Max. :252108
summary(datainfo)
## calendar_date day_of_week holiday_flg
## Length:517 Length:517 Min. :0.0000
## Class :character Class :character 1st Qu.:0.0000
## Mode :character Mode :character Median :0.0000
## Mean :0.0677
## 3rd Qu.:0.0000
## Max. :1.0000
mergetest1 <- merge(airvisitdata, datainfo, by.x = "visit_date", by.y = "calendar_date")
head(mergetest1)
## visit_date air_store_id visitors visit_date1 day_of_week holiday_flg
## 1 2016-01-01 air_c31472d14e29cee8 3 184318 Friday 1
## 2 2016-01-01 air_05c325d315cc17f5 29 171315 Friday 1
## 3 2016-01-01 air_81c5dff692063446 7 65992 Friday 1
## 4 2016-01-01 air_a083834e7ffe187e 27 180079 Friday 1
## 5 2016-01-01 air_36bcf77d3382d36e 34 204402 Friday 1
## 6 2016-01-01 air_09a845d5b5944b01 56 29556 Friday 1
summary(mergetest1)
## visit_date air_store_id visitors visit_date1
## Length:252108 Length:252108 Min. : 1.00 Min. : 1
## Class :character Class :character 1st Qu.: 9.00 1st Qu.: 63028
## Mode :character Mode :character Median : 17.00 Median :126055
## Mean : 20.97 Mean :126055
## 3rd Qu.: 29.00 3rd Qu.:189081
## Max. :877.00 Max. :252108
## day_of_week holiday_flg
## Length:252108 Min. :0.00000
## Class :character 1st Qu.:0.00000
## Mode :character Median :0.00000
## Mean :0.05067
## 3rd Qu.:0.00000
## Max. :1.00000
We have 6 variables (would be 5 but I created an extra one for the plot which probably wont be used). But columns include visit date, store id, # visitors, day of the week and a holiday dummy variable which might be really useful later.
might be able to make some boxplots to map out relations
boxplot(mergetest1$visitors~mergetest1$day_of_week)
boxplot(mergetest1$visitors~mergetest1$day_of_week, outline = FALSE)
boxplot(mergetest1$visitors~mergetest1$holiday_flg)
boxplot(mergetest1$visitors~mergetest1$holiday_flg, outline = FALSE)
Next, lets try the aggregate function. Sample submission model got great results just using mean so we can try the same.
mergeagg <- aggregate(visitors ~ visit_date, FUN = mean, data = mergetest1)
summary(mergeagg)
## visit_date visitors
## Length:478 Min. :13.13
## Class :character 1st Qu.:17.85
## Mode :character Median :20.60
## Mean :21.15
## 3rd Qu.:24.22
## Max. :31.27
Cool! Looks like 478 observations with a reasonable average (similar to summary(airvisitdata) statistics)
What if we tried aggregate and included store id?
mergeaggairstoreid <- aggregate(visitors ~ air_store_id, FUN = mean, data = mergetest1)
summary(mergeaggairstoreid)
## air_store_id visitors
## Length:829 Min. : 1.188
## Class :character 1st Qu.: 11.812
## Mode :character Median : 19.183
## Mean : 21.204
## 3rd Qu.: 28.645
## Max. :115.471
For the air_store_id above, we note that there are 829 different means.
Adding air_store_id removes the mean Lets try time series plot using this newly aggregated data
tsmergetest2 <- ts(mergeagg$visitors, start = 2016, frequency = 365)
autoplot(tsmergetest2)
lets try air store id plot as well
tsmergeairstoreid <- ts(mergeaggairstoreid$visitors, frequency = 7)
autoplot(tsmergeairstoreid)
Now we might be getting somewhere. Lets try geometric mean since we were told to try that out as well
geomergeagg <- aggregate(visitors ~ visit_date, FUN = geometric.mean, data = mergetest1)
tsgeomergetest <- ts(geomergeagg$visitors, start = 2016, frequency = 365)
autoplot(tsgeomergetest)
Same trend but slightly lower mean values.
Could try decomposition? Might show us something useful but I dont know yet.
Also didnt work, maybe I need to use different frequency in my time series? Trying a weekly frequency
tsmergetest3 <- ts(mergeagg$visitors, frequency = 7)
autoplot(tsmergetest3)
additive decomposition (tsmergetest3 is weekly frequency)
amerge3 <- decompose(tsmergetest3, type = "additive")
autoplot(amerge3)
robust stl decomposition
stlmerge3 <- tsmergetest3 %>% stl(s.window = "periodic", robust = TRUE)
autoplot(stlmerge3)
Perhaps the STL model is better since the trend is a bit smoother while having essentially the same level of detail
We might begin to try linear regression just to see what we get
lm1 <- tslm(tsmergetest3 ~ trend + season)
summary(lm1)
##
## Call:
## tslm(formula = tsmergetest3 ~ trend + season)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9732 -1.3230 -0.3687 0.9704 11.3975
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.6970408 0.3272741 72.407 < 2e-16 ***
## trend -0.0016425 0.0007528 -2.182 0.0296 *
## season2 3.2221963 0.3866256 8.334 8.66e-16 ***
## season3 0.7969265 0.3880453 2.054 0.0406 *
## season4 -5.7605415 0.3880439 -14.845 < 2e-16 ***
## season5 -5.3658377 0.3880439 -13.828 < 2e-16 ***
## season6 -3.9593271 0.3880453 -10.203 < 2e-16 ***
## season7 -4.1449065 0.3880483 -10.681 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.271 on 470 degrees of freedom
## Multiple R-squared: 0.6739, Adjusted R-squared: 0.6691
## F-statistic: 138.8 on 7 and 470 DF, p-value: < 2.2e-16
checkresiduals(lm1)
##
## Breusch-Godfrey test for serial correlation of order up to 14
##
## data: Residuals from Linear regression model
## LM test = 192.87, df = 14, p-value < 2.2e-16
what happens without trend
lm1notrend <- tslm(tsmergetest3 ~ season)
summary(lm1notrend)
##
## Call:
## tslm(formula = tsmergetest3 ~ season)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.7892 -1.3604 -0.3524 0.9510 11.1848
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.3045 0.2745 84.906 < 2e-16 ***
## season2 3.2206 0.3882 8.297 1.13e-15 ***
## season3 0.7994 0.3896 2.052 0.0407 *
## season4 -5.7597 0.3896 -14.784 < 2e-16 ***
## season5 -5.3667 0.3896 -13.775 < 2e-16 ***
## season6 -3.9618 0.3896 -10.169 < 2e-16 ***
## season7 -4.1490 0.3896 -10.650 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.28 on 471 degrees of freedom
## Multiple R-squared: 0.6706, Adjusted R-squared: 0.6664
## F-statistic: 159.8 on 6 and 471 DF, p-value: < 2.2e-16
lets try forecasting
fclm1 <- rep(0, 35)
tsfclm1 <- ts(fclm1, frequency = 7)
mergeforecast <- forecast(lm1, newdata = tsfclm1)
## Warning in forecast.lm(lm1, newdata = tsfclm1): newdata column names not
## specified, defaulting to first variable required.
autoplot(mergeforecast) + xlim(60, 80)
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Warning: Removed 413 row(s) containing missing values (geom_path).
summary(mergeforecast)
##
## Forecast method: Linear regression model
##
## Model Information:
##
## Call:
## tslm(formula = tsmergetest3 ~ trend + season)
##
## Coefficients:
## (Intercept) trend season2 season3 season4 season5
## 23.697041 -0.001642 3.222196 0.796926 -5.760541 -5.365838
## season6 season7
## -3.959327 -4.144906
##
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -1.677256e-16 2.251823 1.616953 -1.126531 7.670276 0.8918741
## ACF1
## Training set 0.5585678
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 69.28571 23.70722 20.76223 26.65220 19.19796 28.21647
## 69.42857 17.14810 14.20312 20.09309 12.63885 21.65736
## 69.57143 17.54117 14.59618 20.48615 13.03191 22.05042
## 69.71429 18.94603 16.00105 21.89102 14.43678 23.45529
## 69.85714 18.75881 15.81382 21.70380 14.24956 23.26806
## 70.00000 22.90208 19.95713 25.84703 18.39288 27.41127
## 70.14286 26.12263 23.17768 29.06758 21.61344 30.63182
## 70.28571 23.69572 20.75019 26.64125 19.18563 28.20580
## 70.42857 17.13661 14.19108 20.08214 12.62652 21.64669
## 70.57143 17.52967 14.58414 20.47520 13.01958 22.03975
## 70.71429 18.93454 15.98901 21.88007 14.42445 23.44462
## 70.85714 18.74731 15.80178 21.69285 14.23723 23.25740
## 71.00000 22.89058 19.94508 25.83608 18.38054 27.40062
## 71.14286 26.11113 23.16563 29.05663 21.60109 30.62117
## 71.28571 23.68422 20.73813 26.63031 19.17328 28.19516
## 71.42857 17.12511 14.17902 20.07120 12.61417 21.63605
## 71.57143 17.51817 14.57208 20.46426 13.00723 22.02911
## 71.71429 18.92304 15.97695 21.86913 14.41210 23.43398
## 71.85714 18.73582 15.78973 21.68191 14.22488 23.24676
## 72.00000 22.87908 19.93301 25.82515 18.36818 27.38999
## 72.14286 26.09964 23.15357 29.04570 21.58873 30.61054
## 72.28571 23.67272 20.72606 26.61939 19.16090 28.18454
## 72.42857 17.11361 14.16695 20.06028 12.60179 21.62543
## 72.57143 17.50667 14.56001 20.45334 12.99485 22.01849
## 72.71429 18.91154 15.96488 21.85821 14.39972 23.42336
## 72.85714 18.72432 15.77766 21.67098 14.21250 23.23614
## 73.00000 22.86758 19.92093 25.81423 18.35579 27.37938
## 73.14286 26.08814 23.14149 29.03479 21.57634 30.59993
## 73.28571 23.66123 20.71397 26.60848 19.14850 28.17395
## 73.42857 17.10211 14.15486 20.04937 12.58939 21.61484
## 73.57143 17.49518 14.54792 20.44243 12.98245 22.00790
## 73.71429 18.90004 15.95279 21.84730 14.38732 23.41277
## 73.85714 18.71282 15.76557 21.66008 14.20010 23.22554
## 74.00000 22.85609 19.90884 25.80333 18.34337 27.36880
## 74.14286 26.07664 23.12939 29.02389 21.56393 30.58935
Looks alright, it essentially is able to capture our weekly trend (less restaurant goers during start, more at the end)
Lets try some other forecasting methods?
stlfdrift <- stlf(tsmergetest3, s.window = "periodic", robust = TRUE, method = "rwdrift")
autoplot(stlfdrift)
autoplot(stlfdrift) + xlim(60, 80)
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Warning: Removed 413 row(s) containing missing values (geom_path).
summary(stlfdrift)
##
## Forecast method: STL + Random walk with drift
##
## Model Information:
## Call: rwf(y = x, h = h, drift = TRUE, level = level)
##
## Drift: 0.005 (se 0.1001)
## Residual sd: 2.1857
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 5.697621e-16 2.183425 1.414258 -0.518423 6.802913 0.7800721
## ACF1
## Training set -0.1110387
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 69.28571 24.11090 21.309787 26.91201 19.826969 28.39482
## 69.42857 17.03421 13.068698 20.99973 10.969480 23.09895
## 69.57143 18.10754 13.245719 22.96937 10.672025 25.54306
## 69.71429 19.31955 13.699746 24.93936 10.724799 27.91431
## 69.85714 19.19199 12.902310 25.48167 9.572756 28.81122
## 70.00000 23.93825 17.041098 30.83541 13.389965 34.48654
## 70.14286 27.48336 20.025852 34.94086 16.078088 38.88862
## 70.28571 24.14593 16.165276 32.12659 11.940573 36.35129
## 70.42857 17.06925 8.595746 25.54275 4.110145 30.02835
## 70.57143 18.14258 9.201519 27.08364 4.468407 31.81675
## 70.71429 19.35459 9.967484 28.74169 4.998250 33.71093
## 70.85714 19.22702 9.412451 29.04160 4.216930 34.23712
## 71.00000 23.97329 13.747496 34.19908 8.334289 39.61229
## 71.14286 27.51839 16.895739 38.14104 11.272447 43.76433
STLF drift method with weekly frequency time series data provides similar results. So far tsmergetest3 has been the most effective
here is a naive model
stlfnaive <- stlf(tsmergetest3, s.window = "periodic", robust = TRUE, method = "naive")
autoplot(stlfnaive)
autoplot(stlfnaive) + xlim(60, 80)
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Warning: Removed 413 row(s) containing missing values (geom_path).
summary(stlfnaive)
##
## Forecast method: STL + Random walk
##
## Model Information:
## Call: rwf(y = x, h = h, drift = FALSE, level = level)
##
## Residual sd: 2.1834
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.005005011 2.183431 1.414457 -0.4939261 6.802882 0.7801821
## ACF1
## Training set -0.1110387
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 69.28571 24.10589 21.307713 26.90407 19.826446 28.38534
## 69.42857 17.02420 13.066982 20.98143 10.972155 23.07625
## 69.57143 18.09253 13.245940 22.93912 10.680312 25.50475
## 69.71429 19.29953 13.703176 24.89589 10.740643 27.85842
## 69.85714 19.16696 12.910045 25.42388 9.597832 28.73609
## 70.00000 23.90822 17.054113 30.76233 13.425766 34.39068
## 70.14286 27.44832 20.045035 34.85161 16.125972 38.77067
## 70.28571 24.10589 16.191446 32.02034 12.001792 36.20999
## 70.42857 17.02420 8.629668 25.41874 4.185868 29.86254
## 70.57143 18.09253 9.243910 26.94115 4.559734 31.62532
## 70.71429 19.29953 10.019024 28.58004 5.106219 33.49285
## 70.85714 19.16696 9.473787 28.86014 4.342530 33.99140
## 71.00000 23.90822 13.819246 33.99720 8.478464 39.33798
## 71.14286 27.44832 16.978493 37.91815 11.436102 43.46054
The naive model provides almost the exact same results
Now, maybe we can make a better linear model? I will try to figure out some way to incorporate the holiday flag variable. Initial thoughts is that it would have a positive coefficient (ie. people go out to eat during holidays)
summary(mergeagg)
## visit_date visitors
## Length:478 Min. :13.13
## Class :character 1st Qu.:17.85
## Mode :character Median :20.60
## Mean :21.15
## 3rd Qu.:24.22
## Max. :31.27
Just a quick recap above for our aggregate data frame. We have the visit date and the average number of visitors per day
Here, we begin to reformat the visit date info and turn it into a more complex time series
mergetest1$visit_date <-as.Date(mergetest1$visit_date, format=c('%Y-%m-%d'))
mergetest1$Month <- as.factor(month(mergetest1$visit_date))
mergetest1$Month <- as.numeric(as.factor(mergetest1$Month))
mergetest1$visit_date1 = NULL
head(mergetest1)
## visit_date air_store_id visitors day_of_week holiday_flg Month
## 1 2016-01-01 air_c31472d14e29cee8 3 Friday 1 1
## 2 2016-01-01 air_05c325d315cc17f5 29 Friday 1 1
## 3 2016-01-01 air_81c5dff692063446 7 Friday 1 1
## 4 2016-01-01 air_a083834e7ffe187e 27 Friday 1 1
## 5 2016-01-01 air_36bcf77d3382d36e 34 Friday 1 1
## 6 2016-01-01 air_09a845d5b5944b01 56 Friday 1 1
Above, we now have turned visit_date into time and added a monthly flag from 1 to 12
creating dummy variables for month
mergetest1 <- dummy_cols(mergetest1, select_columns = "Month")
head(mergetest1)
## visit_date air_store_id visitors day_of_week holiday_flg Month
## 1 2016-01-01 air_c31472d14e29cee8 3 Friday 1 1
## 2 2016-01-01 air_05c325d315cc17f5 29 Friday 1 1
## 3 2016-01-01 air_81c5dff692063446 7 Friday 1 1
## 4 2016-01-01 air_a083834e7ffe187e 27 Friday 1 1
## 5 2016-01-01 air_36bcf77d3382d36e 34 Friday 1 1
## 6 2016-01-01 air_09a845d5b5944b01 56 Friday 1 1
## Month_1 Month_2 Month_3 Month_4 Month_5 Month_6 Month_7 Month_8 Month_9
## 1 1 0 0 0 0 0 0 0 0
## 2 1 0 0 0 0 0 0 0 0
## 3 1 0 0 0 0 0 0 0 0
## 4 1 0 0 0 0 0 0 0 0
## 5 1 0 0 0 0 0 0 0 0
## 6 1 0 0 0 0 0 0 0 0
## Month_10 Month_11 Month_12
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
summary(mergetest1)
## visit_date air_store_id visitors day_of_week
## Min. :2016-01-01 Length:252108 Min. : 1.00 Length:252108
## 1st Qu.:2016-07-23 Class :character 1st Qu.: 9.00 Class :character
## Median :2016-10-23 Mode :character Median : 17.00 Mode :character
## Mean :2016-10-12 Mean : 20.97
## 3rd Qu.:2017-01-24 3rd Qu.: 29.00
## Max. :2017-04-22 Max. :877.00
## holiday_flg Month Month_1 Month_2
## Min. :0.00000 Min. : 1.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.: 3.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median : 7.000 Median :0.0000 Median :0.0000
## Mean :0.05067 Mean : 6.208 Mean :0.1072 Mean :0.1087
## 3rd Qu.:0.00000 3rd Qu.:10.000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.00000 Max. :12.000 Max. :1.0000 Max. :1.0000
## Month_3 Month_4 Month_5 Month_6
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.1213 Mean :0.09464 Mean :0.03237 Mean :0.03271
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## Month_7 Month_8 Month_9 Month_10
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.08499 Mean :0.08201 Mean :0.08262 Mean :0.08515
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## Month_11 Month_12
## Min. :0.000 Min. :0.00000
## 1st Qu.:0.000 1st Qu.:0.00000
## Median :0.000 Median :0.00000
## Mean :0.083 Mean :0.08534
## 3rd Qu.:0.000 3rd Qu.:0.00000
## Max. :1.000 Max. :1.00000
Now, above we have added dummy variables which will help us create more complex linear models
So lets try another lm now with the new dummy variables
lm2 <- lm(visitors~visit_date+holiday_flg+Month_2+
Month_3+Month_4+Month_5+Month_6+Month_7+
Month_8+Month_9+Month_10+Month_11+Month_12, data = mergetest1)
summary(lm2)
##
## Call:
## lm(formula = visitors ~ visit_date + holiday_flg + Month_2 +
## Month_3 + Month_4 + Month_5 + Month_6 + Month_7 + Month_8 +
## Month_9 + Month_10 + Month_11 + Month_12, data = mergetest1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.83 -12.13 -3.87 8.12 854.89
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 55.7366770 5.2403515 10.636 < 2e-16 ***
## visit_date -0.0021031 0.0003067 -6.858 7.01e-12 ***
## holiday_flg 2.9761154 0.1534097 19.400 < 2e-16 ***
## Month_2 0.6162385 0.1434554 4.296 1.74e-05 ***
## Month_3 2.6175421 0.1406553 18.610 < 2e-16 ***
## Month_4 2.1017658 0.1497246 14.038 < 2e-16 ***
## Month_5 1.6297222 0.2158005 7.552 4.30e-14 ***
## Month_6 1.2664427 0.2133947 5.935 2.95e-09 ***
## Month_7 1.2070385 0.1551855 7.778 7.39e-15 ***
## Month_8 -0.5097829 0.1554993 -3.278 0.00104 **
## Month_9 -0.1255054 0.1542208 -0.814 0.41576
## Month_10 0.4481443 0.1528223 2.932 0.00336 **
## Month_11 -0.1709310 0.1542427 -1.108 0.26778
## Month_12 3.1440027 0.1541984 20.389 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.71 on 252094 degrees of freedom
## Multiple R-squared: 0.006202, Adjusted R-squared: 0.006151
## F-statistic: 121 on 13 and 252094 DF, p-value: < 2.2e-16
Incredibly low adj.R squared. Normal linear model doesnt do much for us. Its almost necessary that we use the tslm() function in order to use the season + trend predictors
need to figure out how to get tslm() model working for mergetest1 data to use trend and seasonal. We have used tsmergetest1 in order to compute our time series using the average visits of all stores combined
maybe i need to create new data frame with useful variables. Here we convert visit date info into actual time data
mergeagg$visit_date <-as.Date(mergeagg$visit_date, format=c('%Y-%m-%d'))
mergeagg$Month <- as.factor(month(mergeagg$visit_date))
mergeagg$Month <- as.numeric(as.factor(mergeagg$Month))
mergeagg <- dummy_cols(mergeagg, select_columns = "Month")
head(mergeagg)
## visit_date visitors Month Month_1 Month_2 Month_3 Month_4 Month_5 Month_6
## 1 2016-01-01 21.52083 1 1 0 0 0 0 0
## 2 2016-01-02 28.00000 1 1 0 0 0 0 0
## 3 2016-01-03 29.23457 1 1 0 0 0 0 0
## 4 2016-01-04 21.18471 1 1 0 0 0 0 0
## 5 2016-01-05 17.00000 1 1 0 0 0 0 0
## 6 2016-01-06 16.61600 1 1 0 0 0 0 0
## Month_7 Month_8 Month_9 Month_10 Month_11 Month_12
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
summary(mergeagg)
## visit_date visitors Month Month_1
## Min. :2016-01-01 Min. :13.13 Min. : 1.000 Min. :0.0000
## 1st Qu.:2016-04-29 1st Qu.:17.85 1st Qu.: 3.000 1st Qu.:0.0000
## Median :2016-08-26 Median :20.60 Median : 5.000 Median :0.0000
## Mean :2016-08-26 Mean :21.15 Mean : 5.548 Mean :0.1297
## 3rd Qu.:2016-12-23 3rd Qu.:24.22 3rd Qu.: 9.000 3rd Qu.:0.0000
## Max. :2017-04-22 Max. :31.27 Max. :12.000 Max. :1.0000
## Month_2 Month_3 Month_4 Month_5
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.1192 Mean :0.1297 Mean :0.1088 Mean :0.06485
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## Month_6 Month_7 Month_8 Month_9
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.06276 Mean :0.06485 Mean :0.06485 Mean :0.06276
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## Month_10 Month_11 Month_12
## Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.06485 Mean :0.06276 Mean :0.06485
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000
By looking over that summary, we can see that mergeagg is now a data frame which contains dummy variables for the months, average visitors per day and the date that statistic was recorded on
Here, we convert mergeagg into time series data with a weekly frequency
tsmergetest5 <- ts(mergeagg, frequency = 7)
head(tsmergetest5)
## Time Series:
## Start = c(1, 1)
## End = c(1, 6)
## Frequency = 7
## visit_date visitors Month Month_1 Month_2 Month_3 Month_4 Month_5
## 1.000000 16801 21.52083 1 1 0 0 0 0
## 1.142857 16802 28.00000 1 1 0 0 0 0
## 1.285714 16803 29.23457 1 1 0 0 0 0
## 1.428571 16804 21.18471 1 1 0 0 0 0
## 1.571429 16805 17.00000 1 1 0 0 0 0
## 1.714286 16806 16.61600 1 1 0 0 0 0
## Month_6 Month_7 Month_8 Month_9 Month_10 Month_11 Month_12
## 1.000000 0 0 0 0 0 0 0
## 1.142857 0 0 0 0 0 0 0
## 1.285714 0 0 0 0 0 0 0
## 1.428571 0 0 0 0 0 0 0
## 1.571429 0 0 0 0 0 0 0
## 1.714286 0 0 0 0 0 0 0
autoplot(tsmergetest5)
tslm() using months as dummy variables
lm3 <- tslm(visitors ~ visit_date + season + Month_2+
Month_3+Month_4+Month_5+Month_6+Month_7+
Month_8+Month_9+Month_10+Month_11 + Month_12, data = tsmergetest5)
summary(lm3)
##
## Call:
## tslm(formula = visitors ~ visit_date + season + Month_2 + Month_3 +
## Month_4 + Month_5 + Month_6 + Month_7 + Month_8 + Month_9 +
## Month_10 + Month_11 + Month_12, data = tsmergetest5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.0153 -1.0922 -0.2242 0.7093 12.1625
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59.737643 12.480020 4.787 2.29e-06 ***
## visit_date -0.002183 0.000734 -2.974 0.003094 **
## season2 3.231050 0.347636 9.294 < 2e-16 ***
## season3 0.851229 0.349018 2.439 0.015109 *
## season4 -5.696396 0.349102 -16.317 < 2e-16 ***
## season5 -5.329111 0.349061 -15.267 < 2e-16 ***
## season6 -3.945275 0.349103 -11.301 < 2e-16 ***
## season7 -4.174325 0.348936 -11.963 < 2e-16 ***
## Month_2 0.213291 0.375136 0.569 0.569926
## Month_3 2.297744 0.369495 6.219 1.13e-09 ***
## Month_4 1.378286 0.386512 3.566 0.000401 ***
## Month_5 1.544538 0.451369 3.422 0.000678 ***
## Month_6 0.825379 0.454827 1.815 0.070221 .
## Month_7 0.411927 0.449065 0.917 0.359468
## Month_8 -0.433943 0.449743 -0.965 0.335118
## Month_9 -0.496864 0.456332 -1.089 0.276804
## Month_10 -0.151024 0.454003 -0.333 0.739551
## Month_11 -0.334229 0.462830 -0.722 0.470574
## Month_12 2.721453 0.462865 5.880 7.92e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.041 on 459 degrees of freedom
## Multiple R-squared: 0.7427, Adjusted R-squared: 0.7327
## F-statistic: 73.62 on 18 and 459 DF, p-value: < 2.2e-16
checkresiduals(lm3)
##
## Breusch-Godfrey test for serial correlation of order up to 22
##
## data: Residuals from Linear regression model
## LM test = 144.9, df = 22, p-value < 2.2e-16
Our adj.R squared has gone up compared to the last tslm() model. Trend showed as NA and had no effect on adj.R squared so i removed it
we need to find someway to extract date data and holiday flag data to move to our ts() for further analysis
summary(mergetest1)
## visit_date air_store_id visitors day_of_week
## Min. :2016-01-01 Length:252108 Min. : 1.00 Length:252108
## 1st Qu.:2016-07-23 Class :character 1st Qu.: 9.00 Class :character
## Median :2016-10-23 Mode :character Median : 17.00 Mode :character
## Mean :2016-10-12 Mean : 20.97
## 3rd Qu.:2017-01-24 3rd Qu.: 29.00
## Max. :2017-04-22 Max. :877.00
## holiday_flg Month Month_1 Month_2
## Min. :0.00000 Min. : 1.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.: 3.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median : 7.000 Median :0.0000 Median :0.0000
## Mean :0.05067 Mean : 6.208 Mean :0.1072 Mean :0.1087
## 3rd Qu.:0.00000 3rd Qu.:10.000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.00000 Max. :12.000 Max. :1.0000 Max. :1.0000
## Month_3 Month_4 Month_5 Month_6
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.1213 Mean :0.09464 Mean :0.03237 Mean :0.03271
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## Month_7 Month_8 Month_9 Month_10
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.08499 Mean :0.08201 Mean :0.08262 Mean :0.08515
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## Month_11 Month_12
## Min. :0.000 Min. :0.00000
## 1st Qu.:0.000 1st Qu.:0.00000
## Median :0.000 Median :0.00000
## Mean :0.083 Mean :0.08534
## 3rd Qu.:0.000 3rd Qu.:0.00000
## Max. :1.000 Max. :1.00000
plot(mergetest1$holiday_flg~mergetest1$visit_date)
table(mergetest1$holiday_flg)
##
## 0 1
## 239333 12775
making sure datainfo will work by converting datainfo$calendar_date into time data
datainfo$calendar_date <-as.Date(datainfo$calendar_date, format=c('%Y-%m-%d'))
summary(datainfo)
## calendar_date day_of_week holiday_flg
## Min. :2016-01-01 Length:517 Min. :0.0000
## 1st Qu.:2016-05-09 Class :character 1st Qu.:0.0000
## Median :2016-09-15 Mode :character Median :0.0000
## Mean :2016-09-15 Mean :0.0677
## 3rd Qu.:2017-01-22 3rd Qu.:0.0000
## Max. :2017-05-31 Max. :1.0000
Now calendar date should be able to be merged
binding holiday flag to rest mergeagg
mergeaggtest1 <- merge(mergeagg, datainfo, by.x = "visit_date", by.y = "calendar_date")
summary(mergeaggtest1)
## visit_date visitors Month Month_1
## Min. :2016-01-01 Min. :13.13 Min. : 1.000 Min. :0.0000
## 1st Qu.:2016-04-29 1st Qu.:17.85 1st Qu.: 3.000 1st Qu.:0.0000
## Median :2016-08-26 Median :20.60 Median : 5.000 Median :0.0000
## Mean :2016-08-26 Mean :21.15 Mean : 5.548 Mean :0.1297
## 3rd Qu.:2016-12-23 3rd Qu.:24.22 3rd Qu.: 9.000 3rd Qu.:0.0000
## Max. :2017-04-22 Max. :31.27 Max. :12.000 Max. :1.0000
## Month_2 Month_3 Month_4 Month_5
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.1192 Mean :0.1297 Mean :0.1088 Mean :0.06485
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## Month_6 Month_7 Month_8 Month_9
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.06276 Mean :0.06485 Mean :0.06485 Mean :0.06276
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## Month_10 Month_11 Month_12 day_of_week
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Length:478
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 Class :character
## Median :0.00000 Median :0.00000 Median :0.00000 Mode :character
## Mean :0.06485 Mean :0.06276 Mean :0.06485
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000
## holiday_flg
## Min. :0.00000
## 1st Qu.:0.00000
## Median :0.00000
## Mean :0.06485
## 3rd Qu.:0.00000
## Max. :1.00000
Now in the code above, we have merged our previous aggregate data with the date info which has holiday flags in it
another time series using the newly formed df
tsmergetest6 <- ts(mergeaggtest1, frequency = 7)
lmtsmergetest6 <- tslm(visitors ~ season + holiday_flg + Month_2+
Month_3+Month_4+Month_5+Month_6+Month_7+
Month_8+Month_9+Month_10+Month_11 + Month_12, data = tsmergetest6)
summary(lmtsmergetest6)
##
## Call:
## tslm(formula = visitors ~ season + holiday_flg + Month_2 + Month_3 +
## Month_4 + Month_5 + Month_6 + Month_7 + Month_8 + Month_9 +
## Month_10 + Month_11 + Month_12, data = tsmergetest6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.6291 -0.8610 -0.0583 0.8643 8.5140
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.13809 0.30727 72.047 < 2e-16 ***
## season2 3.22599 0.30693 10.511 < 2e-16 ***
## season3 0.84602 0.30814 2.746 0.00628 **
## season4 -6.00182 0.30925 -19.408 < 2e-16 ***
## season5 -5.21151 0.30835 -16.901 < 2e-16 ***
## season6 -3.83748 0.30836 -12.445 < 2e-16 ***
## season7 -4.30692 0.30826 -13.972 < 2e-16 ***
## holiday_flg 4.07930 0.34331 11.882 < 2e-16 ***
## Month_2 0.54318 0.33237 1.634 0.10288
## Month_3 2.49247 0.32509 7.667 1.06e-13 ***
## Month_4 1.69289 0.34108 4.963 9.78e-07 ***
## Month_5 1.81441 0.39664 4.574 6.15e-06 ***
## Month_6 1.41900 0.40347 3.517 0.00048 ***
## Month_7 0.80593 0.39785 2.026 0.04337 *
## Month_8 -0.50012 0.39660 -1.261 0.20794
## Month_9 -0.37242 0.40156 -0.927 0.35419
## Month_10 0.05202 0.39791 0.131 0.89605
## Month_11 -0.35486 0.40151 -0.884 0.37725
## Month_12 2.39083 0.39660 6.028 3.41e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.802 on 459 degrees of freedom
## Multiple R-squared: 0.7995, Adjusted R-squared: 0.7916
## F-statistic: 101.7 on 18 and 459 DF, p-value: < 2.2e-16
checkresiduals(lmtsmergetest6)
##
## Breusch-Godfrey test for serial correlation of order up to 22
##
## data: Residuals from Linear regression model
## LM test = 117.97, df = 22, p-value = 4.073e-15
Above, we use the new data frame which includes months and holiday flags to create a tslm() Once again, trend showed NA and did not change the adj.R squared value when removing it so it is not in the final model
NOTE TO SELF: TSMERGETEST6 IS MOST UP TO DATE TSLM(). NEED BETTER VARIABLE NAMES IN THE FUTURE
decomposition on the above time series? idk if this is going to work
amerge6 <- decompose(tsmergetest6, type = "additive")
autoplot(amerge6)
I am not quite sure how to read this as it is a multivariate time series or maybe it is even just completely wrong
another tslm(), same data set, different variables (no months)
lmtsmergetest7 <- tslm(visitors ~ season + holiday_flg, data = tsmergetest6)
summary(lmtsmergetest7)
##
## Call:
## tslm(formula = visitors ~ season + holiday_flg, data = tsmergetest6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9807 -1.1904 -0.1493 1.0324 8.7229
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.0792 0.2505 92.133 <2e-16 ***
## season2 3.2206 0.3528 9.128 <2e-16 ***
## season3 0.7961 0.3541 2.248 0.025 *
## season4 -6.0488 0.3553 -17.025 <2e-16 ***
## season5 -5.2557 0.3543 -14.834 <2e-16 ***
## season6 -3.8508 0.3543 -10.869 <2e-16 ***
## season7 -4.2666 0.3543 -12.042 <2e-16 ***
## holiday_flg 3.8860 0.3884 10.004 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.072 on 470 degrees of freedom
## Multiple R-squared: 0.7285, Adjusted R-squared: 0.7244
## F-statistic: 180.1 on 7 and 470 DF, p-value: < 2.2e-16
checkresiduals(lmtsmergetest7)
##
## Breusch-Godfrey test for serial correlation of order up to 14
##
## data: Residuals from Linear regression model
## LM test = 202.62, df = 14, p-value < 2.2e-16
The adjusted R-squared dropped a bit when removing months. Down from 0.79 to 0.72
lets add the last few holiday flags and predict forecast
fc_holiday_flg <- datainfo$holiday_flg[479:517]
hfforecast <- forecast(lmtsmergetest7, newdata = fc_holiday_flg)
## Warning in forecast.lm(lmtsmergetest7, newdata = fc_holiday_flg): newdata column
## names not specified, defaulting to first variable required.
autoplot(hfforecast) + xlim(60, 80)
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Warning: Removed 413 row(s) containing missing values (geom_path).
summary(hfforecast)
##
## Forecast method: Linear regression model
##
## Model Information:
##
## Call:
## tslm(formula = visitors ~ season + holiday_flg, data = tsmergetest6)
##
## Coefficients:
## (Intercept) season2 season3 season4 season5 season6
## 23.0792 3.2206 0.7961 -6.0488 -5.2557 -3.8508
## season7 holiday_flg
## -4.2666 3.8860
##
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -1.751594e-16 2.05496 1.493419 -0.9357917 7.072272 0.8237356
## ACF1
## Training set 0.5479932
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 69.28571 23.87529 21.19605 26.55453 19.77294 27.97763
## 69.42857 17.03044 14.35055 19.71033 12.92709 21.13378
## 69.57143 17.82353 15.14441 20.50265 13.72137 21.92570
## 69.71429 19.22840 16.54928 21.90752 15.12624 23.33056
## 69.85714 18.81259 16.13315 21.49203 14.70993 22.91524
## 70.00000 23.07921 20.40026 25.75816 18.97730 27.18112
## 70.14286 30.18579 27.46614 32.90544 26.02157 34.35001
## 70.28571 23.87529 21.19605 26.55453 19.77294 27.97763
## 70.42857 17.03044 14.35055 19.71033 12.92709 21.13378
## 70.57143 17.82353 15.14441 20.50265 13.72137 21.92570
## 70.71429 23.11443 20.39201 25.83685 18.94596 27.28290
## 70.85714 22.69862 19.98126 25.41598 18.53790 26.85934
## 71.00000 26.96524 24.24559 29.68489 22.80102 31.12946
## 71.14286 26.29976 23.62081 28.97872 22.19785 30.40167
## 71.28571 23.87529 21.19605 26.55453 19.77294 27.97763
## 71.42857 17.03044 14.35055 19.71033 12.92709 21.13378
## 71.57143 17.82353 15.14441 20.50265 13.72137 21.92570
## 71.71429 19.22840 16.54928 21.90752 15.12624 23.33056
## 71.85714 18.81259 16.13315 21.49203 14.70993 22.91524
## 72.00000 23.07921 20.40026 25.75816 18.97730 27.18112
## 72.14286 26.29976 23.62081 28.97872 22.19785 30.40167
## 72.28571 23.87529 21.19605 26.55453 19.77294 27.97763
## 72.42857 17.03044 14.35055 19.71033 12.92709 21.13378
## 72.57143 17.82353 15.14441 20.50265 13.72137 21.92570
## 72.71429 19.22840 16.54928 21.90752 15.12624 23.33056
## 72.85714 18.81259 16.13315 21.49203 14.70993 22.91524
## 73.00000 23.07921 20.40026 25.75816 18.97730 27.18112
## 73.14286 26.29976 23.62081 28.97872 22.19785 30.40167
## 73.28571 23.87529 21.19605 26.55453 19.77294 27.97763
## 73.42857 17.03044 14.35055 19.71033 12.92709 21.13378
## 73.57143 17.82353 15.14441 20.50265 13.72137 21.92570
## 73.71429 19.22840 16.54928 21.90752 15.12624 23.33056
## 73.85714 18.81259 16.13315 21.49203 14.70993 22.91524
## 74.00000 23.07921 20.40026 25.75816 18.97730 27.18112
## 74.14286 26.29976 23.62081 28.97872 22.19785 30.40167
## 74.28571 23.87529 21.19605 26.55453 19.77294 27.97763
## 74.42857 17.03044 14.35055 19.71033 12.92709 21.13378
## 74.57143 17.82353 15.14441 20.50265 13.72137 21.92570
## 74.71429 19.22840 16.54928 21.90752 15.12624 23.33056
direct comparison with our earlier models
autoplot(mergeforecast) + xlim(60, 80)
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Warning: Removed 413 row(s) containing missing values (geom_path).
summary(mergeforecast)
##
## Forecast method: Linear regression model
##
## Model Information:
##
## Call:
## tslm(formula = tsmergetest3 ~ trend + season)
##
## Coefficients:
## (Intercept) trend season2 season3 season4 season5
## 23.697041 -0.001642 3.222196 0.796926 -5.760541 -5.365838
## season6 season7
## -3.959327 -4.144906
##
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -1.677256e-16 2.251823 1.616953 -1.126531 7.670276 0.8918741
## ACF1
## Training set 0.5585678
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 69.28571 23.70722 20.76223 26.65220 19.19796 28.21647
## 69.42857 17.14810 14.20312 20.09309 12.63885 21.65736
## 69.57143 17.54117 14.59618 20.48615 13.03191 22.05042
## 69.71429 18.94603 16.00105 21.89102 14.43678 23.45529
## 69.85714 18.75881 15.81382 21.70380 14.24956 23.26806
## 70.00000 22.90208 19.95713 25.84703 18.39288 27.41127
## 70.14286 26.12263 23.17768 29.06758 21.61344 30.63182
## 70.28571 23.69572 20.75019 26.64125 19.18563 28.20580
## 70.42857 17.13661 14.19108 20.08214 12.62652 21.64669
## 70.57143 17.52967 14.58414 20.47520 13.01958 22.03975
## 70.71429 18.93454 15.98901 21.88007 14.42445 23.44462
## 70.85714 18.74731 15.80178 21.69285 14.23723 23.25740
## 71.00000 22.89058 19.94508 25.83608 18.38054 27.40062
## 71.14286 26.11113 23.16563 29.05663 21.60109 30.62117
## 71.28571 23.68422 20.73813 26.63031 19.17328 28.19516
## 71.42857 17.12511 14.17902 20.07120 12.61417 21.63605
## 71.57143 17.51817 14.57208 20.46426 13.00723 22.02911
## 71.71429 18.92304 15.97695 21.86913 14.41210 23.43398
## 71.85714 18.73582 15.78973 21.68191 14.22488 23.24676
## 72.00000 22.87908 19.93301 25.82515 18.36818 27.38999
## 72.14286 26.09964 23.15357 29.04570 21.58873 30.61054
## 72.28571 23.67272 20.72606 26.61939 19.16090 28.18454
## 72.42857 17.11361 14.16695 20.06028 12.60179 21.62543
## 72.57143 17.50667 14.56001 20.45334 12.99485 22.01849
## 72.71429 18.91154 15.96488 21.85821 14.39972 23.42336
## 72.85714 18.72432 15.77766 21.67098 14.21250 23.23614
## 73.00000 22.86758 19.92093 25.81423 18.35579 27.37938
## 73.14286 26.08814 23.14149 29.03479 21.57634 30.59993
## 73.28571 23.66123 20.71397 26.60848 19.14850 28.17395
## 73.42857 17.10211 14.15486 20.04937 12.58939 21.61484
## 73.57143 17.49518 14.54792 20.44243 12.98245 22.00790
## 73.71429 18.90004 15.95279 21.84730 14.38732 23.41277
## 73.85714 18.71282 15.76557 21.66008 14.20010 23.22554
## 74.00000 22.85609 19.90884 25.80333 18.34337 27.36880
## 74.14286 26.07664 23.12939 29.02389 21.56393 30.58935
We notice that most of the points are similar but the variables flagged as holidays have slightly higher predictions as expected
Lets begin working on the sample submission part now. Here, we are dividing up the id into both air_store_id and visit_date.
samplesubmission$air_store_id <- substr(samplesubmission$id, 1,20)
samplesubmission$visit_date <- substr(samplesubmission$id, 22,31)
samplesubmission$visit_date <- as.Date(samplesubmission$visit_date, format=c('%Y-%m-%d'))
head(samplesubmission)
## id visitors air_store_id visit_date
## 1 air_00a91d42b08b08d9_2017-04-23 0 air_00a91d42b08b08d9 2017-04-23
## 2 air_00a91d42b08b08d9_2017-04-24 0 air_00a91d42b08b08d9 2017-04-24
## 3 air_00a91d42b08b08d9_2017-04-25 0 air_00a91d42b08b08d9 2017-04-25
## 4 air_00a91d42b08b08d9_2017-04-26 0 air_00a91d42b08b08d9 2017-04-26
## 5 air_00a91d42b08b08d9_2017-04-27 0 air_00a91d42b08b08d9 2017-04-27
## 6 air_00a91d42b08b08d9_2017-04-28 0 air_00a91d42b08b08d9 2017-04-28
breaking up id into store and date was successful. Now we need to start implementing the as.date, month and holiday flags.
summary(datainfo)
## calendar_date day_of_week holiday_flg
## Min. :2016-01-01 Length:517 Min. :0.0000
## 1st Qu.:2016-05-09 Class :character 1st Qu.:0.0000
## Median :2016-09-15 Mode :character Median :0.0000
## Mean :2016-09-15 Mean :0.0677
## 3rd Qu.:2017-01-22 3rd Qu.:0.0000
## Max. :2017-05-31 Max. :1.0000
We can see that datainfo$calendar_date is in time format (the same as samplesubmission) so there shouldnt be issues merging
samplemerge <- merge(samplesubmission, datainfo, by.x = "visit_date", by.y = "calendar_date")
head(samplemerge)
## visit_date id visitors air_store_id
## 1 2017-04-23 air_00a91d42b08b08d9_2017-04-23 0 air_00a91d42b08b08d9
## 2 2017-04-23 air_ec0fad2def4dcff0_2017-04-23 0 air_ec0fad2def4dcff0
## 3 2017-04-23 air_f2c5a1f24279c531_2017-04-23 0 air_f2c5a1f24279c531
## 4 2017-04-23 air_e270aff84ac7e4c8_2017-04-23 0 air_e270aff84ac7e4c8
## 5 2017-04-23 air_dad0b6a36138f309_2017-04-23 0 air_dad0b6a36138f309
## 6 2017-04-23 air_cfdeb326418194ff_2017-04-23 0 air_cfdeb326418194ff
## day_of_week holiday_flg
## 1 Sunday 0
## 2 Sunday 0
## 3 Sunday 0
## 4 Sunday 0
## 5 Sunday 0
## 6 Sunday 0
summary(samplemerge)
## visit_date id visitors air_store_id
## Min. :2017-04-23 Length:32019 Min. :0 Length:32019
## 1st Qu.:2017-05-02 Class :character 1st Qu.:0 Class :character
## Median :2017-05-12 Mode :character Median :0 Mode :character
## Mean :2017-05-12 Mean :0
## 3rd Qu.:2017-05-22 3rd Qu.:0
## Max. :2017-05-31 Max. :0
## day_of_week holiday_flg
## Length:32019 Min. :0.0000
## Class :character 1st Qu.:0.0000
## Mode :character Median :0.0000
## Mean :0.1026
## 3rd Qu.:0.0000
## Max. :1.0000
So now we have a new data frame (?) called “samplemerge” and it contains visit_date, id (needed for submission), visitors (needed for submission), air store id, day of the week and holiday flag.
Data is classified by date first and foremost.This means that observation 1 and 2 are no longer consecutive days of the same restaurant but rather different restaurants on the same day.
Now there shouldnt be any issue predicting the tslm() model which only uses season and holiday flag as predictors (lmtsmergetest7).
However, if we are to use the more complex model “lmtsmergetest6” we will need to add dummy variables for Months 2 through 12.
Data frame works. Lets add month factors. 6568 break for april to august transition
samplemerge$Month <- as.factor(month(samplemerge$visit_date))
samplemerge$Month <- as.numeric(as.factor(samplemerge$Month))
samplemerge <- dummy_cols(samplemerge, select_columns = "Month")
samplemerge$Month_3 <- rep(0,32019)
samplemerge$Month_4 <- rep(0,32019)
samplemerge$Month_5 <- rep(0,32019)
samplemerge$Month_6 <- rep(0,32019)
samplemerge$Month_7 <- rep(0,32019)
samplemerge$Month_8 <- rep(0,32019)
samplemerge$Month_9 <- rep(0,32019)
samplemerge$Month_10 <- rep(0,32019)
samplemerge$Month_11 <- rep(0,32019)
samplemerge$Month_12 <- rep(0,32019)
head(samplemerge)
## visit_date id visitors air_store_id
## 1 2017-04-23 air_00a91d42b08b08d9_2017-04-23 0 air_00a91d42b08b08d9
## 2 2017-04-23 air_ec0fad2def4dcff0_2017-04-23 0 air_ec0fad2def4dcff0
## 3 2017-04-23 air_f2c5a1f24279c531_2017-04-23 0 air_f2c5a1f24279c531
## 4 2017-04-23 air_e270aff84ac7e4c8_2017-04-23 0 air_e270aff84ac7e4c8
## 5 2017-04-23 air_dad0b6a36138f309_2017-04-23 0 air_dad0b6a36138f309
## 6 2017-04-23 air_cfdeb326418194ff_2017-04-23 0 air_cfdeb326418194ff
## day_of_week holiday_flg Month Month_1 Month_2 Month_3 Month_4 Month_5 Month_6
## 1 Sunday 0 1 1 0 0 0 0 0
## 2 Sunday 0 1 1 0 0 0 0 0
## 3 Sunday 0 1 1 0 0 0 0 0
## 4 Sunday 0 1 1 0 0 0 0 0
## 5 Sunday 0 1 1 0 0 0 0 0
## 6 Sunday 0 1 1 0 0 0 0 0
## Month_7 Month_8 Month_9 Month_10 Month_11 Month_12
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
above, i have added all 12 dummy variable months. Now we need to change month 1 and 2 to 4 and 5 and then make month 1 and 2 0 all the way through
samplemerge$Month_4 <- samplemerge$Month_1
samplemerge$Month_5 <- samplemerge$Month_2
samplemerge$Month_1 <- rep(0,32019)
samplemerge$Month_2 <- rep(0,32019)
perfect! now we have visit date, id, visitors (to be imputed), air_store_id, day of week, holiday flag and month dummy variables
summary(samplemerge)
## visit_date id visitors air_store_id
## Min. :2017-04-23 Length:32019 Min. :0 Length:32019
## 1st Qu.:2017-05-02 Class :character 1st Qu.:0 Class :character
## Median :2017-05-12 Mode :character Median :0 Mode :character
## Mean :2017-05-12 Mean :0
## 3rd Qu.:2017-05-22 3rd Qu.:0
## Max. :2017-05-31 Max. :0
## day_of_week holiday_flg Month Month_1 Month_2
## Length:32019 Min. :0.0000 Min. :1.000 Min. :0 Min. :0
## Class :character 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:0 1st Qu.:0
## Mode :character Median :0.0000 Median :2.000 Median :0 Median :0
## Mean :0.1026 Mean :1.795 Mean :0 Mean :0
## 3rd Qu.:0.0000 3rd Qu.:2.000 3rd Qu.:0 3rd Qu.:0
## Max. :1.0000 Max. :2.000 Max. :0 Max. :0
## Month_3 Month_4 Month_5 Month_6 Month_7
## Min. :0 Min. :0.0000 Min. :0.0000 Min. :0 Min. :0
## 1st Qu.:0 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0 1st Qu.:0
## Median :0 Median :0.0000 Median :1.0000 Median :0 Median :0
## Mean :0 Mean :0.2051 Mean :0.7949 Mean :0 Mean :0
## 3rd Qu.:0 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0 3rd Qu.:0
## Max. :0 Max. :1.0000 Max. :1.0000 Max. :0 Max. :0
## Month_8 Month_9 Month_10 Month_11 Month_12
## Min. :0 Min. :0 Min. :0 Min. :0 Min. :0
## 1st Qu.:0 1st Qu.:0 1st Qu.:0 1st Qu.:0 1st Qu.:0
## Median :0 Median :0 Median :0 Median :0 Median :0
## Mean :0 Mean :0 Mean :0 Mean :0 Mean :0
## 3rd Qu.:0 3rd Qu.:0 3rd Qu.:0 3rd Qu.:0 3rd Qu.:0
## Max. :0 Max. :0 Max. :0 Max. :0 Max. :0
samplefc <- forecast(lmtsmergetest6, newdata = samplemerge)
write.csv(samplefc, "test4.csv", row.names = FALSE)
using the forecast with months, holiday flags and no trend i recieved a score of 0.91. This actually is infinitely better than some of my other attempts so ill take it. I however that the results repeated over and over. I guess this makes sense when you think about it; without any downwards or upwards trend the cycle would just repeat itself over and over.
However, what if we tried adding trend?
I can see already just by examining the spread sheet that over enough rows, the trend will eventually drop our point forecast into the negatives.
samplefc1 <- forecast(lmtsmergetest7, newdata = samplemerge)
write.csv(samplefc, "test1.csv", row.names = FALSE)
Similarly, I received the exact same score of 0.91131 on my linear model which forecasted without monthly dummy variables.
I figured since I reached this point that I could attempt to add dummy variables to indicate if it is the weekend or not (Friday, Saturday and Sunday). I believe that maybe this could add some explanatory power and improve my model. I will refer back to mergeaggtest1 to do so.
mergeaggtest1$IsWeekend <- rep(0,478)
for (i in 1:length(mergeaggtest1$IsWeekend)){
if (mergeaggtest1$day_of_week[i] == "Friday"){
mergeaggtest1$IsWeekend[i] <- 1
} else if (mergeaggtest1$day_of_week[i] == "Saturday"){
mergeaggtest1$IsWeekend[i] <- 1
} else if (mergeaggtest1$day_of_week[i] == "Sunday"){
mergeaggtest1$IsWeekend[i] <- 1
} else {
mergeaggtest1$IsWeekend[i] <- 0
}
}
summary(mergeaggtest1)
## visit_date visitors Month Month_1
## Min. :2016-01-01 Min. :13.13 Min. : 1.000 Min. :0.0000
## 1st Qu.:2016-04-29 1st Qu.:17.85 1st Qu.: 3.000 1st Qu.:0.0000
## Median :2016-08-26 Median :20.60 Median : 5.000 Median :0.0000
## Mean :2016-08-26 Mean :21.15 Mean : 5.548 Mean :0.1297
## 3rd Qu.:2016-12-23 3rd Qu.:24.22 3rd Qu.: 9.000 3rd Qu.:0.0000
## Max. :2017-04-22 Max. :31.27 Max. :12.000 Max. :1.0000
## Month_2 Month_3 Month_4 Month_5
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.1192 Mean :0.1297 Mean :0.1088 Mean :0.06485
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## Month_6 Month_7 Month_8 Month_9
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.06276 Mean :0.06485 Mean :0.06485 Mean :0.06276
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## Month_10 Month_11 Month_12 day_of_week
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Length:478
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 Class :character
## Median :0.00000 Median :0.00000 Median :0.00000 Mode :character
## Mean :0.06485 Mean :0.06276 Mean :0.06485
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000
## holiday_flg IsWeekend
## Min. :0.00000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:0.000
## Median :0.00000 Median :0.000
## Mean :0.06485 Mean :0.431
## 3rd Qu.:0.00000 3rd Qu.:1.000
## Max. :1.00000 Max. :1.000
Neat, now we got weekend dummy to work. Lets see how it affects our model
tsmergetest8 <- ts(mergeaggtest1, frequency = 7)
lmtsmergetest8 <- tslm(visitors ~ season + holiday_flg + IsWeekend + Month_2+
Month_3+Month_4+Month_5+Month_6+Month_7+
Month_8+Month_9+Month_10+Month_11 + Month_12, data = tsmergetest8)
summary(lmtsmergetest8)
##
## Call:
## tslm(formula = visitors ~ season + holiday_flg + IsWeekend +
## Month_2 + Month_3 + Month_4 + Month_5 + Month_6 + Month_7 +
## Month_8 + Month_9 + Month_10 + Month_11 + Month_12, data = tsmergetest8)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.6291 -0.8610 -0.0583 0.8643 8.5140
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.13809 0.30727 72.047 < 2e-16 ***
## season2 3.22599 0.30693 10.511 < 2e-16 ***
## season3 0.84602 0.30814 2.746 0.00628 **
## season4 -6.00182 0.30925 -19.408 < 2e-16 ***
## season5 -5.21151 0.30835 -16.901 < 2e-16 ***
## season6 -3.83748 0.30836 -12.445 < 2e-16 ***
## season7 -4.30692 0.30826 -13.972 < 2e-16 ***
## holiday_flg 4.07930 0.34331 11.882 < 2e-16 ***
## IsWeekend NA NA NA NA
## Month_2 0.54318 0.33237 1.634 0.10288
## Month_3 2.49247 0.32509 7.667 1.06e-13 ***
## Month_4 1.69289 0.34108 4.963 9.78e-07 ***
## Month_5 1.81441 0.39664 4.574 6.15e-06 ***
## Month_6 1.41900 0.40347 3.517 0.00048 ***
## Month_7 0.80593 0.39785 2.026 0.04337 *
## Month_8 -0.50012 0.39660 -1.261 0.20794
## Month_9 -0.37242 0.40156 -0.927 0.35419
## Month_10 0.05202 0.39791 0.131 0.89605
## Month_11 -0.35486 0.40151 -0.884 0.37725
## Month_12 2.39083 0.39660 6.028 3.41e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.802 on 459 degrees of freedom
## Multiple R-squared: 0.7995, Adjusted R-squared: 0.7916
## F-statistic: 101.7 on 18 and 459 DF, p-value: < 2.2e-16
checkresiduals(lmtsmergetest8)
##
## Breusch-Godfrey test for serial correlation of order up to 23
##
## data: Residuals from Linear regression model
## LM test = 118, df = 23, p-value = 9.502e-15
Couldnt seem to get this to work.Maybe in the future I will get this
samplefc1 <- forecast(lm1, newdata = samplemerge)
write.csv(samplefc, "test2.csv", row.names = FALSE)
Looking over this linear model, my point forecast also drops into the negatives when including trend. This means that it is considering all 32019 observations to be predicted as one restaurant instead of each individual restaurant having its own separate trend.
Lets try the naive drift model here
samplefc2 <- forecast(stlfdrift, newdata = samplemerge)
write.csv(samplefc, "naivedrift.csv", row.names = FALSE)
Still recieving negative point values later on in the forecast
Lets try naive with no drift stlfnaive
samplefc3 <- forecast(stlfnaive, newdata = samplemerge)
write.csv(samplefc, "naive.csv", row.names = FALSE)
baseline example by professor. score: 0.63490
samplesubmission$air_store_id=substr(samplesubmission$id, 1,20)
myagg=aggregate(visitors~air_store_id, FUN=mean, data=airvisitdata)
mymerge=merge(samplesubmission, myagg, by="air_store_id", all.x=TRUE)
mymerge$visitors.x=NULL
mymerge$visitors=mymerge$visitors.y
mymerge$visitors.y=NULL
mymerge$air_store_id=NULL
write.csv(mymerge, "sillysubmit.csv", row.names=FALSE)
geometric mean. score: 0.59820
samplesubmission$air_store_id=substr(samplesubmission$id, 1,20)
myagg=aggregate(visitors~air_store_id, FUN=geometric.mean, data=airvisitdata)
mymerge=merge(samplesubmission, myagg, by="air_store_id", all.x=TRUE)
mymerge$visitors.x=NULL
mymerge$visitors=mymerge$visitors.y
mymerge$visit_date=NULL
mymerge$visitors.y=NULL
mymerge$air_store_id = NULL
write.csv(mymerge, "sillysubmit2.csv", row.names=FALSE)