Here is the code to recipricate my results used in the paper.
#--Processing/EDA--#
data<-read.csv('train.csv')
str(data)
## 'data.frame': 913000 obs. of 4 variables:
## $ date : Factor w/ 1826 levels "2013-01-01","2013-01-02",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ store: int 1 1 1 1 1 1 1 1 1 1 ...
## $ item : int 1 1 1 1 1 1 1 1 1 1 ...
## $ sales: int 13 11 14 13 10 12 10 9 12 9 ...
data$date<- as.Date(data$date)
str(data)
## 'data.frame': 913000 obs. of 4 variables:
## $ date : Date, format: "2013-01-01" "2013-01-02" ...
## $ store: int 1 1 1 1 1 1 1 1 1 1 ...
## $ item : int 1 1 1 1 1 1 1 1 1 1 ...
## $ sales: int 13 11 14 13 10 12 10 9 12 9 ...
head(data)
## date store item sales
## 1 2013-01-01 1 1 13
## 2 2013-01-02 1 1 11
## 3 2013-01-03 1 1 14
## 4 2013-01-04 1 1 13
## 5 2013-01-05 1 1 10
## 6 2013-01-06 1 1 12
store1<- data%>%filter(store== 3 & item== 5)
store1$months<-months(store1$date)
store1$months=factor(store1$months, levels= month.name, ordered = TRUE)
store1$Years<- format(store1$date, format="%y")
store1$Years<- as.integer(store1$Years)
store1$Years=store1$Years+2000
head(store1)
## date store item sales months Years
## 1 2013-01-01 3 5 13 January 2013
## 2 2013-01-02 3 5 9 January 2013
## 3 2013-01-03 3 5 10 January 2013
## 4 2013-01-04 3 5 20 January 2013
## 5 2013-01-05 3 5 13 January 2013
## 6 2013-01-06 3 5 12 January 2013
d1<-aggregate(sales~ months + Years, store1, sum)
d1$sales<- round(d1$sales,0)
d1$date <- as.yearmon(paste(d1$Years, d1$months), "%Y %B" )
ggplot(d1, aes(x= date, y= sales))+
geom_line ()+ ggtitle("sales for item 5 in store 3 over five years")+
theme_minimal()+ scale_x_continuous()
#--Test-Data-#
d2<-d1[49:60,]
d2
## months Years sales date
## 49 January 2017 515 Jan 2017
## 50 February 2017 490 Feb 2017
## 51 March 2017 652 Mar 2017
## 52 April 2017 731 Apr 2017
## 53 May 2017 821 May 2017
## 54 June 2017 866 Jun 2017
## 55 July 2017 937 Jul 2017
## 56 August 2017 807 Aug 2017
## 57 September 2017 773 Sep 2017
## 58 October 2017 730 Oct 2017
## 59 November 2017 727 Nov 2017
## 60 December 2017 616 Dec 2017
#--Naive method--#
mape <- function(y, yhat){
mean(abs((y - yhat)/y))}
naive<- d1[37:48,]
naivemape<-mape(naive$sales, d2$sales)*100
naivemape<-round(naivemape,2)
cat(naivemape, "%")
## 6.46 %
mad(naive$sales, d2$sales)
## [1] 64.4931
#-- linear regression---#
d1$Jan<-d1$Feb<-d1$Mar<-d1$Apr<-d1$May<-d1$Jun<-d1$Jul<-d1$Aug<-d1$Sep<-d1$Oct<-d1$Nov<- c(rep(0,length(d1$sales)))
for( i in 1:60){
if (d1$months[i]== "January") d1$Jan[i]<- 1
if (d1$months[i]== "February") d1$Feb[i]<- 1
if (d1$months[i]== "March") d1$Mar[i]<- 1
if (d1$months[i]== "April") d1$Apr[i]<- 1
if (d1$months[i]== "May") d1$May[i]<- 1
if (d1$months[i]== "June") d1$Jun[i]<- 1
if (d1$months[i]== "July") d1$Jul[i]<- 1
if (d1$months[i]== "August") d1$Aug[i]<- 1
if (d1$months[i]== "September") d1$Sep[i]<- 1
if (d1$months[i]== "October") d1$Oct[i]<- 1
if (d1$months[i]== "November") d1$Nov[i]<- 1
}
d1$tpi<- 1:60
am<- d1[1:48,]
fit<-lm(sales~ tpi+Jan+Feb+Mar+Apr+May+Jun+Jul+Aug+Sep+Oct+Nov, data= am)
summary(fit)
##
## Call:
## lm(formula = sales ~ tpi + Jan + Feb + Mar + Apr + May + Jun +
## Jul + Aug + Sep + Oct + Nov, data = am)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.11 -11.62 3.25 15.98 67.05
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 335.5208 17.0730 19.652 < 2e-16 ***
## tpi 4.2993 0.3086 13.930 7.64e-16 ***
## Jan 11.0424 20.5679 0.537 0.595
## Feb 3.2431 20.5192 0.158 0.875
## Mar 140.9438 20.4750 6.884 5.40e-08 ***
## Apr 203.8944 20.4354 9.977 8.99e-12 ***
## May 270.3451 20.4004 13.252 3.34e-15 ***
## Jun 283.0458 20.3701 13.895 8.24e-16 ***
## Jul 367.4965 20.3443 18.064 < 2e-16 ***
## Aug 270.1972 20.3233 13.295 3.04e-15 ***
## Sep 179.3979 20.3068 8.834 1.96e-10 ***
## Oct 139.0986 20.2951 6.854 5.90e-08 ***
## Nov 196.2993 20.2881 9.676 1.99e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.69 on 35 degrees of freedom
## Multiple R-squared: 0.9664, Adjusted R-squared: 0.9548
## F-statistic: 83.77 on 12 and 35 DF, p-value: < 2.2e-16
d2<-d1[49:60,]
d3<-predict(fit, d2)
linmape<-mape(d3, d2$sales)*100
linmape<-round(linmape,2)
cat(linmape,"%")
## 4.3 %
mad(d3, d2$sale)
## [1] 40.40085