library(readr)
library(dplyr)
library(tidyr)
library(plotly)
library(TTR)
library(forecast)
library(caret)
library(plyr)
detach("package:plyr", unload = TRUE)

First of all we read all the data available on Kaggle:

# train <- read_csv("train.csv")
# saveRDS(train, file = "train.rds")
# producto_tabla <- read_csv("producto_tabla.csv")

First exporatory analysis and subsetting Takis into a smaller csv:

# dim(train)
# 
# head(train)
# 
# por_producto <- train %>%
#   select(Semana, Producto_ID, Venta_uni_hoy, Dev_uni_proxima, Demanda_uni_equil)
# 
# producto_tabla %>%
#    separate(NombreProducto,
#            c("Name", "Grams","Brand", "hola", "a", "b", "c", "d"))
# 
# takis <- producto_tabla[which(grepl("Takis Fuego", producto_tabla$NombreProducto)),]
# 
# takis$Producto_ID
# 
# takis1<- por_producto[which(por_producto$Producto_ID == "30110"),]
# takis2<- por_producto[which(por_producto$Producto_ID == "42453"),]
# takis3<- por_producto[which(por_producto$Producto_ID == "42672"),]
# 
# write.csv(takis2, file = "takis62g.csv", row.names = F)

Now we can work with only Takis 62g with a smaller and faster file, and we’ll try to do a time series analysis to spot any trends or seasonalities:

takis <- read_csv("takis62g.csv")
takis_per_week <- takis %>%
  arrange(Semana) %>%
  group_by(Semana) %>%
  summarise(
    ventas = sum(Venta_uni_hoy),
    devoluciones = sum(Dev_uni_proxima),
    demanda = sum(Demanda_uni_equil)) %>%
  select(Semana, ventas, devoluciones, demanda)

plot_ly(takis_per_week, x = ~takis_per_week$Semana, y = ~takis_per_week$demanda, name = 'Demanda', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~takis_per_week$ventas, name = 'ventas', mode = 'lines') %>%
  add_trace(y = ~takis_per_week$devoluciones, name = 'devoluciones', mode = 'lines') %>%
  layout(title = "Takis Demand Jan-Mar",
  xaxis = list(title = "Week"),
  yaxis = list (title = "Demand"))

For the time series analysis I chose to set the frequency as 2-weekly. A weekly frequency (4) would have been perfect to spot seasonalities along the month (as in 1st week after getting paid through the end of the month), however we don’t have enough data to treat it like this.

takis_ts <- ts(takis_per_week$demanda, frequency = 2)
takis_ts
## Time Series:
## Start = c(1, 1) 
## End = c(4, 1) 
## Frequency = 2 
## [1] 63188 64419 64627 57759 61610 58605 59190
takis_components <- decompose(takis_ts)
plot(takis_components)

Although we have very few data we can spot:

Time series modeling

Time Series Linear Model

As we spot trend and seasonality, we’ll try to predict demand for the last 3 weeks with a time series linear model:

train <- takis_per_week[1:4, 4]
test <- takis_per_week[5:7, 4]

train_ts <- ts(train, frequency = 2)
train_components <- decompose(train_ts)
test_ts <- ts(test, frequency = 2)

modelts <- tslm(train_ts ~ trend + season)
summary(modelts)
## 
## Call:
## tslm(formula = train_ts ~ trend + season)
## 
## Residuals:
## Time Series:
## Start = c(1, 1) 
## End = c(2, 2) 
## Frequency = 2 
##     1     2     3     4 
## -2025  2025  2025 -2025 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    66518       4960  13.412   0.0474 *
## trend          -1305       2025  -0.645   0.6355  
## season2        -1513       4528  -0.334   0.7946  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4050 on 1 degrees of freedom
## Multiple R-squared:  0.4737, Adjusted R-squared:  -0.5789 
## F-statistic:  0.45 on 2 and 1 DF,  p-value: 0.7255
predicteddemand <- forecast(train_ts, h = 3)
autoplot(predicteddemand)

error_analysis <- cbind(test, predicteddemand)
error_analysis$error <- error_analysis$`Point Forecast` - error_analysis$demanda
error_analysis$errorperc <- (error_analysis$error / error_analysis$demanda)*100
error_analysis
##      demanda Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95 error errorperc
## 3.00   61610          63188 59503.67 66872.33 57553.31 68822.69  1578  2.561273
## 3.50   58605          63188 59503.67 66872.33 57553.31 68822.69  4583  7.820152
## 4.00   59190          63188 59503.67 66872.33 57553.31 68822.69  3998  6.754519

However, unlike we though the forecast is flat and well above actual demand. Therefore we’ll have to use a different method in a more “manual” way.

Linear model

As we spotted a decreasing trend, first we’ll try to predict the last 3 weeks demand with only this decreasing trend by week:

train <- takis_per_week[1:4,]
test <- takis_per_week[5:7,]

fit <- train(demanda ~ Semana, data = train, method = "lm", na.action = na.pass)
predicteddemand <- predict(fit, test)
postResample(pred = predicteddemand, obs = test$demanda)
##         RMSE     Rsquared          MAE 
## 3068.0135213    0.5768469 2931.0666667
# coef(lm(demanda ~ Semana, data = train))

# test
# predicteddemand
lmsemana_error <- cbind(test, predicteddemand)
lmsemana_error$error <- lmsemana_error$demanda - lmsemana_error$predicteddemand
lmsemana_error$demanda <- lmsemana_error$predicteddemand
lmsemana_error$error_perc <- (lmsemana_error$error / lmsemana_error$demanda)*100

lmsemana_error
##   Semana ventas devoluciones demanda predicteddemand  error error_perc
## 1      7  63706         2290 58478.5         58478.5 3131.5   5.354960
## 2      8  60967         2458 56870.6         56870.6 1734.4   3.049730
## 3      9  61602         2625 55262.7         55262.7 3927.3   7.106602
semanaplot <- rbind(takis_per_week[,1:4], lmsemana_error[,1:4])
semanaplot[1:7, "actual"] <- "actual"
semanaplot[8:10, "actual"] <- "prediction"

ggplot(semanaplot, aes(x = Semana, y = demanda, colour = actual)) + geom_abline(intercept = 69733.8, slope = -1607.9, size = 0.01) + geom_point()

In this first try, although the trend is captured, there are 2 things missing:

  • the 2-week seasonality isn’t represented

  • our predictions are well below the actual demand, most probably due to the effect of week 6

To replicate this seasonality we started by inserting a lag variable. We started with lag 1, as it seems that the previous week’s demand has an effect on this week’s demand

takis_per_week$lag1 <- lag(takis_per_week$demanda, n = 1L)

train <- takis_per_week[1:4,]
test <- takis_per_week[5:7,]

fit <- train(demanda ~ Semana + lag1, data = train, method = "lm", na.action = na.pass)
predicteddemand <- predict(fit, test)
postResample(pred = predicteddemand, obs = test$demanda)
##         RMSE     Rsquared          MAE 
## 5.672551e+04 1.073052e-01 5.533187e+04
# coef(lm(demanda ~ Semana + payday, data = train))

# test
# predicteddemand
lmlag1_error <- cbind(test, predicteddemand)
lmlag1_error$error <- lmlag1_error$demanda - lmlag1_error$predicteddemand
lmlag1_error$error_perc <- (lmlag1_error$error / lmlag1_error$demanda)*100


lmlag1_error
##   Semana ventas devoluciones demanda  lag1 predicteddemand    error error_perc
## 1      7  63706         2290   61610 57759        1946.937 59663.06   96.83990
## 2      8  60967         2458   58605 61610       20277.244 38327.76   65.40015
## 3      9  61602         2625   59190 58605       -8814.791 68004.79  114.89236
lmlag1_error$demanda <- lmlag1_error$predicteddemand

lag1plot <- rbind(takis_per_week[,1:4], lmlag1_error[,1:4])
lag1plot[1:7, "actual"] <- "actual"
lag1plot[8:10, "actual"] <- "prediction"

ggplot(lag1plot, aes(x = Semana, y = demanda, colour = actual)) + geom_point()

We see that this new variable allows us to mimic the fluctuations of actual data, but the predictions are well below the actual demand.

As a 2nd idea, as we saw that peaks in demand could be related to paydays, we’ll input it manually as a categorical variable:

takis_per_week$payday <- c(1,0,1,0,1,0,1)

train <- takis_per_week[1:4,]
test <- takis_per_week[5:7,]

fit <- train(demanda ~ Semana + payday, data = train, method = "lm", na.action = na.pass)
predicteddemand <- predict(fit, test)
postResample(pred = predicteddemand, obs = test$demanda)
##         RMSE     Rsquared          MAE 
## 1626.8802135    0.9860505 1619.5833333
# coef(lm(demanda ~ Semana + payday, data = train))

# test
# predicteddemand
lmpayday_error <- cbind(test, predicteddemand)
lmpayday_error$error <- lmpayday_error$demanda - lmpayday_error$predicteddemand
lmpayday_error$error_perc <- (lmpayday_error$error / lmpayday_error$demanda)*100

lmpayday_error
##   Semana ventas devoluciones demanda  lag1 payday predicteddemand   error
## 1      7  63706         2290   61610 57759      1        59991.75 1618.25
## 2      8  60967         2458   58605 61610      0        57173.25 1431.75
## 3      9  61602         2625   59190 58605      1        57381.25 1808.75
##   error_perc
## 1   2.626603
## 2   2.443051
## 3   3.055837
lmpayday_error$demanda <- lmpayday_error$predicteddemand

paydayplot <- rbind(takis_per_week[,1:4], lmpayday_error[,1:4])
paydayplot[1:7, "actual"] <- "actual"
paydayplot[8:10, "actual"] <- "prediction"

ggplot(paydayplot, aes(x = Semana, y = demanda, colour = actual)) + geom_point()

We can see that our metrics have improved dramatically:

  • we have now an R2 of 0.986,

  • our predictions have almost the same shape as actual data,

  • and RMSE and MAE have been reduced to half compared to the previous model.

However, our predictions are still a 3% below actual demand.

Even if we change the train and test sizes the performance improves even more, although we shouldn’t rely on this 100% as it might be overfitted:

train <- takis_per_week[1:5,]
test <- takis_per_week[6:7,]

fit <- train(demanda ~ Semana + payday, data = train, method = "lm", na.action = na.pass)
predicteddemand <- predict(fit, test)
postResample(pred = predicteddemand, obs = test$demanda)
##     RMSE Rsquared      MAE 
## 326.3243   1.0000 243.0333
# coef(lm(demanda ~ Semana + payday, data = train))

# test
# predicteddemand
lmpayday_error <- cbind(test, predicteddemand)
lmpayday_error$error <- lmpayday_error$demanda - lmpayday_error$predicteddemand
lmpayday_error$error_perc <- (lmpayday_error$error / lmpayday_error$demanda)*100

lmpayday_error
##   Semana ventas devoluciones demanda  lag1 payday predicteddemand     error
## 1      8  60967         2458   58605 61610      0        58144.20 460.80000
## 2      9  61602         2625   59190 58605      1        59215.27 -25.26667
##    error_perc
## 1  0.78628103
## 2 -0.04268739
lmpayday_error$demanda <- lmpayday_error$predicteddemand

paydayplot <- rbind(takis_per_week[,1:4], lmpayday_error[,1:4])
paydayplot[1:7, "actual"] <- "actual"
paydayplot[8:9, "actual"] <- "prediction"

ggplot(paydayplot, aes(x = Semana, y = demanda, colour = actual)) + geom_point()