library(readr)
library(dplyr)
library(tidyr)
library(plotly)
library(TTR)
library(forecast)
library(caret)
library(plyr)
detach("package:plyr", unload = TRUE)

First of all we read all the data available on Kaggle:

# train <- read_csv("train.csv")
# saveRDS(train, file = "train.rds")
# producto_tabla <- read_csv("producto_tabla.csv")

First exporatory analysis and subsetting Takis into a smaller csv:

# dim(train)
# 
# head(train)
# 
# por_producto <- train %>%
#   select(Semana, Producto_ID, Venta_uni_hoy, Dev_uni_proxima, Demanda_uni_equil)
# 
# producto_tabla %>%
#    separate(NombreProducto,
#            c("Name", "Grams","Brand", "hola", "a", "b", "c", "d"))
# 
# takis <- producto_tabla[which(grepl("Takis Fuego", producto_tabla$NombreProducto)),]
# 
# takis$Producto_ID
# 
# takis1<- por_producto[which(por_producto$Producto_ID == "30110"),]
# takis2<- por_producto[which(por_producto$Producto_ID == "42453"),]
# takis3<- por_producto[which(por_producto$Producto_ID == "42672"),]
# 
# write.csv(takis2, file = "takis62g.csv", row.names = F)

Now we can work with only Takis 62g with a smaller and faster file, and we’ll try to do a time series analysis to spot any trends or seasonalities:

takis <- read_csv("takis62g.csv")
takis_per_week <- takis %>%
  arrange(Semana) %>%
  group_by(Semana) %>%
  summarise(
    ventas = sum(Venta_uni_hoy),
    devoluciones = sum(Dev_uni_proxima),
    demanda = sum(Demanda_uni_equil)) %>%
  select(Semana, ventas, devoluciones, demanda)

plot_ly(takis_per_week, x = ~takis_per_week$Semana, y = ~takis_per_week$demanda, name = 'Demanda', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~takis_per_week$ventas, name = 'ventas', mode = 'lines') %>%
  add_trace(y = ~takis_per_week$devoluciones, name = 'devoluciones', mode = 'lines') %>%
  layout(title = "Takis Demand Jan-Mar",
  xaxis = list(title = "Week"),
  yaxis = list (title = "Demand"))

For the time series analysis I chose to set the frequency as 2-weekly. A weekly frequency (4) would have been perfect to spot seasonalities along the month (as in 1st week after getting paid through the end of the month), however we don’t have enough data to treat it like this.

takis_ts <- ts(takis_per_week$demanda, frequency = 2)
takis_ts
## Time Series:
## Start = c(1, 1) 
## End = c(4, 1) 
## Frequency = 2 
## [1] 63188 64419 64627 57759 61610 58605 59190
takis_components <- decompose(takis_ts)
plot(takis_components)

Although we have very few data we can spot:

Time series modeling

Time Series Linear Model

As we spot trend and seasonality, we’ll try to predict demand for the last 3 weeks with a time series linear model:

train <- takis_per_week[1:4, 4]
test <- takis_per_week[5:7, 4]

train_ts <- ts(train, frequency = 2)
train_components <- decompose(train_ts)
test_ts <- ts(test, frequency = 2)

modelts <- tslm(train_ts ~ trend + season)
summary(modelts)
## 
## Call:
## tslm(formula = train_ts ~ trend + season)
## 
## Residuals:
## Time Series:
## Start = c(1, 1) 
## End = c(2, 2) 
## Frequency = 2 
##     1     2     3     4 
## -2025  2025  2025 -2025 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    66518       4960  13.412   0.0474 *
## trend          -1305       2025  -0.645   0.6355  
## season2        -1513       4528  -0.334   0.7946  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4050 on 1 degrees of freedom
## Multiple R-squared:  0.4737, Adjusted R-squared:  -0.5789 
## F-statistic:  0.45 on 2 and 1 DF,  p-value: 0.7255
predicteddemand <- forecast(train_ts, h = 3)
autoplot(predicteddemand)

error_analysis <- cbind(test, predicteddemand)
error_analysis$error <- error_analysis$`Point Forecast` - error_analysis$demanda
error_analysis$errorperc <- (error_analysis$error / error_analysis$demanda)*100
error_analysis
##      demanda Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95 error errorperc
## 3.00   61610          63188 59503.67 66872.33 57553.31 68822.69  1578  2.561273
## 3.50   58605          63188 59503.67 66872.33 57553.31 68822.69  4583  7.820152
## 4.00   59190          63188 59503.67 66872.33 57553.31 68822.69  3998  6.754519

However, unlike we though the forecast is flat and well above actual demand. Therefore we’ll have to use a different method in a more “manual” way.

Linear model

As we spotted a decreasing trend, first we’ll try to predict the last 3 weeks demand with only this decreasing trend by week:

train <- takis_per_week[1:4,]
test <- takis_per_week[5:7,]

fit <- train(demanda ~ Semana, data = train, method = "lm", na.action = na.pass)
predicteddemand <- predict(fit, test)
postResample(pred = predicteddemand, obs = test$demanda)
##         RMSE     Rsquared          MAE 
## 3068.0135213    0.5768469 2931.0666667
# coef(lm(demanda ~ Semana, data = train))

# test
# predicteddemand
lmsemana_error <- cbind(test, predicteddemand)
lmsemana_error$error <- lmsemana_error$demanda - lmsemana_error$predicteddemand
lmsemana_error$demanda <- lmsemana_error$predicteddemand
lmsemana_error$error_perc <- (lmsemana_error$error / lmsemana_error$demanda)*100

lmsemana_error
##   Semana ventas devoluciones demanda predicteddemand  error error_perc
## 1      7  63706         2290 58478.5         58478.5 3131.5   5.354960
## 2      8  60967         2458 56870.6         56870.6 1734.4   3.049730
## 3      9  61602         2625 55262.7         55262.7 3927.3   7.106602
semanaplot <- rbind(takis_per_week[,1:4], lmsemana_error[,1:4])
semanaplot[1:7, "actual"] <- "actual"
semanaplot[8:10, "actual"] <- "prediction"

ggplot(semanaplot, aes(x = Semana, y = demanda, colour = actual)) + geom_abline(intercept = 69733.8, slope = -1607.9, size = 0.01) + geom_point()

In this first try, although the trend is captured, there are 2 things missing:

  • the 2-week seasonality isn’t represented

  • our predictions are well below the actual demand, most probably due to the effect of week 6

To replicate this seasonality we though of different ideas:

  • we started by inserting a lag variable. We started with lag 1, as it seems that the previous week’s demand has an effect on this week’s demand:
takis_per_week$lag1 <- lag(takis_per_week$demanda, n = 1L)
ggplot(takis_per_week, aes(x = lag1, y = demanda)) + geom_point()

  • However, we can’t spot a linear correlation, so we also tried with lag 2:
takis_per_week$lag2 <- lag(takis_per_week$demanda, n = 2L)
ggplot(takis_per_week, aes(x = lag2, y = demanda)) + geom_point()

Lag 2 seems more correlated, so we tried using it for our prediction:

train <- takis_per_week[1:4,]
test <- takis_per_week[5:7,]

fit <- train(demanda ~ Semana + lag2, data = train, method = "lm", na.action = na.pass)
predicteddemand <- predict(fit, test)
postResample(pred = predicteddemand, obs = test$demanda)
##         RMSE     Rsquared          MAE 
## 1.646282e+04 5.768469e-01 1.577867e+04
# coef(lm(demanda ~ Semana, data = train))

# test
# predicteddemand
lag2_error <- cbind(test, predicteddemand)
lag2_error$error <- lag2_error$demanda - lag2_error$predicteddemand
lag2_error$demanda <- lag2_error$predicteddemand
lag2_error$error_perc <- (lag2_error$error / lag2_error$demanda)*100

lag2_error
##   Semana ventas devoluciones demanda  lag1  lag2 predicteddemand error
## 1      7  63706         2290   50891 57759 64627           50891 10719
## 2      8  60967         2458   44023 61610 57759           44023 14582
## 3      9  61602         2625   37155 58605 61610           37155 22035
##   error_perc
## 1   21.06266
## 2   33.12359
## 3   59.30561
lag2plot <- rbind(takis_per_week[,1:4], lag2_error[,1:4])
lag2plot[1:7, "actual"] <- "actual"
lag2plot[8:10, "actual"] <- "prediction"

ggplot(lag2plot, aes(x = Semana, y = demanda, colour = actual)) + geom_point()

However, predictions seem very off and below actual demand and doesn’t replicate seasonality either. So we went for a diffetent idea:

  • We thought that % of returns could impact clients’ orders, as they would be more cautious after a week with a lot of returns:
takis_per_week$porcentaje_devol <- ((takis_per_week$devoluciones / takis_per_week$ventas)*100)
takis_per_week$lag_devoluciones <- lag(takis_per_week$porcentaje_devol)
ggplot(takis_per_week, aes(x = lag_devoluciones, y = demanda)) + geom_point()

There doesn’t seem to be much correlation either, so we tried with our 3rd idea:

  • as we saw that peaks in demand could be related to paydays, we’ll input it manually as a categorical variable:
takis_per_week$payday <- c(1,0,1,0,1,0,1)

train <- takis_per_week[1:4,]
test <- takis_per_week[5:7,]

fit <- train(demanda ~ Semana + payday, data = train, method = "lm", na.action = na.pass)
predicteddemand <- predict(fit, test)
postResample(pred = predicteddemand, obs = test$demanda)
##         RMSE     Rsquared          MAE 
## 1626.8802135    0.9860505 1619.5833333
# coef(lm(demanda ~ Semana + payday, data = train))

# test
# predicteddemand
lmpayday_error <- cbind(test, predicteddemand)
lmpayday_error$error <- lmpayday_error$demanda - lmpayday_error$predicteddemand
lmpayday_error$error_perc <- (lmpayday_error$error / lmpayday_error$demanda)*100

lmpayday_error
##   Semana ventas devoluciones demanda  lag1  lag2 porcentaje_devol
## 1      7  63706         2290   61610 57759 64627         3.594638
## 2      8  60967         2458   58605 61610 57759         4.031689
## 3      9  61602         2625   59190 58605 61610         4.261225
##   lag_devoluciones payday predicteddemand   error error_perc
## 1         3.025767      1        59991.75 1618.25   2.626603
## 2         3.594638      0        57173.25 1431.75   2.443051
## 3         4.031689      1        57381.25 1808.75   3.055837
lmpayday_error$demanda <- lmpayday_error$predicteddemand

paydayplot <- rbind(takis_per_week[,1:4], lmpayday_error[,1:4])
paydayplot[1:7, "actual"] <- "actual"
paydayplot[8:10, "actual"] <- "prediction"

ggplot(paydayplot, aes(x = Semana, y = demanda, colour = actual)) + geom_point()

We can see that our metrics have improved dramatically:

  • we have now an R2 of 0.986,

  • our predictions have almost the same shape as actual data,

  • and RMSE and MAE have been reduced to half compared to the previous model.

However, although only by 3% our predictions are still always below actual demand.