library(readr)
library(dplyr)
library(tidyr)
library(plotly)
library(TTR)
library(forecast)
library(caret)
library(plyr)
detach("package:plyr", unload = TRUE)
First of all we read all the data available on Kaggle:
# train <- read_csv("train.csv")
# saveRDS(train, file = "train.rds")
# producto_tabla <- read_csv("producto_tabla.csv")
First exporatory analysis and subsetting Takis into a smaller csv:
# dim(train)
#
# head(train)
#
# por_producto <- train %>%
# select(Semana, Producto_ID, Venta_uni_hoy, Dev_uni_proxima, Demanda_uni_equil)
#
# producto_tabla %>%
# separate(NombreProducto,
# c("Name", "Grams","Brand", "hola", "a", "b", "c", "d"))
#
# takis <- producto_tabla[which(grepl("Takis Fuego", producto_tabla$NombreProducto)),]
#
# takis$Producto_ID
#
# takis1<- por_producto[which(por_producto$Producto_ID == "30110"),]
# takis2<- por_producto[which(por_producto$Producto_ID == "42453"),]
# takis3<- por_producto[which(por_producto$Producto_ID == "42672"),]
#
# write.csv(takis2, file = "takis62g.csv", row.names = F)
Now we can work with only Takis 62g with a smaller and faster file, and we’ll try to do a time series analysis to spot any trends or seasonalities:
takis <- read_csv("takis62g.csv")
takis_per_week <- takis %>%
arrange(Semana) %>%
group_by(Semana) %>%
summarise(
ventas = sum(Venta_uni_hoy),
devoluciones = sum(Dev_uni_proxima),
demanda = sum(Demanda_uni_equil)) %>%
select(Semana, ventas, devoluciones, demanda)
plot_ly(takis_per_week, x = ~takis_per_week$Semana, y = ~takis_per_week$demanda, name = 'Demanda', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~takis_per_week$ventas, name = 'ventas', mode = 'lines') %>%
add_trace(y = ~takis_per_week$devoluciones, name = 'devoluciones', mode = 'lines') %>%
layout(title = "Takis Demand Jan-Mar",
xaxis = list(title = "Week"),
yaxis = list (title = "Demand"))
For the time series analysis I chose to set the frequency as 2-weekly. A weekly frequency (4) would have been perfect to spot seasonalities along the month (as in 1st week after getting paid through the end of the month), however we don’t have enough data to treat it like this.
takis_ts <- ts(takis_per_week$demanda, frequency = 2)
takis_ts
## Time Series:
## Start = c(1, 1)
## End = c(4, 1)
## Frequency = 2
## [1] 63188 64419 64627 57759 61610 58605 59190
takis_components <- decompose(takis_ts)
plot(takis_components)
Although we have very few data we can spot:
As we spot trend and seasonality, we’ll try to predict demand for the last 3 weeks with a time series linear model:
train <- takis_per_week[1:4, 4]
test <- takis_per_week[5:7, 4]
train_ts <- ts(train, frequency = 2)
train_components <- decompose(train_ts)
test_ts <- ts(test, frequency = 2)
modelts <- tslm(train_ts ~ trend + season)
summary(modelts)
##
## Call:
## tslm(formula = train_ts ~ trend + season)
##
## Residuals:
## Time Series:
## Start = c(1, 1)
## End = c(2, 2)
## Frequency = 2
## 1 2 3 4
## -2025 2025 2025 -2025
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66518 4960 13.412 0.0474 *
## trend -1305 2025 -0.645 0.6355
## season2 -1513 4528 -0.334 0.7946
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4050 on 1 degrees of freedom
## Multiple R-squared: 0.4737, Adjusted R-squared: -0.5789
## F-statistic: 0.45 on 2 and 1 DF, p-value: 0.7255
predicteddemand <- forecast(train_ts, h = 3)
autoplot(predicteddemand)
error_analysis <- cbind(test, predicteddemand)
error_analysis$error <- error_analysis$`Point Forecast` - error_analysis$demanda
error_analysis$errorperc <- (error_analysis$error / error_analysis$demanda)*100
error_analysis
## demanda Point Forecast Lo 80 Hi 80 Lo 95 Hi 95 error errorperc
## 3.00 61610 63188 59503.67 66872.33 57553.31 68822.69 1578 2.561273
## 3.50 58605 63188 59503.67 66872.33 57553.31 68822.69 4583 7.820152
## 4.00 59190 63188 59503.67 66872.33 57553.31 68822.69 3998 6.754519
However, unlike we though the forecast is flat and well above actual demand. Therefore we’ll have to use a different method in a more “manual” way.
As we spotted a decreasing trend, first we’ll try to predict the last 3 weeks demand with only this decreasing trend by week:
train <- takis_per_week[1:4,]
test <- takis_per_week[5:7,]
fit <- train(demanda ~ Semana, data = train, method = "lm", na.action = na.pass)
predicteddemand <- predict(fit, test)
postResample(pred = predicteddemand, obs = test$demanda)
## RMSE Rsquared MAE
## 3068.0135213 0.5768469 2931.0666667
# coef(lm(demanda ~ Semana, data = train))
# test
# predicteddemand
lmsemana_error <- cbind(test, predicteddemand)
lmsemana_error$error <- lmsemana_error$demanda - lmsemana_error$predicteddemand
lmsemana_error$demanda <- lmsemana_error$predicteddemand
lmsemana_error$error_perc <- (lmsemana_error$error / lmsemana_error$demanda)*100
lmsemana_error
## Semana ventas devoluciones demanda predicteddemand error error_perc
## 1 7 63706 2290 58478.5 58478.5 3131.5 5.354960
## 2 8 60967 2458 56870.6 56870.6 1734.4 3.049730
## 3 9 61602 2625 55262.7 55262.7 3927.3 7.106602
semanaplot <- rbind(takis_per_week[,1:4], lmsemana_error[,1:4])
semanaplot[1:7, "actual"] <- "actual"
semanaplot[8:10, "actual"] <- "prediction"
ggplot(semanaplot, aes(x = Semana, y = demanda, colour = actual)) + geom_abline(intercept = 69733.8, slope = -1607.9, size = 0.01) + geom_point()
In this first try, although the trend is captured, there are 2 things missing:
the 2-week seasonality isn’t represented
our predictions are well below the actual demand, most probably due to the effect of week 6
To replicate this seasonality we started by inserting a lag variable. We started with lag 1, as it seems that the previous week’s demand has an effect on this week’s demand
takis_per_week$lag1 <- lag(takis_per_week$demanda, n = 1L)
train <- takis_per_week[1:4,]
test <- takis_per_week[5:7,]
fit <- train(demanda ~ Semana + lag1, data = train, method = "lm", na.action = na.pass)
predicteddemand <- predict(fit, test)
postResample(pred = predicteddemand, obs = test$demanda)
## RMSE Rsquared MAE
## 5.672551e+04 1.073052e-01 5.533187e+04
# coef(lm(demanda ~ Semana + payday, data = train))
# test
# predicteddemand
lmlag1_error <- cbind(test, predicteddemand)
lmlag1_error$error <- lmlag1_error$demanda - lmlag1_error$predicteddemand
lmlag1_error$error_perc <- (lmlag1_error$error / lmlag1_error$demanda)*100
lmlag1_error
## Semana ventas devoluciones demanda lag1 predicteddemand error error_perc
## 1 7 63706 2290 61610 57759 1946.937 59663.06 96.83990
## 2 8 60967 2458 58605 61610 20277.244 38327.76 65.40015
## 3 9 61602 2625 59190 58605 -8814.791 68004.79 114.89236
lmlag1_error$demanda <- lmlag1_error$predicteddemand
lag1plot <- rbind(takis_per_week[,1:4], lmlag1_error[,1:4])
lag1plot[1:7, "actual"] <- "actual"
lag1plot[8:10, "actual"] <- "prediction"
ggplot(lag1plot, aes(x = Semana, y = demanda, colour = actual)) + geom_point()
We see that this new variable allows us to mimic the fluctuations of actual data, but the predictions are well below the actual demand.
As a 2nd idea, as we saw that peaks in demand could be related to paydays, we’ll input it manually as a categorical variable:
takis_per_week$payday <- c(1,0,1,0,1,0,1)
train <- takis_per_week[1:4,]
test <- takis_per_week[5:7,]
fit <- train(demanda ~ Semana + payday, data = train, method = "lm", na.action = na.pass)
predicteddemand <- predict(fit, test)
postResample(pred = predicteddemand, obs = test$demanda)
## RMSE Rsquared MAE
## 1626.8802135 0.9860505 1619.5833333
# coef(lm(demanda ~ Semana + payday, data = train))
# test
# predicteddemand
lmpayday_error <- cbind(test, predicteddemand)
lmpayday_error$error <- lmpayday_error$demanda - lmpayday_error$predicteddemand
lmpayday_error$error_perc <- (lmpayday_error$error / lmpayday_error$demanda)*100
lmpayday_error
## Semana ventas devoluciones demanda lag1 payday predicteddemand error
## 1 7 63706 2290 61610 57759 1 59991.75 1618.25
## 2 8 60967 2458 58605 61610 0 57173.25 1431.75
## 3 9 61602 2625 59190 58605 1 57381.25 1808.75
## error_perc
## 1 2.626603
## 2 2.443051
## 3 3.055837
lmpayday_error$demanda <- lmpayday_error$predicteddemand
paydayplot <- rbind(takis_per_week[,1:4], lmpayday_error[,1:4])
paydayplot[1:7, "actual"] <- "actual"
paydayplot[8:10, "actual"] <- "prediction"
ggplot(paydayplot, aes(x = Semana, y = demanda, colour = actual)) + geom_point()
We can see that our metrics have improved dramatically:
we have now an R2 of 0.986,
our predictions have almost the same shape as actual data,
and RMSE and MAE have been reduced to half compared to the previous model.
However, our predictions are still a 3% below actual demand.
Even if we change the train and test sizes the performance improves even more, although we shouldn’t rely on this 100% as it might be overfitted:
train <- takis_per_week[1:5,]
test <- takis_per_week[6:7,]
fit <- train(demanda ~ Semana + payday, data = train, method = "lm", na.action = na.pass)
predicteddemand <- predict(fit, test)
postResample(pred = predicteddemand, obs = test$demanda)
## RMSE Rsquared MAE
## 326.3243 1.0000 243.0333
# coef(lm(demanda ~ Semana + payday, data = train))
# test
# predicteddemand
lmpayday_error <- cbind(test, predicteddemand)
lmpayday_error$error <- lmpayday_error$demanda - lmpayday_error$predicteddemand
lmpayday_error$error_perc <- (lmpayday_error$error / lmpayday_error$demanda)*100
lmpayday_error
## Semana ventas devoluciones demanda lag1 payday predicteddemand error
## 1 8 60967 2458 58605 61610 0 58144.20 460.80000
## 2 9 61602 2625 59190 58605 1 59215.27 -25.26667
## error_perc
## 1 0.78628103
## 2 -0.04268739
lmpayday_error$demanda <- lmpayday_error$predicteddemand
paydayplot <- rbind(takis_per_week[,1:4], lmpayday_error[,1:4])
paydayplot[1:7, "actual"] <- "actual"
paydayplot[8:9, "actual"] <- "prediction"
ggplot(paydayplot, aes(x = Semana, y = demanda, colour = actual)) + geom_point()