library(readr)
library(dplyr)
library(tidyr)
library(plotly)
library(TTR)
library(forecast)
library(caret)
library(plyr)
detach("package:plyr", unload = TRUE)
First of all we read all the data available on Kaggle:
# train <- read_csv("train.csv")
# saveRDS(train, file = "train.rds")
# producto_tabla <- read_csv("producto_tabla.csv")
First exporatory analysis and subsetting Takis into a smaller csv:
# dim(train)
#
# head(train)
#
# por_producto <- train %>%
# select(Semana, Producto_ID, Venta_uni_hoy, Dev_uni_proxima, Demanda_uni_equil)
#
# producto_tabla %>%
# separate(NombreProducto,
# c("Name", "Grams","Brand", "hola", "a", "b", "c", "d"))
#
# takis <- producto_tabla[which(grepl("Takis Fuego", producto_tabla$NombreProducto)),]
#
# takis$Producto_ID
#
# takis1<- por_producto[which(por_producto$Producto_ID == "30110"),]
# takis2<- por_producto[which(por_producto$Producto_ID == "42453"),]
# takis3<- por_producto[which(por_producto$Producto_ID == "42672"),]
#
# write.csv(takis2, file = "takis62g.csv", row.names = F)
Now we can work with only Takis 62g with a smaller and faster file, and we’ll try to do a time series analysis to spot any trends or seasonalities:
takis <- read_csv("takis62g.csv")
takis_per_week <- takis %>%
arrange(Semana) %>%
group_by(Semana) %>%
summarise(
ventas = sum(Venta_uni_hoy),
devoluciones = sum(Dev_uni_proxima),
demanda = sum(Demanda_uni_equil)) %>%
select(Semana, ventas, devoluciones, demanda)
plot_ly(takis_per_week, x = ~takis_per_week$Semana, y = ~takis_per_week$demanda, name = 'Demanda', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~takis_per_week$ventas, name = 'ventas', mode = 'lines') %>%
add_trace(y = ~takis_per_week$devoluciones, name = 'devoluciones', mode = 'lines') %>%
layout(title = "Takis Demand Jan-Mar",
xaxis = list(title = "Week"),
yaxis = list (title = "Demand"))
For the time series analysis I chose to set the frequency as 2-weekly. A weekly frequency (4) would have been perfect to spot seasonalities along the month (as in 1st week after getting paid through the end of the month), however we don’t have enough data to treat it like this.
takis_ts <- ts(takis_per_week$demanda, frequency = 2)
takis_ts
## Time Series:
## Start = c(1, 1)
## End = c(4, 1)
## Frequency = 2
## [1] 63188 64419 64627 57759 61610 58605 59190
takis_components <- decompose(takis_ts)
plot(takis_components)
Although we have very few data we can spot:
As we spot trend and seasonality, we’ll try to predict demand for the last 3 weeks with a time series linear model:
train <- takis_per_week[1:4, 4]
test <- takis_per_week[5:7, 4]
train_ts <- ts(train, frequency = 2)
train_components <- decompose(train_ts)
test_ts <- ts(test, frequency = 2)
modelts <- tslm(train_ts ~ trend + season)
summary(modelts)
##
## Call:
## tslm(formula = train_ts ~ trend + season)
##
## Residuals:
## Time Series:
## Start = c(1, 1)
## End = c(2, 2)
## Frequency = 2
## 1 2 3 4
## -2025 2025 2025 -2025
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66518 4960 13.412 0.0474 *
## trend -1305 2025 -0.645 0.6355
## season2 -1513 4528 -0.334 0.7946
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4050 on 1 degrees of freedom
## Multiple R-squared: 0.4737, Adjusted R-squared: -0.5789
## F-statistic: 0.45 on 2 and 1 DF, p-value: 0.7255
predicteddemand <- forecast(train_ts, h = 3)
autoplot(predicteddemand)
error_analysis <- cbind(test, predicteddemand)
error_analysis$error <- error_analysis$`Point Forecast` - error_analysis$demanda
error_analysis$errorperc <- (error_analysis$error / error_analysis$demanda)*100
error_analysis
## demanda Point Forecast Lo 80 Hi 80 Lo 95 Hi 95 error errorperc
## 3.00 61610 63188 59503.67 66872.33 57553.31 68822.69 1578 2.561273
## 3.50 58605 63188 59503.67 66872.33 57553.31 68822.69 4583 7.820152
## 4.00 59190 63188 59503.67 66872.33 57553.31 68822.69 3998 6.754519
However, unlike we though the forecast is flat and well above actual demand. Therefore we’ll have to use a different method in a more “manual” way.
As we spotted a decreasing trend, first we’ll try to predict the last 3 weeks demand with only this decreasing trend by week:
train <- takis_per_week[1:4,]
test <- takis_per_week[5:7,]
fit <- train(demanda ~ Semana, data = train, method = "lm", na.action = na.pass)
predicteddemand <- predict(fit, test)
postResample(pred = predicteddemand, obs = test$demanda)
## RMSE Rsquared MAE
## 3068.0135213 0.5768469 2931.0666667
# coef(lm(demanda ~ Semana, data = train))
# test
# predicteddemand
lmsemana_error <- cbind(test, predicteddemand)
lmsemana_error$error <- lmsemana_error$demanda - lmsemana_error$predicteddemand
lmsemana_error$demanda <- lmsemana_error$predicteddemand
lmsemana_error$error_perc <- (lmsemana_error$error / lmsemana_error$demanda)*100
lmsemana_error
## Semana ventas devoluciones demanda predicteddemand error error_perc
## 1 7 63706 2290 58478.5 58478.5 3131.5 5.354960
## 2 8 60967 2458 56870.6 56870.6 1734.4 3.049730
## 3 9 61602 2625 55262.7 55262.7 3927.3 7.106602
semanaplot <- rbind(takis_per_week[,1:4], lmsemana_error[,1:4])
semanaplot[1:7, "actual"] <- "actual"
semanaplot[8:10, "actual"] <- "prediction"
ggplot(semanaplot, aes(x = Semana, y = demanda, colour = actual)) + geom_abline(intercept = 69733.8, slope = -1607.9, size = 0.01) + geom_point()
In this first try, although the trend is captured, there are 2 things missing:
the 2-week seasonality isn’t represented
our predictions are well below the actual demand, most probably due to the effect of week 6
To replicate this seasonality we though of different ideas:
takis_per_week$lag1 <- lag(takis_per_week$demanda, n = 1L)
ggplot(takis_per_week, aes(x = lag1, y = demanda)) + geom_point()
takis_per_week$lag2 <- lag(takis_per_week$demanda, n = 2L)
ggplot(takis_per_week, aes(x = lag2, y = demanda)) + geom_point()
Lag 2 seems more correlated, so we tried using it for our prediction:
train <- takis_per_week[1:4,]
test <- takis_per_week[5:7,]
fit <- train(demanda ~ Semana + lag2, data = train, method = "lm", na.action = na.pass)
predicteddemand <- predict(fit, test)
postResample(pred = predicteddemand, obs = test$demanda)
## RMSE Rsquared MAE
## 1.646282e+04 5.768469e-01 1.577867e+04
# coef(lm(demanda ~ Semana, data = train))
# test
# predicteddemand
lag2_error <- cbind(test, predicteddemand)
lag2_error$error <- lag2_error$demanda - lag2_error$predicteddemand
lag2_error$demanda <- lag2_error$predicteddemand
lag2_error$error_perc <- (lag2_error$error / lag2_error$demanda)*100
lag2_error
## Semana ventas devoluciones demanda lag1 lag2 predicteddemand error
## 1 7 63706 2290 50891 57759 64627 50891 10719
## 2 8 60967 2458 44023 61610 57759 44023 14582
## 3 9 61602 2625 37155 58605 61610 37155 22035
## error_perc
## 1 21.06266
## 2 33.12359
## 3 59.30561
lag2plot <- rbind(takis_per_week[,1:4], lag2_error[,1:4])
lag2plot[1:7, "actual"] <- "actual"
lag2plot[8:10, "actual"] <- "prediction"
ggplot(lag2plot, aes(x = Semana, y = demanda, colour = actual)) + geom_point()
However, predictions seem very off and below actual demand and doesn’t replicate seasonality either. So we went for a diffetent idea:
takis_per_week$porcentaje_devol <- ((takis_per_week$devoluciones / takis_per_week$ventas)*100)
takis_per_week$lag_devoluciones <- lag(takis_per_week$porcentaje_devol)
ggplot(takis_per_week, aes(x = lag_devoluciones, y = demanda)) + geom_point()
There doesn’t seem to be much correlation either, so we tried with our 3rd idea:
takis_per_week$payday <- c(1,0,1,0,1,0,1)
train <- takis_per_week[1:4,]
test <- takis_per_week[5:7,]
fit <- train(demanda ~ Semana + payday, data = train, method = "lm", na.action = na.pass)
predicteddemand <- predict(fit, test)
postResample(pred = predicteddemand, obs = test$demanda)
## RMSE Rsquared MAE
## 1626.8802135 0.9860505 1619.5833333
# coef(lm(demanda ~ Semana + payday, data = train))
# test
# predicteddemand
lmpayday_error <- cbind(test, predicteddemand)
lmpayday_error$error <- lmpayday_error$demanda - lmpayday_error$predicteddemand
lmpayday_error$error_perc <- (lmpayday_error$error / lmpayday_error$demanda)*100
lmpayday_error
## Semana ventas devoluciones demanda lag1 lag2 porcentaje_devol
## 1 7 63706 2290 61610 57759 64627 3.594638
## 2 8 60967 2458 58605 61610 57759 4.031689
## 3 9 61602 2625 59190 58605 61610 4.261225
## lag_devoluciones payday predicteddemand error error_perc
## 1 3.025767 1 59991.75 1618.25 2.626603
## 2 3.594638 0 57173.25 1431.75 2.443051
## 3 4.031689 1 57381.25 1808.75 3.055837
lmpayday_error$demanda <- lmpayday_error$predicteddemand
paydayplot <- rbind(takis_per_week[,1:4], lmpayday_error[,1:4])
paydayplot[1:7, "actual"] <- "actual"
paydayplot[8:10, "actual"] <- "prediction"
ggplot(paydayplot, aes(x = Semana, y = demanda, colour = actual)) + geom_point()
We can see that our metrics have improved dramatically:
we have now an R2 of 0.986,
our predictions have almost the same shape as actual data,
and RMSE and MAE have been reduced to half compared to the previous model.
However, although only by 3% our predictions are still always below actual demand.