library(readr)
library(dplyr)
library(tidyr)
library(plotly)
library(TTR)

First of all we read the data available on Kaggle:

# train <- read_csv("train.csv")
# saveRDS(train, file = "train.rds")
# train <- readRDS(file = "train.rds")
# cliente_tabla <- read_csv("cliente_tabla.csv")

First exporatory analysis and subsetting different client types into a lighter file:

# dim(train)
# 
# head(train)
# 
# per_client_type <- train %>%
#   select(Semana, Canal_ID, Venta_uni_hoy, Dev_uni_proxima, Demanda_uni_equil)
# 
# per_client_type_grouped <- per_client_type %>%
#   arrange(Semana, Canal_ID) %>%
#   group_by(Semana, Canal_ID) %>%
#   summarise(
#     ventas = sum(Venta_uni_hoy),
#     devoluciones = sum(Dev_uni_proxima),
#     demanda = sum(Demanda_uni_equil)) %>%
#   select(Semana, Canal_ID, ventas, devoluciones, demanda)
# 
# saveRDS(per_client_type_grouped, file = "per_client_type_grouped.rds")
per_client_type_grouped <- readRDS(file = "per_client_type_grouped.rds")
per_client_type_grouped
## # A tibble: 63 x 5
## # Groups:   Semana [7]
##    Semana Canal_ID   ventas devoluciones  demanda
##     <dbl>    <dbl>    <dbl>        <dbl>    <dbl>
##  1      3        1 56697977       835051 56154862
##  2      3        2  7319278       166617  7181704
##  3      3        4  7044839        38016  7018552
##  4      3        5  3031814       161603  2903645
##  5      3        6   454008         5888   451766
##  6      3        7  1493844        14941  1489526
##  7      3        8   475460        35512   469635
##  8      3        9     4302            0     4302
##  9      3       11  1998474        28427  1990317
## 10      4        1 57970962       839777 57415516
## # ... with 53 more rows

Now we can work with sales grouped by Canal ID and we spot that Canal 1 takes most of the demand:

ggplot(per_client_type_grouped, aes(x = Semana, y = demanda)) + geom_line() + facet_wrap(~ Canal_ID)

# ggplot(per_client_type_grouped, aes(x = Semana, y = ventas)) + geom_line() + facet_wrap(~ Canal_ID)
# 
# ggplot(per_client_type_grouped, aes(x = Semana, y = devoluciones)) + geom_line() + facet_wrap(~ Canal_ID)
per_client_type_grouped$Canal_ID <- as.factor(per_client_type_grouped$Canal_ID)
ggplot(per_client_type_grouped, aes(x = Semana, y = demanda, fill = Canal_ID)) + geom_col(colour = "black")

To have an idea about this distribution we might want to see which clients fall into each category (to develop further to see why they have grouped like this, could be that some canals are grouped by size rather than store type?):

# sales_per_client <- train %>%
#   select(Canal_ID, Cliente_ID, Demanda_uni_equil) %>%
#   arrange(Canal_ID, Cliente_ID) %>%
#   group_by(Canal_ID, Cliente_ID) %>%
#   summarise(
#     demanda = sum(Demanda_uni_equil)
#     )
# 
# sales_per_client <- merge(sales_per_client, cliente_tabla, by= 'Cliente_ID')
# saveRDS(sales_per_client, file = "sales_per_client.rds")
sales_per_client <- readRDS(file = "sales_per_client.rds")

Canal ID Summary

Most categories include many “no identificados”, we’ll have to see what to do with them

Canal ID 1: we can see that this canal includes mostly markets (such as Juchiluca) and supermarkets (Chedraui, Bodega Aurrera) and quite a few non-identified customers:

head(sales_per_client %>%
  filter(Canal_ID == 1) %>%
  arrange(desc(demanda)), n = 10L)
##    Cliente_ID Canal_ID demanda                    NombreCliente
## 1      587268        1   80477                        JUCHILUCA
## 2      406878        1   52727                CHEDRAUI TUXPAM 1
## 3     7448474        1   49135          BODEGA AURRERA JUCHITAN
## 4      158633        1   46516                     LA SUPREMA 2
## 5      133289        1   45410                         TLAPANGO
## 6     4667634        1   44234                  NO IDENTIFICADO
## 7      134764        1   44137        CENTRO COMERCIAL DON NICO
## 8     4242474        1   43411                  NO IDENTIFICADO
## 9     4667610        1   41960                  NO IDENTIFICADO
## 10     152082        1   41705 COMERCIALIZADORA GUANZA SA DE CV

Canal ID 2: this canal includes more supermarkets, including Chedraui again and others like Wal-Mart and Mega Comercial Mexicana

head(sales_per_client %>%
  filter(Canal_ID == 2) %>%
  arrange(desc(demanda)), n = 10L)
##    Cliente_ID Canal_ID demanda                                 NombreCliente
## 1     1200400        2  163591                              CHEDRAUI POLANCO
## 2     5903732        2  162633 WAL MART SUPER CENTER DOMINGO DIEZ CUERNAVACA
## 3       19260        2  154782                              WAL MART TEPEYAC
## 4       19174        2  125018                           WAL MART MIRAMONTES
## 5       20959        2  114458                                WAL MART LOMAS
## 6       23849        2  109021                            WAL MART CLOUTHIER
## 7       26201        2  108070                             WAL MART VERACRUZ
## 8       17815        2  106258                        WAL MART PLAZA ORIENTE
## 9       16063        2  105437                              WAL MART TLALPAN
## 10      22534        2  104248               MEGA COMERCIAL MEXICANA MIXCOAC

Canal ID 4: this canal includes again more supermarkets, including mainly Oxxo, more Bodega Aurrera and others:

head(sales_per_client %>%
  filter(Canal_ID == 4) %>%
  arrange(desc(demanda)))
##   Cliente_ID Canal_ID demanda               NombreCliente
## 1    1216931        4  232472            OXXO TECNOLOGICO
## 2    2502084        4  181407 CEDIS CADENA COMERCIAL OXXO
## 3      36161        4  100873           OXXO PUEBLA FINSA
## 4    2387266        4   74489               OXXO ALEMANIA
## 5      22905        4   56245                 OXXO MADERO
## 6    8356583        4   55412                OXXO SINALOA

Canal ID 5: the top clients are Remisión from different states. On Kaggle users agree that these must be internal clients

head(sales_per_client %>%
  filter(Canal_ID == 5) %>%
  arrange(desc(demanda)))
##   Cliente_ID Canal_ID  demanda                    NombreCliente
## 1     653378        5 17866224                  PUEBLA REMISION
## 2     653039        5   893756    QUERETARO DE ARTEAGA REMISION
## 3     652850        5   440039                 JALISCO REMISION
## 4     653124        5   207547              TAMAULIPAS REMISION
## 5     653037        5   163985                  OAXACA REMISION
## 6    2082738        5   143785 CLIENTE 2274 TUXTLA GUTIERREZ AV

Canal ID 6: mainly “no identificado”, vendor machines, schools and cafes

head(sales_per_client %>%
  filter(Canal_ID == 6) %>%
  arrange(desc(demanda)))
##   Cliente_ID Canal_ID demanda      NombreCliente
## 1    4430691        6   27934    NO IDENTIFICADO
## 2    4350858        6   24496    NO IDENTIFICADO
## 3    4494362        6   21249    NO IDENTIFICADO
## 4    1373210        6   16709          VENDI MAX
## 5      20008        6   15666 CAFETERIA CECIT 10
## 6    4433208        6   15570    NO IDENTIFICADO

Canal ID 7:

head(sales_per_client %>%
  filter(Canal_ID == 7) %>%
  arrange(desc(demanda)))
##   Cliente_ID Canal_ID demanda                      NombreCliente
## 1    4414498        7   91450                    NO IDENTIFICADO
## 2    1491016        7   85418          EMBOTELLADORA DE CULIACAN
## 3     155559        7   55030                            PAN ROL
## 4     430503        7   52133 PLAZA COMERCIAL ORIENTE S A DE C V
## 5    1598228        7   43593                   LA COSMOPOLITANA
## 6    1013817        7   41490                    OTBT S A DE C V

Canal ID 8: mainly “no identificado”, bus stations, restaurants

head(sales_per_client %>%
  filter(Canal_ID == 8) %>%
  arrange(desc(demanda)))
##   Cliente_ID Canal_ID demanda                 NombreCliente
## 1     933857        8  130424                  PRIMERA PLUS
## 2    4593591        8   74271               NO IDENTIFICADO
## 3    2384654        8   71586     VENTA DE EMPLEADO FABRICA
## 4     827594        8   69264 MC DONALDS ANTONIO CUAUTITLAN
## 5    2096890        8   58945  MARIANO OTERO VENTA EMPLEADO
## 6    1050905        8   57092                     LOS PATOS

Canal ID 9: Carls Jr & Burger King

head(sales_per_client %>%
  filter(Canal_ID == 9) %>%
  arrange(desc(demanda)))
##   Cliente_ID Canal_ID demanda             NombreCliente
## 1     312258        9    3295      CARLS JR INSURGENTES
## 2     168621        9    2485             CARLS JR OTAY
## 3     168510        9    2453   CARLS JR AGUASCALIENTES
## 4    4452840        9    2125           NO IDENTIFICADO
## 5    4249188        9    1990           NO IDENTIFICADO
## 6     167355        9    1905 CARLS JR OTAY UNIVERSIDAD

Canal ID 11: mainly bottling companies –> see which products are sold to them

head(sales_per_client %>%
  filter(Canal_ID == 11) %>%
  arrange(desc(demanda)))
##   Cliente_ID Canal_ID demanda                              NombreCliente
## 1    7727092       11   73653               BEBIDAS MUNDIALES S A DE C V
## 2    7727092       11   73653              BEBIDAS MUNDIALES S A  DE C V
## 3    1383948       11   71112  BEBIDAS ENVASADAS DEL PACIFICO S A DE C V
## 4    1383948       11   71112 BEBIDAS ENVASADAS DEL PACIFICO S A  DE C V
## 5     872839       11   69572                     AUTOBUSES DE LA PIEDAD
## 6    9847617       11   55100                      COCA COLA PUENTE ALTO

Many classifications can be done, for which we’d need further exploration. For now we’ll classify them as:

per_client_type_grouped$customer_type <- ifelse(
  per_client_type_grouped$Canal_ID < 5, "supermarket", ifelse(
    per_client_type_grouped$Canal_ID == 5, "intermediary", ifelse(
        per_client_type_grouped$Canal_ID == 7, "intermediary", ifelse(
          per_client_type_grouped$Canal_ID == 11, "intermediary", "restaurant"
    ))))
supermarket_perweek <- per_client_type_grouped %>%
  select(Semana, ventas, devoluciones, demanda, customer_type) %>%
  filter(customer_type == "supermarket") %>%
  summarise(
    ventas = sum(ventas),
    devoluciones = sum(devoluciones),
    demanda = sum(demanda)
  )

intermediary_perweek <- per_client_type_grouped %>%
  select(Semana, ventas, devoluciones, demanda, customer_type) %>%
  filter(customer_type == "intermediary") %>%
  summarise(
    ventas = sum(ventas),
    devoluciones = sum(devoluciones),
    demanda = sum(demanda)
  )

restaurant_perweek <- per_client_type_grouped %>%
  select(Semana, ventas, devoluciones, demanda, customer_type) %>%
  filter(customer_type == "restaurant") %>%
  summarise(
    ventas = sum(ventas),
    devoluciones = sum(devoluciones),
    demanda = sum(demanda)
  )

Interestingly enough we can see that demand in supermarket has very similar trends, seasonality and random component as Takis:

supermarket_ts <- ts(supermarket_perweek$demanda, frequency = 2)

supermarket_components <- decompose(supermarket_ts)
plot(supermarket_components)

Intermediary channel behaves differently in terms of trend and seasonality, but also has the same random fall at the 2nd week of February, we should have a look at what happened.

intermediary_ts <- ts(intermediary_perweek$demanda, frequency = 2)

intermediary_components <- decompose(intermediary_ts)
plot(intermediary_components)

Quite the opposite way as supermarkets and Takis, demand in restaurants is growing, and has the same two-week seasonality and the same random drop:

restaurant_ts <- ts(restaurant_perweek$demanda, frequency = 2)

restaurant_components <- decompose(restaurant_ts)
plot(restaurant_components)