library(readr)
library(dplyr)
library(tidyr)
library(plotly)
library(TTR)
First of all we read the data available on Kaggle:
# train <- read_csv("train.csv")
# saveRDS(train, file = "train.rds")
# train <- readRDS(file = "train.rds")
# cliente_tabla <- read_csv("cliente_tabla.csv")
First exporatory analysis and subsetting different client types into a lighter file:
# dim(train)
#
# head(train)
#
# per_client_type <- train %>%
# select(Semana, Canal_ID, Venta_uni_hoy, Dev_uni_proxima, Demanda_uni_equil)
#
# per_client_type_grouped <- per_client_type %>%
# arrange(Semana, Canal_ID) %>%
# group_by(Semana, Canal_ID) %>%
# summarise(
# ventas = sum(Venta_uni_hoy),
# devoluciones = sum(Dev_uni_proxima),
# demanda = sum(Demanda_uni_equil)) %>%
# select(Semana, Canal_ID, ventas, devoluciones, demanda)
#
# saveRDS(per_client_type_grouped, file = "per_client_type_grouped.rds")
per_client_type_grouped <- readRDS(file = "per_client_type_grouped.rds")
per_client_type_grouped
## # A tibble: 63 x 5
## # Groups: Semana [7]
## Semana Canal_ID ventas devoluciones demanda
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3 1 56697977 835051 56154862
## 2 3 2 7319278 166617 7181704
## 3 3 4 7044839 38016 7018552
## 4 3 5 3031814 161603 2903645
## 5 3 6 454008 5888 451766
## 6 3 7 1493844 14941 1489526
## 7 3 8 475460 35512 469635
## 8 3 9 4302 0 4302
## 9 3 11 1998474 28427 1990317
## 10 4 1 57970962 839777 57415516
## # ... with 53 more rows
Now we can work with sales grouped by Canal ID and we spot that Canal 1 takes most of the demand:
ggplot(per_client_type_grouped, aes(x = Semana, y = demanda)) + geom_line() + facet_wrap(~ Canal_ID)
# ggplot(per_client_type_grouped, aes(x = Semana, y = ventas)) + geom_line() + facet_wrap(~ Canal_ID)
#
# ggplot(per_client_type_grouped, aes(x = Semana, y = devoluciones)) + geom_line() + facet_wrap(~ Canal_ID)
per_client_type_grouped$Canal_ID <- as.factor(per_client_type_grouped$Canal_ID)
ggplot(per_client_type_grouped, aes(x = Semana, y = demanda, fill = Canal_ID)) + geom_col(colour = "black")
To have an idea about this distribution we might want to see which clients fall into each category (to develop further to see why they have grouped like this, could be that some canals are grouped by size rather than store type?):
# sales_per_client <- train %>%
# select(Canal_ID, Cliente_ID, Demanda_uni_equil) %>%
# arrange(Canal_ID, Cliente_ID) %>%
# group_by(Canal_ID, Cliente_ID) %>%
# summarise(
# demanda = sum(Demanda_uni_equil)
# )
#
# sales_per_client <- merge(sales_per_client, cliente_tabla, by= 'Cliente_ID')
# saveRDS(sales_per_client, file = "sales_per_client.rds")
sales_per_client <- readRDS(file = "sales_per_client.rds")
Canal ID Summary
Most categories include many “no identificados”, we’ll have to see what to do with them
Canal ID 1: we can see that this canal includes mostly markets (such as Juchiluca) and supermarkets (Chedraui, Bodega Aurrera) and quite a few non-identified customers:
head(sales_per_client %>%
filter(Canal_ID == 1) %>%
arrange(desc(demanda)), n = 10L)
## Cliente_ID Canal_ID demanda NombreCliente
## 1 587268 1 80477 JUCHILUCA
## 2 406878 1 52727 CHEDRAUI TUXPAM 1
## 3 7448474 1 49135 BODEGA AURRERA JUCHITAN
## 4 158633 1 46516 LA SUPREMA 2
## 5 133289 1 45410 TLAPANGO
## 6 4667634 1 44234 NO IDENTIFICADO
## 7 134764 1 44137 CENTRO COMERCIAL DON NICO
## 8 4242474 1 43411 NO IDENTIFICADO
## 9 4667610 1 41960 NO IDENTIFICADO
## 10 152082 1 41705 COMERCIALIZADORA GUANZA SA DE CV
Canal ID 2: this canal includes more supermarkets, including Chedraui again and others like Wal-Mart and Mega Comercial Mexicana
head(sales_per_client %>%
filter(Canal_ID == 2) %>%
arrange(desc(demanda)), n = 10L)
## Cliente_ID Canal_ID demanda NombreCliente
## 1 1200400 2 163591 CHEDRAUI POLANCO
## 2 5903732 2 162633 WAL MART SUPER CENTER DOMINGO DIEZ CUERNAVACA
## 3 19260 2 154782 WAL MART TEPEYAC
## 4 19174 2 125018 WAL MART MIRAMONTES
## 5 20959 2 114458 WAL MART LOMAS
## 6 23849 2 109021 WAL MART CLOUTHIER
## 7 26201 2 108070 WAL MART VERACRUZ
## 8 17815 2 106258 WAL MART PLAZA ORIENTE
## 9 16063 2 105437 WAL MART TLALPAN
## 10 22534 2 104248 MEGA COMERCIAL MEXICANA MIXCOAC
Canal ID 4: this canal includes again more supermarkets, including mainly Oxxo, more Bodega Aurrera and others:
head(sales_per_client %>%
filter(Canal_ID == 4) %>%
arrange(desc(demanda)))
## Cliente_ID Canal_ID demanda NombreCliente
## 1 1216931 4 232472 OXXO TECNOLOGICO
## 2 2502084 4 181407 CEDIS CADENA COMERCIAL OXXO
## 3 36161 4 100873 OXXO PUEBLA FINSA
## 4 2387266 4 74489 OXXO ALEMANIA
## 5 22905 4 56245 OXXO MADERO
## 6 8356583 4 55412 OXXO SINALOA
Canal ID 5: the top clients are Remisión from different states. On Kaggle users agree that these must be internal clients
head(sales_per_client %>%
filter(Canal_ID == 5) %>%
arrange(desc(demanda)))
## Cliente_ID Canal_ID demanda NombreCliente
## 1 653378 5 17866224 PUEBLA REMISION
## 2 653039 5 893756 QUERETARO DE ARTEAGA REMISION
## 3 652850 5 440039 JALISCO REMISION
## 4 653124 5 207547 TAMAULIPAS REMISION
## 5 653037 5 163985 OAXACA REMISION
## 6 2082738 5 143785 CLIENTE 2274 TUXTLA GUTIERREZ AV
Canal ID 6: mainly “no identificado”, vendor machines, schools and cafes
head(sales_per_client %>%
filter(Canal_ID == 6) %>%
arrange(desc(demanda)))
## Cliente_ID Canal_ID demanda NombreCliente
## 1 4430691 6 27934 NO IDENTIFICADO
## 2 4350858 6 24496 NO IDENTIFICADO
## 3 4494362 6 21249 NO IDENTIFICADO
## 4 1373210 6 16709 VENDI MAX
## 5 20008 6 15666 CAFETERIA CECIT 10
## 6 4433208 6 15570 NO IDENTIFICADO
Canal ID 7:
head(sales_per_client %>%
filter(Canal_ID == 7) %>%
arrange(desc(demanda)))
## Cliente_ID Canal_ID demanda NombreCliente
## 1 4414498 7 91450 NO IDENTIFICADO
## 2 1491016 7 85418 EMBOTELLADORA DE CULIACAN
## 3 155559 7 55030 PAN ROL
## 4 430503 7 52133 PLAZA COMERCIAL ORIENTE S A DE C V
## 5 1598228 7 43593 LA COSMOPOLITANA
## 6 1013817 7 41490 OTBT S A DE C V
Canal ID 8: mainly “no identificado”, bus stations, restaurants
head(sales_per_client %>%
filter(Canal_ID == 8) %>%
arrange(desc(demanda)))
## Cliente_ID Canal_ID demanda NombreCliente
## 1 933857 8 130424 PRIMERA PLUS
## 2 4593591 8 74271 NO IDENTIFICADO
## 3 2384654 8 71586 VENTA DE EMPLEADO FABRICA
## 4 827594 8 69264 MC DONALDS ANTONIO CUAUTITLAN
## 5 2096890 8 58945 MARIANO OTERO VENTA EMPLEADO
## 6 1050905 8 57092 LOS PATOS
Canal ID 9: Carls Jr & Burger King
head(sales_per_client %>%
filter(Canal_ID == 9) %>%
arrange(desc(demanda)))
## Cliente_ID Canal_ID demanda NombreCliente
## 1 312258 9 3295 CARLS JR INSURGENTES
## 2 168621 9 2485 CARLS JR OTAY
## 3 168510 9 2453 CARLS JR AGUASCALIENTES
## 4 4452840 9 2125 NO IDENTIFICADO
## 5 4249188 9 1990 NO IDENTIFICADO
## 6 167355 9 1905 CARLS JR OTAY UNIVERSIDAD
Canal ID 11: mainly bottling companies –> see which products are sold to them
head(sales_per_client %>%
filter(Canal_ID == 11) %>%
arrange(desc(demanda)))
## Cliente_ID Canal_ID demanda NombreCliente
## 1 7727092 11 73653 BEBIDAS MUNDIALES S A DE C V
## 2 7727092 11 73653 BEBIDAS MUNDIALES S A DE C V
## 3 1383948 11 71112 BEBIDAS ENVASADAS DEL PACIFICO S A DE C V
## 4 1383948 11 71112 BEBIDAS ENVASADAS DEL PACIFICO S A DE C V
## 5 872839 11 69572 AUTOBUSES DE LA PIEDAD
## 6 9847617 11 55100 COCA COLA PUENTE ALTO
Many classifications can be done, for which we’d need further exploration. For now we’ll classify them as:
Supermarkets
Restaurants & Cafes
Intermediaries
per_client_type_grouped$customer_type <- ifelse(
per_client_type_grouped$Canal_ID < 5, "supermarket", ifelse(
per_client_type_grouped$Canal_ID == 5, "intermediary", ifelse(
per_client_type_grouped$Canal_ID == 7, "intermediary", ifelse(
per_client_type_grouped$Canal_ID == 11, "intermediary", "restaurant"
))))
supermarket_perweek <- per_client_type_grouped %>%
select(Semana, ventas, devoluciones, demanda, customer_type) %>%
filter(customer_type == "supermarket") %>%
summarise(
ventas = sum(ventas),
devoluciones = sum(devoluciones),
demanda = sum(demanda)
)
intermediary_perweek <- per_client_type_grouped %>%
select(Semana, ventas, devoluciones, demanda, customer_type) %>%
filter(customer_type == "intermediary") %>%
summarise(
ventas = sum(ventas),
devoluciones = sum(devoluciones),
demanda = sum(demanda)
)
restaurant_perweek <- per_client_type_grouped %>%
select(Semana, ventas, devoluciones, demanda, customer_type) %>%
filter(customer_type == "restaurant") %>%
summarise(
ventas = sum(ventas),
devoluciones = sum(devoluciones),
demanda = sum(demanda)
)
Interestingly enough we can see that demand in supermarket has very similar trends, seasonality and random component as Takis:
supermarket_ts <- ts(supermarket_perweek$demanda, frequency = 2)
supermarket_components <- decompose(supermarket_ts)
plot(supermarket_components)
Intermediary channel behaves differently in terms of trend and seasonality, but also has the same random fall at the 2nd week of February, we should have a look at what happened.
intermediary_ts <- ts(intermediary_perweek$demanda, frequency = 2)
intermediary_components <- decompose(intermediary_ts)
plot(intermediary_components)
Quite the opposite way as supermarkets and Takis, demand in restaurants is growing, and has the same two-week seasonality and the same random drop:
restaurant_ts <- ts(restaurant_perweek$demanda, frequency = 2)
restaurant_components <- decompose(restaurant_ts)
plot(restaurant_components)