Easy Manipulation and Visualization


Reference: https://www.kaggle.com/fabienvs/grupo-bimbo-data-analysis

Step 1:

Read all the libraryies you need


Step 2:

Read the data.

train <- fread('raw/adjusted_data.csv')
town <- fread('raw/town_state.csv')
client <- fread('raw/cliente_tabla.csv')
Product <- fread('raw/producto_tabla.csv')

Step 3:

Start Exploratory Data Analysis

Part 1: The Present Varibles

Week

To see if there’s a difference between weeks. ***

Week <- train %>%
  group_by(Semana) %>%
  summarise(Net = sum(Demanda_uni_equil))
ggplot(Week)+geom_bar(aes(x=as.factor(Week[[1]]),y=Week[[2]]), stat='identity',
                      col='#317778',fill='#317778')+
  labs(title='Demand by Week',x='Week',y='Demand')

We can see there’s no obvious difference among weeks. ***

Agencias

According to the introductin on Kaggle, Agencias is refered to the place they store their inventory to be delivered. We would like to use treemap to see the distribution of each Agencias. ***

Agencias <- train %>%
  group_by(Agencia_ID) %>%
  summarise(Sales = sum(Demanda_uni_equil))
treemap(Agencias,index=colnames(Agencias)[1],vSize=colnames(Agencias)[2],vColor='#317778',title = 'Demand by Agencias')

However, the color is not well-used here, which implies we can put further information on this treemap. We choose Venta_hoy, price, as the color of this chart. Furthermore, we choose 60 Agencias with the most demand.

Agencias <- train %>%
  group_by(Agencia_ID) %>%
  summarise(Sales = sum(Demanda_uni_equil),
            Price = mean(Venta_hoy)) %>%
  arrange(desc(Sales))
treemap(Agencias[1:60,],index = colnames(Agencias)[1],
        vSize = colnames(Agencias)[2],
        vColor = colnames(Agencias)[3],
        palette=c('#FFFFFF','#FFFFFF','#025055'),
        type='value',
        title.legend = 'Average Price',
        title = 'Demand by Agencias')

State

We can merge state information from agencias.


Agencias <- left_join(Agencias, town, by='Agencia_ID')
AgenciasState <- Agencias %>%
  group_by(State) %>%
  summarise(Sales = sum(Sales))
state_map = c("Tabasco", "Tlaxcala", "Baja California Sur", "Yucatan", "Campeche", "Baja California", "Queretaro", "Tamaulipas", "Sinaloa", "San Luis Potosi", "Chihuahua", "None", "None", "Quintana Roo", "Veracruz", "Colima", "Coahuila", "Mexico", "Guerrero", "Michoacan", "Chiapas", "Guanajuato", "Zacatecas", "Durango", "Puebla", "Sonora", "Oaxaca", "Aguascalientes", "Jalisco", "Morelos", "Hidalgo", "Nayarit", "Nuevo Leon")
names(state_map) = c("TABASCO", "TLAXCALA", "BAJA CALIFORNIA SUR", "YUCATÁN", "CAMPECHE", "BAJA CALIFORNIA NORTE", "QUERETARO", "TAMAULIPAS", "SINALOA", "SAN LUIS POTOSÍ", "CHIHUAHUA", "ESTADO DE MÉXICO", "Queretaro de Arteaga", "QUINTANA ROO", "VERACRUZ", "COLIMA", "COAHUILA", "MÉXICO, D.F.", "GUERRERO", "MICHOACÁN", "CHIAPAS", "GUANAJUATO", "ZACATECAS", "DURANGO", "PUEBLA", "SONORA", "OAXACA", "AGUASCALIENTES", "JALISCO", "MORELOS", "HIDALGO", "NAYARIT", "NUEVO LEÓN")
AgenciasState$State <- sapply(AgenciasState$State, function(x) state_map[x])
setorderv(AgenciasState, 'Sales', order=-1)
colors = c(rep("#3366FF", 10), rep("#6699FF", 10), rep("#66CCFF", 20))
plot = ggplot()
area = readShapePoly("mexstates/mexstates.shp")
## Warning: use rgdal::readOGR or sf::st_read
for (i in 1:32) {
  fill = colors[match(area$ADMIN_NAME[i], AgenciasState$State)]
  plot = plot + geom_polygon(data=area[i, 3], aes(long, lat, group = group),
                             colour = alpha("darkred", 1/2), size = 0.7, fill = fill, alpha = .5)
}
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
## Regions defined for each Polygons
print(plot)

Product

We can also take a look on how product contributed to the demand. Before we start, we must remember what ‘80/20’ tells us. ***

ProductTrain <- train %>%
  group_by(Producto_ID) %>%
  summarise( Sales = sum(Demanda_uni_equil)) %>%
  mutate( P_Character = factor(Producto_ID,
                                  levels =    
                              as.character(Producto_ID)))%>%
  arrange(desc(Sales))
ggplot(ProductTrain)+geom_bar(aes(x=ProductTrain[[3]],
                                  y=ProductTrain[[2]]),
                              stat='identity',
                              col='#317778',fill='#317778')+
  scale_x_discrete(limits=ProductTrain[[3]])+
  labs(title= 'Demand by Product',x='Product',y='Demand')

So, we merely choose 25% of the product, 250, as the target for our treemap.

ProductTrain <- train %>%
  group_by(Producto_ID) %>%
  summarise( Sales = sum(Demanda_uni_equil),
             Price = mean(Venta_hoy)) %>%
  arrange(desc(Sales))
treemap(ProductTrain[1:250,],index = colnames(ProductTrain)[1],
        vSize = colnames(ProductTrain)[2],
        vColor = colnames(ProductTrain)[3],
        palette=c('#FFFFFF','#FFFFFF','#025055'),
        type='value',
        title.legend = 'Average Price',
        title = 'Demand by Product')

Part 2: The Generated Variables

Grupo Bimbo has offered us further information about products. With the manipulation and extraction of features from producto_tabla.csv, we’ll find something interesting. Firstly, we must join two data by Producto_ID

ProductInfo <- train %>%
  inner_join(Product,by='Producto_ID')

We can easily find out that the NombreProducto is consisted of three part: * The full name of products * The weight, either in g or kg * The packing style, like how many pieces are there in the product. Inorder to extract all the information, we must use the package stringr and skills on Regular Expression. #### Extraction of Flavor

FlavorName <- c('Choco','Va(i)?nilla','Multigrano','Sandwich','Clas(s)?ic')
FlavorTable <- data.table(
  ifelse(ProductInfo$NombreProducto %in%
  Product$NombreProducto[grep(FlavorName[1],Product$NombreProducto)],1,0)
)
colnames(FlavorTable) <- FlavorName[1]
  
for(i in 2:length(FlavorName)){
FlavorTable <- FlavorTable[,FlavorName[i] := 
                             ifelse(ProductInfo$NombreProducto %in%
                                    Product$NombreProducto[grep(FlavorName[i],Product$NombreProducto)],1,0)
                          ]
}
FlavorTable[1:20,]
##     Choco Va(i)?nilla Multigrano Sandwich Clas(s)?ic
##  1:     0           0          0        0          0
##  2:     0           0          0        0          0
##  3:     0           0          0        0          0
##  4:     0           1          0        0          0
##  5:     0           0          0        0          0
##  6:     0           0          0        0          0
##  7:     0           0          0        0          0
##  8:     0           0          0        0          0
##  9:     0           0          0        0          0
## 10:     0           0          0        0          0
## 11:     0           0          0        0          0
## 12:     1           0          0        0          0
## 13:     0           0          0        0          0
## 14:     0           0          0        0          0
## 15:     0           0          0        0          0
## 16:     0           0          0        0          0
## 17:     0           0          0        0          0
## 18:     0           0          0        0          0
## 19:     0           0          0        0          0
## 20:     0           0          0        0          0

Extraction of Weight and Packing

Weight

The units of weight are either g or Kg. We can extract the exact number with matching the pattern. Since the data.table is not too slim, it may take a while to finish this.

library(stringr)
weight <- function(x){
  StringList <- strsplit(x," ")[[1]]
  Index <- grep("\\d+[Kg|g]",StringList)
  weight <- str_match(StringList[Index],"(\\d+)(Kg|g)")
  ifelse(weight[3] %in% 'Kg',
         weight[2] <- as.numeric(weight[2])*1000,
        weight[2] <- weight[2])
  WeightFinal <- as.numeric(weight[2])
  return(WeightFinal)
}
w <- unlist(lapply(ProductInfo$NombreProducto, function(x) weight(x)))
w[1:100]
##   [1]  120  135  140  125  105  105   66  120   69   50  106   84   67   90
##  [15]  113   55   98   92   62   93   80  123   34  255  680  640  680 5000
##  [29]  640  675  640  577  480 1200  450  130  125  105  105   66   69   50
##  [43]  106   67   90  113   92   80  125  105   34  120  130  125  105   50
##  [57]   67   67   90  113   93   34  120  135  125  105   50   67  114  114
##  [71] 4000  255  675  210 5000  640  360  100  630  625  500  170  340  450
##  [85]  255  675  125  105   66  640  114 5000  120  135  130  140  125  105
##  [99]  105   66
Packing

The unit of packing is p, refered to pieces. We can extract the exact number with matching the pattern. Since the data.table is not too slim, it may take a while to finish this.

packing <- function(x){
  StringList <- strsplit(x," ")[[1]]
  Index <- grep("\\d+[p]",StringList)
  Packing <- str_match(StringList[Index],"(\\d+)(p)")[2]
  ifelse(is.na(Packing),
         Packing <- 0,
         Packing <- Packing)
  PackingFinal <- as.numeric(Packing)
  return(PackingFinal)
}
p <- unlist(lapply(ProductInfo$NombreProducto, function(x) packing(x)))
p[1:100]
##   [1]   2   2   2   4   6   4   4   8   8   0  10   0   0   0   6   2  10
##  [18]  10   1   3   2   0   0  10   0   0   0   0   0   0   0  24   6  15
##  [35]   8   6   4   6   4   4   8   0  10   0   0   6  10   2   4   4   0
##  [52]   2   6   4   4   0   0   0   0   6   3   0   2   2   4   6   0   0
##  [69]   3   3 200  10   0   0   0   0  30   0  12  10  16   0   8   8  10
##  [86]   0   4   4   4   0   3   0   2   2   6   2   4   6   4   4

Step 3: Put them altogether

NewData <- train %>%
  bind_cols(FlavorTable) %>%
  bind_cols(data.table(w)) %>%
  bind_cols(data.table(p))
fwrite(NewData, file="train_merged.csv")