Contexto

Para Arca Continental, es vital que las tienditas de la esquina sigan existiendo ya que los mayores márgenes de ganancia en la venta de sus productos los obteneien a través de este canal.

Instalar paquetes y llamar librerias

#install.packages("tidyverse")
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#install.packages("dplyr")
library(dplyr)
#install.packages("lubridate")
library(lubridate)
#install.packages("plyr")
library(plyr)

## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Adjuntando el paquete: 'plyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following object is masked from 'package:purrr':
## 
##     compact

#install.packages("readxl")
library(readxl)
#install.packages("ggplot2")
library(ggplot2)
#install.packages("forecast")
library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

Importar la base de datos

# file.choose()
df <- read_excel("C:\\Users\\cesar\\Downloads\\Datos Arca Continental Original.xlsx")

## Warning: Expecting numeric in N184066 / R184066C14: got a date

## Warning: Expecting numeric in O184066 / R184066C15: got a date

## Warning: Expecting numeric in P184066 / R184066C16: got a date

## Warning: Expecting numeric in Q184066 / R184066C17: got a date

## Warning: Expecting numeric in R184066 / R184066C18: got a date

## Warning: Expecting numeric in S184066 / R184066C19: got a date

## Warning: Expecting numeric in T184066 / R184066C20: got a date

## Warning: Expecting numeric in U184066 / R184066C21: got a date

## Warning: Expecting numeric in V184066 / R184066C22: got a date

## Warning: Expecting numeric in W184066 / R184066C23: got a date

## Warning: Expecting numeric in X184066 / R184066C24: got a date

## Warning: Expecting numeric in Y184066 / R184066C25: got a date

Entender la base de datos

summary(df)

##        ID              Año        Territorio        Sub Territorio    
##  Min.   :     1   Min.   :2016   Length:466509      Length:466509     
##  1st Qu.:116628   1st Qu.:2017   Class :character   Class :character  
##  Median :233255   Median :2018   Mode  :character   Mode  :character  
##  Mean   :233255   Mean   :2018                                        
##  3rd Qu.:349882   3rd Qu.:2019                                        
##  Max.   :466509   Max.   :2019                                        
##                                                                       
##      CEDI             Cliente             Nombre          Tamaño Cte Industria
##  Length:466509      Length:466509      Length:466509      Length:466509       
##  Class :character   Class :character   Class :character   Class :character    
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character    
##                                                                               
##                                                                               
##                                                                               
##                                                                               
##  Segmento Det          Marca           Presentacion          Tamaño         
##  Length:466509      Length:466509      Length:466509      Length:466509     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Retornable_NR          Enero            Febrero             Marzo         
##  Length:466509      Min.   :  -19.0   Min.   :  -11.00   Min.   :  -32.00  
##  Class :character   1st Qu.:    1.0   1st Qu.:    1.00   1st Qu.:    1.00  
##  Mode  :character   Median :    2.0   Median :    2.00   Median :    3.00  
##                     Mean   :   10.1   Mean   :    9.76   Mean   :   11.36  
##                     3rd Qu.:    6.0   3rd Qu.:    6.00   3rd Qu.:    6.00  
##                     Max.   :42736.0   Max.   :42767.00   Max.   :42795.00  
##                     NA's   :233480    NA's   :231213     NA's   :227420    
##      Abril               Mayo              Junio             Julio         
##  Min.   :  -70.00   Min.   : -106.00   Min.   : -211.0   Min.   :  -60.00  
##  1st Qu.:    1.00   1st Qu.:    1.00   1st Qu.:    1.0   1st Qu.:    1.00  
##  Median :    3.00   Median :    3.00   Median :    3.0   Median :    2.00  
##  Mean   :   11.71   Mean   :   12.75   Mean   :   12.2   Mean   :   11.75  
##  3rd Qu.:    6.00   3rd Qu.:    7.00   3rd Qu.:    6.0   3rd Qu.:    6.00  
##  Max.   :42826.00   Max.   :42856.00   Max.   :42887.0   Max.   :42917.00  
##  NA's   :224057     NA's   :216910     NA's   :215753    NA's   :223411    
##      Agosto           Septiembre         Octubre          Noviembre      
##  Min.   : -211.00   Min.   : -527.0   Min.   :  -38.0   Min.   :  -25.0  
##  1st Qu.:    1.00   1st Qu.:    1.0   1st Qu.:    1.0   1st Qu.:    1.0  
##  Median :    3.00   Median :    3.0   Median :    3.0   Median :    3.0  
##  Mean   :   11.98   Mean   :   13.4   Mean   :   13.7   Mean   :   13.3  
##  3rd Qu.:    6.00   3rd Qu.:    7.0   3rd Qu.:    7.0   3rd Qu.:    6.0  
##  Max.   :42948.00   Max.   :42979.0   Max.   :43009.0   Max.   :43040.0  
##  NA's   :220242     NA's   :337314    NA's   :338386    NA's   :338460   
##    Diciembre      
##  Min.   :  -28.0  
##  1st Qu.:    1.0  
##  Median :    3.0  
##  Mean   :   14.8  
##  3rd Qu.:    7.0  
##  Max.   :43070.0  
##  NA's   :341855

str(df)

## tibble [466,509 × 25] (S3: tbl_df/tbl/data.frame)
##  $ ID                  : num [1:466509] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Año                 : num [1:466509] 2016 2016 2016 2016 2016 ...
##  $ Territorio          : chr [1:466509] "Guadalajara" "Guadalajara" "Guadalajara" "Guadalajara" ...
##  $ Sub Territorio      : chr [1:466509] "Belenes" "Belenes" "Belenes" "Belenes" ...
##  $ CEDI                : chr [1:466509] "Suc. Belenes" "Suc. Belenes" "Suc. Belenes" "Suc. Belenes" ...
##  $ Cliente             : chr [1:466509] "77737" "77737" "77737" "77737" ...
##  $ Nombre              : chr [1:466509] "ABARR" "ABARR" "ABARR" "ABARR" ...
##  $ Tamaño Cte Industria: chr [1:466509] "Extra Grande" "Extra Grande" "Extra Grande" "Extra Grande" ...
##  $ Segmento Det        : chr [1:466509] "Agua Mineral" "Agua Purificada" "Agua Purificada" "Agua Saborizada" ...
##  $ Marca               : chr [1:466509] "Topo Chico A.M." "Ciel Agua Purificada" "Ciel Agua Purificada" "Ciel Exprim" ...
##  $ Presentacion        : chr [1:466509] "600 ml NR" "1 Ltro. N.R." "1.5 Lts. NR" "600 ml NR" ...
##  $ Tamaño              : chr [1:466509] "Individual" "Individual" "Individual" "Individual" ...
##  $ Retornable_NR       : chr [1:466509] "No Retornable" "No Retornable" "No Retornable" "No Retornable" ...
##  $ Enero               : num [1:466509] NA NA NA NA NA NA 1 NA 3 NA ...
##  $ Febrero             : num [1:466509] NA 2 NA NA NA NA NA 1 3 NA ...
##  $ Marzo               : num [1:466509] NA 8 3 NA NA 1 NA NA 4 NA ...
##  $ Abril               : num [1:466509] NA 4 6 NA NA NA NA 1 4 NA ...
##  $ Mayo                : num [1:466509] NA 4 3 NA NA NA 0 NA 4 NA ...
##  $ Junio               : num [1:466509] NA 2 3 NA NA NA NA 1 4 0 ...
##  $ Julio               : num [1:466509] NA 2 3 NA NA NA 0 NA 4 NA ...
##  $ Agosto              : num [1:466509] NA 2 3 NA NA NA NA 1 7 NA ...
##  $ Septiembre          : num [1:466509] NA 2 3 NA NA NA NA 1 4 NA ...
##  $ Octubre             : num [1:466509] NA 2 3 NA NA NA 0 NA 3 NA ...
##  $ Noviembre           : num [1:466509] NA 4 3 NA 0 NA NA NA 1 NA ...
##  $ Diciembre           : num [1:466509] 1 2 3 1 NA NA NA NA 3 NA ...

head(df, n=10)

## # A tibble: 10 × 25
##       ID   Año Territorio  `Sub Territorio` CEDI         Cliente Nombre
##    <dbl> <dbl> <chr>       <chr>            <chr>        <chr>   <chr> 
##  1     1  2016 Guadalajara Belenes          Suc. Belenes 77737   ABARR 
##  2     2  2016 Guadalajara Belenes          Suc. Belenes 77737   ABARR 
##  3     3  2016 Guadalajara Belenes          Suc. Belenes 77737   ABARR 
##  4     4  2016 Guadalajara Belenes          Suc. Belenes 77737   ABARR 
##  5     5  2016 Guadalajara Belenes          Suc. Belenes 77737   ABARR 
##  6     6  2016 Guadalajara Belenes          Suc. Belenes 77737   ABARR 
##  7     7  2016 Guadalajara Belenes          Suc. Belenes 77737   ABARR 
##  8     8  2016 Guadalajara Belenes          Suc. Belenes 77737   ABARR 
##  9     9  2016 Guadalajara Belenes          Suc. Belenes 77737   ABARR 
## 10    10  2016 Guadalajara Belenes          Suc. Belenes 77737   ABARR 
## # ℹ 18 more variables: `Tamaño Cte Industria` <chr>, `Segmento Det` <chr>,
## #   Marca <chr>, Presentacion <chr>, Tamaño <chr>, Retornable_NR <chr>,
## #   Enero <dbl>, Febrero <dbl>, Marzo <dbl>, Abril <dbl>, Mayo <dbl>,
## #   Junio <dbl>, Julio <dbl>, Agosto <dbl>, Septiembre <dbl>, Octubre <dbl>,
## #   Noviembre <dbl>, Diciembre <dbl>

tail(df, n=10)

## # A tibble: 10 × 25
##        ID   Año Territorio  `Sub Territorio` CEDI           Cliente Nombre
##     <dbl> <dbl> <chr>       <chr>            <chr>          <chr>   <chr> 
##  1 466500  2019 Guadalajara Huentitán        Suc. Huentitán 4531    HECTO 
##  2 466501  2019 Guadalajara Huentitán        Suc. Huentitán 4531    HECTO 
##  3 466502  2019 Guadalajara Huentitán        Suc. Huentitán 4531    HECTO 
##  4 466503  2019 Guadalajara Huentitán        Suc. Huentitán 4531    HECTO 
##  5 466504  2019 Guadalajara Huentitán        Suc. Huentitán 4531    HECTO 
##  6 466505  2019 Guadalajara Huentitán        Suc. Huentitán 4531    HECTO 
##  7 466506  2019 Guadalajara Huentitán        Suc. Huentitán 4531    HECTO 
##  8 466507  2019 Guadalajara Huentitán        Suc. Huentitán 4531    HECTO 
##  9 466508  2019 Guadalajara Huentitán        Suc. Huentitán 4531    HECTO 
## 10 466509  2019 Guadalajara Huentitán        Suc. Huentitán 4531    HECTO 
## # ℹ 18 more variables: `Tamaño Cte Industria` <chr>, `Segmento Det` <chr>,
## #   Marca <chr>, Presentacion <chr>, Tamaño <chr>, Retornable_NR <chr>,
## #   Enero <dbl>, Febrero <dbl>, Marzo <dbl>, Abril <dbl>, Mayo <dbl>,
## #   Junio <dbl>, Julio <dbl>, Agosto <dbl>, Septiembre <dbl>, Octubre <dbl>,
## #   Noviembre <dbl>, Diciembre <dbl>

colnames(df) <- gsub(" ", "_", colnames(df))
dplyr::count(df,Territorio, sort=TRUE)

## # A tibble: 2 × 2
##   Territorio       n
##   <chr>        <int>
## 1 Guadalajara 466508
## 2 Territorio       1

dplyr::count(df,Sub_Territorio, sort=TRUE)

## # A tibble: 4 × 2
##   Sub_Territorio      n
##   <chr>           <int>
## 1 Belenes        208982
## 2 Huentitán      144196
## 3 Toluquilla     113330
## 4 Sub Territorio      1

dplyr::count(df,CEDI, sort=TRUE)

## # A tibble: 4 × 2
##   CEDI                 n
##   <chr>            <int>
## 1 Suc. Belenes    208982
## 2 Suc. Huentitán  144196
## 3 Suc. Toluquilla 113330
## 4 CEDI                 1

dplyr::count(df,Cliente, sort=TRUE)

## # A tibble: 5,249 × 2
##    Cliente     n
##    <chr>   <int>
##  1 0286      647
##  2 2912      586
##  3 2661      537
##  4 7821      531
##  5 1859      525
##  6 5583      516
##  7 9998      508
##  8 3601      506
##  9 5879      499
## 10 0335      496
## # ℹ 5,239 more rows

dplyr::count(df,Nombre, sort=TRUE)

## # A tibble: 1,090 × 2
##    Nombre     n
##    <chr>  <int>
##  1 ABARR  71186
##  2 MARIA  39816
##  3 JOSE   17479
##  4 JUAN    7580
##  5 MARTH   5759
##  6 MISCE   5700
##  7 LUIS    5585
##  8 SUPER   4565
##  9 CARLO   3991
## 10 ROSA    3890
## # ℹ 1,080 more rows

dplyr::count(df,Tamaño_Cte_Industria, sort=TRUE)

## # A tibble: 5 × 2
##   Tamaño_Cte_Industria      n
##   <chr>                 <int>
## 1 Extra Grande         230190
## 2 Micro                117110
## 3 Pequeño               77875
## 4 Grande                41333
## 5 Tamaño Cte Industria      1

dplyr::count(df,Segmento_Det, sort=TRUE)

## # A tibble: 21 × 2
##    Segmento_Det            n
##    <chr>               <int>
##  1 Sabores Regular    156242
##  2 Colas Regular       95720
##  3 Colas Light         43807
##  4 Jugos y Néctares    33362
##  5 Bebidas de Fruta    30641
##  6 Agua Purificada     20766
##  7 Agua Mineral        12590
##  8 Isotónicos Regular  11905
##  9 Té Regular          10062
## 10 Agua Saborizada     10056
## # ℹ 11 more rows

dplyr::count(df,Marca, sort=TRUE)

## # A tibble: 56 × 2
##    Marca                    n
##    <chr>                <int>
##  1 Coca-Cola            95720
##  2 Sprite               37925
##  3 Fanta                35728
##  4 Fresca               26435
##  5 Manzana Lift         25598
##  6 Coca-Cola Light      21926
##  7 Del Valle            21325
##  8 Ciel Agua Purificada 20766
##  9 Sidral Mundet        17150
## 10 Valle Frut           15808
## # ℹ 46 more rows

dplyr::count(df,Presentacion, sort=TRUE)

## # A tibble: 57 × 2
##    Presentacion         n
##    <chr>            <int>
##  1 600 ml NR        74008
##  2 1 Ltro. N.R.     36930
##  3 2 Lts. NR        36415
##  4 500 ml Ret       35165
##  5 1.5 Lts. NR      30637
##  6 Lata 235 ml      24551
##  7 400 ml NR        22877
##  8 250 ml. NR PET   21735
##  9 500 ml NR Vidrio 18758
## 10 2.5 Lts. NR      13235
## # ℹ 47 more rows

dplyr::count(df,Tamaño, sort=TRUE)

## # A tibble: 3 × 2
##   Tamaño          n
##   <chr>       <int>
## 1 Individual 328513
## 2 Familiar   137995
## 3 Tamaño          1

dplyr::count(df,Retornable_NR, sort=TRUE)

## # A tibble: 3 × 2
##   Retornable_NR      n
##   <chr>          <int>
## 1 No Retornable 403226
## 2 Retornable     63282
## 3 Retornable_NR      1

Limpiar la base de datos

Técnica 1. Remover valores irrelevantes

df1 <- df
df1 <- df1[-184085, ]
summary(df1)

##        ID              Año        Territorio        Sub_Territorio    
##  Min.   :     1   Min.   :2016   Length:466508      Length:466508     
##  1st Qu.:116628   1st Qu.:2017   Class :character   Class :character  
##  Median :233256   Median :2018   Mode  :character   Mode  :character  
##  Mean   :233255   Mean   :2018                                        
##  3rd Qu.:349882   3rd Qu.:2019                                        
##  Max.   :466509   Max.   :2019                                        
##                                                                       
##      CEDI             Cliente             Nombre          Tamaño_Cte_Industria
##  Length:466508      Length:466508      Length:466508      Length:466508       
##  Class :character   Class :character   Class :character   Class :character    
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character    
##                                                                               
##                                                                               
##                                                                               
##                                                                               
##  Segmento_Det          Marca           Presentacion          Tamaño         
##  Length:466508      Length:466508      Length:466508      Length:466508     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Retornable_NR          Enero            Febrero             Marzo         
##  Length:466508      Min.   :  -19.0   Min.   :  -11.00   Min.   :  -32.00  
##  Class :character   1st Qu.:    1.0   1st Qu.:    1.00   1st Qu.:    1.00  
##  Mode  :character   Median :    2.0   Median :    2.00   Median :    3.00  
##                     Mean   :   10.1   Mean   :    9.76   Mean   :   11.36  
##                     3rd Qu.:    6.0   3rd Qu.:    6.00   3rd Qu.:    6.00  
##                     Max.   :42736.0   Max.   :42767.00   Max.   :42795.00  
##                     NA's   :233480    NA's   :231213     NA's   :227420    
##      Abril               Mayo              Junio             Julio         
##  Min.   :  -70.00   Min.   : -106.00   Min.   : -211.0   Min.   :  -60.00  
##  1st Qu.:    1.00   1st Qu.:    1.00   1st Qu.:    1.0   1st Qu.:    1.00  
##  Median :    3.00   Median :    3.00   Median :    3.0   Median :    2.00  
##  Mean   :   11.71   Mean   :   12.75   Mean   :   12.2   Mean   :   11.75  
##  3rd Qu.:    6.00   3rd Qu.:    7.00   3rd Qu.:    6.0   3rd Qu.:    6.00  
##  Max.   :42826.00   Max.   :42856.00   Max.   :42887.0   Max.   :42917.00  
##  NA's   :224057     NA's   :216910     NA's   :215753    NA's   :223411    
##      Agosto           Septiembre         Octubre          Noviembre      
##  Min.   : -211.00   Min.   : -527.0   Min.   :  -38.0   Min.   :  -25.0  
##  1st Qu.:    1.00   1st Qu.:    1.0   1st Qu.:    1.0   1st Qu.:    1.0  
##  Median :    3.00   Median :    3.0   Median :    3.0   Median :    3.0  
##  Mean   :   11.98   Mean   :   13.4   Mean   :   13.7   Mean   :   13.3  
##  3rd Qu.:    6.00   3rd Qu.:    7.0   3rd Qu.:    7.0   3rd Qu.:    6.0  
##  Max.   :42948.00   Max.   :42979.0   Max.   :43009.0   Max.   :43040.0  
##  NA's   :220242     NA's   :337314    NA's   :338385    NA's   :338460   
##    Diciembre      
##  Min.   :  -28.0  
##  1st Qu.:    1.0  
##  Median :    3.0  
##  Mean   :   14.8  
##  3rd Qu.:    7.0  
##  Max.   :43070.0  
##  NA's   :341855

Técnica 2. Remover valores duplicados

# Cuantos renglones duplicados tenemos
sum(duplicated(df1))

## [1] 0

# Cuales son los renglones duplicados 
df1[duplicated(df1),]

## # A tibble: 0 × 25
## # ℹ 25 variables: ID <dbl>, Año <dbl>, Territorio <chr>, Sub_Territorio <chr>,
## #   CEDI <chr>, Cliente <chr>, Nombre <chr>, Tamaño_Cte_Industria <chr>,
## #   Segmento_Det <chr>, Marca <chr>, Presentacion <chr>, Tamaño <chr>,
## #   Retornable_NR <chr>, Enero <dbl>, Febrero <dbl>, Marzo <dbl>, Abril <dbl>,
## #   Mayo <dbl>, Junio <dbl>, Julio <dbl>, Agosto <dbl>, Septiembre <dbl>,
## #   Octubre <dbl>, Noviembre <dbl>, Diciembre <dbl>

Técnica 3. Eliminar errores tipograficos renglones duplicados

# Cantidades absolutas
df2 <- df1
df2$Enero <- abs(df2$Enero)
df2$Febrero <- abs(df2$Febrero)
df2$Marzo <- abs(df2$Marzo)
df2$Abril <- abs(df2$Abril)
df2$Mayo <- abs(df2$Mayo)
df2$Junio <- abs(df2$Junio)
df2$Julio <- abs(df2$Julio)
df2$Agosto <- abs(df2$Agosto)
df2$Septiembre <- abs(df2$Septiembre)
df2$Octubre <- abs(df2$Octubre)
df2$Noviembre <- abs(df2$Noviembre)
df2$Diciembre <- abs(df2$Diciembre)
summary(df2)

##        ID              Año        Territorio        Sub_Territorio    
##  Min.   :     1   Min.   :2016   Length:466508      Length:466508     
##  1st Qu.:116628   1st Qu.:2017   Class :character   Class :character  
##  Median :233256   Median :2018   Mode  :character   Mode  :character  
##  Mean   :233255   Mean   :2018                                        
##  3rd Qu.:349882   3rd Qu.:2019                                        
##  Max.   :466509   Max.   :2019                                        
##                                                                       
##      CEDI             Cliente             Nombre          Tamaño_Cte_Industria
##  Length:466508      Length:466508      Length:466508      Length:466508       
##  Class :character   Class :character   Class :character   Class :character    
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character    
##                                                                               
##                                                                               
##                                                                               
##                                                                               
##  Segmento_Det          Marca           Presentacion          Tamaño         
##  Length:466508      Length:466508      Length:466508      Length:466508     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Retornable_NR          Enero            Febrero             Marzo         
##  Length:466508      Min.   :    0.0   Min.   :    0.00   Min.   :    0.00  
##  Class :character   1st Qu.:    1.0   1st Qu.:    1.00   1st Qu.:    1.00  
##  Mode  :character   Median :    2.0   Median :    2.00   Median :    3.00  
##                     Mean   :   10.1   Mean   :    9.76   Mean   :   11.36  
##                     3rd Qu.:    6.0   3rd Qu.:    6.00   3rd Qu.:    6.00  
##                     Max.   :42736.0   Max.   :42767.00   Max.   :42795.00  
##                     NA's   :233480    NA's   :231213     NA's   :227420    
##      Abril               Mayo              Junio              Julio         
##  Min.   :    0.00   Min.   :    0.00   Min.   :    0.00   Min.   :    0.00  
##  1st Qu.:    1.00   1st Qu.:    1.00   1st Qu.:    1.00   1st Qu.:    1.00  
##  Median :    3.00   Median :    3.00   Median :    3.00   Median :    2.00  
##  Mean   :   11.71   Mean   :   12.76   Mean   :   12.21   Mean   :   11.76  
##  3rd Qu.:    6.00   3rd Qu.:    7.00   3rd Qu.:    6.00   3rd Qu.:    6.00  
##  Max.   :42826.00   Max.   :42856.00   Max.   :42887.00   Max.   :42917.00  
##  NA's   :224057     NA's   :216910     NA's   :215753     NA's   :223411    
##      Agosto           Septiembre         Octubre          Noviembre      
##  Min.   :    0.00   Min.   :    0.0   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:    1.00   1st Qu.:    1.0   1st Qu.:    1.0   1st Qu.:    1.0  
##  Median :    3.00   Median :    3.0   Median :    3.0   Median :    3.0  
##  Mean   :   11.99   Mean   :   13.4   Mean   :   13.7   Mean   :   13.4  
##  3rd Qu.:    6.00   3rd Qu.:    7.0   3rd Qu.:    7.0   3rd Qu.:    6.0  
##  Max.   :42948.00   Max.   :42979.0   Max.   :43009.0   Max.   :43040.0  
##  NA's   :220242     NA's   :337314    NA's   :338385    NA's   :338460   
##    Diciembre      
##  Min.   :    0.0  
##  1st Qu.:    1.0  
##  Median :    3.0  
##  Mean   :   14.8  
##  3rd Qu.:    7.0  
##  Max.   :43070.0  
##  NA's   :341855

Técnica 4. Convertir tipos de datos

# No aplica

Técnica 5. Reemplazar valores faltantes

# ¿Cuántos NAs tengo en la base de datos?
sum(is.na(df2))

## [1] 3148500

# ¿Cuántos NAs tengo por variable?
sapply(df2, function(x) sum(is.na(x)))

##                   ID                  Año           Territorio 
##                    0                    0                    0 
##       Sub_Territorio                 CEDI              Cliente 
##                    0                    0                    0 
##               Nombre Tamaño_Cte_Industria         Segmento_Det 
##                    0                    0                    0 
##                Marca         Presentacion               Tamaño 
##                    0                    0                    0 
##        Retornable_NR                Enero              Febrero 
##                    0               233480               231213 
##                Marzo                Abril                 Mayo 
##               227420               224057               216910 
##                Junio                Julio               Agosto 
##               215753               223411               220242 
##           Septiembre              Octubre            Noviembre 
##               337314               338385               338460 
##            Diciembre 
##               341855

# Opción 1. Borrrar todos los NAs de una tabla

# bd100 <- na.omit(bd)

# Opción 2. Remplazar los NAs con CEROS. 

df3 <- df2
df3[is.na(df3)] <- 0

# Opción 3. Reemplazar los NAs con el PROMEDIO

# bd102 < bd
# bd102$PLU[is.na(bd102$PLU)]<- mean(bd102$PLU, na.rm=TRUE)

Técnica 6. Correciones por métodos estadisticos

boxplot(df3$Enero, horizontal=TRUE)

boxplot(df3$Febrero, horizontal=TRUE)

boxplot(df3$Marzo, horizontal=TRUE)

boxplot(df3$Abril, horizontal=TRUE)

boxplot(df3$Mayo, horizontal=TRUE)

boxplot(df3$Junio, horizontal=TRUE)

LS0tDQp0aXRsZTogIkFyY2EgQ29udGluZW50YWwiDQphdXRob3I6ICJKdWxpbyBDw6lzYXIgU2lsdmEgZGUgbGEgUm9zYSBBMDA4MzkxNDciDQpkYXRlOiAiMjAyNS0wMy0yMCINCm91dHB1dDogDQogIGh0bWxfZG9jdW1lbnQ6DQogICAgdG9jOiBUUlVFDQogICAgdG9jX2Zsb2F0OiBUUlVFDQogICAgY29kZV9kb3dubG9hZDogVFJVRQ0KICAgIHRoZW1lOiAicGFwZXIiIA0KLS0tDQoNCiFbXShodHRwczovL2kuZ2lmZXIuY29tLzc3ZUcuZ2lmKQ0KDQojICoqQ29udGV4dG8qKg0KUGFyYSAqKkFyY2EgQ29udGluZW50YWwqKiwgIGVzIHZpdGFsIHF1ZSBsYXMgdGllbmRpdGFzIGRlIGxhIGVzcXVpbmEgc2lnYW4gZXhpc3RpZW5kbyB5YSBxdWUgbG9zIG1heW9yZXMgbcOhcmdlbmVzIGRlIGdhbmFuY2lhIGVuIGxhIHZlbnRhIGRlIHN1cyBwcm9kdWN0b3MgbG9zIG9idGVuZWllbiBhIHRyYXbDqXMgZGUgZXN0ZSBjYW5hbC4gDQoNCiMgKipJbnN0YWxhciBwYXF1ZXRlcyB5IGxsYW1hciBsaWJyZXJpYXMqKg0KYGBge3IgbWVzc2FnZT1UUlVFLCB3YXJuaW5nPVRSVUV9DQojaW5zdGFsbC5wYWNrYWdlcygidGlkeXZlcnNlIikNCmxpYnJhcnkodGlkeXZlcnNlKQ0KI2luc3RhbGwucGFja2FnZXMoImRwbHlyIikNCmxpYnJhcnkoZHBseXIpDQojaW5zdGFsbC5wYWNrYWdlcygibHVicmlkYXRlIikNCmxpYnJhcnkobHVicmlkYXRlKQ0KI2luc3RhbGwucGFja2FnZXMoInBseXIiKQ0KbGlicmFyeShwbHlyKQ0KI2luc3RhbGwucGFja2FnZXMoInJlYWR4bCIpDQpsaWJyYXJ5KHJlYWR4bCkNCiNpbnN0YWxsLnBhY2thZ2VzKCJnZ3Bsb3QyIikNCmxpYnJhcnkoZ2dwbG90MikNCiNpbnN0YWxsLnBhY2thZ2VzKCJmb3JlY2FzdCIpDQpsaWJyYXJ5KGZvcmVjYXN0KQ0KYGBgDQojICoqSW1wb3J0YXIgbGEgYmFzZSBkZSBkYXRvcyoqDQpgYGB7ciBtZXNzYWdlPVRSVUUsIHdhcm5pbmc9VFJVRX0NCiMgZmlsZS5jaG9vc2UoKQ0KZGYgPC0gcmVhZF9leGNlbCgiQzpcXFVzZXJzXFxjZXNhclxcRG93bmxvYWRzXFxEYXRvcyBBcmNhIENvbnRpbmVudGFsIE9yaWdpbmFsLnhsc3giKQ0KYGBgDQojICoqRW50ZW5kZXIgbGEgYmFzZSBkZSBkYXRvcyoqDQpgYGB7cn0NCnN1bW1hcnkoZGYpDQpzdHIoZGYpDQpoZWFkKGRmLCBuPTEwKQ0KdGFpbChkZiwgbj0xMCkNCmNvbG5hbWVzKGRmKSA8LSBnc3ViKCIgIiwgIl8iLCBjb2xuYW1lcyhkZikpDQpkcGx5cjo6Y291bnQoZGYsVGVycml0b3Jpbywgc29ydD1UUlVFKQ0KZHBseXI6OmNvdW50KGRmLFN1Yl9UZXJyaXRvcmlvLCBzb3J0PVRSVUUpDQpkcGx5cjo6Y291bnQoZGYsQ0VESSwgc29ydD1UUlVFKQ0KZHBseXI6OmNvdW50KGRmLENsaWVudGUsIHNvcnQ9VFJVRSkNCmRwbHlyOjpjb3VudChkZixOb21icmUsIHNvcnQ9VFJVRSkNCmRwbHlyOjpjb3VudChkZixUYW1hw7FvX0N0ZV9JbmR1c3RyaWEsIHNvcnQ9VFJVRSkNCmRwbHlyOjpjb3VudChkZixTZWdtZW50b19EZXQsIHNvcnQ9VFJVRSkNCmRwbHlyOjpjb3VudChkZixNYXJjYSwgc29ydD1UUlVFKQ0KZHBseXI6OmNvdW50KGRmLFByZXNlbnRhY2lvbiwgc29ydD1UUlVFKQ0KZHBseXI6OmNvdW50KGRmLFRhbWHDsW8sIHNvcnQ9VFJVRSkNCmRwbHlyOjpjb3VudChkZixSZXRvcm5hYmxlX05SLCBzb3J0PVRSVUUpDQpgYGANCg0KIyAqKkxpbXBpYXIgbGEgYmFzZSBkZSBkYXRvcyoqDQojIyAqVMOpY25pY2EgMS4gUmVtb3ZlciB2YWxvcmVzIGlycmVsZXZhbnRlcyoNCmBgYHtyfQ0KZGYxIDwtIGRmDQpkZjEgPC0gZGYxWy0xODQwODUsIF0NCnN1bW1hcnkoZGYxKQ0KYGBgDQojIyAqVMOpY25pY2EgMi4gUmVtb3ZlciB2YWxvcmVzIGR1cGxpY2Fkb3MqDQpgYGB7cn0NCiMgQ3VhbnRvcyByZW5nbG9uZXMgZHVwbGljYWRvcyB0ZW5lbW9zDQpzdW0oZHVwbGljYXRlZChkZjEpKQ0KDQojIEN1YWxlcyBzb24gbG9zIHJlbmdsb25lcyBkdXBsaWNhZG9zIA0KZGYxW2R1cGxpY2F0ZWQoZGYxKSxdDQpgYGANCg0KIyMgKlTDqWNuaWNhIDMuIEVsaW1pbmFyIGVycm9yZXMgdGlwb2dyYWZpY29zIHJlbmdsb25lcyBkdXBsaWNhZG9zKg0KYGBge3J9DQojIENhbnRpZGFkZXMgYWJzb2x1dGFzDQpkZjIgPC0gZGYxDQpkZjIkRW5lcm8gPC0gYWJzKGRmMiRFbmVybykNCmRmMiRGZWJyZXJvIDwtIGFicyhkZjIkRmVicmVybykNCmRmMiRNYXJ6byA8LSBhYnMoZGYyJE1hcnpvKQ0KZGYyJEFicmlsIDwtIGFicyhkZjIkQWJyaWwpDQpkZjIkTWF5byA8LSBhYnMoZGYyJE1heW8pDQpkZjIkSnVuaW8gPC0gYWJzKGRmMiRKdW5pbykNCmRmMiRKdWxpbyA8LSBhYnMoZGYyJEp1bGlvKQ0KZGYyJEFnb3N0byA8LSBhYnMoZGYyJEFnb3N0bykNCmRmMiRTZXB0aWVtYnJlIDwtIGFicyhkZjIkU2VwdGllbWJyZSkNCmRmMiRPY3R1YnJlIDwtIGFicyhkZjIkT2N0dWJyZSkNCmRmMiROb3ZpZW1icmUgPC0gYWJzKGRmMiROb3ZpZW1icmUpDQpkZjIkRGljaWVtYnJlIDwtIGFicyhkZjIkRGljaWVtYnJlKQ0Kc3VtbWFyeShkZjIpDQpgYGANCg0KIyMgKlTDqWNuaWNhIDQuIENvbnZlcnRpciB0aXBvcyBkZSBkYXRvcyoNCmBgYHtyfQ0KIyBObyBhcGxpY2ENCmBgYA0KDQojIyAqVMOpY25pY2EgNS4gUmVlbXBsYXphciB2YWxvcmVzIGZhbHRhbnRlcyoNCmBgYHtyfQ0KIyDCv0N1w6FudG9zIE5BcyB0ZW5nbyBlbiBsYSBiYXNlIGRlIGRhdG9zPw0Kc3VtKGlzLm5hKGRmMikpDQoNCiMgwr9DdcOhbnRvcyBOQXMgdGVuZ28gcG9yIHZhcmlhYmxlPw0Kc2FwcGx5KGRmMiwgZnVuY3Rpb24oeCkgc3VtKGlzLm5hKHgpKSkNCg0KIyBPcGNpw7NuIDEuIEJvcnJyYXIgdG9kb3MgbG9zIE5BcyBkZSB1bmEgdGFibGENCg0KIyBiZDEwMCA8LSBuYS5vbWl0KGJkKQ0KDQojIE9wY2nDs24gMi4gUmVtcGxhemFyIGxvcyBOQXMgY29uIENFUk9TLiANCg0KZGYzIDwtIGRmMg0KZGYzW2lzLm5hKGRmMyldIDwtIDANCg0KIyBPcGNpw7NuIDMuIFJlZW1wbGF6YXIgbG9zIE5BcyBjb24gZWwgUFJPTUVESU8NCg0KIyBiZDEwMiA8IGJkDQojIGJkMTAyJFBMVVtpcy5uYShiZDEwMiRQTFUpXTwtIG1lYW4oYmQxMDIkUExVLCBuYS5ybT1UUlVFKQ0KYGBgDQojIyAqVMOpY25pY2EgNi4gQ29ycmVjaW9uZXMgcG9yIG3DqXRvZG9zIGVzdGFkaXN0aWNvcyoNCmBgYHtyfQ0KYm94cGxvdChkZjMkRW5lcm8sIGhvcml6b250YWw9VFJVRSkNCmJveHBsb3QoZGYzJEZlYnJlcm8sIGhvcml6b250YWw9VFJVRSkNCmJveHBsb3QoZGYzJE1hcnpvLCBob3Jpem9udGFsPVRSVUUpDQpib3hwbG90KGRmMyRBYnJpbCwgaG9yaXpvbnRhbD1UUlVFKQ0KYm94cGxvdChkZjMkTWF5bywgaG9yaXpvbnRhbD1UUlVFKQ0KYm94cGxvdChkZjMkSnVuaW8sIGhvcml6b250YWw9VFJVRSkNCg0KYGBgDQo=

Arca Continental

Julio César Silva de la Rosa A00839147

2025-03-20