Contexto

Una empresa con 5 tiendas en el país solicita un análisis de sus ventas de abarrotes entre mayo y noviembre del 2020.

###Importar la base de datos

bd <- read.csv("/Users/pedrovillanueva/Downloads/abarrotes.csv")
# View(bd)
summary(bd)
##  vcClaveTienda        DescGiro         Codigo.Barras            PLU        
##  Length:200625      Length:200625      Min.   :8.347e+05   Min.   : 1.00   
##  Class :character   Class :character   1st Qu.:7.501e+12   1st Qu.: 1.00   
##  Mode  :character   Mode  :character   Median :7.501e+12   Median : 1.00   
##                                        Mean   :5.950e+12   Mean   : 2.11   
##                                        3rd Qu.:7.501e+12   3rd Qu.: 1.00   
##                                        Max.   :1.750e+13   Max.   :30.00   
##                                                            NA's   :199188  
##     Fecha               Hora              Marca            Fabricante       
##  Length:200625      Length:200625      Length:200625      Length:200625     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    Producto             Precio          Ult.Costo         Unidades     
##  Length:200625      Min.   :-147.00   Min.   :  0.38   Min.   : 0.200  
##  Class :character   1st Qu.:  11.00   1st Qu.:  8.46   1st Qu.: 1.000  
##  Mode  :character   Median :  16.00   Median : 12.31   Median : 1.000  
##                     Mean   :  19.42   Mean   : 15.31   Mean   : 1.262  
##                     3rd Qu.:  25.00   3rd Qu.: 19.23   3rd Qu.: 1.000  
##                     Max.   :1000.00   Max.   :769.23   Max.   :96.000  
##                                                                        
##     F.Ticket      NombreDepartamento NombreFamilia      NombreCategoria   
##  Min.   :     1   Length:200625      Length:200625      Length:200625     
##  1st Qu.: 33964   Class :character   Class :character   Class :character  
##  Median :105993   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :193990                                                           
##  3rd Qu.:383005                                                           
##  Max.   :450040                                                           
##                                                                           
##     Estado              Mts.2      Tipo.ubicación         Giro          
##  Length:200625      Min.   :47.0   Length:200625      Length:200625     
##  Class :character   1st Qu.:53.0   Class :character   Class :character  
##  Mode  :character   Median :60.0   Mode  :character   Mode  :character  
##                     Mean   :56.6                                        
##                     3rd Qu.:60.0                                        
##                     Max.   :62.0                                        
##                                                                         
##  Hora.inicio        Hora.cierre       
##  Length:200625      Length:200625     
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 

###Explorar la base de datos

#  tibble(bd)
str(bd)
## 'data.frame':    200625 obs. of  22 variables:
##  $ vcClaveTienda     : chr  "MX001" "MX001" "MX001" "MX001" ...
##  $ DescGiro          : chr  "Abarrotes" "Abarrotes" "Abarrotes" "Abarrotes" ...
##  $ Codigo.Barras     : num  7.5e+12 7.5e+12 7.5e+12 7.5e+12 7.5e+12 ...
##  $ PLU               : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Fecha             : chr  "19/06/2020" "19/06/2020" "19/06/2020" "19/06/2020" ...
##  $ Hora              : chr  "08:16:21" "08:23:33" "08:24:33" "08:24:33" ...
##  $ Marca             : chr  "NUTRI LECHE" "DAN UP" "BIMBO" "PEPSI" ...
##  $ Fabricante        : chr  "MEXILAC" "DANONE DE MEXICO" "GRUPO BIMBO" "PEPSI-COLA MEXICANA" ...
##  $ Producto          : chr  "Nutri Leche 1 Litro" "DANUP STRAWBERRY P/BEBER 350GR NAL" "Rebanadas Bimbo 2Pz" "Pepsi N.R. 400Ml" ...
##  $ Precio            : num  16 14 5 8 19.5 16 14 5 8 19.5 ...
##  $ Ult.Costo         : num  12.3 14 5 8 15 ...
##  $ Unidades          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ F.Ticket          : int  1 2 3 3 4 1 2 3 3 4 ...
##  $ NombreDepartamento: chr  "Abarrotes" "Abarrotes" "Abarrotes" "Abarrotes" ...
##  $ NombreFamilia     : chr  "Lacteos y Refrigerados" "Lacteos y Refrigerados" "Pan y Tortilla" "Bebidas" ...
##  $ NombreCategoria   : chr  "Leche" "Yogurt" "Pan Dulce Empaquetado" "Refrescos Plástico (N.R.)" ...
##  $ Estado            : chr  "Nuevo León" "Nuevo León" "Nuevo León" "Nuevo León" ...
##  $ Mts.2             : int  60 60 60 60 60 60 60 60 60 60 ...
##  $ Tipo.ubicación    : chr  "Esquina" "Esquina" "Esquina" "Esquina" ...
##  $ Giro              : chr  "Abarrotes" "Abarrotes" "Abarrotes" "Abarrotes" ...
##  $ Hora.inicio       : chr  "08:00" "08:00" "08:00" "08:00" ...
##  $ Hora.cierre       : chr  "22:00" "22:00" "22:00" "22:00" ...
head(bd)
##   vcClaveTienda  DescGiro Codigo.Barras PLU      Fecha     Hora
## 1         MX001 Abarrotes  7.501021e+12  NA 19/06/2020 08:16:21
## 2         MX001 Abarrotes  7.501032e+12  NA 19/06/2020 08:23:33
## 3         MX001 Abarrotes  7.501000e+12  NA 19/06/2020 08:24:33
## 4         MX001 Abarrotes  7.501031e+12  NA 19/06/2020 08:24:33
## 5         MX001 Abarrotes  7.501026e+12  NA 19/06/2020 08:26:28
## 6         MX001 Abarrotes  7.501021e+12  NA 19/06/2020 08:16:21
##                        Marca                 Fabricante
## 1                NUTRI LECHE                    MEXILAC
## 2                     DAN UP           DANONE DE MEXICO
## 3                      BIMBO                GRUPO BIMBO
## 4                      PEPSI        PEPSI-COLA MEXICANA
## 5 BLANCA NIEVES (DETERGENTE) FABRICA DE JABON LA CORONA
## 6                NUTRI LECHE                    MEXILAC
##                             Producto Precio Ult.Costo Unidades F.Ticket
## 1                Nutri Leche 1 Litro   16.0     12.31        1        1
## 2 DANUP STRAWBERRY P/BEBER 350GR NAL   14.0     14.00        1        2
## 3                Rebanadas Bimbo 2Pz    5.0      5.00        1        3
## 4                   Pepsi N.R. 400Ml    8.0      8.00        1        3
## 5      Detergente Blanca Nieves 500G   19.5     15.00        1        4
## 6                Nutri Leche 1 Litro   16.0     12.31        1        1
##   NombreDepartamento          NombreFamilia           NombreCategoria
## 1          Abarrotes Lacteos y Refrigerados                     Leche
## 2          Abarrotes Lacteos y Refrigerados                    Yogurt
## 3          Abarrotes         Pan y Tortilla     Pan Dulce Empaquetado
## 4          Abarrotes                Bebidas Refrescos Plástico (N.R.)
## 5          Abarrotes     Limpieza del Hogar                Lavandería
## 6          Abarrotes Lacteos y Refrigerados                     Leche
##       Estado Mts.2 Tipo.ubicación      Giro Hora.inicio Hora.cierre
## 1 Nuevo León    60        Esquina Abarrotes       08:00       22:00
## 2 Nuevo León    60        Esquina Abarrotes       08:00       22:00
## 3 Nuevo León    60        Esquina Abarrotes       08:00       22:00
## 4 Nuevo León    60        Esquina Abarrotes       08:00       22:00
## 5 Nuevo León    60        Esquina Abarrotes       08:00       22:00
## 6 Nuevo León    60        Esquina Abarrotes       08:00       22:00
tail(bd)
##        vcClaveTienda DescGiro Codigo.Barras PLU      Fecha     Hora
## 200620         MX005 Depósito   7.62221e+12  NA 12/07/2020 01:08:25
## 200621         MX005 Depósito   7.62221e+12  NA 23/10/2020 22:17:37
## 200622         MX005 Depósito   7.62221e+12  NA 10/10/2020 20:30:20
## 200623         MX005 Depósito   7.62221e+12  NA 10/10/2020 22:40:43
## 200624         MX005 Depósito   7.62221e+12  NA 27/06/2020 22:30:19
## 200625         MX005 Depósito   7.62221e+12  NA 26/06/2020 23:43:34
##                    Marca    Fabricante                          Producto Precio
## 200620 TRIDENT XTRA CARE CADBURY ADAMS Trident Xtracare Freshmint 16.32G      9
## 200621 TRIDENT XTRA CARE CADBURY ADAMS Trident Xtracare Freshmint 16.32G      9
## 200622 TRIDENT XTRA CARE CADBURY ADAMS Trident Xtracare Freshmint 16.32G      9
## 200623 TRIDENT XTRA CARE CADBURY ADAMS Trident Xtracare Freshmint 16.32G      9
## 200624 TRIDENT XTRA CARE CADBURY ADAMS Trident Xtracare Freshmint 16.32G      9
## 200625 TRIDENT XTRA CARE CADBURY ADAMS Trident Xtracare Freshmint 16.32G      9
##        Ult.Costo Unidades F.Ticket NombreDepartamento NombreFamilia
## 200620      6.92        1   103100          Abarrotes      Dulcería
## 200621      6.92        1   116598          Abarrotes      Dulcería
## 200622      6.92        1   114886          Abarrotes      Dulcería
## 200623      6.92        1   114955          Abarrotes      Dulcería
## 200624      6.92        1   101121          Abarrotes      Dulcería
## 200625      6.92        1   100879          Abarrotes      Dulcería
##        NombreCategoria       Estado Mts.2 Tipo.ubicación       Giro Hora.inicio
## 200620 Gomas de Mazcar Quintana Roo    58        Esquina Mini súper       08:00
## 200621 Gomas de Mazcar Quintana Roo    58        Esquina Mini súper       08:00
## 200622 Gomas de Mazcar Quintana Roo    58        Esquina Mini súper       08:00
## 200623 Gomas de Mazcar Quintana Roo    58        Esquina Mini súper       08:00
## 200624 Gomas de Mazcar Quintana Roo    58        Esquina Mini súper       08:00
## 200625 Gomas de Mazcar Quintana Roo    58        Esquina Mini súper       08:00
##        Hora.cierre
## 200620       21:00
## 200621       21:00
## 200622       21:00
## 200623       21:00
## 200624       21:00
## 200625       21:00

Importar librerias

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(dplyr)

# count(bd,vcClaveTienda, sort= TRUE)
# count(bd,DescGiro, sort= TRUE)
# count(bd,Fecha, sort= TRUE)
# count(bd,Hora, sort= TRUE)
# count(bd,Marca, sort= TRUE)
# count(bd,Fabricante, sort= TRUE)
# count(bd,Producto, sort= TRUE)
# count(bd,NombreDepartamento, sort= TRUE)
# count(bd,NombreFamilia, sort= TRUE)
# count(bd,NombreCategoria, sort= TRUE)
# count(bd,Estado, sort= TRUE)
# count(bd,Hora.inicio, sort= TRUE)
# count(bd,Tipo.ubicación, sort= TRUE)
# count(bd,Giro, sort= TRUE)
# count(bd,Hora.cierre, sort= TRUE)

tabyl(bd, vcClaveTienda, NombreDepartamento)
##  vcClaveTienda Abarrotes Bebes e Infantiles Carnes Farmacia Ferretería Mercería
##          MX001     95415                515      1      147        245       28
##          MX002      6590                 21      0        4         10        0
##          MX003      4026                 15      0        2          8        0
##          MX004     82234                932      0      102        114       16
##          MX005     10014                  0      0        0          0        0
##  Papelería Productos a Eliminar Vinos y Licores
##         35                    3              80
##          0                    0               4
##          0                    0               0
##         32                    5              20
##          7                    0               0
tabyl(bd, NombreFamilia, vcClaveTienda)
##              NombreFamilia MX001 MX002 MX003 MX004 MX005
##                 Accesorios    88     0     0    58     0
##                     Aceite   346    29    18  1088     2
##                   Aderezos   544    21    30   909     3
##                    Alcohol     6     2     0     8     0
##                 Alimentos    256     9    15   530     0
##         Alimentos a Granel     1     0     0     0     0
##    Alimentos para Mascotas   300     9    36   533     0
##                Analgésicos     0     0     1     0     0
##                  Antiácido     0     0     1     0     0
##                 Antigripal    17     0     0    40     0
##     Artículos de Escritura     0     0     0     6     0
##              Azúcar y Miel   349     0     0    38     4
##                    Bebidas 38511  3416  1460 21504    27
##       Bebidas Premezcladas     0     4     0    19     0
##                    Botanas 13051  1194   498  5724  1116
##  C. Frías y Salchichonería   451     1   143  1528     0
##                   Cereales   533     7    10   210     0
##                    Cerveza  4644   196    26  1041  8110
##                   Cigarros  3775   451    75  2237   279
##                  Cuadernos     7     0     0     8     0
##           Cuidado Personal  1940   117    40  3319    17
##             Dermatológicos    33     1     0    20     0
##                Desechables   809    38    25  2588     0
##                   Dulcería  1725    45   108   486   307
##                   Especias  1596    28    22  3249    26
##                   Galletas  3259   218   256  3754     0
##          Granos y Semillas  1138    18    19  1488     0
##     Harinas y Complementos   460    20    43  1237     0
##     Lacteos y Refrigerados  6795   139   503 10221     1
##                    Latería  1540    90   108  3365     4
##         Limpieza del Hogar  3771   295   172  4470    16
##                   Mantecas   203     7     6   581     0
##       Material de Curación    46     0     0    11     0
##    Materiales y Accesorios    28     0     0    18     7
##             Pan y Tortilla  5782    39   294  4387     0
##                    Pañales   114     8     0   215     0
##                 Pegamentos   104     8     6   102     0
##   Pilas para uso Doméstico   141     2     2    12     0
##                      Pollo     1     0     0     0     0
##                    Postres    29     0     2    56     0
##       Productos Higiénicos    57     4     0   129     0
##      Productos sin Familia     3     0     0     5     0
##                        Ron     1     0     0     0     0
##       Salsas y Sazonadores  1550    94    59  3527    90
##                    Sangría    13     0     0     0     0
##             Sopas y Pastas  1280    65    37  2749    10
##       Te, Chocolate y Café   454    42    27   906     2
##                    Tequila    62     0     0     1     0
##                     Varios    73     1     0    39     0
##          Velas y Veladoras   579    11     9  1039     0
##                     Whisky     4     0     0     0     0

Con lo anterior, se encontraron las siguientes observaciones: 1. La variable PLU tine más de 199,000 NA 2. Las variables fecha, hora, hora.inicio y hora.cierre no tienen formato correcto. 3. Existen precios negativo s 4. No se observa variable Ventas o Subtotal

Técnicas de limpieza de datos

# Técnica 1. Remover valores irrelevantes 
  # Eliminar columnas 
  bd1 <- bd 
  bd1 <- subset (bd1, select = -c( PLU, Codigo.Barras))
  summary(bd1)
##  vcClaveTienda        DescGiro            Fecha               Hora          
##  Length:200625      Length:200625      Length:200625      Length:200625     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     Marca            Fabricante          Producto             Precio       
##  Length:200625      Length:200625      Length:200625      Min.   :-147.00  
##  Class :character   Class :character   Class :character   1st Qu.:  11.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :  16.00  
##                                                           Mean   :  19.42  
##                                                           3rd Qu.:  25.00  
##                                                           Max.   :1000.00  
##    Ult.Costo         Unidades         F.Ticket      NombreDepartamento
##  Min.   :  0.38   Min.   : 0.200   Min.   :     1   Length:200625     
##  1st Qu.:  8.46   1st Qu.: 1.000   1st Qu.: 33964   Class :character  
##  Median : 12.31   Median : 1.000   Median :105993   Mode  :character  
##  Mean   : 15.31   Mean   : 1.262   Mean   :193990                     
##  3rd Qu.: 19.23   3rd Qu.: 1.000   3rd Qu.:383005                     
##  Max.   :769.23   Max.   :96.000   Max.   :450040                     
##  NombreFamilia      NombreCategoria       Estado              Mts.2     
##  Length:200625      Length:200625      Length:200625      Min.   :47.0  
##  Class :character   Class :character   Class :character   1st Qu.:53.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :60.0  
##                                                           Mean   :56.6  
##                                                           3rd Qu.:60.0  
##                                                           Max.   :62.0  
##  Tipo.ubicación         Giro           Hora.inicio        Hora.cierre       
##  Length:200625      Length:200625      Length:200625      Length:200625     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
## 
  #Eliminar renglones
  bd2 <- bd1
  bd2 <- bd2[bd2$Precio>0,]
  summary(bd2)
##  vcClaveTienda        DescGiro            Fecha               Hora          
##  Length:200478      Length:200478      Length:200478      Length:200478     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     Marca            Fabricante          Producto             Precio       
##  Length:200478      Length:200478      Length:200478      Min.   :   0.50  
##  Class :character   Class :character   Class :character   1st Qu.:  11.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :  16.00  
##                                                           Mean   :  19.45  
##                                                           3rd Qu.:  25.00  
##                                                           Max.   :1000.00  
##    Ult.Costo         Unidades         F.Ticket      NombreDepartamento
##  Min.   :  0.38   Min.   : 0.200   Min.   :     1   Length:200478     
##  1st Qu.:  8.46   1st Qu.: 1.000   1st Qu.: 33977   Class :character  
##  Median : 12.31   Median : 1.000   Median :106034   Mode  :character  
##  Mean   : 15.31   Mean   : 1.261   Mean   :194096                     
##  3rd Qu.: 19.23   3rd Qu.: 1.000   3rd Qu.:383062                     
##  Max.   :769.23   Max.   :96.000   Max.   :450040                     
##  NombreFamilia      NombreCategoria       Estado              Mts.2     
##  Length:200478      Length:200478      Length:200478      Min.   :47.0  
##  Class :character   Class :character   Class :character   1st Qu.:53.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :60.0  
##                                                           Mean   :56.6  
##                                                           3rd Qu.:60.0  
##                                                           Max.   :62.0  
##  Tipo.ubicación         Giro           Hora.inicio        Hora.cierre       
##  Length:200478      Length:200478      Length:200478      Length:200478     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
## 
# Tecnica 2 Remover valores duplicados
  #¿Cuántos renglones duplicados hay?
  bd2[duplicated(bd2),]
##    vcClaveTienda  DescGiro      Fecha     Hora                      Marca
## 6          MX001 Abarrotes 19/06/2020 08:16:21                NUTRI LECHE
## 7          MX001 Abarrotes 19/06/2020 08:23:33                     DAN UP
## 8          MX001 Abarrotes 19/06/2020 08:24:33                      BIMBO
## 9          MX001 Abarrotes 19/06/2020 08:24:33                      PEPSI
## 10         MX001 Abarrotes 19/06/2020 08:26:28 BLANCA NIEVES (DETERGENTE)
##                    Fabricante                           Producto Precio
## 6                     MEXILAC                Nutri Leche 1 Litro   16.0
## 7            DANONE DE MEXICO DANUP STRAWBERRY P/BEBER 350GR NAL   14.0
## 8                 GRUPO BIMBO                Rebanadas Bimbo 2Pz    5.0
## 9         PEPSI-COLA MEXICANA                   Pepsi N.R. 400Ml    8.0
## 10 FABRICA DE JABON LA CORONA      Detergente Blanca Nieves 500G   19.5
##    Ult.Costo Unidades F.Ticket NombreDepartamento          NombreFamilia
## 6      12.31        1        1          Abarrotes Lacteos y Refrigerados
## 7      14.00        1        2          Abarrotes Lacteos y Refrigerados
## 8       5.00        1        3          Abarrotes         Pan y Tortilla
## 9       8.00        1        3          Abarrotes                Bebidas
## 10     15.00        1        4          Abarrotes     Limpieza del Hogar
##              NombreCategoria     Estado Mts.2 Tipo.ubicación      Giro
## 6                      Leche Nuevo León    60        Esquina Abarrotes
## 7                     Yogurt Nuevo León    60        Esquina Abarrotes
## 8      Pan Dulce Empaquetado Nuevo León    60        Esquina Abarrotes
## 9  Refrescos Plástico (N.R.) Nuevo León    60        Esquina Abarrotes
## 10                Lavandería Nuevo León    60        Esquina Abarrotes
##    Hora.inicio Hora.cierre
## 6        08:00       22:00
## 7        08:00       22:00
## 8        08:00       22:00
## 9        08:00       22:00
## 10       08:00       22:00
  sum(duplicated(bd2))
## [1] 5
  #eliminar renglones duplicados 
  bd3 <- bd2
  bd3 <- distinct(bd3)
  
# Tecnica 3. resolver errores tipográficos y similares 
  # Precios en absoluto
  bd4 <- bd1 
  bd4$Precio <- abs(bd4$Precio)
  summary(bd4)
##  vcClaveTienda        DescGiro            Fecha               Hora          
##  Length:200625      Length:200625      Length:200625      Length:200625     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     Marca            Fabricante          Producto             Precio       
##  Length:200625      Length:200625      Length:200625      Min.   :   0.50  
##  Class :character   Class :character   Class :character   1st Qu.:  11.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :  16.00  
##                                                           Mean   :  19.45  
##                                                           3rd Qu.:  25.00  
##                                                           Max.   :1000.00  
##    Ult.Costo         Unidades         F.Ticket      NombreDepartamento
##  Min.   :  0.38   Min.   : 0.200   Min.   :     1   Length:200625     
##  1st Qu.:  8.46   1st Qu.: 1.000   1st Qu.: 33964   Class :character  
##  Median : 12.31   Median : 1.000   Median :105993   Mode  :character  
##  Mean   : 15.31   Mean   : 1.262   Mean   :193990                     
##  3rd Qu.: 19.23   3rd Qu.: 1.000   3rd Qu.:383005                     
##  Max.   :769.23   Max.   :96.000   Max.   :450040                     
##  NombreFamilia      NombreCategoria       Estado              Mts.2     
##  Length:200625      Length:200625      Length:200625      Min.   :47.0  
##  Class :character   Class :character   Class :character   1st Qu.:53.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :60.0  
##                                                           Mean   :56.6  
##                                                           3rd Qu.:60.0  
##                                                           Max.   :62.0  
##  Tipo.ubicación         Giro           Hora.inicio        Hora.cierre       
##  Length:200625      Length:200625      Length:200625      Length:200625     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
## 
  #signo de $ te dice la columna
  
  #unidades en entero
  bd5 <- bd4
  bd5$Unidades <- ceiling(bd5$Unidades)
  summary(bd5)
##  vcClaveTienda        DescGiro            Fecha               Hora          
##  Length:200625      Length:200625      Length:200625      Length:200625     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     Marca            Fabricante          Producto             Precio       
##  Length:200625      Length:200625      Length:200625      Min.   :   0.50  
##  Class :character   Class :character   Class :character   1st Qu.:  11.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :  16.00  
##                                                           Mean   :  19.45  
##                                                           3rd Qu.:  25.00  
##                                                           Max.   :1000.00  
##    Ult.Costo         Unidades         F.Ticket      NombreDepartamento
##  Min.   :  0.38   Min.   : 1.000   Min.   :     1   Length:200625     
##  1st Qu.:  8.46   1st Qu.: 1.000   1st Qu.: 33964   Class :character  
##  Median : 12.31   Median : 1.000   Median :105993   Mode  :character  
##  Mean   : 15.31   Mean   : 1.262   Mean   :193990                     
##  3rd Qu.: 19.23   3rd Qu.: 1.000   3rd Qu.:383005                     
##  Max.   :769.23   Max.   :96.000   Max.   :450040                     
##  NombreFamilia      NombreCategoria       Estado              Mts.2     
##  Length:200625      Length:200625      Length:200625      Min.   :47.0  
##  Class :character   Class :character   Class :character   1st Qu.:53.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :60.0  
##                                                           Mean   :56.6  
##                                                           3rd Qu.:60.0  
##                                                           Max.   :62.0  
##  Tipo.ubicación         Giro           Hora.inicio        Hora.cierre       
##  Length:200625      Length:200625      Length:200625      Length:200625     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
## 
# Tecnica 4 Convertir tipos de datos 
  # Convertir de caracter a fecha
  bd6 <- bd3 
  bd6$Fecha <- as.Date(bd6$Fecha, format="%d/%m/%Y")
  tibble(bd6)
## # A tibble: 200,473 × 20
##    vcClaveTienda DescGiro  Fecha      Hora     Marca  Fabricante Producto Precio
##    <chr>         <chr>     <date>     <chr>    <chr>  <chr>      <chr>     <dbl>
##  1 MX001         Abarrotes 2020-06-19 08:16:21 NUTRI… MEXILAC    Nutri L…   16  
##  2 MX001         Abarrotes 2020-06-19 08:23:33 DAN UP DANONE DE… DANUP S…   14  
##  3 MX001         Abarrotes 2020-06-19 08:24:33 BIMBO  GRUPO BIM… Rebanad…    5  
##  4 MX001         Abarrotes 2020-06-19 08:24:33 PEPSI  PEPSI-COL… Pepsi N…    8  
##  5 MX001         Abarrotes 2020-06-19 08:26:28 BLANC… FABRICA D… Deterge…   19.5
##  6 MX001         Abarrotes 2020-06-19 08:26:28 FLASH  ALEN       Flash X…    9.5
##  7 MX001         Abarrotes 2020-06-19 08:26:28 VARIO… DANONE DE… Danone …   11  
##  8 MX001         Abarrotes 2020-06-19 08:26:28 ZOTE   FABRICA D… Jabon Z…    9.5
##  9 MX001         Abarrotes 2020-06-19 08:26:28 ALWAYS PROCTER &… T Femen…   23.5
## 10 MX001         Abarrotes 2020-06-19 15:24:02 JUMEX  JUMEX      Jugo De…   12  
## # ℹ 200,463 more rows
## # ℹ 12 more variables: Ult.Costo <dbl>, Unidades <dbl>, F.Ticket <int>,
## #   NombreDepartamento <chr>, NombreFamilia <chr>, NombreCategoria <chr>,
## #   Estado <chr>, Mts.2 <int>, Tipo.ubicación <chr>, Giro <chr>,
## #   Hora.inicio <chr>, Hora.cierre <chr>
  #Convertir de caracter a entero 
  bd7 <- bd6 
  bd7$Hora <- substr(bd7$Hora, star =1, stop=2)
  tibble(bd7)
## # A tibble: 200,473 × 20
##    vcClaveTienda DescGiro  Fecha      Hora  Marca     Fabricante Producto Precio
##    <chr>         <chr>     <date>     <chr> <chr>     <chr>      <chr>     <dbl>
##  1 MX001         Abarrotes 2020-06-19 08    NUTRI LE… MEXILAC    Nutri L…   16  
##  2 MX001         Abarrotes 2020-06-19 08    DAN UP    DANONE DE… DANUP S…   14  
##  3 MX001         Abarrotes 2020-06-19 08    BIMBO     GRUPO BIM… Rebanad…    5  
##  4 MX001         Abarrotes 2020-06-19 08    PEPSI     PEPSI-COL… Pepsi N…    8  
##  5 MX001         Abarrotes 2020-06-19 08    BLANCA N… FABRICA D… Deterge…   19.5
##  6 MX001         Abarrotes 2020-06-19 08    FLASH     ALEN       Flash X…    9.5
##  7 MX001         Abarrotes 2020-06-19 08    VARIOS D… DANONE DE… Danone …   11  
##  8 MX001         Abarrotes 2020-06-19 08    ZOTE      FABRICA D… Jabon Z…    9.5
##  9 MX001         Abarrotes 2020-06-19 08    ALWAYS    PROCTER &… T Femen…   23.5
## 10 MX001         Abarrotes 2020-06-19 15    JUMEX     JUMEX      Jugo De…   12  
## # ℹ 200,463 more rows
## # ℹ 12 more variables: Ult.Costo <dbl>, Unidades <dbl>, F.Ticket <int>,
## #   NombreDepartamento <chr>, NombreFamilia <chr>, NombreCategoria <chr>,
## #   Estado <chr>, Mts.2 <int>, Tipo.ubicación <chr>, Giro <chr>,
## #   Hora.inicio <chr>, Hora.cierre <chr>
  bd7$Hora <- as.integer(bd7$Hora)
  str(bd7)
## 'data.frame':    200473 obs. of  20 variables:
##  $ vcClaveTienda     : chr  "MX001" "MX001" "MX001" "MX001" ...
##  $ DescGiro          : chr  "Abarrotes" "Abarrotes" "Abarrotes" "Abarrotes" ...
##  $ Fecha             : Date, format: "2020-06-19" "2020-06-19" ...
##  $ Hora              : int  8 8 8 8 8 8 8 8 8 15 ...
##  $ Marca             : chr  "NUTRI LECHE" "DAN UP" "BIMBO" "PEPSI" ...
##  $ Fabricante        : chr  "MEXILAC" "DANONE DE MEXICO" "GRUPO BIMBO" "PEPSI-COLA MEXICANA" ...
##  $ Producto          : chr  "Nutri Leche 1 Litro" "DANUP STRAWBERRY P/BEBER 350GR NAL" "Rebanadas Bimbo 2Pz" "Pepsi N.R. 400Ml" ...
##  $ Precio            : num  16 14 5 8 19.5 9.5 11 9.5 23.5 12 ...
##  $ Ult.Costo         : num  12.3 14 5 8 15 ...
##  $ Unidades          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ F.Ticket          : int  1 2 3 3 4 4 4 4 4 5 ...
##  $ NombreDepartamento: chr  "Abarrotes" "Abarrotes" "Abarrotes" "Abarrotes" ...
##  $ NombreFamilia     : chr  "Lacteos y Refrigerados" "Lacteos y Refrigerados" "Pan y Tortilla" "Bebidas" ...
##  $ NombreCategoria   : chr  "Leche" "Yogurt" "Pan Dulce Empaquetado" "Refrescos Plástico (N.R.)" ...
##  $ Estado            : chr  "Nuevo León" "Nuevo León" "Nuevo León" "Nuevo León" ...
##  $ Mts.2             : int  60 60 60 60 60 60 60 60 60 60 ...
##  $ Tipo.ubicación    : chr  "Esquina" "Esquina" "Esquina" "Esquina" ...
##  $ Giro              : chr  "Abarrotes" "Abarrotes" "Abarrotes" "Abarrotes" ...
##  $ Hora.inicio       : chr  "08:00" "08:00" "08:00" "08:00" ...
##  $ Hora.cierre       : chr  "22:00" "22:00" "22:00" "22:00" ...
# Tecnica 5. Tratar valores faltantes (NA)
  #¿Cuantos NA tengo?
  sum(is.na(bd7))
## [1] 0
  sum(is.na(bd))
## [1] 199188
  # ¿Cuantos NA tengo por variable?
  sapply(bd, function(x) sum(is.na(x)))
##      vcClaveTienda           DescGiro      Codigo.Barras                PLU 
##                  0                  0                  0             199188 
##              Fecha               Hora              Marca         Fabricante 
##                  0                  0                  0                  0 
##           Producto             Precio          Ult.Costo           Unidades 
##                  0                  0                  0                  0 
##           F.Ticket NombreDepartamento      NombreFamilia    NombreCategoria 
##                  0                  0                  0                  0 
##             Estado              Mts.2     Tipo.ubicación               Giro 
##                  0                  0                  0                  0 
##        Hora.inicio        Hora.cierre 
##                  0                  0
  #Borrar todos los registros NA de una tabla 
  bd8 <- bd 
  bd8 <- na.omit(bd8)
  summary(bd8)
##  vcClaveTienda        DescGiro         Codigo.Barras            PLU        
##  Length:1437        Length:1437        Min.   :6.750e+08   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.:6.750e+08   1st Qu.: 1.000  
##  Mode  :character   Mode  :character   Median :6.750e+08   Median : 1.000  
##                                        Mean   :2.616e+11   Mean   : 2.112  
##                                        3rd Qu.:6.750e+08   3rd Qu.: 1.000  
##                                        Max.   :7.501e+12   Max.   :30.000  
##     Fecha               Hora              Marca            Fabricante       
##  Length:1437        Length:1437        Length:1437        Length:1437       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Producto             Precio        Ult.Costo        Unidades    
##  Length:1437        Min.   :30.00   Min.   : 1.00   Min.   :1.000  
##  Class :character   1st Qu.:90.00   1st Qu.:64.62   1st Qu.:1.000  
##  Mode  :character   Median :90.00   Median :64.62   Median :1.000  
##                     Mean   :87.94   Mean   :56.65   Mean   :1.124  
##                     3rd Qu.:90.00   3rd Qu.:64.62   3rd Qu.:1.000  
##                     Max.   :90.00   Max.   :64.62   Max.   :7.000  
##     F.Ticket      NombreDepartamento NombreFamilia      NombreCategoria   
##  Min.   :   772   Length:1437        Length:1437        Length:1437       
##  1st Qu.: 99955   Class :character   Class :character   Class :character  
##  Median :102493   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :100595                                                           
##  3rd Qu.:106546                                                           
##  Max.   :118356                                                           
##     Estado              Mts.2       Tipo.ubicación         Giro          
##  Length:1437        Min.   :58.00   Length:1437        Length:1437       
##  Class :character   1st Qu.:58.00   Class :character   Class :character  
##  Mode  :character   Median :58.00   Mode  :character   Mode  :character  
##                     Mean   :58.07                                        
##                     3rd Qu.:58.00                                        
##                     Max.   :60.00                                        
##  Hora.inicio        Hora.cierre       
##  Length:1437        Length:1437       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
  #remplazar na por ceros
  bd9 <- bd
  bd9[is.na(bd9)] <- 0
  summary(bd9)
##  vcClaveTienda        DescGiro         Codigo.Barras            PLU          
##  Length:200625      Length:200625      Min.   :8.347e+05   Min.   : 0.00000  
##  Class :character   Class :character   1st Qu.:7.501e+12   1st Qu.: 0.00000  
##  Mode  :character   Mode  :character   Median :7.501e+12   Median : 0.00000  
##                                        Mean   :5.950e+12   Mean   : 0.01513  
##                                        3rd Qu.:7.501e+12   3rd Qu.: 0.00000  
##                                        Max.   :1.750e+13   Max.   :30.00000  
##     Fecha               Hora              Marca            Fabricante       
##  Length:200625      Length:200625      Length:200625      Length:200625     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Producto             Precio          Ult.Costo         Unidades     
##  Length:200625      Min.   :-147.00   Min.   :  0.38   Min.   : 0.200  
##  Class :character   1st Qu.:  11.00   1st Qu.:  8.46   1st Qu.: 1.000  
##  Mode  :character   Median :  16.00   Median : 12.31   Median : 1.000  
##                     Mean   :  19.42   Mean   : 15.31   Mean   : 1.262  
##                     3rd Qu.:  25.00   3rd Qu.: 19.23   3rd Qu.: 1.000  
##                     Max.   :1000.00   Max.   :769.23   Max.   :96.000  
##     F.Ticket      NombreDepartamento NombreFamilia      NombreCategoria   
##  Min.   :     1   Length:200625      Length:200625      Length:200625     
##  1st Qu.: 33964   Class :character   Class :character   Class :character  
##  Median :105993   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :193990                                                           
##  3rd Qu.:383005                                                           
##  Max.   :450040                                                           
##     Estado              Mts.2      Tipo.ubicación         Giro          
##  Length:200625      Min.   :47.0   Length:200625      Length:200625     
##  Class :character   1st Qu.:53.0   Class :character   Class :character  
##  Mode  :character   Median :60.0   Mode  :character   Mode  :character  
##                     Mean   :56.6                                        
##                     3rd Qu.:60.0                                        
##                     Max.   :62.0                                        
##  Hora.inicio        Hora.cierre       
##  Length:200625      Length:200625     
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
  #remplazar los na con el promedio
  bd10 <- bd
  bd10$PLU[is.na(bd10$PLU)] <- mean(bd10$PLU,na.rm=TRUE)
  summary(bd10)
##  vcClaveTienda        DescGiro         Codigo.Barras            PLU        
##  Length:200625      Length:200625      Min.   :8.347e+05   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.:7.501e+12   1st Qu.: 2.112  
##  Mode  :character   Mode  :character   Median :7.501e+12   Median : 2.112  
##                                        Mean   :5.950e+12   Mean   : 2.112  
##                                        3rd Qu.:7.501e+12   3rd Qu.: 2.112  
##                                        Max.   :1.750e+13   Max.   :30.000  
##     Fecha               Hora              Marca            Fabricante       
##  Length:200625      Length:200625      Length:200625      Length:200625     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Producto             Precio          Ult.Costo         Unidades     
##  Length:200625      Min.   :-147.00   Min.   :  0.38   Min.   : 0.200  
##  Class :character   1st Qu.:  11.00   1st Qu.:  8.46   1st Qu.: 1.000  
##  Mode  :character   Median :  16.00   Median : 12.31   Median : 1.000  
##                     Mean   :  19.42   Mean   : 15.31   Mean   : 1.262  
##                     3rd Qu.:  25.00   3rd Qu.: 19.23   3rd Qu.: 1.000  
##                     Max.   :1000.00   Max.   :769.23   Max.   :96.000  
##     F.Ticket      NombreDepartamento NombreFamilia      NombreCategoria   
##  Min.   :     1   Length:200625      Length:200625      Length:200625     
##  1st Qu.: 33964   Class :character   Class :character   Class :character  
##  Median :105993   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :193990                                                           
##  3rd Qu.:383005                                                           
##  Max.   :450040                                                           
##     Estado              Mts.2      Tipo.ubicación         Giro          
##  Length:200625      Min.   :47.0   Length:200625      Length:200625     
##  Class :character   1st Qu.:53.0   Class :character   Class :character  
##  Mode  :character   Median :60.0   Mode  :character   Mode  :character  
##                     Mean   :56.6                                        
##                     3rd Qu.:60.0                                        
##                     Max.   :62.0                                        
##  Hora.inicio        Hora.cierre       
##  Length:200625      Length:200625     
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
# Tecnica 6 verificar datos con métodos estadísticos
  bd11 <- bd7
  boxplot(bd11$Precio, horizontal=TRUE)

  boxplot(bd11$Unidades, horizontal=TRUE)

# Paso 4. Manipular base de datos 

#Agregar columnas 
bd11$diadelasemana <- wday(bd11$Fecha)
summary(bd11)
##  vcClaveTienda        DescGiro             Fecha                 Hora      
##  Length:200473      Length:200473      Min.   :2020-05-01   Min.   : 0.00  
##  Class :character   Class :character   1st Qu.:2020-06-06   1st Qu.:13.00  
##  Mode  :character   Mode  :character   Median :2020-07-11   Median :17.00  
##                                        Mean   :2020-07-18   Mean   :16.23  
##                                        3rd Qu.:2020-08-29   3rd Qu.:20.00  
##                                        Max.   :2020-11-11   Max.   :23.00  
##     Marca            Fabricante          Producto             Precio       
##  Length:200473      Length:200473      Length:200473      Min.   :   0.50  
##  Class :character   Class :character   Class :character   1st Qu.:  11.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :  16.00  
##                                                           Mean   :  19.45  
##                                                           3rd Qu.:  25.00  
##                                                           Max.   :1000.00  
##    Ult.Costo         Unidades         F.Ticket      NombreDepartamento
##  Min.   :  0.38   Min.   : 0.200   Min.   :     1   Length:200473     
##  1st Qu.:  8.46   1st Qu.: 1.000   1st Qu.: 33978   Class :character  
##  Median : 12.31   Median : 1.000   Median :106035   Mode  :character  
##  Mean   : 15.31   Mean   : 1.261   Mean   :194101                     
##  3rd Qu.: 19.23   3rd Qu.: 1.000   3rd Qu.:383065                     
##  Max.   :769.23   Max.   :96.000   Max.   :450040                     
##  NombreFamilia      NombreCategoria       Estado              Mts.2     
##  Length:200473      Length:200473      Length:200473      Min.   :47.0  
##  Class :character   Class :character   Class :character   1st Qu.:53.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :60.0  
##                                                           Mean   :56.6  
##                                                           3rd Qu.:60.0  
##                                                           Max.   :62.0  
##  Tipo.ubicación         Giro           Hora.inicio        Hora.cierre       
##  Length:200473      Length:200473      Length:200473      Length:200473     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  diadelasemana  
##  Min.   :1.000  
##  1st Qu.:2.000  
##  Median :4.000  
##  Mean   :3.911  
##  3rd Qu.:6.000  
##  Max.   :7.000
bd11$subtotal <- bd11$Precio * bd11$Unidades
summary(bd11)
##  vcClaveTienda        DescGiro             Fecha                 Hora      
##  Length:200473      Length:200473      Min.   :2020-05-01   Min.   : 0.00  
##  Class :character   Class :character   1st Qu.:2020-06-06   1st Qu.:13.00  
##  Mode  :character   Mode  :character   Median :2020-07-11   Median :17.00  
##                                        Mean   :2020-07-18   Mean   :16.23  
##                                        3rd Qu.:2020-08-29   3rd Qu.:20.00  
##                                        Max.   :2020-11-11   Max.   :23.00  
##     Marca            Fabricante          Producto             Precio       
##  Length:200473      Length:200473      Length:200473      Min.   :   0.50  
##  Class :character   Class :character   Class :character   1st Qu.:  11.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :  16.00  
##                                                           Mean   :  19.45  
##                                                           3rd Qu.:  25.00  
##                                                           Max.   :1000.00  
##    Ult.Costo         Unidades         F.Ticket      NombreDepartamento
##  Min.   :  0.38   Min.   : 0.200   Min.   :     1   Length:200473     
##  1st Qu.:  8.46   1st Qu.: 1.000   1st Qu.: 33978   Class :character  
##  Median : 12.31   Median : 1.000   Median :106035   Mode  :character  
##  Mean   : 15.31   Mean   : 1.261   Mean   :194101                     
##  3rd Qu.: 19.23   3rd Qu.: 1.000   3rd Qu.:383065                     
##  Max.   :769.23   Max.   :96.000   Max.   :450040                     
##  NombreFamilia      NombreCategoria       Estado              Mts.2     
##  Length:200473      Length:200473      Length:200473      Min.   :47.0  
##  Class :character   Class :character   Class :character   1st Qu.:53.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :60.0  
##                                                           Mean   :56.6  
##                                                           3rd Qu.:60.0  
##                                                           Max.   :62.0  
##  Tipo.ubicación         Giro           Hora.inicio        Hora.cierre       
##  Length:200473      Length:200473      Length:200473      Length:200473     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  diadelasemana      subtotal     
##  Min.   :1.000   Min.   :   1.0  
##  1st Qu.:2.000   1st Qu.:  12.0  
##  Median :4.000   Median :  18.0  
##  Mean   :3.911   Mean   :  24.3  
##  3rd Qu.:6.000   3rd Qu.:  27.0  
##  Max.   :7.000   Max.   :2496.0
# Exportar la base de datos 
write.csv(bd11, file="abarrotes11_limpia.csv", row.names = FALSE)