library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ stringr 1.4.0
## ✔ tidyr   1.2.0     ✔ forcats 0.5.1
## ✔ readr   2.1.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(ggplot2)

getwd()

Leer base de datos

#file.choose()
bd <- read_csv("/Users/danieltrevino/Documents/Quinto Semestre TEC/Bootcamp de Programación/abarrotes.csv")
## Rows: 200625 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (12): vcClaveTienda, DescGiro, Fecha, Marca, Fabricante, Producto, Nomb...
## dbl   (7): Codigo.Barras, PLU, Precio, Ult.Costo, Unidades, F.Ticket, Mts.2
## time  (3): Hora, Hora.inicio, Hora.cierre
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Entender base de datos

summary(bd)
##  vcClaveTienda        DescGiro         Codigo.Barras            PLU        
##  Length:200625      Length:200625      Min.   :8.347e+05   Min.   : 1.00   
##  Class :character   Class :character   1st Qu.:7.500e+12   1st Qu.: 1.00   
##  Mode  :character   Mode  :character   Median :7.500e+12   Median : 1.00   
##                                        Mean   :5.949e+12   Mean   : 2.11   
##                                        3rd Qu.:7.500e+12   3rd Qu.: 1.00   
##                                        Max.   :1.750e+13   Max.   :30.00   
##                                                            NA's   :199188  
##     Fecha               Hora             Marca            Fabricante       
##  Length:200625      Length:200625     Length:200625      Length:200625     
##  Class :character   Class1:hms        Class :character   Class :character  
##  Mode  :character   Class2:difftime   Mode  :character   Mode  :character  
##                     Mode  :numeric                                         
##                                                                            
##                                                                            
##                                                                            
##    Producto             Precio          Ult.Costo         Unidades     
##  Length:200625      Min.   :-147.00   Min.   :  0.38   Min.   : 0.200  
##  Class :character   1st Qu.:  11.00   1st Qu.:  8.46   1st Qu.: 1.000  
##  Mode  :character   Median :  16.00   Median : 12.31   Median : 1.000  
##                     Mean   :  19.42   Mean   : 15.31   Mean   : 1.262  
##                     3rd Qu.:  25.00   3rd Qu.: 19.23   3rd Qu.: 1.000  
##                     Max.   :1000.00   Max.   :769.23   Max.   :96.000  
##                                                                        
##     F.Ticket      NombreDepartamento NombreFamilia      NombreCategoria   
##  Min.   :     1   Length:200625      Length:200625      Length:200625     
##  1st Qu.: 33964   Class :character   Class :character   Class :character  
##  Median :105993   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :193990                                                           
##  3rd Qu.:383005                                                           
##  Max.   :450040                                                           
##                                                                           
##     Estado              Mts.2      Tipo.ubicación         Giro          
##  Length:200625      Min.   :47.0   Length:200625      Length:200625     
##  Class :character   1st Qu.:53.0   Class :character   Class :character  
##  Mode  :character   Median :60.0   Mode  :character   Mode  :character  
##                     Mean   :56.6                                        
##                     3rd Qu.:60.0                                        
##                     Max.   :62.0                                        
##                                                                         
##  Hora.inicio       Hora.cierre      
##  Length:200625     Length:200625    
##  Class1:hms        Class1:hms       
##  Class2:difftime   Class2:difftime  
##  Mode  :numeric    Mode  :numeric   
##                                     
##                                     
## 
library(dplyr)
count(bd, vcClaveTienda, sort = TRUE)
## # A tibble: 5 × 2
##   vcClaveTienda     n
##   <chr>         <int>
## 1 MX001         96469
## 2 MX004         83455
## 3 MX005         10021
## 4 MX002          6629
## 5 MX003          4051
count(bd, DescGiro, sort = TRUE)
## # A tibble: 3 × 2
##   DescGiro        n
##   <chr>       <int>
## 1 Abarrotes  100520
## 2 Carnicería  83455
## 3 Depósito    16650
count(bd, Marca, sort = TRUE)
## # A tibble: 540 × 2
##    Marca           n
##    <chr>       <int>
##  1 COCA COLA   18686
##  2 PEPSI       15967
##  3 TECATE      11674
##  4 BIMBO        8317
##  5 LALA         5866
##  6 MARINELA     3696
##  7 DORITOS      3142
##  8 CHEETOS      3130
##  9 NUTRI LECHE  3128
## 10 MARLBORO     2579
## # … with 530 more rows
count(bd, Fabricante, sort = TRUE)
## # A tibble: 241 × 2
##    Fabricante                          n
##    <chr>                           <int>
##  1 COCA COLA                       27519
##  2 PEPSI-COLA MEXICANA             22416
##  3 SABRITAS                        14296
##  4 CERVECERIA CUAUHTEMOC MOCTEZUMA 13681
##  5 GRUPO BIMBO                     13078
##  6 SIGMA ALIMENTOS                  8014
##  7 GRUPO INDUSTRIAL LALA            5868
##  8 GRUPO GAMESA                     5527
##  9 NESTLE                           3698
## 10 JUGOS DEL VALLE S.A. DE C.V.     3581
## # … with 231 more rows
count(bd, Producto, sort = TRUE)
## # A tibble: 3,404 × 2
##    Producto                        n
##    <chr>                       <int>
##  1 Pepsi N.R. 1.5L              5108
##  2 Coca Cola Retornable 2.5L    3771
##  3 Caguamon Tecate Light 1.2Lt  3471
##  4 Pepsi N. R. 2.5L             2899
##  5 Cerveza Tecate Light 340Ml   2619
##  6 Cerveza Tecate Light 16Oz    2315
##  7 Coca Cola Retornable 1.5L    2124
##  8 Pepsi N.R. 3L                1832
##  9 Coca Cola Retornable 500Ml   1659
## 10 PEPSI N.R. 1.5L              1631
## # … with 3,394 more rows
count(bd, NombreDepartamento, sort = TRUE)
## # A tibble: 9 × 2
##   NombreDepartamento        n
##   <chr>                 <int>
## 1 Abarrotes            198279
## 2 Bebes e Infantiles     1483
## 3 Ferretería              377
## 4 Farmacia                255
## 5 Vinos y Licores         104
## 6 Papelería                74
## 7 Mercería                 44
## 8 Productos a Eliminar      8
## 9 Carnes                    1
count(bd, NombreFamilia, sort = TRUE)
## # A tibble: 51 × 2
##    NombreFamilia              n
##    <chr>                  <int>
##  1 Bebidas                64918
##  2 Botanas                21583
##  3 Lacteos y Refrigerados 17659
##  4 Cerveza                14017
##  5 Pan y Tortilla         10502
##  6 Limpieza del Hogar      8724
##  7 Galletas                7487
##  8 Cigarros                6817
##  9 Cuidado Personal        5433
## 10 Salsas y Sazonadores    5320
## # … with 41 more rows
count(bd, NombreCategoria, sort = TRUE)
## # A tibble: 174 × 2
##    NombreCategoria               n
##    <chr>                     <int>
##  1 Refrescos Plástico (N.R.) 32862
##  2 Refrescos Retornables     13880
##  3 Frituras                  11082
##  4 Lata                       8150
##  5 Leche                      7054
##  6 Cajetilla                  6329
##  7 Botella                    5867
##  8 Productos sin Categoria    5455
##  9 Papas Fritas               5344
## 10 Jugos y Néctares           5295
## # … with 164 more rows
count(bd, Estado, sort = TRUE)
## # A tibble: 5 × 2
##   Estado           n
##   <chr>        <int>
## 1 Nuevo León   96469
## 2 Sinaloa      83455
## 3 Quintana Roo 10021
## 4 Jalisco       6629
## 5 Chiapas       4051
count(bd, Mts.2, sort = TRUE)
## # A tibble: 5 × 2
##   Mts.2     n
##   <dbl> <int>
## 1    60 96469
## 2    53 83455
## 3    58 10021
## 4    47  6629
## 5    62  4051
count(bd, Tipo.ubicación , sort = TRUE)
## # A tibble: 3 × 2
##   Tipo.ubicación      n
##   <chr>           <int>
## 1 Esquina        189945
## 2 Rotonda          6629
## 3 Entre calles     4051
count(bd,  Giro , sort = TRUE)
## # A tibble: 2 × 2
##   Giro            n
##   <chr>       <int>
## 1 Abarrotes  183975
## 2 Mini súper  16650
count(bd,  Hora.inicio  , sort = TRUE)
## # A tibble: 3 × 2
##   Hora.inicio      n
##   <time>       <int>
## 1 08:00       106490
## 2 07:00        87506
## 3 09:00         6629
count(bd,  Hora.cierre  , sort = TRUE)
## # A tibble: 3 × 2
##   Hora.cierre      n
##   <time>       <int>
## 1 22:00       103098
## 2 23:00        87506
## 3 21:00        10021
library(tidyverse)
tibble(bd)
## # A tibble: 200,625 × 22
##    vcClaveTienda DescGiro  Codigo.Barras   PLU Fecha   Hora     Marca Fabricante
##    <chr>         <chr>             <dbl> <dbl> <chr>   <time>   <chr> <chr>     
##  1 MX001         Abarrotes 7500000000000    NA 19/06/… 08:16:21 NUTR… MEXILAC   
##  2 MX001         Abarrotes 7500000000000    NA 19/06/… 08:23:33 DAN … DANONE DE…
##  3 MX001         Abarrotes 7500000000000    NA 19/06/… 08:24:33 BIMBO GRUPO BIM…
##  4 MX001         Abarrotes 7500000000000    NA 19/06/… 08:24:33 PEPSI PEPSI-COL…
##  5 MX001         Abarrotes 7500000000000    NA 19/06/… 08:26:28 BLAN… FABRICA D…
##  6 MX001         Abarrotes 7500000000000    NA 19/06/… 08:16:21 NUTR… MEXILAC   
##  7 MX001         Abarrotes 7500000000000    NA 19/06/… 08:23:33 DAN … DANONE DE…
##  8 MX001         Abarrotes 7500000000000    NA 19/06/… 08:24:33 BIMBO GRUPO BIM…
##  9 MX001         Abarrotes 7500000000000    NA 19/06/… 08:24:33 PEPSI PEPSI-COL…
## 10 MX001         Abarrotes 7500000000000    NA 19/06/… 08:26:28 BLAN… FABRICA D…
## # … with 200,615 more rows, and 14 more variables: Producto <chr>,
## #   Precio <dbl>, Ult.Costo <dbl>, Unidades <dbl>, F.Ticket <dbl>,
## #   NombreDepartamento <chr>, NombreFamilia <chr>, NombreCategoria <chr>,
## #   Estado <chr>, Mts.2 <dbl>, Tipo.ubicación <chr>, Giro <chr>,
## #   Hora.inicio <time>, Hora.cierre <time>
str(bd)
## spec_tbl_df [200,625 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ vcClaveTienda     : chr [1:200625] "MX001" "MX001" "MX001" "MX001" ...
##  $ DescGiro          : chr [1:200625] "Abarrotes" "Abarrotes" "Abarrotes" "Abarrotes" ...
##  $ Codigo.Barras     : num [1:200625] 7.5e+12 7.5e+12 7.5e+12 7.5e+12 7.5e+12 ...
##  $ PLU               : num [1:200625] NA NA NA NA NA NA NA NA NA NA ...
##  $ Fecha             : chr [1:200625] "19/06/20" "19/06/20" "19/06/20" "19/06/20" ...
##  $ Hora              : 'hms' num [1:200625] 08:16:21 08:23:33 08:24:33 08:24:33 ...
##   ..- attr(*, "units")= chr "secs"
##  $ Marca             : chr [1:200625] "NUTRI LECHE" "DAN UP" "BIMBO" "PEPSI" ...
##  $ Fabricante        : chr [1:200625] "MEXILAC" "DANONE DE MEXICO" "GRUPO BIMBO" "PEPSI-COLA MEXICANA" ...
##  $ Producto          : chr [1:200625] "Nutri Leche 1 Litro" "DANUP STRAWBERRY P/BEBER 350GR NAL" "Rebanadas Bimbo 2Pz" "Pepsi N.R. 400Ml" ...
##  $ Precio            : num [1:200625] 16 14 5 8 19.5 16 14 5 8 19.5 ...
##  $ Ult.Costo         : num [1:200625] 12.3 14 5 8 15 ...
##  $ Unidades          : num [1:200625] 1 1 1 1 1 1 1 1 1 1 ...
##  $ F.Ticket          : num [1:200625] 1 2 3 3 4 1 2 3 3 4 ...
##  $ NombreDepartamento: chr [1:200625] "Abarrotes" "Abarrotes" "Abarrotes" "Abarrotes" ...
##  $ NombreFamilia     : chr [1:200625] "Lacteos y Refrigerados" "Lacteos y Refrigerados" "Pan y Tortilla" "Bebidas" ...
##  $ NombreCategoria   : chr [1:200625] "Leche" "Yogurt" "Pan Dulce Empaquetado" "Refrescos Plástico (N.R.)" ...
##  $ Estado            : chr [1:200625] "Nuevo León" "Nuevo León" "Nuevo León" "Nuevo León" ...
##  $ Mts.2             : num [1:200625] 60 60 60 60 60 60 60 60 60 60 ...
##  $ Tipo.ubicación    : chr [1:200625] "Esquina" "Esquina" "Esquina" "Esquina" ...
##  $ Giro              : chr [1:200625] "Abarrotes" "Abarrotes" "Abarrotes" "Abarrotes" ...
##  $ Hora.inicio       : 'hms' num [1:200625] 08:00:00 08:00:00 08:00:00 08:00:00 ...
##   ..- attr(*, "units")= chr "secs"
##  $ Hora.cierre       : 'hms' num [1:200625] 22:00:00 22:00:00 22:00:00 22:00:00 ...
##   ..- attr(*, "units")= chr "secs"
##  - attr(*, "spec")=
##   .. cols(
##   ..   vcClaveTienda = col_character(),
##   ..   DescGiro = col_character(),
##   ..   Codigo.Barras = col_double(),
##   ..   PLU = col_double(),
##   ..   Fecha = col_character(),
##   ..   Hora = col_time(format = ""),
##   ..   Marca = col_character(),
##   ..   Fabricante = col_character(),
##   ..   Producto = col_character(),
##   ..   Precio = col_double(),
##   ..   Ult.Costo = col_double(),
##   ..   Unidades = col_double(),
##   ..   F.Ticket = col_double(),
##   ..   NombreDepartamento = col_character(),
##   ..   NombreFamilia = col_character(),
##   ..   NombreCategoria = col_character(),
##   ..   Estado = col_character(),
##   ..   Mts.2 = col_double(),
##   ..   Tipo.ubicación = col_character(),
##   ..   Giro = col_character(),
##   ..   Hora.inicio = col_time(format = ""),
##   ..   Hora.cierre = col_time(format = "")
##   .. )
##  - attr(*, "problems")=<externalptr>
head(bd, 7)
## # A tibble: 7 × 22
##   vcClaveTienda DescGiro  Codigo.Barras   PLU Fecha    Hora     Marca Fabricante
##   <chr>         <chr>             <dbl> <dbl> <chr>    <time>   <chr> <chr>     
## 1 MX001         Abarrotes 7500000000000    NA 19/06/20 08:16:21 NUTR… MEXILAC   
## 2 MX001         Abarrotes 7500000000000    NA 19/06/20 08:23:33 DAN … DANONE DE…
## 3 MX001         Abarrotes 7500000000000    NA 19/06/20 08:24:33 BIMBO GRUPO BIM…
## 4 MX001         Abarrotes 7500000000000    NA 19/06/20 08:24:33 PEPSI PEPSI-COL…
## 5 MX001         Abarrotes 7500000000000    NA 19/06/20 08:26:28 BLAN… FABRICA D…
## 6 MX001         Abarrotes 7500000000000    NA 19/06/20 08:16:21 NUTR… MEXILAC   
## 7 MX001         Abarrotes 7500000000000    NA 19/06/20 08:23:33 DAN … DANONE DE…
## # … with 14 more variables: Producto <chr>, Precio <dbl>, Ult.Costo <dbl>,
## #   Unidades <dbl>, F.Ticket <dbl>, NombreDepartamento <chr>,
## #   NombreFamilia <chr>, NombreCategoria <chr>, Estado <chr>, Mts.2 <dbl>,
## #   Tipo.ubicación <chr>, Giro <chr>, Hora.inicio <time>, Hora.cierre <time>
tail(bd)
## # A tibble: 6 × 22
##   vcClaveTienda DescGiro Codigo.Barras   PLU Fecha    Hora     Marca  Fabricante
##   <chr>         <chr>            <dbl> <dbl> <chr>    <time>   <chr>  <chr>     
## 1 MX005         Depósito 7620000000000    NA 12/07/20 01:08:25 TRIDE… CADBURY A…
## 2 MX005         Depósito 7620000000000    NA 23/10/20 22:17:37 TRIDE… CADBURY A…
## 3 MX005         Depósito 7620000000000    NA 10/10/20 20:30:20 TRIDE… CADBURY A…
## 4 MX005         Depósito 7620000000000    NA 10/10/20 22:40:43 TRIDE… CADBURY A…
## 5 MX005         Depósito 7620000000000    NA 27/06/20 22:30:19 TRIDE… CADBURY A…
## 6 MX005         Depósito 7620000000000    NA 26/06/20 23:43:34 TRIDE… CADBURY A…
## # … with 14 more variables: Producto <chr>, Precio <dbl>, Ult.Costo <dbl>,
## #   Unidades <dbl>, F.Ticket <dbl>, NombreDepartamento <chr>,
## #   NombreFamilia <chr>, NombreCategoria <chr>, Estado <chr>, Mts.2 <dbl>,
## #   Tipo.ubicación <chr>, Giro <chr>, Hora.inicio <time>, Hora.cierre <time>
library(janitor)
tabyl(bd, vcClaveTienda, NombreDepartamento)
##  vcClaveTienda Abarrotes Bebes e Infantiles Carnes Farmacia Ferretería Mercería
##          MX001     95415                515      1      147        245       28
##          MX002      6590                 21      0        4         10        0
##          MX003      4026                 15      0        2          8        0
##          MX004     82234                932      0      102        114       16
##          MX005     10014                  0      0        0          0        0
##  Papelería Productos a Eliminar Vinos y Licores
##         35                    3              80
##          0                    0               4
##          0                    0               0
##         32                    5              20
##          7                    0               0

Observaciones

1 - En producto, no hay un catalogo especifico. Ejemplo: Pepsi N.R. 1.5L y PEPSI N.R. 1.5L 2 - Casi ningun registro cuenta con PLU 3 - Cambiar formato de fecha 4 - Cambiar formato de hora 5 - Hay precios negativos 6 - Hay unidades menores a 1

Tecnicas para la limpiza de datos

Tecnica 1 - Remover valores irrelevantes

# Eliminar columnas PLU y  codigo de barras
bd1 <- bd
bd1 <- subset(bd1, select = -c (PLU, Codigo.Barras))

#Eliminar renglones
bd2 <- bd1
bd2 <- bd2[bd2$Precio > 0,]
summary(bd2)
##  vcClaveTienda        DescGiro            Fecha               Hora         
##  Length:200478      Length:200478      Length:200478      Length:200478    
##  Class :character   Class :character   Class :character   Class1:hms       
##  Mode  :character   Mode  :character   Mode  :character   Class2:difftime  
##                                                           Mode  :numeric   
##                                                                            
##                                                                            
##     Marca            Fabricante          Producto             Precio       
##  Length:200478      Length:200478      Length:200478      Min.   :   0.50  
##  Class :character   Class :character   Class :character   1st Qu.:  11.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :  16.00  
##                                                           Mean   :  19.45  
##                                                           3rd Qu.:  25.00  
##                                                           Max.   :1000.00  
##    Ult.Costo         Unidades         F.Ticket      NombreDepartamento
##  Min.   :  0.38   Min.   : 0.200   Min.   :     1   Length:200478     
##  1st Qu.:  8.46   1st Qu.: 1.000   1st Qu.: 33977   Class :character  
##  Median : 12.31   Median : 1.000   Median :106034   Mode  :character  
##  Mean   : 15.31   Mean   : 1.261   Mean   :194096                     
##  3rd Qu.: 19.23   3rd Qu.: 1.000   3rd Qu.:383062                     
##  Max.   :769.23   Max.   :96.000   Max.   :450040                     
##  NombreFamilia      NombreCategoria       Estado              Mts.2     
##  Length:200478      Length:200478      Length:200478      Min.   :47.0  
##  Class :character   Class :character   Class :character   1st Qu.:53.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :60.0  
##                                                           Mean   :56.6  
##                                                           3rd Qu.:60.0  
##                                                           Max.   :62.0  
##  Tipo.ubicación         Giro           Hora.inicio       Hora.cierre      
##  Length:200478      Length:200478      Length:200478     Length:200478    
##  Class :character   Class :character   Class1:hms        Class1:hms       
##  Mode  :character   Mode  :character   Class2:difftime   Class2:difftime  
##                                        Mode  :numeric    Mode  :numeric   
##                                                                           
## 
# Eliminar renglones de acurdo a alguna variable
bd2 <- bd2[bd2$NombreDepartamento != "Productos a Eliminar", ]
count(bd, NombreDepartamento, sort = TRUE)
## # A tibble: 9 × 2
##   NombreDepartamento        n
##   <chr>                 <int>
## 1 Abarrotes            198279
## 2 Bebes e Infantiles     1483
## 3 Ferretería              377
## 4 Farmacia                255
## 5 Vinos y Licores         104
## 6 Papelería                74
## 7 Mercería                 44
## 8 Productos a Eliminar      8
## 9 Carnes                    1

Tecnica 2 - Remover valores duplicados

# Conocer cuantos renglones hay duplicados
bd1[duplicated(bd1),]
## # A tibble: 5 × 20
##   vcClaveTienda DescGiro  Fecha    Hora     Marca     Fabricante Producto Precio
##   <chr>         <chr>     <chr>    <time>   <chr>     <chr>      <chr>     <dbl>
## 1 MX001         Abarrotes 19/06/20 08:16:21 NUTRI LE… MEXILAC    Nutri L…   16  
## 2 MX001         Abarrotes 19/06/20 08:23:33 DAN UP    DANONE DE… DANUP S…   14  
## 3 MX001         Abarrotes 19/06/20 08:24:33 BIMBO     GRUPO BIM… Rebanad…    5  
## 4 MX001         Abarrotes 19/06/20 08:24:33 PEPSI     PEPSI-COL… Pepsi N…    8  
## 5 MX001         Abarrotes 19/06/20 08:26:28 BLANCA N… FABRICA D… Deterge…   19.5
## # … with 12 more variables: Ult.Costo <dbl>, Unidades <dbl>, F.Ticket <dbl>,
## #   NombreDepartamento <chr>, NombreFamilia <chr>, NombreCategoria <chr>,
## #   Estado <chr>, Mts.2 <dbl>, Tipo.ubicación <chr>, Giro <chr>,
## #   Hora.inicio <time>, Hora.cierre <time>
sum(duplicated(bd1))
## [1] 5
# Eliminar renglones duplicados
bd3 <- bd1 
bd3 <- distinct(bd3)

Tecnica 3 - Eliminar valores tipograficos y errores similares

# Precios en absoluto
bd4 <- bd3
bd4$Precio <- abs(bd4$Precio)
summary(bd4)
##  vcClaveTienda        DescGiro            Fecha               Hora         
##  Length:200620      Length:200620      Length:200620      Length:200620    
##  Class :character   Class :character   Class :character   Class1:hms       
##  Mode  :character   Mode  :character   Mode  :character   Class2:difftime  
##                                                           Mode  :numeric   
##                                                                            
##                                                                            
##     Marca            Fabricante          Producto             Precio       
##  Length:200620      Length:200620      Length:200620      Min.   :   0.50  
##  Class :character   Class :character   Class :character   1st Qu.:  11.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :  16.00  
##                                                           Mean   :  19.45  
##                                                           3rd Qu.:  25.00  
##                                                           Max.   :1000.00  
##    Ult.Costo         Unidades         F.Ticket      NombreDepartamento
##  Min.   :  0.38   Min.   : 0.200   Min.   :     1   Length:200620     
##  1st Qu.:  8.46   1st Qu.: 1.000   1st Qu.: 33967   Class :character  
##  Median : 12.31   Median : 1.000   Median :105996   Mode  :character  
##  Mean   : 15.31   Mean   : 1.262   Mean   :193994                     
##  3rd Qu.: 19.23   3rd Qu.: 1.000   3rd Qu.:383008                     
##  Max.   :769.23   Max.   :96.000   Max.   :450040                     
##  NombreFamilia      NombreCategoria       Estado              Mts.2     
##  Length:200620      Length:200620      Length:200620      Min.   :47.0  
##  Class :character   Class :character   Class :character   1st Qu.:53.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :60.0  
##                                                           Mean   :56.6  
##                                                           3rd Qu.:60.0  
##                                                           Max.   :62.0  
##  Tipo.ubicación         Giro           Hora.inicio       Hora.cierre      
##  Length:200620      Length:200620      Length:200620     Length:200620    
##  Class :character   Class :character   Class1:hms        Class1:hms       
##  Mode  :character   Mode  :character   Class2:difftime   Class2:difftime  
##                                        Mode  :numeric    Mode  :numeric   
##                                                                           
## 
# Cantidades en enteros
bd5 <- bd4
bd5$Unidades <- ceiling(bd5$Unidades)
tibble(bd5)
## # A tibble: 200,620 × 20
##    vcClaveTienda DescGiro  Fecha    Hora     Marca    Fabricante Producto Precio
##    <chr>         <chr>     <chr>    <time>   <chr>    <chr>      <chr>     <dbl>
##  1 MX001         Abarrotes 19/06/20 08:16:21 NUTRI L… MEXILAC    Nutri L…   16  
##  2 MX001         Abarrotes 19/06/20 08:23:33 DAN UP   DANONE DE… DANUP S…   14  
##  3 MX001         Abarrotes 19/06/20 08:24:33 BIMBO    GRUPO BIM… Rebanad…    5  
##  4 MX001         Abarrotes 19/06/20 08:24:33 PEPSI    PEPSI-COL… Pepsi N…    8  
##  5 MX001         Abarrotes 19/06/20 08:26:28 BLANCA … FABRICA D… Deterge…   19.5
##  6 MX001         Abarrotes 19/06/20 08:26:28 FLASH    ALEN       Flash X…    9.5
##  7 MX001         Abarrotes 19/06/20 08:26:28 VARIOS … DANONE DE… Danone …   11  
##  8 MX001         Abarrotes 19/06/20 08:26:28 ZOTE     FABRICA D… Jabon Z…    9.5
##  9 MX001         Abarrotes 19/06/20 08:26:28 ALWAYS   PROCTER &… T Femen…   23.5
## 10 MX001         Abarrotes 19/06/20 15:24:02 JUMEX    JUMEX      Jugo De…   12  
## # … with 200,610 more rows, and 12 more variables: Ult.Costo <dbl>,
## #   Unidades <dbl>, F.Ticket <dbl>, NombreDepartamento <chr>,
## #   NombreFamilia <chr>, NombreCategoria <chr>, Estado <chr>, Mts.2 <dbl>,
## #   Tipo.ubicación <chr>, Giro <chr>, Hora.inicio <time>, Hora.cierre <time>

Tecnica 4 - Convertir tipos de datos

# Convertir de caracter a fecha
bd6 <- bd5
bd6$Fecha <- as.Date(bd6$Fecha, format = "%d/%m/%y")
tibble(bd6)
## # A tibble: 200,620 × 20
##    vcClaveTienda DescGiro  Fecha      Hora     Marca  Fabricante Producto Precio
##    <chr>         <chr>     <date>     <time>   <chr>  <chr>      <chr>     <dbl>
##  1 MX001         Abarrotes 2020-06-19 08:16:21 NUTRI… MEXILAC    Nutri L…   16  
##  2 MX001         Abarrotes 2020-06-19 08:23:33 DAN UP DANONE DE… DANUP S…   14  
##  3 MX001         Abarrotes 2020-06-19 08:24:33 BIMBO  GRUPO BIM… Rebanad…    5  
##  4 MX001         Abarrotes 2020-06-19 08:24:33 PEPSI  PEPSI-COL… Pepsi N…    8  
##  5 MX001         Abarrotes 2020-06-19 08:26:28 BLANC… FABRICA D… Deterge…   19.5
##  6 MX001         Abarrotes 2020-06-19 08:26:28 FLASH  ALEN       Flash X…    9.5
##  7 MX001         Abarrotes 2020-06-19 08:26:28 VARIO… DANONE DE… Danone …   11  
##  8 MX001         Abarrotes 2020-06-19 08:26:28 ZOTE   FABRICA D… Jabon Z…    9.5
##  9 MX001         Abarrotes 2020-06-19 08:26:28 ALWAYS PROCTER &… T Femen…   23.5
## 10 MX001         Abarrotes 2020-06-19 15:24:02 JUMEX  JUMEX      Jugo De…   12  
## # … with 200,610 more rows, and 12 more variables: Ult.Costo <dbl>,
## #   Unidades <dbl>, F.Ticket <dbl>, NombreDepartamento <chr>,
## #   NombreFamilia <chr>, NombreCategoria <chr>, Estado <chr>, Mts.2 <dbl>,
## #   Tipo.ubicación <chr>, Giro <chr>, Hora.inicio <time>, Hora.cierre <time>
# Convertir de caracter a entero
bd7 <- bd6
bd7$Hora <- substr(bd7$Hora, start = 1, stop = 2)
tibble(bd7)
## # A tibble: 200,620 × 20
##    vcClaveTienda DescGiro  Fecha      Hora  Marca     Fabricante Producto Precio
##    <chr>         <chr>     <date>     <chr> <chr>     <chr>      <chr>     <dbl>
##  1 MX001         Abarrotes 2020-06-19 08    NUTRI LE… MEXILAC    Nutri L…   16  
##  2 MX001         Abarrotes 2020-06-19 08    DAN UP    DANONE DE… DANUP S…   14  
##  3 MX001         Abarrotes 2020-06-19 08    BIMBO     GRUPO BIM… Rebanad…    5  
##  4 MX001         Abarrotes 2020-06-19 08    PEPSI     PEPSI-COL… Pepsi N…    8  
##  5 MX001         Abarrotes 2020-06-19 08    BLANCA N… FABRICA D… Deterge…   19.5
##  6 MX001         Abarrotes 2020-06-19 08    FLASH     ALEN       Flash X…    9.5
##  7 MX001         Abarrotes 2020-06-19 08    VARIOS D… DANONE DE… Danone …   11  
##  8 MX001         Abarrotes 2020-06-19 08    ZOTE      FABRICA D… Jabon Z…    9.5
##  9 MX001         Abarrotes 2020-06-19 08    ALWAYS    PROCTER &… T Femen…   23.5
## 10 MX001         Abarrotes 2020-06-19 15    JUMEX     JUMEX      Jugo De…   12  
## # … with 200,610 more rows, and 12 more variables: Ult.Costo <dbl>,
## #   Unidades <dbl>, F.Ticket <dbl>, NombreDepartamento <chr>,
## #   NombreFamilia <chr>, NombreCategoria <chr>, Estado <chr>, Mts.2 <dbl>,
## #   Tipo.ubicación <chr>, Giro <chr>, Hora.inicio <time>, Hora.cierre <time>
bd7$Hora <- as.integer(bd7$Hora)
tibble(bd7)
## # A tibble: 200,620 × 20
##    vcClaveTienda DescGiro  Fecha       Hora Marca     Fabricante Producto Precio
##    <chr>         <chr>     <date>     <int> <chr>     <chr>      <chr>     <dbl>
##  1 MX001         Abarrotes 2020-06-19     8 NUTRI LE… MEXILAC    Nutri L…   16  
##  2 MX001         Abarrotes 2020-06-19     8 DAN UP    DANONE DE… DANUP S…   14  
##  3 MX001         Abarrotes 2020-06-19     8 BIMBO     GRUPO BIM… Rebanad…    5  
##  4 MX001         Abarrotes 2020-06-19     8 PEPSI     PEPSI-COL… Pepsi N…    8  
##  5 MX001         Abarrotes 2020-06-19     8 BLANCA N… FABRICA D… Deterge…   19.5
##  6 MX001         Abarrotes 2020-06-19     8 FLASH     ALEN       Flash X…    9.5
##  7 MX001         Abarrotes 2020-06-19     8 VARIOS D… DANONE DE… Danone …   11  
##  8 MX001         Abarrotes 2020-06-19     8 ZOTE      FABRICA D… Jabon Z…    9.5
##  9 MX001         Abarrotes 2020-06-19     8 ALWAYS    PROCTER &… T Femen…   23.5
## 10 MX001         Abarrotes 2020-06-19    15 JUMEX     JUMEX      Jugo De…   12  
## # … with 200,610 more rows, and 12 more variables: Ult.Costo <dbl>,
## #   Unidades <dbl>, F.Ticket <dbl>, NombreDepartamento <chr>,
## #   NombreFamilia <chr>, NombreCategoria <chr>, Estado <chr>, Mts.2 <dbl>,
## #   Tipo.ubicación <chr>, Giro <chr>, Hora.inicio <time>, Hora.cierre <time>

Tecnica 5 - Valores Faltantes

#Cuantos NA hay en total
sum(is.na(bd7))
## [1] 0
sum(is.na(bd))
## [1] 199188
# Cuanto NA hay por columna
colSums(is.na(bd7)) 
##      vcClaveTienda           DescGiro              Fecha               Hora 
##                  0                  0                  0                  0 
##              Marca         Fabricante           Producto             Precio 
##                  0                  0                  0                  0 
##          Ult.Costo           Unidades           F.Ticket NombreDepartamento 
##                  0                  0                  0                  0 
##      NombreFamilia    NombreCategoria             Estado              Mts.2 
##                  0                  0                  0                  0 
##     Tipo.ubicación               Giro        Hora.inicio        Hora.cierre 
##                  0                  0                  0                  0
colSums(is.na(bd)) 
##      vcClaveTienda           DescGiro      Codigo.Barras                PLU 
##                  0                  0                  0             199188 
##              Fecha               Hora              Marca         Fabricante 
##                  0                  0                  0                  0 
##           Producto             Precio          Ult.Costo           Unidades 
##                  0                  0                  0                  0 
##           F.Ticket NombreDepartamento      NombreFamilia    NombreCategoria 
##                  0                  0                  0                  0 
##             Estado              Mts.2     Tipo.ubicación               Giro 
##                  0                  0                  0                  0 
##        Hora.inicio        Hora.cierre 
##                  0                  0
# Borrar todos los registros NA de una tabla - borra toda la fila donde hay NA
bd8 <- bd
bd8 <- na.omit(bd8)
summary(bd8)
##  vcClaveTienda        DescGiro         Codigo.Barras            PLU        
##  Length:1437        Length:1437        Min.   :6.750e+08   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.:6.750e+08   1st Qu.: 1.000  
##  Mode  :character   Mode  :character   Median :6.750e+08   Median : 1.000  
##                                        Mean   :2.616e+11   Mean   : 2.112  
##                                        3rd Qu.:6.750e+08   3rd Qu.: 1.000  
##                                        Max.   :7.500e+12   Max.   :30.000  
##     Fecha               Hora             Marca            Fabricante       
##  Length:1437        Length:1437       Length:1437        Length:1437       
##  Class :character   Class1:hms        Class :character   Class :character  
##  Mode  :character   Class2:difftime   Mode  :character   Mode  :character  
##                     Mode  :numeric                                         
##                                                                            
##                                                                            
##    Producto             Precio        Ult.Costo        Unidades    
##  Length:1437        Min.   :30.00   Min.   : 1.00   Min.   :1.000  
##  Class :character   1st Qu.:90.00   1st Qu.:64.62   1st Qu.:1.000  
##  Mode  :character   Median :90.00   Median :64.62   Median :1.000  
##                     Mean   :87.94   Mean   :56.65   Mean   :1.124  
##                     3rd Qu.:90.00   3rd Qu.:64.62   3rd Qu.:1.000  
##                     Max.   :90.00   Max.   :64.62   Max.   :7.000  
##     F.Ticket      NombreDepartamento NombreFamilia      NombreCategoria   
##  Min.   :   772   Length:1437        Length:1437        Length:1437       
##  1st Qu.: 99955   Class :character   Class :character   Class :character  
##  Median :102493   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :100595                                                           
##  3rd Qu.:106546                                                           
##  Max.   :118356                                                           
##     Estado              Mts.2       Tipo.ubicación         Giro          
##  Length:1437        Min.   :58.00   Length:1437        Length:1437       
##  Class :character   1st Qu.:58.00   Class :character   Class :character  
##  Mode  :character   Median :58.00   Mode  :character   Mode  :character  
##                     Mean   :58.07                                        
##                     3rd Qu.:58.00                                        
##                     Max.   :60.00                                        
##  Hora.inicio       Hora.cierre      
##  Length:1437       Length:1437      
##  Class1:hms        Class1:hms       
##  Class2:difftime   Class2:difftime  
##  Mode  :numeric    Mode  :numeric   
##                                     
## 
# Reemplazar NA con ceros 0
bd9 <- bd
bd9[is.na(bd9)] <- 0
summary(bd9)
##  vcClaveTienda        DescGiro         Codigo.Barras            PLU          
##  Length:200625      Length:200625      Min.   :8.347e+05   Min.   : 0.00000  
##  Class :character   Class :character   1st Qu.:7.500e+12   1st Qu.: 0.00000  
##  Mode  :character   Mode  :character   Median :7.500e+12   Median : 0.00000  
##                                        Mean   :5.949e+12   Mean   : 0.01513  
##                                        3rd Qu.:7.500e+12   3rd Qu.: 0.00000  
##                                        Max.   :1.750e+13   Max.   :30.00000  
##     Fecha               Hora             Marca            Fabricante       
##  Length:200625      Length:200625     Length:200625      Length:200625     
##  Class :character   Class1:hms        Class :character   Class :character  
##  Mode  :character   Class2:difftime   Mode  :character   Mode  :character  
##                     Mode  :numeric                                         
##                                                                            
##                                                                            
##    Producto             Precio          Ult.Costo         Unidades     
##  Length:200625      Min.   :-147.00   Min.   :  0.38   Min.   : 0.200  
##  Class :character   1st Qu.:  11.00   1st Qu.:  8.46   1st Qu.: 1.000  
##  Mode  :character   Median :  16.00   Median : 12.31   Median : 1.000  
##                     Mean   :  19.42   Mean   : 15.31   Mean   : 1.262  
##                     3rd Qu.:  25.00   3rd Qu.: 19.23   3rd Qu.: 1.000  
##                     Max.   :1000.00   Max.   :769.23   Max.   :96.000  
##     F.Ticket      NombreDepartamento NombreFamilia      NombreCategoria   
##  Min.   :     1   Length:200625      Length:200625      Length:200625     
##  1st Qu.: 33964   Class :character   Class :character   Class :character  
##  Median :105993   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :193990                                                           
##  3rd Qu.:383005                                                           
##  Max.   :450040                                                           
##     Estado              Mts.2      Tipo.ubicación         Giro          
##  Length:200625      Min.   :47.0   Length:200625      Length:200625     
##  Class :character   1st Qu.:53.0   Class :character   Class :character  
##  Mode  :character   Median :60.0   Mode  :character   Mode  :character  
##                     Mean   :56.6                                        
##                     3rd Qu.:60.0                                        
##                     Max.   :62.0                                        
##  Hora.inicio       Hora.cierre      
##  Length:200625     Length:200625    
##  Class1:hms        Class1:hms       
##  Class2:difftime   Class2:difftime  
##  Mode  :numeric    Mode  :numeric   
##                                     
## 
# Reemplazar NA con el Promedio
bd10 <- bd
bd10$PLU[is.na(bd10$PLU)] <- mean(bd10$PLU, na.rm = TRUE)
summary(bd10)
##  vcClaveTienda        DescGiro         Codigo.Barras            PLU        
##  Length:200625      Length:200625      Min.   :8.347e+05   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.:7.500e+12   1st Qu.: 2.112  
##  Mode  :character   Mode  :character   Median :7.500e+12   Median : 2.112  
##                                        Mean   :5.949e+12   Mean   : 2.112  
##                                        3rd Qu.:7.500e+12   3rd Qu.: 2.112  
##                                        Max.   :1.750e+13   Max.   :30.000  
##     Fecha               Hora             Marca            Fabricante       
##  Length:200625      Length:200625     Length:200625      Length:200625     
##  Class :character   Class1:hms        Class :character   Class :character  
##  Mode  :character   Class2:difftime   Mode  :character   Mode  :character  
##                     Mode  :numeric                                         
##                                                                            
##                                                                            
##    Producto             Precio          Ult.Costo         Unidades     
##  Length:200625      Min.   :-147.00   Min.   :  0.38   Min.   : 0.200  
##  Class :character   1st Qu.:  11.00   1st Qu.:  8.46   1st Qu.: 1.000  
##  Mode  :character   Median :  16.00   Median : 12.31   Median : 1.000  
##                     Mean   :  19.42   Mean   : 15.31   Mean   : 1.262  
##                     3rd Qu.:  25.00   3rd Qu.: 19.23   3rd Qu.: 1.000  
##                     Max.   :1000.00   Max.   :769.23   Max.   :96.000  
##     F.Ticket      NombreDepartamento NombreFamilia      NombreCategoria   
##  Min.   :     1   Length:200625      Length:200625      Length:200625     
##  1st Qu.: 33964   Class :character   Class :character   Class :character  
##  Median :105993   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :193990                                                           
##  3rd Qu.:383005                                                           
##  Max.   :450040                                                           
##     Estado              Mts.2      Tipo.ubicación         Giro          
##  Length:200625      Min.   :47.0   Length:200625      Length:200625     
##  Class :character   1st Qu.:53.0   Class :character   Class :character  
##  Mode  :character   Median :60.0   Mode  :character   Mode  :character  
##                     Mean   :56.6                                        
##                     3rd Qu.:60.0                                        
##                     Max.   :62.0                                        
##  Hora.inicio       Hora.cierre      
##  Length:200625     Length:200625    
##  Class1:hms        Class1:hms       
##  Class2:difftime   Class2:difftime  
##  Mode  :numeric    Mode  :numeric   
##                                     
## 
# Reemplazar negativos con 0
bd11 <- bd
bd11[bd11 < 0] <- 0
summary(bd11)
##  vcClaveTienda        DescGiro         Codigo.Barras            PLU        
##  Length:200625      Length:200625      Min.   :8.347e+05   Min.   : 1.00   
##  Class :character   Class :character   1st Qu.:7.500e+12   1st Qu.: 1.00   
##  Mode  :character   Mode  :character   Median :7.500e+12   Median : 1.00   
##                                        Mean   :5.949e+12   Mean   : 2.11   
##                                        3rd Qu.:7.500e+12   3rd Qu.: 1.00   
##                                        Max.   :1.750e+13   Max.   :30.00   
##                                                            NA's   :199188  
##     Fecha               Hora             Marca            Fabricante       
##  Length:200625      Length:200625     Length:200625      Length:200625     
##  Class :character   Class1:hms        Class :character   Class :character  
##  Mode  :character   Class2:difftime   Mode  :character   Mode  :character  
##                     Mode  :numeric                                         
##                                                                            
##                                                                            
##                                                                            
##    Producto             Precio          Ult.Costo         Unidades     
##  Length:200625      Min.   :   0.00   Min.   :  0.38   Min.   : 0.200  
##  Class :character   1st Qu.:  11.00   1st Qu.:  8.46   1st Qu.: 1.000  
##  Mode  :character   Median :  16.00   Median : 12.31   Median : 1.000  
##                     Mean   :  19.44   Mean   : 15.31   Mean   : 1.262  
##                     3rd Qu.:  25.00   3rd Qu.: 19.23   3rd Qu.: 1.000  
##                     Max.   :1000.00   Max.   :769.23   Max.   :96.000  
##                                                                        
##     F.Ticket      NombreDepartamento NombreFamilia      NombreCategoria   
##  Min.   :     1   Length:200625      Length:200625      Length:200625     
##  1st Qu.: 33964   Class :character   Class :character   Class :character  
##  Median :105993   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :193990                                                           
##  3rd Qu.:383005                                                           
##  Max.   :450040                                                           
##                                                                           
##     Estado              Mts.2      Tipo.ubicación         Giro          
##  Length:200625      Min.   :47.0   Length:200625      Length:200625     
##  Class :character   1st Qu.:53.0   Class :character   Class :character  
##  Mode  :character   Median :60.0   Mode  :character   Mode  :character  
##                     Mean   :56.6                                        
##                     3rd Qu.:60.0                                        
##                     Max.   :62.0                                        
##                                                                         
##  Hora.inicio       Hora.cierre      
##  Length:200625     Length:200625    
##  Class1:hms        Class1:hms       
##  Class2:difftime   Class2:difftime  
##  Mode  :numeric    Mode  :numeric   
##                                     
##                                     
## 

Tecnica 6 - Metodo estadistico

bd12 <- bd7
boxplot(bd12$Precio, horizontal = TRUE)

boxplot(bd12$Unidades, horizontal = TRUE)

Agregar columnas

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
# Dia de la semana
bd12$Dia_de_la_Semana <- wday(bd12$Fecha)
summary(bd12)
##  vcClaveTienda        DescGiro             Fecha                 Hora      
##  Length:200620      Length:200620      Min.   :2020-05-01   Min.   : 0.00  
##  Class :character   Class :character   1st Qu.:2020-06-06   1st Qu.:13.00  
##  Mode  :character   Mode  :character   Median :2020-07-11   Median :17.00  
##                                        Mean   :2020-07-18   Mean   :16.23  
##                                        3rd Qu.:2020-08-29   3rd Qu.:20.00  
##                                        Max.   :2020-11-11   Max.   :23.00  
##     Marca            Fabricante          Producto             Precio       
##  Length:200620      Length:200620      Length:200620      Min.   :   0.50  
##  Class :character   Class :character   Class :character   1st Qu.:  11.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :  16.00  
##                                                           Mean   :  19.45  
##                                                           3rd Qu.:  25.00  
##                                                           Max.   :1000.00  
##    Ult.Costo         Unidades         F.Ticket      NombreDepartamento
##  Min.   :  0.38   Min.   : 1.000   Min.   :     1   Length:200620     
##  1st Qu.:  8.46   1st Qu.: 1.000   1st Qu.: 33967   Class :character  
##  Median : 12.31   Median : 1.000   Median :105996   Mode  :character  
##  Mean   : 15.31   Mean   : 1.262   Mean   :193994                     
##  3rd Qu.: 19.23   3rd Qu.: 1.000   3rd Qu.:383008                     
##  Max.   :769.23   Max.   :96.000   Max.   :450040                     
##  NombreFamilia      NombreCategoria       Estado              Mts.2     
##  Length:200620      Length:200620      Length:200620      Min.   :47.0  
##  Class :character   Class :character   Class :character   1st Qu.:53.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :60.0  
##                                                           Mean   :56.6  
##                                                           3rd Qu.:60.0  
##                                                           Max.   :62.0  
##  Tipo.ubicación         Giro           Hora.inicio       Hora.cierre      
##  Length:200620      Length:200620      Length:200620     Length:200620    
##  Class :character   Class :character   Class1:hms        Class1:hms       
##  Mode  :character   Mode  :character   Class2:difftime   Class2:difftime  
##                                        Mode  :numeric    Mode  :numeric   
##                                                                           
##                                                                           
##  Dia_de_la_Semana
##  Min.   :1.000   
##  1st Qu.:2.000   
##  Median :4.000   
##  Mean   :3.912   
##  3rd Qu.:6.000   
##  Max.   :7.000
# Subtotal de la compra 
bd12$Subtotal <- bd12$Precio * bd12$Unidades
summary(bd12)
##  vcClaveTienda        DescGiro             Fecha                 Hora      
##  Length:200620      Length:200620      Min.   :2020-05-01   Min.   : 0.00  
##  Class :character   Class :character   1st Qu.:2020-06-06   1st Qu.:13.00  
##  Mode  :character   Mode  :character   Median :2020-07-11   Median :17.00  
##                                        Mean   :2020-07-18   Mean   :16.23  
##                                        3rd Qu.:2020-08-29   3rd Qu.:20.00  
##                                        Max.   :2020-11-11   Max.   :23.00  
##     Marca            Fabricante          Producto             Precio       
##  Length:200620      Length:200620      Length:200620      Min.   :   0.50  
##  Class :character   Class :character   Class :character   1st Qu.:  11.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :  16.00  
##                                                           Mean   :  19.45  
##                                                           3rd Qu.:  25.00  
##                                                           Max.   :1000.00  
##    Ult.Costo         Unidades         F.Ticket      NombreDepartamento
##  Min.   :  0.38   Min.   : 1.000   Min.   :     1   Length:200620     
##  1st Qu.:  8.46   1st Qu.: 1.000   1st Qu.: 33967   Class :character  
##  Median : 12.31   Median : 1.000   Median :105996   Mode  :character  
##  Mean   : 15.31   Mean   : 1.262   Mean   :193994                     
##  3rd Qu.: 19.23   3rd Qu.: 1.000   3rd Qu.:383008                     
##  Max.   :769.23   Max.   :96.000   Max.   :450040                     
##  NombreFamilia      NombreCategoria       Estado              Mts.2     
##  Length:200620      Length:200620      Length:200620      Min.   :47.0  
##  Class :character   Class :character   Class :character   1st Qu.:53.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :60.0  
##                                                           Mean   :56.6  
##                                                           3rd Qu.:60.0  
##                                                           Max.   :62.0  
##  Tipo.ubicación         Giro           Hora.inicio       Hora.cierre      
##  Length:200620      Length:200620      Length:200620     Length:200620    
##  Class :character   Class :character   Class1:hms        Class1:hms       
##  Mode  :character   Mode  :character   Class2:difftime   Class2:difftime  
##                                        Mode  :numeric    Mode  :numeric   
##                                                                           
##                                                                           
##  Dia_de_la_Semana    Subtotal      
##  Min.   :1.000    Min.   :   1.00  
##  1st Qu.:2.000    1st Qu.:  12.00  
##  Median :4.000    Median :  18.00  
##  Mean   :3.912    Mean   :  24.33  
##  3rd Qu.:6.000    3rd Qu.:  27.00  
##  Max.   :7.000    Max.   :2496.00
# Utilidad por producto
bd12$Utilidad <- bd12$Precio - bd12$Ult.Costo
summary(bd12)
##  vcClaveTienda        DescGiro             Fecha                 Hora      
##  Length:200620      Length:200620      Min.   :2020-05-01   Min.   : 0.00  
##  Class :character   Class :character   1st Qu.:2020-06-06   1st Qu.:13.00  
##  Mode  :character   Mode  :character   Median :2020-07-11   Median :17.00  
##                                        Mean   :2020-07-18   Mean   :16.23  
##                                        3rd Qu.:2020-08-29   3rd Qu.:20.00  
##                                        Max.   :2020-11-11   Max.   :23.00  
##     Marca            Fabricante          Producto             Precio       
##  Length:200620      Length:200620      Length:200620      Min.   :   0.50  
##  Class :character   Class :character   Class :character   1st Qu.:  11.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :  16.00  
##                                                           Mean   :  19.45  
##                                                           3rd Qu.:  25.00  
##                                                           Max.   :1000.00  
##    Ult.Costo         Unidades         F.Ticket      NombreDepartamento
##  Min.   :  0.38   Min.   : 1.000   Min.   :     1   Length:200620     
##  1st Qu.:  8.46   1st Qu.: 1.000   1st Qu.: 33967   Class :character  
##  Median : 12.31   Median : 1.000   Median :105996   Mode  :character  
##  Mean   : 15.31   Mean   : 1.262   Mean   :193994                     
##  3rd Qu.: 19.23   3rd Qu.: 1.000   3rd Qu.:383008                     
##  Max.   :769.23   Max.   :96.000   Max.   :450040                     
##  NombreFamilia      NombreCategoria       Estado              Mts.2     
##  Length:200620      Length:200620      Length:200620      Min.   :47.0  
##  Class :character   Class :character   Class :character   1st Qu.:53.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :60.0  
##                                                           Mean   :56.6  
##                                                           3rd Qu.:60.0  
##                                                           Max.   :62.0  
##  Tipo.ubicación         Giro           Hora.inicio       Hora.cierre      
##  Length:200620      Length:200620      Length:200620     Length:200620    
##  Class :character   Class :character   Class1:hms        Class1:hms       
##  Mode  :character   Mode  :character   Class2:difftime   Class2:difftime  
##                                        Mode  :numeric    Mode  :numeric   
##                                                                           
##                                                                           
##  Dia_de_la_Semana    Subtotal          Utilidad      
##  Min.   :1.000    Min.   :   1.00   Min.   :  0.000  
##  1st Qu.:2.000    1st Qu.:  12.00   1st Qu.:  2.310  
##  Median :4.000    Median :  18.00   Median :  3.230  
##  Mean   :3.912    Mean   :  24.33   Mean   :  4.142  
##  3rd Qu.:6.000    3rd Qu.:  27.00   3rd Qu.:  5.420  
##  Max.   :7.000    Max.   :2496.00   Max.   :230.770

Exportar Base de Datos

#bd_limpia <- bd12
#write.csv(bd_limpia, file = "abarrote bd limpia.csv", row.names = FALSE)
#install.packages("Matrix")
library(Matrix)
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(arules)
## 
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library(arulesViz)
library(datasets)
#file.choose()
abarrotes_limpia <- read.csv("/Users/danieltrevino/Documents/Quinto Semestre TEC/Bootcamp de Programación/abarrote bd limpia.csv")

Ordenar de menor a mayor los tickets

abarrotes_limpia <- abarrotes_limpia[order(abarrotes_limpia$F.Ticket),]
head(abarrotes_limpia)
##   vcClaveTienda  DescGiro      Fecha Hora                      Marca
## 1         MX001 Abarrotes 2020-06-19    8                NUTRI LECHE
## 2         MX001 Abarrotes 2020-06-19    8                     DAN UP
## 3         MX001 Abarrotes 2020-06-19    8                      BIMBO
## 4         MX001 Abarrotes 2020-06-19    8                      PEPSI
## 5         MX001 Abarrotes 2020-06-19    8 BLANCA NIEVES (DETERGENTE)
## 6         MX001 Abarrotes 2020-06-19    8                      FLASH
##                   Fabricante                           Producto Precio
## 1                    MEXILAC                Nutri Leche 1 Litro   16.0
## 2           DANONE DE MEXICO DANUP STRAWBERRY P/BEBER 350GR NAL   14.0
## 3                GRUPO BIMBO                Rebanadas Bimbo 2Pz    5.0
## 4        PEPSI-COLA MEXICANA                   Pepsi N.R. 400Ml    8.0
## 5 FABRICA DE JABON LA CORONA      Detergente Blanca Nieves 500G   19.5
## 6                       ALEN      Flash Xtra Brisa Marina 500Ml    9.5
##   Ult.Costo Unidades F.Ticket NombreDepartamento          NombreFamilia
## 1     12.31        1        1          Abarrotes Lacteos y Refrigerados
## 2     14.00        1        2          Abarrotes Lacteos y Refrigerados
## 3      5.00        1        3          Abarrotes         Pan y Tortilla
## 4      8.00        1        3          Abarrotes                Bebidas
## 5     15.00        1        4          Abarrotes     Limpieza del Hogar
## 6      7.31        1        4          Abarrotes     Limpieza del Hogar
##             NombreCategoria     Estado Mts.2 Tipo.ubicación      Giro
## 1                     Leche Nuevo León    60        Esquina Abarrotes
## 2                    Yogurt Nuevo León    60        Esquina Abarrotes
## 3     Pan Dulce Empaquetado Nuevo León    60        Esquina Abarrotes
## 4 Refrescos Plástico (N.R.) Nuevo León    60        Esquina Abarrotes
## 5                Lavandería Nuevo León    60        Esquina Abarrotes
## 6      Limpiadores Líquidos Nuevo León    60        Esquina Abarrotes
##   Hora.inicio Hora.cierre Dia_de_la_Semana Subtotal Utilidad
## 1    08:00:00    22:00:00                6     16.0     3.69
## 2    08:00:00    22:00:00                6     14.0     0.00
## 3    08:00:00    22:00:00                6      5.0     0.00
## 4    08:00:00    22:00:00                6      8.0     0.00
## 5    08:00:00    22:00:00                6     19.5     4.50
## 6    08:00:00    22:00:00                6      9.5     2.19
tail(abarrotes_limpia)
##        vcClaveTienda   DescGiro      Fecha Hora          Marca
## 107394         MX004 Carnicería 2020-10-15   11         YEMINA
## 167771         MX004 Carnicería 2020-10-15   11     DEL FUERTE
## 149429         MX004 Carnicería 2020-10-15   11 COCA COLA ZERO
## 168750         MX004 Carnicería 2020-10-15   11       DIAMANTE
## 161193         MX004 Carnicería 2020-10-15   12          PEPSI
## 112970         MX004 Carnicería 2020-10-15   12      COCA COLA
##                  Fabricante                       Producto Precio Ult.Costo
## 107394               HERDEZ    PASTA SPAGHETTI YEMINA 200G      7      5.38
## 167771 ALIMENTOS DEL FUERTE PURE DE TOMATE DEL FUERTE 345G     12      9.23
## 149429            COCA COLA           COCA COLA ZERO 600ML     15     11.54
## 168750            EMPACADOS             ARROZ DIAMANTE225G     11      8.46
## 161193  PEPSI-COLA MEXICANA              PEPSI N. R. 500ML     10      7.69
## 112970            COCA COLA     COCA COLA RETORNABLE 500ML     10      7.69
##        Unidades F.Ticket NombreDepartamento        NombreFamilia
## 107394        2   450032          Abarrotes       Sopas y Pastas
## 167771        1   450032          Abarrotes Salsas y Sazonadores
## 149429        2   450034          Abarrotes              Bebidas
## 168750        1   450037          Abarrotes    Granos y Semillas
## 161193        1   450039          Abarrotes              Bebidas
## 112970        8   450040          Abarrotes              Bebidas
##                      NombreCategoria  Estado Mts.2 Tipo.ubicación      Giro
## 107394 Fideos, Spaguetti, Tallarines Sinaloa    53        Esquina Abarrotes
## 167771          Salsa para Spaguetti Sinaloa    53        Esquina Abarrotes
## 149429         Refrescos Retornables Sinaloa    53        Esquina Abarrotes
## 168750                         Arroz Sinaloa    53        Esquina Abarrotes
## 161193     Refrescos Plástico (N.R.) Sinaloa    53        Esquina Abarrotes
## 112970         Refrescos Retornables Sinaloa    53        Esquina Abarrotes
##        Hora.inicio Hora.cierre Dia_de_la_Semana Subtotal Utilidad
## 107394    07:00:00    23:00:00                5       14     1.62
## 167771    07:00:00    23:00:00                5       12     2.77
## 149429    07:00:00    23:00:00                5       30     3.46
## 168750    07:00:00    23:00:00                5       11     2.54
## 161193    07:00:00    23:00:00                5       10     2.31
## 112970    07:00:00    23:00:00                5       80     2.31

Generar basket

library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:purrr':
## 
##     compact
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
basket <- ddply(abarrotes_limpia, c("F.Ticket"), function(abarrotes_limpia)paste(abarrotes_limpia$Marca, collapse = ","))

Eliminar numero de ticket

basket$F.Ticket <- NULL

Renombrar el nombre de columna

colnames(basket) <- c("Marca")

Exportar basket

write.csv(basket, "basket.csv", quote = FALSE, row.names = FALSE)

Importar transacciones

#file.choose()
tr <-  read.transactions("basket.csv", format = c("basket", "single"), sep = "",
                  cols = NULL, rm.duplicates = FALSE, 
                  quote = "\"'", skip = 0, 
                  encoding = "unknown")
## Warning in asMethod(object): removing duplicated items in transactions
reglas.asociacion <- apriori(tr, parameter = list(supp= 0.001, conf=0.02, maxlen= 10))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.02    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 115 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[27332 item(s), 115111 transaction(s)] done [0.16s].
## sorting and recoding items ... [191 item(s)] done [0.00s].
## creating transaction tree ... done [0.04s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [172 rule(s)] done [0.00s].
## creating S4 object  ... done [0.01s].
summary(reglas.asociacion)
## set of 172 rules
## 
## rule length distribution (lhs + rhs):sizes
##   1   2   3   4 
##   6 126  36   4 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   2.000   2.221   2.000   4.000 
## 
## summary of quality measures:
##     support           confidence         coverage             lift         
##  Min.   :0.001008   Min.   :0.02059   Min.   :0.001008   Min.   :  0.3269  
##  1st Qu.:0.001446   1st Qu.:0.37681   1st Qu.:0.002128   1st Qu.: 14.2036  
##  Median :0.002042   Median :0.61845   Median :0.004148   Median : 86.2174  
##  Mean   :0.005772   Mean   :0.58457   Mean   :0.047977   Mean   :143.7319  
##  3rd Qu.:0.004020   3rd Qu.:0.86837   3rd Qu.:0.010312   3rd Qu.:184.2080  
##  Max.   :0.119172   Max.   :1.00000   Max.   :1.000000   Max.   :828.1367  
##      count        
##  Min.   :  116.0  
##  1st Qu.:  166.5  
##  Median :  235.0  
##  Mean   :  664.4  
##  3rd Qu.:  462.8  
##  Max.   :13718.0  
## 
## mining info:
##  data ntransactions support confidence
##    tr        115111   0.001       0.02
##                                                                          call
##  apriori(data = tr, parameter = list(supp = 0.001, conf = 0.02, maxlen = 10))
inspect(reglas.asociacion)
##       lhs                       rhs            support     confidence
## [1]   {}                     => {TECATE}       0.043670892 0.04367089
## [2]   {}                     => {DEL}          0.023777050 0.02377705
## [3]   {}                     => {PEPSI}        0.060333070 0.06033307
## [4]   {}                     => {LA}           0.035452737 0.03545274
## [5]   {}                     => {COCA}         0.113003970 0.11300397
## [6]   {}                     => {COLA}         0.119171930 0.11917193
## [7]   {ABSOR}                => {SEC}          0.001138032 0.90972222
## [8]   {SEC}                  => {ABSOR}        0.001138032 0.68947368
## [9]   {COLA,JOYA}            => {COCA}         0.001033785 0.72121212
## [10]  {%}                    => {100}          0.001007723 1.00000000
## [11]  {100}                  => {%}            0.001007723 0.83453237
## [12]  {ZONA}                 => {SABOR}        0.001077221 0.96875000
## [13]  {SABOR}                => {ZONA}         0.001077221 0.53218884
## [14]  {ZONA}                 => {DEL}          0.001111970 1.00000000
## [15]  {DEL}                  => {ZONA}         0.001111970 0.04676653
## [16]  {SABORES}              => {PEÑAFIEL}     0.001033785 0.65384615
## [17]  {PEÑAFIEL}             => {SABORES}      0.001033785 0.46484375
## [18]  {DOS}                  => {EQUIS}        0.001181468 1.00000000
## [19]  {EQUIS}                => {DOS}          0.001181468 0.46101695
## [20]  {ROSA}                 => {TIA}          0.001059847 0.63874346
## [21]  {TIA}                  => {ROSA}         0.001059847 0.42214533
## [22]  {MARIA}                => {DOÑA}         0.001450774 0.81862745
## [23]  {DOÑA}                 => {MARIA}        0.001450774 0.41959799
## [24]  {FIOR}                 => {DI}           0.001085908 1.00000000
## [25]  {DI}                   => {FIOR}         0.001085908 0.56818182
## [26]  {DART}                 => {(PLAST}       0.001181468 1.00000000
## [27]  {(PLAST}               => {DART}         0.001181468 0.71957672
## [28]  {COLA,PEPSI}           => {COCA}         0.001250966 0.66666667
## [29]  {MODELO)}              => {(CERV.}       0.001372588 1.00000000
## [30]  {(CERV.}               => {MODELO)}      0.001372588 0.80612245
## [31]  {CREMAX}               => {DE}           0.001372588 1.00000000
## [32]  {DE}                   => {CREMAX}       0.001372588 0.13310868
## [33]  {CHICO,COCA}           => {TOPO}         0.001320465 0.91017964
## [34]  {TOPO}                 => {CHICO,COCA}   0.001320465 0.10764873
## [35]  {FORTILECHE}           => {LECHE}        0.001598457 0.69696970
## [36]  {LECHE}                => {FORTILECHE}   0.001598457 0.09913793
## [37]  {BIMBO,COCA}           => {COLA}         0.001633206 0.71482890
## [38]  {SIERRA}               => {LA}           0.001268341 0.54275093
## [39]  {LA}                   => {SIERRA}       0.001268341 0.03577555
## [40]  {LIGHT,COCA}           => {COLA}         0.001416025 0.98787879
## [41]  {SEVEN}                => {UP}           0.001450774 0.60507246
## [42]  {UP}                   => {SEVEN}        0.001450774 0.41750000
## [43]  {FINA}                 => {LA}           0.002223940 0.83116883
## [44]  {LA}                   => {FINA}         0.002223940 0.06272972
## [45]  {CARTA}                => {BLANCA}       0.002223940 0.80757098
## [46]  {BLANCA}               => {CARTA}        0.002223940 0.25728643
## [47]  {FRUTO)}               => {BARRILITOS}   0.001494210 0.76785714
## [48]  {BARRILITOS}           => {FRUTO)}       0.001494210 0.60139860
## [49]  {FRUTO)}               => {(DEL}         0.001945948 1.00000000
## [50]  {(DEL}                 => {FRUTO)}       0.001945948 0.60704607
## [51]  {BLUE}                 => {HOUSE}        0.001433399 0.55369128
## [52]  {HOUSE}                => {BLUE}         0.001433399 0.47687861
## [53]  {PEPSI,COCA}           => {COLA}         0.002024133 0.76143791
## [54]  {LAGER}                => {EQUIS}        0.002058882 1.00000000
## [55]  {EQUIS}                => {LAGER}        0.002058882 0.80338983
## [56]  {BENSON}               => {HEDGES}       0.001476835 0.69672131
## [57]  {HEDGES}               => {BENSON}       0.001476835 0.67460317
## [58]  {BENSON}               => {&}            0.002119693 1.00000000
## [59]  {&}                    => {BENSON}       0.002119693 0.60847880
## [60]  {SABOR}                => {DEL}          0.002024133 1.00000000
## [61]  {DEL}                  => {SABOR}        0.002024133 0.08512970
## [62]  {HEDGES}               => {&}            0.002189191 1.00000000
## [63]  {&}                    => {HEDGES}       0.002189191 0.62842893
## [64]  {MANZANITA}            => {SOL}          0.002362937 0.67830424
## [65]  {SOL}                  => {MANZANITA}    0.002362937 0.40840841
## [66]  {BARRILITOS}           => {(DEL}         0.002484558 1.00000000
## [67]  {(DEL}                 => {BARRILITOS}   0.002484558 0.77506775
## [68]  {(DETERGENTE)}         => {BLANCA}       0.001945948 0.68292683
## [69]  {BLANCA}               => {(DETERGENTE)} 0.001945948 0.22512563
## [70]  {(DETERGENTE)}         => {NIEVES}       0.002849424 1.00000000
## [71]  {NIEVES}               => {(DETERGENTE)} 0.002849424 0.39375750
## [72]  {ARDILLA,LA}           => {ARDILLA}      0.002319500 0.87828947
## [73]  {ARDILLA}              => {ARDILLA,LA}   0.002319500 0.17717319
## [74]  {ARDILLA,LA}           => {LA}           0.001320465 0.50000000
## [75]  {LA}                   => {ARDILLA,LA}   0.001320465 0.03724577
## [76]  {COLA,TECATE}          => {COCA}         0.003605216 0.76851852
## [77]  {COCA}                 => {COLA,TECATE}  0.003605216 0.03190344
## [78]  {MEXICO}               => {VELADORA}     0.004552128 0.95795247
## [79]  {VELADORA}             => {MEXICO}       0.004552128 0.46412755
## [80]  {EL}                   => {ORO}          0.001859075 0.65045593
## [81]  {ORO}                  => {EL}           0.001859075 0.38214286
## [82]  {EL}                   => {GALLO}        0.002128380 0.74468085
## [83]  {GALLO}                => {EL}           0.002128380 0.36082474
## [84]  {EL}                   => {DE}           0.002128380 0.74468085
## [85]  {DE}                   => {EL}           0.002128380 0.20640270
## [86]  {NUESTRA}              => {LA}           0.002310813 0.50474383
## [87]  {LA}                   => {NUESTRA}      0.002310813 0.06518010
## [88]  {CHICO}                => {TOPO}         0.004751935 0.84283513
## [89]  {TOPO}                 => {CHICO}        0.004751935 0.38739377
## [90]  {ZERO}                 => {COCA}         0.001824326 0.51980198
## [91]  {ZERO}                 => {COLA}         0.003509656 1.00000000
## [92]  {COLA}                 => {ZERO}         0.003509656 0.02945036
## [93]  {(GAMESA)}             => {SALADITAS}    0.004143826 0.68338109
## [94]  {SALADITAS}            => {(GAMESA)}     0.004143826 0.34970674
## [95]  {PALL}                 => {MALL}         0.005759658 0.86666667
## [96]  {MALL}                 => {PALL}         0.005759658 0.63688761
## [97]  {MONTE}                => {DEL}          0.003978768 0.52342857
## [98]  {DEL}                  => {MONTE}        0.003978768 0.16733650
## [99]  {FRUT}                 => {VALLE}        0.004526066 0.51380671
## [100] {VALLE}                => {FRUT}         0.004526066 0.31826512
## [101] {ORO}                  => {GALLO}        0.004864870 1.00000000
## [102] {GALLO}                => {ORO}          0.004864870 0.82474227
## [103] {ORO}                  => {DE}           0.004864870 1.00000000
## [104] {DE}                   => {ORO}          0.004864870 0.47177759
## [105] {COLA,COCA}            => {COCA}         0.004725873 0.74215553
## [106] {COCA}                 => {COLA,COCA}    0.004725873 0.04182042
## [107] {COLA,COCA}            => {COLA}         0.005151549 0.80900409
## [108] {COLA}                 => {COLA,COCA}    0.005151549 0.04322788
## [109] {BLANCA}               => {NIEVES}       0.004343634 0.50251256
## [110] {NIEVES}               => {BLANCA}       0.004343634 0.60024010
## [111] {FUERTE}               => {DEL}          0.004890931 0.47510549
## [112] {DEL}                  => {FUERTE}       0.004890931 0.20569967
## [113] {COSTEÑA}              => {LA}           0.005811782 0.56743003
## [114] {LA}                   => {COSTEÑA}      0.005811782 0.16393041
## [115] {GALLO}                => {DE}           0.005898654 1.00000000
## [116] {DE}                   => {GALLO}        0.005898654 0.57203033
## [117] {LIGHT}                => {PEPSI}        0.001242279 0.13414634
## [118] {PEPSI}                => {LIGHT}        0.001242279 0.02059035
## [119] {LIGHT}                => {COCA}         0.003952707 0.42682927
## [120] {COCA}                 => {LIGHT}        0.003952707 0.03497847
## [121] {LIGHT}                => {COLA}         0.006385141 0.68949343
## [122] {COLA}                 => {LIGHT}        0.006385141 0.05357924
## [123] {TOPO}                 => {COLA}         0.001303090 0.10623229
## [124] {VALLE}                => {DEL}          0.002936296 0.20647526
## [125] {DEL}                  => {VALLE}        0.002936296 0.12349288
## [126] {ARDILLA}              => {LA}           0.007210432 0.55076311
## [127] {LA}                   => {ARDILLA}      0.007210432 0.20338152
## [128] {LECHE}                => {NUTRI}        0.009521245 0.59051724
## [129] {NUTRI}                => {LECHE}        0.009521245 0.53203883
## [130] {LA}                   => {COLA}         0.001381275 0.03896104
## [131] {COCA}                 => {COLA}         0.080574402 0.71302276
## [132] {COLA}                 => {COCA}         0.080574402 0.67611897
## [133] {SABOR, ZONA}          => {DEL}          0.001077221 1.00000000
## [134] {DEL, ZONA}            => {SABOR}        0.001077221 0.96875000
## [135] {DEL, SABOR}           => {ZONA}         0.001077221 0.53218884
## [136] {BARRILITOS, FRUTO)}   => {(DEL}         0.001494210 1.00000000
## [137] {(DEL, FRUTO)}         => {BARRILITOS}   0.001494210 0.76785714
## [138] {(DEL, BARRILITOS}     => {FRUTO)}       0.001494210 0.60139860
## [139] {BENSON, HEDGES}       => {&}            0.001476835 1.00000000
## [140] {&, BENSON}            => {HEDGES}       0.001476835 0.69672131
## [141] {&, HEDGES}            => {BENSON}       0.001476835 0.67460317
## [142] {(DETERGENTE), BLANCA} => {NIEVES}       0.001945948 1.00000000
## [143] {(DETERGENTE), NIEVES} => {BLANCA}       0.001945948 0.68292683
## [144] {BLANCA, NIEVES}       => {(DETERGENTE)} 0.001945948 0.44800000
## [145] {ARDILLA, ARDILLA,LA}  => {LA}           0.001216217 0.52434457
## [146] {ARDILLA,LA, LA}       => {ARDILLA}      0.001216217 0.92105263
## [147] {ARDILLA, LA}          => {ARDILLA,LA}   0.001216217 0.16867470
## [148] {EL, ORO}              => {GALLO}        0.001859075 1.00000000
## [149] {EL, GALLO}            => {ORO}          0.001859075 0.87346939
## [150] {GALLO, ORO}           => {EL}           0.001859075 0.38214286
## [151] {EL, ORO}              => {DE}           0.001859075 1.00000000
## [152] {DE, EL}               => {ORO}          0.001859075 0.87346939
## [153] {DE, ORO}              => {EL}           0.001859075 0.38214286
## [154] {EL, GALLO}            => {DE}           0.002128380 1.00000000
## [155] {DE, EL}               => {GALLO}        0.002128380 1.00000000
## [156] {DE, GALLO}            => {EL}           0.002128380 0.36082474
## [157] {COCA, ZERO}           => {COLA}         0.001824326 1.00000000
## [158] {COLA, ZERO}           => {COCA}         0.001824326 0.51980198
## [159] {COCA, COLA}           => {ZERO}         0.001824326 0.02264151
## [160] {GALLO, ORO}           => {DE}           0.004864870 1.00000000
## [161] {DE, ORO}              => {GALLO}        0.004864870 1.00000000
## [162] {DE, GALLO}            => {ORO}          0.004864870 0.82474227
## [163] {COCA, COLA,COCA}      => {COLA}         0.003900583 0.82536765
## [164] {COLA, COLA,COCA}      => {COCA}         0.003900583 0.75716695
## [165] {COCA, COLA}           => {COLA,COCA}    0.003900583 0.04840970
## [166] {COCA, LIGHT}          => {COLA}         0.003848459 0.97362637
## [167] {COLA, LIGHT}          => {COCA}         0.003848459 0.60272109
## [168] {COCA, COLA}           => {LIGHT}        0.003848459 0.04776280
## [169] {EL, GALLO, ORO}       => {DE}           0.001859075 1.00000000
## [170] {DE, EL, ORO}          => {GALLO}        0.001859075 1.00000000
## [171] {DE, EL, GALLO}        => {ORO}          0.001859075 0.87346939
## [172] {DE, GALLO, ORO}       => {EL}           0.001859075 0.38214286
##       coverage    lift        count
## [1]   1.000000000   1.0000000  5027
## [2]   1.000000000   1.0000000  2737
## [3]   1.000000000   1.0000000  6945
## [4]   1.000000000   1.0000000  4081
## [5]   1.000000000   1.0000000 13008
## [6]   1.000000000   1.0000000 13718
## [7]   0.001250966 551.1528143   131
## [8]   0.001650581 551.1528143   131
## [9]   0.001433399   6.3821839   119
## [10]  0.001007723 828.1366906   116
## [11]  0.001207530 828.1366906   116
## [12]  0.001111970 478.5999195   124
## [13]  0.002024133 478.5999195   124
## [14]  0.001111970  42.0573621   128
## [15]  0.023777050  42.0573621   128
## [16]  0.001581083 294.0034555   119
## [17]  0.002223940 294.0034555   119
## [18]  0.001181468 390.2067797   136
## [19]  0.002562744 390.2067797   136
## [20]  0.001659268 254.4166018   122
## [21]  0.002510620 254.4166018   122
## [22]  0.001772202 236.7663932   167
## [23]  0.003457532 236.7663932   167
## [24]  0.001085908 523.2318182   125
## [25]  0.001911199 523.2318182   125
## [26]  0.001181468 609.0529101   136
## [27]  0.001641893 609.0529101   136
## [28]  0.001876450   5.8994977   144
## [29]  0.001372588 587.3010204   158
## [30]  0.001702704 587.3010204   158
## [31]  0.001372588  96.9764111   158
## [32]  0.010311786  96.9764111   158
## [33]  0.001450774  74.2009126   152
## [34]  0.012266421  74.2009126   152
## [35]  0.002293439  43.2267666   184
## [36]  0.016123568  43.2267666   184
## [37]  0.002284751   5.9982993   188
## [38]  0.002336875  15.3091405   146
## [39]  0.035452737  15.3091405   146
## [40]  0.001433399   8.2895258   163
## [41]  0.002397686 174.1262409   167
## [42]  0.003474907 174.1262409   167
## [43]  0.002675678  23.4444193   256
## [44]  0.035452737  23.4444193   256
## [45]  0.002753864  93.4274400   256
## [46]  0.008643831  93.4274400   256
## [47]  0.001945948 309.0517607   172
## [48]  0.002484558 309.0517607   172
## [49]  0.001945948 311.9539295   224
## [50]  0.003205602 311.9539295   224
## [51]  0.002588806 184.2079664   165
## [52]  0.003005794 184.2079664   165
## [53]  0.002658304   6.3894066   233
## [54]  0.002058882 390.2067797   237
## [55]  0.002562744 390.2067797   237
## [56]  0.002119693 318.2551067   170
## [57]  0.002189191 318.2551067   170
## [58]  0.002119693 287.0598504   244
## [59]  0.003483594 287.0598504   244
## [60]  0.002024133  42.0573621   233
## [61]  0.023777050  42.0573621   233
## [62]  0.002189191 287.0598504   252
## [63]  0.003483594 287.0598504   252
## [64]  0.003483594 117.2376566   272
## [65]  0.005785720 117.2376566   272
## [66]  0.002484558 311.9539295   286
## [67]  0.003205602 311.9539295   286
## [68]  0.002849424  79.0074274   224
## [69]  0.008643831  79.0074274   224
## [70]  0.002849424 138.1884754   328
## [71]  0.007236493 138.1884754   328
## [72]  0.002640929  67.0874450   267
## [73]  0.013091711  67.0874450   267
## [74]  0.002640929  14.1032835   152
## [75]  0.035452737  14.1032835   152
## [76]  0.004691124   6.8008099   415
## [77]  0.113003970   6.8008099   415
## [78]  0.004751935  97.6712724   524
## [79]  0.009807925  97.6712724   524
## [80]  0.002858111 133.7047004   214
## [81]  0.004864870 133.7047004   214
## [82]  0.002858111 126.2458873   245
## [83]  0.005898654 126.2458873   245
## [84]  0.002858111  72.2164764   245
## [85]  0.010311786  72.2164764   245
## [86]  0.004578190  14.2370908   266
## [87]  0.035452737  14.2370908   266
## [88]  0.005638036  68.7107612   547
## [89]  0.012266421  68.7107612   547
## [90]  0.003509656   4.5998559   210
## [91]  0.003509656   8.3912378   404
## [92]  0.119171930   8.3912378   404
## [93]  0.006063712  57.6720532   477
## [94]  0.011849432  57.6720532   477
## [95]  0.006645759  95.8336856   663
## [96]  0.009043445  95.8336856   663
## [97]  0.007601359  22.0140249   458
## [98]  0.023777050  22.0140249   458
## [99]  0.008808889  36.1299962   521
## [100] 0.014221056  36.1299962   521
## [101] 0.004864870 169.5301915   560
## [102] 0.005898654 169.5301915   560
## [103] 0.004864870  96.9764111   560
## [104] 0.010311786  96.9764111   560
## [105] 0.006367767   6.5675173   544
## [106] 0.113003970   6.5675173   544
## [107] 0.006367767   6.7885457   593
## [108] 0.119171930   6.7885457   593
## [109] 0.008643831  69.4414449   500
## [110] 0.007236493  69.4414449   500
## [111] 0.010294411  19.9816834   563
## [112] 0.023777050  19.9816834   563
## [113] 0.010242288  16.0052530   669
## [114] 0.035452737  16.0052530   669
## [115] 0.005898654  96.9764111   679
## [116] 0.010311786  96.9764111   679
## [117] 0.009260627   2.2234297   143
## [118] 0.060333070   2.2234297   143
## [119] 0.009260627   3.7771175   455
## [120] 0.113003970   3.7771175   455
## [121] 0.009260627   5.7857034   735
## [122] 0.119171930   5.7857034   735
## [123] 0.012266421   0.8914204   150
## [124] 0.014221056   8.6838048   338
## [125] 0.023777050   8.6838048   338
## [126] 0.013091711  15.5351364   830
## [127] 0.035452737  15.5351364   830
## [128] 0.016123568  32.9975875  1096
## [129] 0.017895770  32.9975875  1096
## [130] 0.035452737   0.3269313   159
## [131] 0.113003970   5.9831435  9275
## [132] 0.119171930   5.9831435  9275
## [133] 0.001077221  42.0573621   124
## [134] 0.001111970 478.5999195   124
## [135] 0.002024133 478.5999195   124
## [136] 0.001494210 311.9539295   172
## [137] 0.001945948 309.0517607   172
## [138] 0.002484558 309.0517607   172
## [139] 0.001476835 287.0598504   170
## [140] 0.002119693 318.2551067   170
## [141] 0.002189191 318.2551067   170
## [142] 0.001945948 138.1884754   224
## [143] 0.002849424  79.0074274   224
## [144] 0.004343634 157.2247805   224
## [145] 0.002319500  14.7899602   140
## [146] 0.001320465  70.3538749   140
## [147] 0.007210432  63.8694515   140
## [148] 0.001859075 169.5301915   214
## [149] 0.002128380 179.5463120   214
## [150] 0.004864870 133.7047004   214
## [151] 0.001859075  96.9764111   214
## [152] 0.002128380 179.5463120   214
## [153] 0.004864870 133.7047004   214
## [154] 0.002128380  96.9764111   245
## [155] 0.002128380 169.5301915   245
## [156] 0.005898654 126.2458873   245
## [157] 0.001824326   8.3912378   210
## [158] 0.003509656   4.5998559   210
## [159] 0.080574402   6.4512049   210
## [160] 0.004864870  96.9764111   560
## [161] 0.004864870 169.5301915   560
## [162] 0.005898654 169.5301915   560
## [163] 0.004725873   6.9258562   449
## [164] 0.005151549   6.7003571   449
## [165] 0.080574402   7.6023047   449
## [166] 0.003952707   8.1699304   443
## [167] 0.006385141   5.3336276   443
## [168] 0.080574402   5.1576211   443
## [169] 0.001859075  96.9764111   214
## [170] 0.001859075 169.5301915   214
## [171] 0.002128380 179.5463120   214
## [172] 0.004864870 133.7047004   214
top10reglas <- head(reglas.asociacion, n = 10, by = "confidence")
plot(top10reglas, method = "graph", engine = "htmlwidget")

Conclusiones

En esta actividad, observamos 6 tecnicas diferentes para hacer una limpieza de datos. Como eliminar datos innecesarios, convertir variables, borrar repetidos, etc. También, se programo un basket, el cual nos permite conocer que variables provocan o influyen en otras variables de nuestra base de datos. En este caso, se filtro una base de datos con información sobre tiendas de abarrotes y se consiguio una grafica que nos muestra que productos provocan la compra de otros productos de la tienda.