Actividad 2.1: Maneja una base de datos - “Abarrotes”

Importar la base de datos

library(readxl)
bd <- read_excel("C:/Users/Asus ZenBook/Downloads/Abarrotes_Ventas-2.xlsx")
summary(bd)
##  vcClaveTienda        DescGiro         Codigo Barras         PLU         
##  Length:200620      Length:200620      Min.   :8.347e+05   Mode:logical  
##  Class :character   Class :character   1st Qu.:7.501e+12   TRUE:1437     
##  Mode  :character   Mode  :character   Median :7.501e+12   NA's:199183   
##                                        Mean   :5.950e+12                 
##                                        3rd Qu.:7.501e+12                 
##                                        Max.   :1.750e+13                 
##      Fecha                             Hora                       
##  Min.   :2020-05-01 00:00:31.08   Min.   :1899-12-31 00:00:00.00  
##  1st Qu.:2020-06-06 13:24:49.08   1st Qu.:1899-12-31 13:12:42.75  
##  Median :2020-07-11 14:10:21.46   Median :1899-12-31 17:35:59.00  
##  Mean   :2020-07-19 15:19:40.65   Mean   :1899-12-31 16:43:52.05  
##  3rd Qu.:2020-08-29 22:07:47.33   3rd Qu.:1899-12-31 20:47:06.00  
##  Max.   :2020-11-11 23:53:47.73   Max.   :1899-12-31 23:59:59.00  
##     Marca            Fabricante          Producto             Precio       
##  Length:200620      Length:200620      Length:200620      Min.   :-147.00  
##  Class :character   Class :character   Class :character   1st Qu.:  11.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :  16.00  
##                                                           Mean   :  19.42  
##                                                           3rd Qu.:  25.00  
##                                                           Max.   :1000.00  
##    Ult.Costo         Unidades         F.Ticket      NombreDepartamento
##  Min.   :  0.38   Min.   : 0.200   Min.   :     1   Length:200620     
##  1st Qu.:  8.46   1st Qu.: 1.000   1st Qu.: 33967   Class :character  
##  Median : 12.31   Median : 1.000   Median :105996   Mode  :character  
##  Mean   : 15.31   Mean   : 1.262   Mean   :193994                     
##  3rd Qu.: 19.23   3rd Qu.: 1.000   3rd Qu.:383009                     
##  Max.   :769.23   Max.   :96.000   Max.   :450040                     
##  NombreFamilia      NombreCategoria       Estado              Mts 2     
##  Length:200620      Length:200620      Length:200620      Min.   :47.0  
##  Class :character   Class :character   Class :character   1st Qu.:53.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :60.0  
##                                                           Mean   :56.6  
##                                                           3rd Qu.:60.0  
##                                                           Max.   :62.0  
##  Tipo ubicación         Giro            Hora inicio                    
##  Length:200620      Length:200620      Min.   :1899-12-31 07:00:00.00  
##  Class :character   Class :character   1st Qu.:1899-12-31 07:00:00.00  
##  Mode  :character   Mode  :character   Median :1899-12-31 08:00:00.00  
##                                        Mean   :1899-12-31 07:35:49.71  
##                                        3rd Qu.:1899-12-31 08:00:00.00  
##                                        Max.   :1899-12-31 09:00:00.00  
##   Hora cierre                    
##  Min.   :1899-12-31 21:00:00.00  
##  1st Qu.:1899-12-31 22:00:00.00  
##  Median :1899-12-31 22:00:00.00  
##  Mean   :1899-12-31 22:23:11.42  
##  3rd Qu.:1899-12-31 23:00:00.00  
##  Max.   :1899-12-31 23:00:00.00

Observaciones

1. PLU tiene 199188 NA’s

2. La variable Fecha está como caracter.

3. Las variables Hora, Hora.inicio y Hora.cierre están como caracter.

4. La variable Precio tiene negativos.

5. La variable Unidades tiene decimales.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
count(bd, vcClaveTienda, sort=TRUE)
## # A tibble: 5 × 2
##   vcClaveTienda     n
##   <chr>         <int>
## 1 MX001         96464
## 2 MX004         83455
## 3 MX005         10021
## 4 MX002          6629
## 5 MX003          4051
count(bd, DescGiro, sort=TRUE)
## # A tibble: 3 × 2
##   DescGiro        n
##   <chr>       <int>
## 1 Abarrotes  100515
## 2 Carnicería  83455
## 3 Depósito    16650
count(bd, Marca, sort=TRUE)
## # A tibble: 540 × 2
##    Marca           n
##    <chr>       <int>
##  1 COCA COLA   18686
##  2 PEPSI       15966
##  3 TECATE      11674
##  4 BIMBO        8316
##  5 LALA         5866
##  6 MARINELA     3696
##  7 DORITOS      3142
##  8 CHEETOS      3130
##  9 NUTRI LECHE  3127
## 10 MARLBORO     2579
## # ℹ 530 more rows
count(bd, Fabricante, sort=TRUE)
## # A tibble: 241 × 2
##    Fabricante                          n
##    <chr>                           <int>
##  1 COCA COLA                       27519
##  2 PEPSI-COLA MEXICANA             22415
##  3 SABRITAS                        14296
##  4 CERVECERIA CUAUHTEMOC MOCTEZUMA 13681
##  5 GRUPO BIMBO                     13077
##  6 SIGMA ALIMENTOS                  8014
##  7 GRUPO INDUSTRIAL LALA            5868
##  8 GRUPO GAMESA                     5527
##  9 NESTLE                           3698
## 10 JUGOS DEL VALLE S.A. DE C.V.     3581
## # ℹ 231 more rows
count(bd, Producto, sort=TRUE)
## # A tibble: 3,404 × 2
##    Producto                        n
##    <chr>                       <int>
##  1 Pepsi N.R. 1.5L              5108
##  2 Coca Cola Retornable 2.5L    3771
##  3 Caguamon Tecate Light 1.2Lt  3471
##  4 Pepsi N. R. 2.5L             2899
##  5 Cerveza Tecate Light 340Ml   2619
##  6 Cerveza Tecate Light 16Oz    2315
##  7 Coca Cola Retornable 1.5L    2124
##  8 Pepsi N.R. 3L                1832
##  9 Coca Cola Retornable 500Ml   1659
## 10 PEPSI N.R. 1.5L              1631
## # ℹ 3,394 more rows
count(bd, NombreDepartamento, sort=TRUE)
## # A tibble: 9 × 2
##   NombreDepartamento        n
##   <chr>                 <int>
## 1 Abarrotes            198274
## 2 Bebes e Infantiles     1483
## 3 Ferretería              377
## 4 Farmacia                255
## 5 Vinos y Licores         104
## 6 Papelería                74
## 7 Mercería                 44
## 8 Productos a Eliminar      8
## 9 Carnes                    1
count(bd, NombreFamilia, sort=TRUE)
## # A tibble: 51 × 2
##    NombreFamilia              n
##    <chr>                  <int>
##  1 Bebidas                64917
##  2 Botanas                21583
##  3 Lacteos y Refrigerados 17657
##  4 Cerveza                14017
##  5 Pan y Tortilla         10501
##  6 Limpieza del Hogar      8723
##  7 Galletas                7487
##  8 Cigarros                6817
##  9 Cuidado Personal        5433
## 10 Salsas y Sazonadores    5320
## # ℹ 41 more rows
count(bd, NombreCategoria, sort=TRUE)
## # A tibble: 174 × 2
##    NombreCategoria               n
##    <chr>                     <int>
##  1 Refrescos Plástico (N.R.) 32861
##  2 Refrescos Retornables     13880
##  3 Frituras                  11082
##  4 Lata                       8150
##  5 Leche                      7053
##  6 Cajetilla                  6329
##  7 Botella                    5867
##  8 Productos sin Categoria    5455
##  9 Papas Fritas               5344
## 10 Jugos y Néctares           5295
## # ℹ 164 more rows
count(bd, Estado, sort=TRUE)
## # A tibble: 5 × 2
##   Estado           n
##   <chr>        <int>
## 1 Nuevo León   96464
## 2 Sinaloa      83455
## 3 Quintana Roo 10021
## 4 Jalisco       6629
## 5 Chiapas       4051
count(bd, Giro, sort=TRUE)
## # A tibble: 2 × 2
##   Giro            n
##   <chr>       <int>
## 1 Abarrotes  183970
## 2 Mini súper  16650
count(bd, "Tipo ubicación", sort=TRUE)
## # A tibble: 1 × 2
##   `"Tipo ubicación"`      n
##   <chr>               <int>
## 1 Tipo ubicación     200620
count(bd, "Hora inicio", sort=TRUE)
## # A tibble: 1 × 2
##   `"Hora inicio"`      n
##   <chr>            <int>
## 1 Hora inicio     200620
count(bd, "Hora cierre", sort=TRUE)
## # A tibble: 1 × 2
##   `"Hora cierre"`      n
##   <chr>            <int>
## 1 Hora cierre     200620
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.3     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
tibble(bd)
## # A tibble: 200,620 × 22
##    vcClaveTienda DescGiro  `Codigo Barras` PLU   Fecha              
##    <chr>         <chr>               <dbl> <lgl> <dttm>             
##  1 MX001         Abarrotes   7501020540666 NA    2020-06-19 08:16:20
##  2 MX001         Abarrotes   7501032397906 NA    2020-06-19 08:23:32
##  3 MX001         Abarrotes   7501000112845 NA    2020-06-19 08:24:33
##  4 MX001         Abarrotes   7501031302741 NA    2020-06-19 08:24:33
##  5 MX001         Abarrotes   7501026027543 NA    2020-06-19 08:26:28
##  6 MX001         Abarrotes   7501025433024 NA    2020-06-19 08:26:28
##  7 MX001         Abarrotes   7501032332013 NA    2020-06-19 08:26:28
##  8 MX001         Abarrotes   7501026005688 NA    2020-06-19 08:26:28
##  9 MX001         Abarrotes   7506195178188 NA    2020-06-19 08:26:28
## 10 MX001         Abarrotes     32239052017 NA    2020-06-19 15:24:02
## # ℹ 200,610 more rows
## # ℹ 17 more variables: Hora <dttm>, Marca <chr>, Fabricante <chr>,
## #   Producto <chr>, Precio <dbl>, Ult.Costo <dbl>, Unidades <dbl>,
## #   F.Ticket <dbl>, NombreDepartamento <chr>, NombreFamilia <chr>,
## #   NombreCategoria <chr>, Estado <chr>, `Mts 2` <dbl>, `Tipo ubicación` <chr>,
## #   Giro <chr>, `Hora inicio` <dttm>, `Hora cierre` <dttm>
str(bd)
## tibble [200,620 × 22] (S3: tbl_df/tbl/data.frame)
##  $ vcClaveTienda     : chr [1:200620] "MX001" "MX001" "MX001" "MX001" ...
##  $ DescGiro          : chr [1:200620] "Abarrotes" "Abarrotes" "Abarrotes" "Abarrotes" ...
##  $ Codigo Barras     : num [1:200620] 7.5e+12 7.5e+12 7.5e+12 7.5e+12 7.5e+12 ...
##  $ PLU               : logi [1:200620] NA NA NA NA NA NA ...
##  $ Fecha             : POSIXct[1:200620], format: "2020-06-19 08:16:20" "2020-06-19 08:23:32" ...
##  $ Hora              : POSIXct[1:200620], format: "1899-12-31 08:16:21" "1899-12-31 08:23:33" ...
##  $ Marca             : chr [1:200620] "NUTRI LECHE" "DAN UP" "BIMBO" "PEPSI" ...
##  $ Fabricante        : chr [1:200620] "MEXILAC" "DANONE DE MEXICO" "GRUPO BIMBO" "PEPSI-COLA MEXICANA" ...
##  $ Producto          : chr [1:200620] "Nutri Leche 1 Litro" "DANUP STRAWBERRY P/BEBER 350GR NAL" "Rebanadas Bimbo 2Pz" "Pepsi N.R. 400Ml" ...
##  $ Precio            : num [1:200620] 16 14 5 8 19.5 9.5 11 9.5 23.5 12 ...
##  $ Ult.Costo         : num [1:200620] 12.3 14 5 8 15 ...
##  $ Unidades          : num [1:200620] 1 1 1 1 1 1 1 1 1 1 ...
##  $ F.Ticket          : num [1:200620] 1 2 3 3 4 4 4 4 4 5 ...
##  $ NombreDepartamento: chr [1:200620] "Abarrotes" "Abarrotes" "Abarrotes" "Abarrotes" ...
##  $ NombreFamilia     : chr [1:200620] "Lacteos y Refrigerados" "Lacteos y Refrigerados" "Pan y Tortilla" "Bebidas" ...
##  $ NombreCategoria   : chr [1:200620] "Leche" "Yogurt" "Pan Dulce Empaquetado" "Refrescos Plástico (N.R.)" ...
##  $ Estado            : chr [1:200620] "Nuevo León" "Nuevo León" "Nuevo León" "Nuevo León" ...
##  $ Mts 2             : num [1:200620] 60 60 60 60 60 60 60 60 60 60 ...
##  $ Tipo ubicación    : chr [1:200620] "Esquina" "Esquina" "Esquina" "Esquina" ...
##  $ Giro              : chr [1:200620] "Abarrotes" "Abarrotes" "Abarrotes" "Abarrotes" ...
##  $ Hora inicio       : POSIXct[1:200620], format: "1899-12-31 08:00:00" "1899-12-31 08:00:00" ...
##  $ Hora cierre       : POSIXct[1:200620], format: "1899-12-31 22:00:00" "1899-12-31 22:00:00" ...
head(bd, n=7)
## # A tibble: 7 × 22
##   vcClaveTienda DescGiro  `Codigo Barras` PLU   Fecha              
##   <chr>         <chr>               <dbl> <lgl> <dttm>             
## 1 MX001         Abarrotes   7501020540666 NA    2020-06-19 08:16:20
## 2 MX001         Abarrotes   7501032397906 NA    2020-06-19 08:23:32
## 3 MX001         Abarrotes   7501000112845 NA    2020-06-19 08:24:33
## 4 MX001         Abarrotes   7501031302741 NA    2020-06-19 08:24:33
## 5 MX001         Abarrotes   7501026027543 NA    2020-06-19 08:26:28
## 6 MX001         Abarrotes   7501025433024 NA    2020-06-19 08:26:28
## 7 MX001         Abarrotes   7501032332013 NA    2020-06-19 08:26:28
## # ℹ 17 more variables: Hora <dttm>, Marca <chr>, Fabricante <chr>,
## #   Producto <chr>, Precio <dbl>, Ult.Costo <dbl>, Unidades <dbl>,
## #   F.Ticket <dbl>, NombreDepartamento <chr>, NombreFamilia <chr>,
## #   NombreCategoria <chr>, Estado <chr>, `Mts 2` <dbl>, `Tipo ubicación` <chr>,
## #   Giro <chr>, `Hora inicio` <dttm>, `Hora cierre` <dttm>
tail(bd)
## # A tibble: 6 × 22
##   vcClaveTienda DescGiro `Codigo Barras` PLU   Fecha              
##   <chr>         <chr>              <dbl> <lgl> <dttm>             
## 1 MX005         Depósito   7622210464811 NA    2020-07-12 01:08:24
## 2 MX005         Depósito   7622210464811 NA    2020-10-23 22:17:37
## 3 MX005         Depósito   7622210464811 NA    2020-10-10 20:30:20
## 4 MX005         Depósito   7622210464811 NA    2020-10-10 22:40:42
## 5 MX005         Depósito   7622210464811 NA    2020-06-27 22:30:19
## 6 MX005         Depósito   7622210464811 NA    2020-06-26 23:43:33
## # ℹ 17 more variables: Hora <dttm>, Marca <chr>, Fabricante <chr>,
## #   Producto <chr>, Precio <dbl>, Ult.Costo <dbl>, Unidades <dbl>,
## #   F.Ticket <dbl>, NombreDepartamento <chr>, NombreFamilia <chr>,
## #   NombreCategoria <chr>, Estado <chr>, `Mts 2` <dbl>, `Tipo ubicación` <chr>,
## #   Giro <chr>, `Hora inicio` <dttm>, `Hora cierre` <dttm>
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
tabyl(bd, vcClaveTienda, NombreDepartamento)
##  vcClaveTienda Abarrotes Bebes e Infantiles Carnes Farmacia Ferretería Mercería
##          MX001     95410                515      1      147        245       28
##          MX002      6590                 21      0        4         10        0
##          MX003      4026                 15      0        2          8        0
##          MX004     82234                932      0      102        114       16
##          MX005     10014                  0      0        0          0        0
##  Papelería Productos a Eliminar Vinos y Licores
##         35                    3              80
##          0                    0               4
##          0                    0               0
##         32                    5              20
##          7                    0               0

Técnicas para limpieza de datos

Tecnica 1. Remover valores irrelevantes

# Eliminar columnas
bd1 <- bd
bd1 <- subset(bd1, select = -c(PLU))
# Eliminar renglones
bd2 <- bd1
bd2 <- bd2[bd2$Precio>0,]
summary(bd1$Precio)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -147.00   11.00   16.00   19.42   25.00 1000.00
summary(bd2$Precio)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.50   11.00   16.00   19.45   25.00 1000.00

Técnica 2. Remover valores duplicados

# ¿Cuántos registros duplicados tenemos?
bd2[duplicated(bd2),]
## # A tibble: 0 × 21
## # ℹ 21 variables: vcClaveTienda <chr>, DescGiro <chr>, Codigo Barras <dbl>,
## #   Fecha <dttm>, Hora <dttm>, Marca <chr>, Fabricante <chr>, Producto <chr>,
## #   Precio <dbl>, Ult.Costo <dbl>, Unidades <dbl>, F.Ticket <dbl>,
## #   NombreDepartamento <chr>, NombreFamilia <chr>, NombreCategoria <chr>,
## #   Estado <chr>, Mts 2 <dbl>, Tipo ubicación <chr>, Giro <chr>,
## #   Hora inicio <dttm>, Hora cierre <dttm>
sum(duplicated(bd1))
## [1] 0
# Eliminar registros duplicados
bd3 <- bd2
library(dplyr)
bd3 <- distinct(bd3)

Técnica 3. Errores tipográficos y errores similares

# Precios en absoluto
bd4 <- bd1
bd4$Precio <- abs(bd4$Precio)
summary(bd4$Precio)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.50   11.00   16.00   19.45   25.00 1000.00
# Cantidades en enteros 
bd5 <- bd4
bd5$Unidades <- ceiling(bd5$Unidades)
summary(bd5$Unidades)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.262   1.000  96.000
summary(bd$Unidades)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.200   1.000   1.000   1.262   1.000  96.000

Técnica 4. Convertir tipos de datos

# Convertir de caracter a fecha
bd6 <- bd5
bd6$Fecha <- as.Date(bd6$Fecha, "%d/%m/%Y")
## Warning in as.POSIXlt.POSIXct(x, tz = tz): unknown timezone '%d/%m/%Y'
tibble(bd6)
## # A tibble: 200,620 × 21
##    vcClaveTienda DescGiro  `Codigo Barras` Fecha      Hora                Marca 
##    <chr>         <chr>               <dbl> <date>     <dttm>              <chr> 
##  1 MX001         Abarrotes   7501020540666 2020-06-19 1899-12-31 08:16:21 NUTRI…
##  2 MX001         Abarrotes   7501032397906 2020-06-19 1899-12-31 08:23:33 DAN UP
##  3 MX001         Abarrotes   7501000112845 2020-06-19 1899-12-31 08:24:33 BIMBO 
##  4 MX001         Abarrotes   7501031302741 2020-06-19 1899-12-31 08:24:33 PEPSI 
##  5 MX001         Abarrotes   7501026027543 2020-06-19 1899-12-31 08:26:28 BLANC…
##  6 MX001         Abarrotes   7501025433024 2020-06-19 1899-12-31 08:26:28 FLASH 
##  7 MX001         Abarrotes   7501032332013 2020-06-19 1899-12-31 08:26:28 VARIO…
##  8 MX001         Abarrotes   7501026005688 2020-06-19 1899-12-31 08:26:28 ZOTE  
##  9 MX001         Abarrotes   7506195178188 2020-06-19 1899-12-31 08:26:28 ALWAYS
## 10 MX001         Abarrotes     32239052017 2020-06-19 1899-12-31 15:24:02 JUMEX 
## # ℹ 200,610 more rows
## # ℹ 15 more variables: Fabricante <chr>, Producto <chr>, Precio <dbl>,
## #   Ult.Costo <dbl>, Unidades <dbl>, F.Ticket <dbl>, NombreDepartamento <chr>,
## #   NombreFamilia <chr>, NombreCategoria <chr>, Estado <chr>, `Mts 2` <dbl>,
## #   `Tipo ubicación` <chr>, Giro <chr>, `Hora inicio` <dttm>,
## #   `Hora cierre` <dttm>
# Convertir de caracter a entero
bd7 <- bd6
bd7$Hora <- substr(bd7$Hora, start =1, stop =2)
tibble(bd7)
## # A tibble: 200,620 × 21
##    vcClaveTienda DescGiro  `Codigo Barras` Fecha      Hora  Marca     Fabricante
##    <chr>         <chr>               <dbl> <date>     <chr> <chr>     <chr>     
##  1 MX001         Abarrotes   7501020540666 2020-06-19 18    NUTRI LE… MEXILAC   
##  2 MX001         Abarrotes   7501032397906 2020-06-19 18    DAN UP    DANONE DE…
##  3 MX001         Abarrotes   7501000112845 2020-06-19 18    BIMBO     GRUPO BIM…
##  4 MX001         Abarrotes   7501031302741 2020-06-19 18    PEPSI     PEPSI-COL…
##  5 MX001         Abarrotes   7501026027543 2020-06-19 18    BLANCA N… FABRICA D…
##  6 MX001         Abarrotes   7501025433024 2020-06-19 18    FLASH     ALEN      
##  7 MX001         Abarrotes   7501032332013 2020-06-19 18    VARIOS D… DANONE DE…
##  8 MX001         Abarrotes   7501026005688 2020-06-19 18    ZOTE      FABRICA D…
##  9 MX001         Abarrotes   7506195178188 2020-06-19 18    ALWAYS    PROCTER &…
## 10 MX001         Abarrotes     32239052017 2020-06-19 18    JUMEX     JUMEX     
## # ℹ 200,610 more rows
## # ℹ 14 more variables: Producto <chr>, Precio <dbl>, Ult.Costo <dbl>,
## #   Unidades <dbl>, F.Ticket <dbl>, NombreDepartamento <chr>,
## #   NombreFamilia <chr>, NombreCategoria <chr>, Estado <chr>, `Mts 2` <dbl>,
## #   `Tipo ubicación` <chr>, Giro <chr>, `Hora inicio` <dttm>,
## #   `Hora cierre` <dttm>
bd7$Hora <- as.integer(bd7$Hora)
str(bd7)
## tibble [200,620 × 21] (S3: tbl_df/tbl/data.frame)
##  $ vcClaveTienda     : chr [1:200620] "MX001" "MX001" "MX001" "MX001" ...
##  $ DescGiro          : chr [1:200620] "Abarrotes" "Abarrotes" "Abarrotes" "Abarrotes" ...
##  $ Codigo Barras     : num [1:200620] 7.5e+12 7.5e+12 7.5e+12 7.5e+12 7.5e+12 ...
##  $ Fecha             : Date[1:200620], format: "2020-06-19" "2020-06-19" ...
##  $ Hora              : int [1:200620] 18 18 18 18 18 18 18 18 18 18 ...
##  $ Marca             : chr [1:200620] "NUTRI LECHE" "DAN UP" "BIMBO" "PEPSI" ...
##  $ Fabricante        : chr [1:200620] "MEXILAC" "DANONE DE MEXICO" "GRUPO BIMBO" "PEPSI-COLA MEXICANA" ...
##  $ Producto          : chr [1:200620] "Nutri Leche 1 Litro" "DANUP STRAWBERRY P/BEBER 350GR NAL" "Rebanadas Bimbo 2Pz" "Pepsi N.R. 400Ml" ...
##  $ Precio            : num [1:200620] 16 14 5 8 19.5 9.5 11 9.5 23.5 12 ...
##  $ Ult.Costo         : num [1:200620] 12.3 14 5 8 15 ...
##  $ Unidades          : num [1:200620] 1 1 1 1 1 1 1 1 1 1 ...
##  $ F.Ticket          : num [1:200620] 1 2 3 3 4 4 4 4 4 5 ...
##  $ NombreDepartamento: chr [1:200620] "Abarrotes" "Abarrotes" "Abarrotes" "Abarrotes" ...
##  $ NombreFamilia     : chr [1:200620] "Lacteos y Refrigerados" "Lacteos y Refrigerados" "Pan y Tortilla" "Bebidas" ...
##  $ NombreCategoria   : chr [1:200620] "Leche" "Yogurt" "Pan Dulce Empaquetado" "Refrescos Plástico (N.R.)" ...
##  $ Estado            : chr [1:200620] "Nuevo León" "Nuevo León" "Nuevo León" "Nuevo León" ...
##  $ Mts 2             : num [1:200620] 60 60 60 60 60 60 60 60 60 60 ...
##  $ Tipo ubicación    : chr [1:200620] "Esquina" "Esquina" "Esquina" "Esquina" ...
##  $ Giro              : chr [1:200620] "Abarrotes" "Abarrotes" "Abarrotes" "Abarrotes" ...
##  $ Hora inicio       : POSIXct[1:200620], format: "1899-12-31 08:00:00" "1899-12-31 08:00:00" ...
##  $ Hora cierre       : POSIXct[1:200620], format: "1899-12-31 22:00:00" "1899-12-31 22:00:00" ...

Técnica 5. Valores faltantes

# ¿Cuántos NA tengo en la base de datos?
sum(is.na(bd7))
## [1] 0
sum(is.na(bd))
## [1] 199183
# ¿Cuántos NA tengo por variable?
sapply(bd7,function(x)sum(is.na(x)))
##      vcClaveTienda           DescGiro      Codigo Barras              Fecha 
##                  0                  0                  0                  0 
##               Hora              Marca         Fabricante           Producto 
##                  0                  0                  0                  0 
##             Precio          Ult.Costo           Unidades           F.Ticket 
##                  0                  0                  0                  0 
## NombreDepartamento      NombreFamilia    NombreCategoria             Estado 
##                  0                  0                  0                  0 
##              Mts 2     Tipo ubicación               Giro        Hora inicio 
##                  0                  0                  0                  0 
##        Hora cierre 
##                  0
# Borrar todos los registros NA de una tabla
bd8 <- bd
bd8 <- na.omit(bd8)
summary(bd8)
##  vcClaveTienda        DescGiro         Codigo Barras         PLU         
##  Length:1437        Length:1437        Min.   :6.750e+08   Mode:logical  
##  Class :character   Class :character   1st Qu.:6.750e+08   TRUE:1437     
##  Mode  :character   Mode  :character   Median :6.750e+08                 
##                                        Mean   :2.616e+11                 
##                                        3rd Qu.:6.750e+08                 
##                                        Max.   :7.501e+12                 
##      Fecha                             Hora                       
##  Min.   :2020-06-06 14:36:14.11   Min.   :1899-12-31 00:01:22.00  
##  1st Qu.:2020-06-20 21:48:46.75   1st Qu.:1899-12-31 15:57:22.00  
##  Median :2020-07-10 22:17:46.90   Median :1899-12-31 18:49:20.00  
##  Mean   :2020-07-16 11:50:19.00   Mean   :1899-12-31 17:46:04.46  
##  3rd Qu.:2020-08-08 21:42:17.13   3rd Qu.:1899-12-31 21:09:03.00  
##  Max.   :2020-11-11 20:37:03.69   Max.   :1899-12-31 23:58:14.00  
##     Marca            Fabricante          Producto             Precio     
##  Length:1437        Length:1437        Length:1437        Min.   :30.00  
##  Class :character   Class :character   Class :character   1st Qu.:90.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :90.00  
##                                                           Mean   :87.94  
##                                                           3rd Qu.:90.00  
##                                                           Max.   :90.00  
##    Ult.Costo        Unidades        F.Ticket      NombreDepartamento
##  Min.   : 1.00   Min.   :1.000   Min.   :   772   Length:1437       
##  1st Qu.:64.62   1st Qu.:1.000   1st Qu.: 99955   Class :character  
##  Median :64.62   Median :1.000   Median :102493   Mode  :character  
##  Mean   :56.65   Mean   :1.124   Mean   :100595                     
##  3rd Qu.:64.62   3rd Qu.:1.000   3rd Qu.:106546                     
##  Max.   :64.62   Max.   :7.000   Max.   :118356                     
##  NombreFamilia      NombreCategoria       Estado              Mts 2      
##  Length:1437        Length:1437        Length:1437        Min.   :58.00  
##  Class :character   Class :character   Class :character   1st Qu.:58.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :58.00  
##                                                           Mean   :58.07  
##                                                           3rd Qu.:58.00  
##                                                           Max.   :60.00  
##  Tipo ubicación         Giro            Hora inicio                 
##  Length:1437        Length:1437        Min.   :1899-12-31 08:00:00  
##  Class :character   Class :character   1st Qu.:1899-12-31 08:00:00  
##  Mode  :character   Mode  :character   Median :1899-12-31 08:00:00  
##                                        Mean   :1899-12-31 08:00:00  
##                                        3rd Qu.:1899-12-31 08:00:00  
##                                        Max.   :1899-12-31 08:00:00  
##   Hora cierre                    
##  Min.   :1899-12-31 21:00:00.00  
##  1st Qu.:1899-12-31 21:00:00.00  
##  Median :1899-12-31 21:00:00.00  
##  Mean   :1899-12-31 21:02:06.26  
##  3rd Qu.:1899-12-31 21:00:00.00  
##  Max.   :1899-12-31 22:00:00.00
# Reemplazar NA con ceros
bd9 <- bd
bd9[is.na(bd9)] <- 0
summary(bd9)
##  vcClaveTienda        DescGiro         Codigo Barras          PLU         
##  Length:200620      Length:200620      Min.   :8.347e+05   Mode :logical  
##  Class :character   Class :character   1st Qu.:7.501e+12   FALSE:199183   
##  Mode  :character   Mode  :character   Median :7.501e+12   TRUE :1437     
##                                        Mean   :5.950e+12                  
##                                        3rd Qu.:7.501e+12                  
##                                        Max.   :1.750e+13                  
##      Fecha                             Hora                       
##  Min.   :2020-05-01 00:00:31.08   Min.   :1899-12-31 00:00:00.00  
##  1st Qu.:2020-06-06 13:24:49.08   1st Qu.:1899-12-31 13:12:42.75  
##  Median :2020-07-11 14:10:21.46   Median :1899-12-31 17:35:59.00  
##  Mean   :2020-07-19 15:19:40.65   Mean   :1899-12-31 16:43:52.05  
##  3rd Qu.:2020-08-29 22:07:47.33   3rd Qu.:1899-12-31 20:47:06.00  
##  Max.   :2020-11-11 23:53:47.73   Max.   :1899-12-31 23:59:59.00  
##     Marca            Fabricante          Producto             Precio       
##  Length:200620      Length:200620      Length:200620      Min.   :-147.00  
##  Class :character   Class :character   Class :character   1st Qu.:  11.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :  16.00  
##                                                           Mean   :  19.42  
##                                                           3rd Qu.:  25.00  
##                                                           Max.   :1000.00  
##    Ult.Costo         Unidades         F.Ticket      NombreDepartamento
##  Min.   :  0.38   Min.   : 0.200   Min.   :     1   Length:200620     
##  1st Qu.:  8.46   1st Qu.: 1.000   1st Qu.: 33967   Class :character  
##  Median : 12.31   Median : 1.000   Median :105996   Mode  :character  
##  Mean   : 15.31   Mean   : 1.262   Mean   :193994                     
##  3rd Qu.: 19.23   3rd Qu.: 1.000   3rd Qu.:383009                     
##  Max.   :769.23   Max.   :96.000   Max.   :450040                     
##  NombreFamilia      NombreCategoria       Estado              Mts 2     
##  Length:200620      Length:200620      Length:200620      Min.   :47.0  
##  Class :character   Class :character   Class :character   1st Qu.:53.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :60.0  
##                                                           Mean   :56.6  
##                                                           3rd Qu.:60.0  
##                                                           Max.   :62.0  
##  Tipo ubicación         Giro            Hora inicio                    
##  Length:200620      Length:200620      Min.   :1899-12-31 07:00:00.00  
##  Class :character   Class :character   1st Qu.:1899-12-31 07:00:00.00  
##  Mode  :character   Mode  :character   Median :1899-12-31 08:00:00.00  
##                                        Mean   :1899-12-31 07:35:49.71  
##                                        3rd Qu.:1899-12-31 08:00:00.00  
##                                        Max.   :1899-12-31 09:00:00.00  
##   Hora cierre                    
##  Min.   :1899-12-31 21:00:00.00  
##  1st Qu.:1899-12-31 22:00:00.00  
##  Median :1899-12-31 22:00:00.00  
##  Mean   :1899-12-31 22:23:11.42  
##  3rd Qu.:1899-12-31 23:00:00.00  
##  Max.   :1899-12-31 23:00:00.00
# Reemplazar los NA con el promedio
bd10 <- bd
bd10$PLU[is.na(bd10$PLU)] <- mean(bd10$PLU, na.rm =TRUE)
summary(bd10)
##  vcClaveTienda        DescGiro         Codigo Barras            PLU   
##  Length:200620      Length:200620      Min.   :8.347e+05   Min.   :1  
##  Class :character   Class :character   1st Qu.:7.501e+12   1st Qu.:1  
##  Mode  :character   Mode  :character   Median :7.501e+12   Median :1  
##                                        Mean   :5.950e+12   Mean   :1  
##                                        3rd Qu.:7.501e+12   3rd Qu.:1  
##                                        Max.   :1.750e+13   Max.   :1  
##      Fecha                             Hora                       
##  Min.   :2020-05-01 00:00:31.08   Min.   :1899-12-31 00:00:00.00  
##  1st Qu.:2020-06-06 13:24:49.08   1st Qu.:1899-12-31 13:12:42.75  
##  Median :2020-07-11 14:10:21.46   Median :1899-12-31 17:35:59.00  
##  Mean   :2020-07-19 15:19:40.65   Mean   :1899-12-31 16:43:52.05  
##  3rd Qu.:2020-08-29 22:07:47.33   3rd Qu.:1899-12-31 20:47:06.00  
##  Max.   :2020-11-11 23:53:47.73   Max.   :1899-12-31 23:59:59.00  
##     Marca            Fabricante          Producto             Precio       
##  Length:200620      Length:200620      Length:200620      Min.   :-147.00  
##  Class :character   Class :character   Class :character   1st Qu.:  11.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :  16.00  
##                                                           Mean   :  19.42  
##                                                           3rd Qu.:  25.00  
##                                                           Max.   :1000.00  
##    Ult.Costo         Unidades         F.Ticket      NombreDepartamento
##  Min.   :  0.38   Min.   : 0.200   Min.   :     1   Length:200620     
##  1st Qu.:  8.46   1st Qu.: 1.000   1st Qu.: 33967   Class :character  
##  Median : 12.31   Median : 1.000   Median :105996   Mode  :character  
##  Mean   : 15.31   Mean   : 1.262   Mean   :193994                     
##  3rd Qu.: 19.23   3rd Qu.: 1.000   3rd Qu.:383009                     
##  Max.   :769.23   Max.   :96.000   Max.   :450040                     
##  NombreFamilia      NombreCategoria       Estado              Mts 2     
##  Length:200620      Length:200620      Length:200620      Min.   :47.0  
##  Class :character   Class :character   Class :character   1st Qu.:53.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :60.0  
##                                                           Mean   :56.6  
##                                                           3rd Qu.:60.0  
##                                                           Max.   :62.0  
##  Tipo ubicación         Giro            Hora inicio                    
##  Length:200620      Length:200620      Min.   :1899-12-31 07:00:00.00  
##  Class :character   Class :character   1st Qu.:1899-12-31 07:00:00.00  
##  Mode  :character   Mode  :character   Median :1899-12-31 08:00:00.00  
##                                        Mean   :1899-12-31 07:35:49.71  
##                                        3rd Qu.:1899-12-31 08:00:00.00  
##                                        Max.   :1899-12-31 09:00:00.00  
##   Hora cierre                    
##  Min.   :1899-12-31 21:00:00.00  
##  1st Qu.:1899-12-31 22:00:00.00  
##  Median :1899-12-31 22:00:00.00  
##  Mean   :1899-12-31 22:23:11.42  
##  3rd Qu.:1899-12-31 23:00:00.00  
##  Max.   :1899-12-31 23:00:00.00

Técnica 6. Método estadístico

# Gráfica de caja y bigotes
bd12 <- bd7
boxplot(bd12$Precio, horizontal = TRUE)

boxplot(bd12$Unidades, horizontal = TRUE)

# Agregar columnas
library(lubridate)
bd12$Dia_de_la_semana <- wday(bd12$Fecha)
summary(bd12)
##  vcClaveTienda        DescGiro         Codigo Barras           Fecha           
##  Length:200620      Length:200620      Min.   :8.347e+05   Min.   :2020-05-01  
##  Class :character   Class :character   1st Qu.:7.501e+12   1st Qu.:2020-06-06  
##  Mode  :character   Mode  :character   Median :7.501e+12   Median :2020-07-11  
##                                        Mean   :5.950e+12   Mean   :2020-07-18  
##                                        3rd Qu.:7.501e+12   3rd Qu.:2020-08-29  
##                                        Max.   :1.750e+13   Max.   :2020-11-11  
##       Hora       Marca            Fabricante          Producto        
##  Min.   :18   Length:200620      Length:200620      Length:200620     
##  1st Qu.:18   Class :character   Class :character   Class :character  
##  Median :18   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :18                                                           
##  3rd Qu.:18                                                           
##  Max.   :18                                                           
##      Precio          Ult.Costo         Unidades         F.Ticket     
##  Min.   :   0.50   Min.   :  0.38   Min.   : 1.000   Min.   :     1  
##  1st Qu.:  11.00   1st Qu.:  8.46   1st Qu.: 1.000   1st Qu.: 33967  
##  Median :  16.00   Median : 12.31   Median : 1.000   Median :105996  
##  Mean   :  19.45   Mean   : 15.31   Mean   : 1.262   Mean   :193994  
##  3rd Qu.:  25.00   3rd Qu.: 19.23   3rd Qu.: 1.000   3rd Qu.:383009  
##  Max.   :1000.00   Max.   :769.23   Max.   :96.000   Max.   :450040  
##  NombreDepartamento NombreFamilia      NombreCategoria       Estado         
##  Length:200620      Length:200620      Length:200620      Length:200620     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      Mts 2      Tipo ubicación         Giro          
##  Min.   :47.0   Length:200620      Length:200620     
##  1st Qu.:53.0   Class :character   Class :character  
##  Median :60.0   Mode  :character   Mode  :character  
##  Mean   :56.6                                        
##  3rd Qu.:60.0                                        
##  Max.   :62.0                                        
##   Hora inicio                      Hora cierre                    
##  Min.   :1899-12-31 07:00:00.00   Min.   :1899-12-31 21:00:00.00  
##  1st Qu.:1899-12-31 07:00:00.00   1st Qu.:1899-12-31 22:00:00.00  
##  Median :1899-12-31 08:00:00.00   Median :1899-12-31 22:00:00.00  
##  Mean   :1899-12-31 07:35:49.71   Mean   :1899-12-31 22:23:11.42  
##  3rd Qu.:1899-12-31 08:00:00.00   3rd Qu.:1899-12-31 23:00:00.00  
##  Max.   :1899-12-31 09:00:00.00   Max.   :1899-12-31 23:00:00.00  
##  Dia_de_la_semana
##  Min.   :1.000   
##  1st Qu.:2.000   
##  Median :4.000   
##  Mean   :3.912   
##  3rd Qu.:6.000   
##  Max.   :7.000
bd12$Subtotal <- bd12$Precio * bd12$Unidades
summary(bd12)
##  vcClaveTienda        DescGiro         Codigo Barras           Fecha           
##  Length:200620      Length:200620      Min.   :8.347e+05   Min.   :2020-05-01  
##  Class :character   Class :character   1st Qu.:7.501e+12   1st Qu.:2020-06-06  
##  Mode  :character   Mode  :character   Median :7.501e+12   Median :2020-07-11  
##                                        Mean   :5.950e+12   Mean   :2020-07-18  
##                                        3rd Qu.:7.501e+12   3rd Qu.:2020-08-29  
##                                        Max.   :1.750e+13   Max.   :2020-11-11  
##       Hora       Marca            Fabricante          Producto        
##  Min.   :18   Length:200620      Length:200620      Length:200620     
##  1st Qu.:18   Class :character   Class :character   Class :character  
##  Median :18   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :18                                                           
##  3rd Qu.:18                                                           
##  Max.   :18                                                           
##      Precio          Ult.Costo         Unidades         F.Ticket     
##  Min.   :   0.50   Min.   :  0.38   Min.   : 1.000   Min.   :     1  
##  1st Qu.:  11.00   1st Qu.:  8.46   1st Qu.: 1.000   1st Qu.: 33967  
##  Median :  16.00   Median : 12.31   Median : 1.000   Median :105996  
##  Mean   :  19.45   Mean   : 15.31   Mean   : 1.262   Mean   :193994  
##  3rd Qu.:  25.00   3rd Qu.: 19.23   3rd Qu.: 1.000   3rd Qu.:383009  
##  Max.   :1000.00   Max.   :769.23   Max.   :96.000   Max.   :450040  
##  NombreDepartamento NombreFamilia      NombreCategoria       Estado         
##  Length:200620      Length:200620      Length:200620      Length:200620     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      Mts 2      Tipo ubicación         Giro          
##  Min.   :47.0   Length:200620      Length:200620     
##  1st Qu.:53.0   Class :character   Class :character  
##  Median :60.0   Mode  :character   Mode  :character  
##  Mean   :56.6                                        
##  3rd Qu.:60.0                                        
##  Max.   :62.0                                        
##   Hora inicio                      Hora cierre                    
##  Min.   :1899-12-31 07:00:00.00   Min.   :1899-12-31 21:00:00.00  
##  1st Qu.:1899-12-31 07:00:00.00   1st Qu.:1899-12-31 22:00:00.00  
##  Median :1899-12-31 08:00:00.00   Median :1899-12-31 22:00:00.00  
##  Mean   :1899-12-31 07:35:49.71   Mean   :1899-12-31 22:23:11.42  
##  3rd Qu.:1899-12-31 08:00:00.00   3rd Qu.:1899-12-31 23:00:00.00  
##  Max.   :1899-12-31 09:00:00.00   Max.   :1899-12-31 23:00:00.00  
##  Dia_de_la_semana    Subtotal      
##  Min.   :1.000    Min.   :   1.00  
##  1st Qu.:2.000    1st Qu.:  12.00  
##  Median :4.000    Median :  18.00  
##  Mean   :3.912    Mean   :  24.33  
##  3rd Qu.:6.000    3rd Qu.:  27.00  
##  Max.   :7.000    Max.   :2496.00
bd12$Utilidad <- bd12$Precio - bd12$Ult.Costo
summary(bd12)
##  vcClaveTienda        DescGiro         Codigo Barras           Fecha           
##  Length:200620      Length:200620      Min.   :8.347e+05   Min.   :2020-05-01  
##  Class :character   Class :character   1st Qu.:7.501e+12   1st Qu.:2020-06-06  
##  Mode  :character   Mode  :character   Median :7.501e+12   Median :2020-07-11  
##                                        Mean   :5.950e+12   Mean   :2020-07-18  
##                                        3rd Qu.:7.501e+12   3rd Qu.:2020-08-29  
##                                        Max.   :1.750e+13   Max.   :2020-11-11  
##       Hora       Marca            Fabricante          Producto        
##  Min.   :18   Length:200620      Length:200620      Length:200620     
##  1st Qu.:18   Class :character   Class :character   Class :character  
##  Median :18   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :18                                                           
##  3rd Qu.:18                                                           
##  Max.   :18                                                           
##      Precio          Ult.Costo         Unidades         F.Ticket     
##  Min.   :   0.50   Min.   :  0.38   Min.   : 1.000   Min.   :     1  
##  1st Qu.:  11.00   1st Qu.:  8.46   1st Qu.: 1.000   1st Qu.: 33967  
##  Median :  16.00   Median : 12.31   Median : 1.000   Median :105996  
##  Mean   :  19.45   Mean   : 15.31   Mean   : 1.262   Mean   :193994  
##  3rd Qu.:  25.00   3rd Qu.: 19.23   3rd Qu.: 1.000   3rd Qu.:383009  
##  Max.   :1000.00   Max.   :769.23   Max.   :96.000   Max.   :450040  
##  NombreDepartamento NombreFamilia      NombreCategoria       Estado         
##  Length:200620      Length:200620      Length:200620      Length:200620     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      Mts 2      Tipo ubicación         Giro          
##  Min.   :47.0   Length:200620      Length:200620     
##  1st Qu.:53.0   Class :character   Class :character  
##  Median :60.0   Mode  :character   Mode  :character  
##  Mean   :56.6                                        
##  3rd Qu.:60.0                                        
##  Max.   :62.0                                        
##   Hora inicio                      Hora cierre                    
##  Min.   :1899-12-31 07:00:00.00   Min.   :1899-12-31 21:00:00.00  
##  1st Qu.:1899-12-31 07:00:00.00   1st Qu.:1899-12-31 22:00:00.00  
##  Median :1899-12-31 08:00:00.00   Median :1899-12-31 22:00:00.00  
##  Mean   :1899-12-31 07:35:49.71   Mean   :1899-12-31 22:23:11.42  
##  3rd Qu.:1899-12-31 08:00:00.00   3rd Qu.:1899-12-31 23:00:00.00  
##  Max.   :1899-12-31 09:00:00.00   Max.   :1899-12-31 23:00:00.00  
##  Dia_de_la_semana    Subtotal          Utilidad      
##  Min.   :1.000    Min.   :   1.00   Min.   :  0.000  
##  1st Qu.:2.000    1st Qu.:  12.00   1st Qu.:  2.310  
##  Median :4.000    Median :  18.00   Median :  3.230  
##  Mean   :3.912    Mean   :  24.33   Mean   :  4.142  
##  3rd Qu.:6.000    3rd Qu.:  27.00   3rd Qu.:  5.420  
##  Max.   :7.000    Max.   :2496.00   Max.   :230.770
# Exportar la base de datos limpia
bd_limpia <- bd12
write.csv(bd_limpia, file="abarrotes_bd_limpia.csv", row.names = FALSE)
# Market basket analysis
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:purrr':
## 
##     compact
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(Matrix)
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(arules)
## 
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library(arulesViz)
library(datasets)
# Ordenar de menor a mayor los tickets
bd_limpia <- bd_limpia[order(bd_limpia$F.Ticket),]
head(bd_limpia)
## # A tibble: 6 × 24
##   vcClaveTienda DescGiro  `Codigo Barras` Fecha       Hora Marca      Fabricante
##   <chr>         <chr>               <dbl> <date>     <int> <chr>      <chr>     
## 1 MX001         Abarrotes   7501020540666 2020-06-19    18 NUTRI LEC… MEXILAC   
## 2 MX001         Abarrotes   7501032397906 2020-06-19    18 DAN UP     DANONE DE…
## 3 MX001         Abarrotes   7501000112845 2020-06-19    18 BIMBO      GRUPO BIM…
## 4 MX001         Abarrotes   7501031302741 2020-06-19    18 PEPSI      PEPSI-COL…
## 5 MX001         Abarrotes   7501026027543 2020-06-19    18 BLANCA NI… FABRICA D…
## 6 MX001         Abarrotes   7501025433024 2020-06-19    18 FLASH      ALEN      
## # ℹ 17 more variables: Producto <chr>, Precio <dbl>, Ult.Costo <dbl>,
## #   Unidades <dbl>, F.Ticket <dbl>, NombreDepartamento <chr>,
## #   NombreFamilia <chr>, NombreCategoria <chr>, Estado <chr>, `Mts 2` <dbl>,
## #   `Tipo ubicación` <chr>, Giro <chr>, `Hora inicio` <dttm>,
## #   `Hora cierre` <dttm>, Dia_de_la_semana <dbl>, Subtotal <dbl>,
## #   Utilidad <dbl>
tail(bd_limpia)
## # A tibble: 6 × 24
##   vcClaveTienda DescGiro   `Codigo Barras` Fecha       Hora Marca     Fabricante
##   <chr>         <chr>                <dbl> <date>     <int> <chr>     <chr>     
## 1 MX004         Carnicería     10248765241 2020-10-15    18 YEMINA    HERDEZ    
## 2 MX004         Carnicería   7501079702855 2020-10-15    18 DEL FUER… ALIMENTOS…
## 3 MX004         Carnicería   7501055320639 2020-10-15    18 COCA COL… COCA COLA 
## 4 MX004         Carnicería   7501214100256 2020-10-15    18 DIAMANTE  EMPACADOS 
## 5 MX004         Carnicería   7501031311620 2020-10-15    18 PEPSI     PEPSI-COL…
## 6 MX004         Carnicería        75004699 2020-10-15    18 COCA COLA COCA COLA 
## # ℹ 17 more variables: Producto <chr>, Precio <dbl>, Ult.Costo <dbl>,
## #   Unidades <dbl>, F.Ticket <dbl>, NombreDepartamento <chr>,
## #   NombreFamilia <chr>, NombreCategoria <chr>, Estado <chr>, `Mts 2` <dbl>,
## #   `Tipo ubicación` <chr>, Giro <chr>, `Hora inicio` <dttm>,
## #   `Hora cierre` <dttm>, Dia_de_la_semana <dbl>, Subtotal <dbl>,
## #   Utilidad <dbl>
# Generar basket
basket <- ddply(bd_limpia, c("F.Ticket"), function(bd_limpia)paste(bd_limpia$Marca, collapse = ","))
# Eliminar número de ticket
basket$F.Ticket <- NULL
# Renombrar el nombre de la columna
colnames(basket) <- c("Marca")
# Exportar basket
write.csv(basket, "basket.csv", quote = FALSE, row.names = FALSE)
library(arules)
library(arulesViz)
# Importar transacciones
tr <- read.transactions( "C:\\Users\\Asus ZenBook\\Documents\\basket.csv", format = "basket", sep = ",")
## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string

## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string
## Warning in asMethod(object): removing duplicated items in transactions
# Generar reglas de asociación
reglas.asociacion <- apriori(tr,parameter = list(supp=0.001, conf=0.2, maxlen=10))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.2    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 115 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[604 item(s), 115111 transaction(s)] done [0.02s].
## sorting and recoding items ... [207 item(s)] done [0.00s].
## creating transaction tree ... done [0.02s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [11 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
summary(reglas.asociacion)
## set of 11 rules
## 
## rule length distribution (lhs + rhs):sizes
##  2 
## 11 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2       2       2       2       2       2 
## 
## summary of quality measures:
##     support           confidence        coverage             lift       
##  Min.   :0.001016   Min.   :0.2069   Min.   :0.003562   Min.   : 1.325  
##  1st Qu.:0.001103   1st Qu.:0.2356   1st Qu.:0.004504   1st Qu.: 1.787  
##  Median :0.001416   Median :0.2442   Median :0.005803   Median : 3.972  
##  Mean   :0.001519   Mean   :0.2536   Mean   :0.006054   Mean   :17.563  
##  3rd Qu.:0.001651   3rd Qu.:0.2685   3rd Qu.:0.006893   3rd Qu.:21.798  
##  Max.   :0.002745   Max.   :0.3098   Max.   :0.010503   Max.   :65.908  
##      count      
##  Min.   :117.0  
##  1st Qu.:127.0  
##  Median :163.0  
##  Mean   :174.9  
##  3rd Qu.:190.0  
##  Max.   :316.0  
## 
## mining info:
##  data ntransactions support confidence
##    tr        115111   0.001        0.2
##                                                                         call
##  apriori(data = tr, parameter = list(supp = 0.001, conf = 0.2, maxlen = 10))
inspect(reglas.asociacion)
##      lhs                  rhs         support     confidence coverage   
## [1]  {FANTA}           => {COCA COLA} 0.001051159 0.2439516  0.004308884
## [2]  {SALVO}           => {FABULOSO}  0.001103283 0.3097561  0.003561779
## [3]  {FABULOSO}        => {SALVO}     0.001103283 0.2347505  0.004699811
## [4]  {COCA COLA ZERO}  => {COCA COLA} 0.001416025 0.2969035  0.004769310
## [5]  {SPRITE}          => {COCA COLA} 0.001346526 0.2069426  0.006506763
## [6]  {PINOL}           => {CLORALEX}  0.001016410 0.2363636  0.004300197
## [7]  {BLUE HOUSE}      => {BIMBO}     0.001711392 0.2720994  0.006289581
## [8]  {HELLMANN´S}      => {BIMBO}     0.001537646 0.2649701  0.005803094
## [9]  {REYMA}           => {CONVERMEX} 0.002093631 0.2441743  0.008574333
## [10] {FUD}             => {BIMBO}     0.001589770 0.2183771  0.007279930
## [11] {COCA COLA LIGHT} => {COCA COLA} 0.002745176 0.2613730  0.010502906
##      lift      count
## [1]   1.561906 121  
## [2]  65.908196 127  
## [3]  65.908196 127  
## [4]   1.900932 163  
## [5]   1.324955 155  
## [6]  25.030409 117  
## [7]   4.078870 197  
## [8]   3.971997 177  
## [9]  18.564824 241  
## [10]  3.273552 183  
## [11]  1.673447 316
# Ordenar reglas de asociación
reglas.asociacion <- sort(reglas.asociacion, by = "confidence", decreasing = TRUE)
summary(reglas.asociacion)
## set of 11 rules
## 
## rule length distribution (lhs + rhs):sizes
##  2 
## 11 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2       2       2       2       2       2 
## 
## summary of quality measures:
##     support           confidence        coverage             lift       
##  Min.   :0.001016   Min.   :0.2069   Min.   :0.003562   Min.   : 1.325  
##  1st Qu.:0.001103   1st Qu.:0.2356   1st Qu.:0.004504   1st Qu.: 1.787  
##  Median :0.001416   Median :0.2442   Median :0.005803   Median : 3.972  
##  Mean   :0.001519   Mean   :0.2536   Mean   :0.006054   Mean   :17.563  
##  3rd Qu.:0.001651   3rd Qu.:0.2685   3rd Qu.:0.006893   3rd Qu.:21.798  
##  Max.   :0.002745   Max.   :0.3098   Max.   :0.010503   Max.   :65.908  
##      count      
##  Min.   :117.0  
##  1st Qu.:127.0  
##  Median :163.0  
##  Mean   :174.9  
##  3rd Qu.:190.0  
##  Max.   :316.0  
## 
## mining info:
##  data ntransactions support confidence
##    tr        115111   0.001        0.2
##                                                                         call
##  apriori(data = tr, parameter = list(supp = 0.001, conf = 0.2, maxlen = 10))
inspect(reglas.asociacion)
##      lhs                  rhs         support     confidence coverage   
## [1]  {SALVO}           => {FABULOSO}  0.001103283 0.3097561  0.003561779
## [2]  {COCA COLA ZERO}  => {COCA COLA} 0.001416025 0.2969035  0.004769310
## [3]  {BLUE HOUSE}      => {BIMBO}     0.001711392 0.2720994  0.006289581
## [4]  {HELLMANN´S}      => {BIMBO}     0.001537646 0.2649701  0.005803094
## [5]  {COCA COLA LIGHT} => {COCA COLA} 0.002745176 0.2613730  0.010502906
## [6]  {REYMA}           => {CONVERMEX} 0.002093631 0.2441743  0.008574333
## [7]  {FANTA}           => {COCA COLA} 0.001051159 0.2439516  0.004308884
## [8]  {PINOL}           => {CLORALEX}  0.001016410 0.2363636  0.004300197
## [9]  {FABULOSO}        => {SALVO}     0.001103283 0.2347505  0.004699811
## [10] {FUD}             => {BIMBO}     0.001589770 0.2183771  0.007279930
## [11] {SPRITE}          => {COCA COLA} 0.001346526 0.2069426  0.006506763
##      lift      count
## [1]  65.908196 127  
## [2]   1.900932 163  
## [3]   4.078870 197  
## [4]   3.971997 177  
## [5]   1.673447 316  
## [6]  18.564824 241  
## [7]   1.561906 121  
## [8]  25.030409 117  
## [9]  65.908196 127  
## [10]  3.273552 183  
## [11]  1.324955 155
# Visualizar reglas de asociación
top10reglas <- head(reglas.asociacion, n=10, by= "confidence")
plot(top10reglas, method = "graph", engine = "htmlwidget")

Conclusión

Esta actividad es esencial para el reto, ya que trabajeremos con bases de datos extensas que pueden contener errores, valores nulos o datos que no utilizaremos, por lo tanto es importante aprender a limpiar y organizar los datos para facilitar el proceso de análisis y tener información relevante para el caso.

LS0tDQp0aXRsZTogIkFjdGl2aWRhZCAyLjEgRGFuaWVsYSBHYXJ6YSINCmF1dGhvcjogIkRhbmllbGEgR2FyemEiDQpkYXRlOiAiMjAyMy0wOC0yMSINCm91dHB1dDogDQogIGh0bWxfZG9jdW1lbnQ6DQogICAgdG9jOiBUUlVFDQogICAgdG9jX2Zsb2F0OiBUUlVFDQogICAgY29kZV9kb3dubG9hZDogVFJVRQ0KICANCi0tLQ0KDQojIEFjdGl2aWRhZCAyLjE6IE1hbmVqYSB1bmEgYmFzZSBkZSBkYXRvcyAtICJBYmFycm90ZXMiDQoNCiMgSW1wb3J0YXIgbGEgYmFzZSBkZSBkYXRvcw0KYGBge3J9DQpsaWJyYXJ5KHJlYWR4bCkNCmJkIDwtIHJlYWRfZXhjZWwoIkM6L1VzZXJzL0FzdXMgWmVuQm9vay9Eb3dubG9hZHMvQWJhcnJvdGVzX1ZlbnRhcy0yLnhsc3giKQ0KYGBgDQoNCg0KYGBge3J9DQpzdW1tYXJ5KGJkKQ0KYGBgDQoNCg0KIyBPYnNlcnZhY2lvbmVzDQojIyMgMS4gUExVIHRpZW5lIDE5OTE4OCBOQSdzDQojIyMgMi4gTGEgdmFyaWFibGUgRmVjaGEgZXN0w6EgY29tbyBjYXJhY3Rlci4NCiMjIyAzLiBMYXMgdmFyaWFibGVzIEhvcmEsIEhvcmEuaW5pY2lvIHkgSG9yYS5jaWVycmUgZXN0w6FuIGNvbW8gY2FyYWN0ZXIuDQojIyMgNC4gTGEgdmFyaWFibGUgUHJlY2lvIHRpZW5lIG5lZ2F0aXZvcy4NCiMjIyA1LiBMYSB2YXJpYWJsZSBVbmlkYWRlcyB0aWVuZSBkZWNpbWFsZXMuDQoNCg0KYGBge3J9DQpsaWJyYXJ5KGRwbHlyKQ0KDQpjb3VudChiZCwgdmNDbGF2ZVRpZW5kYSwgc29ydD1UUlVFKQ0KY291bnQoYmQsIERlc2NHaXJvLCBzb3J0PVRSVUUpDQpjb3VudChiZCwgTWFyY2EsIHNvcnQ9VFJVRSkNCmNvdW50KGJkLCBGYWJyaWNhbnRlLCBzb3J0PVRSVUUpDQpjb3VudChiZCwgUHJvZHVjdG8sIHNvcnQ9VFJVRSkNCmNvdW50KGJkLCBOb21icmVEZXBhcnRhbWVudG8sIHNvcnQ9VFJVRSkNCmNvdW50KGJkLCBOb21icmVGYW1pbGlhLCBzb3J0PVRSVUUpDQpjb3VudChiZCwgTm9tYnJlQ2F0ZWdvcmlhLCBzb3J0PVRSVUUpDQpjb3VudChiZCwgRXN0YWRvLCBzb3J0PVRSVUUpDQpjb3VudChiZCwgR2lybywgc29ydD1UUlVFKQ0KY291bnQoYmQsICJUaXBvIHViaWNhY2nDs24iLCBzb3J0PVRSVUUpDQpjb3VudChiZCwgIkhvcmEgaW5pY2lvIiwgc29ydD1UUlVFKQ0KY291bnQoYmQsICJIb3JhIGNpZXJyZSIsIHNvcnQ9VFJVRSkNCg0KYGBgDQoNCmBgYHtyfQ0KbGlicmFyeSh0aWR5dmVyc2UpDQoNCnRpYmJsZShiZCkNCg0Kc3RyKGJkKQ0KDQpoZWFkKGJkLCBuPTcpDQoNCnRhaWwoYmQpDQoNCmBgYA0KDQpgYGB7cn0NCmxpYnJhcnkoamFuaXRvcikNCg0KdGFieWwoYmQsIHZjQ2xhdmVUaWVuZGEsIE5vbWJyZURlcGFydGFtZW50bykNCg0KYGBgDQoNCiMgVMOpY25pY2FzIHBhcmEgbGltcGllemEgZGUgZGF0b3MNCg0KIyBUZWNuaWNhIDEuIFJlbW92ZXIgdmFsb3JlcyBpcnJlbGV2YW50ZXMgDQoNCmBgYHtyfQ0KIyBFbGltaW5hciBjb2x1bW5hcw0KYmQxIDwtIGJkDQpiZDEgPC0gc3Vic2V0KGJkMSwgc2VsZWN0ID0gLWMoUExVKSkNCmBgYA0KDQpgYGB7cn0NCiMgRWxpbWluYXIgcmVuZ2xvbmVzDQpiZDIgPC0gYmQxDQpiZDIgPC0gYmQyW2JkMiRQcmVjaW8+MCxdDQpzdW1tYXJ5KGJkMSRQcmVjaW8pDQpzdW1tYXJ5KGJkMiRQcmVjaW8pDQoNCmBgYA0KDQojIFTDqWNuaWNhIDIuIFJlbW92ZXIgdmFsb3JlcyBkdXBsaWNhZG9zDQoNCmBgYHtyfQ0KIyDCv0N1w6FudG9zIHJlZ2lzdHJvcyBkdXBsaWNhZG9zIHRlbmVtb3M/DQpiZDJbZHVwbGljYXRlZChiZDIpLF0NCnN1bShkdXBsaWNhdGVkKGJkMSkpDQpgYGANCg0KYGBge3J9DQojIEVsaW1pbmFyIHJlZ2lzdHJvcyBkdXBsaWNhZG9zDQpiZDMgPC0gYmQyDQpsaWJyYXJ5KGRwbHlyKQ0KYmQzIDwtIGRpc3RpbmN0KGJkMykNCmBgYA0KDQojIFTDqWNuaWNhIDMuIEVycm9yZXMgdGlwb2dyw6FmaWNvcyB5IGVycm9yZXMgc2ltaWxhcmVzDQpgYGB7cn0NCiMgUHJlY2lvcyBlbiBhYnNvbHV0bw0KYmQ0IDwtIGJkMQ0KYmQ0JFByZWNpbyA8LSBhYnMoYmQ0JFByZWNpbykNCnN1bW1hcnkoYmQ0JFByZWNpbykNCmBgYA0KDQpgYGB7cn0NCiMgQ2FudGlkYWRlcyBlbiBlbnRlcm9zIA0KYmQ1IDwtIGJkNA0KYmQ1JFVuaWRhZGVzIDwtIGNlaWxpbmcoYmQ1JFVuaWRhZGVzKQ0Kc3VtbWFyeShiZDUkVW5pZGFkZXMpDQpzdW1tYXJ5KGJkJFVuaWRhZGVzKQ0KYGBgDQojIFTDqWNuaWNhIDQuIENvbnZlcnRpciB0aXBvcyBkZSBkYXRvcw0KDQpgYGB7cn0NCiMgQ29udmVydGlyIGRlIGNhcmFjdGVyIGEgZmVjaGENCmJkNiA8LSBiZDUNCmJkNiRGZWNoYSA8LSBhcy5EYXRlKGJkNiRGZWNoYSwgIiVkLyVtLyVZIikNCnRpYmJsZShiZDYpDQpgYGANCg0KYGBge3J9DQojIENvbnZlcnRpciBkZSBjYXJhY3RlciBhIGVudGVybw0KYmQ3IDwtIGJkNg0KYmQ3JEhvcmEgPC0gc3Vic3RyKGJkNyRIb3JhLCBzdGFydCA9MSwgc3RvcCA9MikNCnRpYmJsZShiZDcpDQpiZDckSG9yYSA8LSBhcy5pbnRlZ2VyKGJkNyRIb3JhKQ0Kc3RyKGJkNykNCmBgYA0KDQojIFTDqWNuaWNhIDUuIFZhbG9yZXMgZmFsdGFudGVzDQoNCmBgYHtyfQ0KIyDCv0N1w6FudG9zIE5BIHRlbmdvIGVuIGxhIGJhc2UgZGUgZGF0b3M/DQpzdW0oaXMubmEoYmQ3KSkNCnN1bShpcy5uYShiZCkpDQpgYGANCg0KYGBge3J9DQojIMK/Q3XDoW50b3MgTkEgdGVuZ28gcG9yIHZhcmlhYmxlPw0Kc2FwcGx5KGJkNyxmdW5jdGlvbih4KXN1bShpcy5uYSh4KSkpDQpgYGANCg0KDQpgYGB7cn0NCiMgQm9ycmFyIHRvZG9zIGxvcyByZWdpc3Ryb3MgTkEgZGUgdW5hIHRhYmxhDQpiZDggPC0gYmQNCmJkOCA8LSBuYS5vbWl0KGJkOCkNCnN1bW1hcnkoYmQ4KQ0KYGBgDQoNCg0KYGBge3J9DQojIFJlZW1wbGF6YXIgTkEgY29uIGNlcm9zDQpiZDkgPC0gYmQNCmJkOVtpcy5uYShiZDkpXSA8LSAwDQpzdW1tYXJ5KGJkOSkNCmBgYA0KDQoNCmBgYHtyfQ0KIyBSZWVtcGxhemFyIGxvcyBOQSBjb24gZWwgcHJvbWVkaW8NCmJkMTAgPC0gYmQNCmJkMTAkUExVW2lzLm5hKGJkMTAkUExVKV0gPC0gbWVhbihiZDEwJFBMVSwgbmEucm0gPVRSVUUpDQpzdW1tYXJ5KGJkMTApDQpgYGANCg0KIyBUw6ljbmljYSA2LiBNw6l0b2RvIGVzdGFkw61zdGljbw0KDQpgYGB7cn0NCiMgR3LDoWZpY2EgZGUgY2FqYSB5IGJpZ290ZXMNCmJkMTIgPC0gYmQ3DQpib3hwbG90KGJkMTIkUHJlY2lvLCBob3Jpem9udGFsID0gVFJVRSkNCmJveHBsb3QoYmQxMiRVbmlkYWRlcywgaG9yaXpvbnRhbCA9IFRSVUUpDQpgYGANCg0KDQpgYGB7cn0NCiMgQWdyZWdhciBjb2x1bW5hcw0KbGlicmFyeShsdWJyaWRhdGUpDQpiZDEyJERpYV9kZV9sYV9zZW1hbmEgPC0gd2RheShiZDEyJEZlY2hhKQ0Kc3VtbWFyeShiZDEyKQ0KDQpiZDEyJFN1YnRvdGFsIDwtIGJkMTIkUHJlY2lvICogYmQxMiRVbmlkYWRlcw0Kc3VtbWFyeShiZDEyKQ0KDQpiZDEyJFV0aWxpZGFkIDwtIGJkMTIkUHJlY2lvIC0gYmQxMiRVbHQuQ29zdG8NCnN1bW1hcnkoYmQxMikNCmBgYA0KDQpgYGB7cn0NCiMgRXhwb3J0YXIgbGEgYmFzZSBkZSBkYXRvcyBsaW1waWENCmJkX2xpbXBpYSA8LSBiZDEyDQp3cml0ZS5jc3YoYmRfbGltcGlhLCBmaWxlPSJhYmFycm90ZXNfYmRfbGltcGlhLmNzdiIsIHJvdy5uYW1lcyA9IEZBTFNFKQ0KYGBgDQoNCg0KYGBge3J9DQojIE1hcmtldCBiYXNrZXQgYW5hbHlzaXMNCmxpYnJhcnkocGx5cikNCmxpYnJhcnkoTWF0cml4KQ0KbGlicmFyeShhcnVsZXMpDQpsaWJyYXJ5KGFydWxlc1ZpeikNCmxpYnJhcnkoZGF0YXNldHMpDQpgYGANCg0KYGBge3J9DQojIE9yZGVuYXIgZGUgbWVub3IgYSBtYXlvciBsb3MgdGlja2V0cw0KYmRfbGltcGlhIDwtIGJkX2xpbXBpYVtvcmRlcihiZF9saW1waWEkRi5UaWNrZXQpLF0NCmhlYWQoYmRfbGltcGlhKQ0KdGFpbChiZF9saW1waWEpDQpgYGANCg0KYGBge3J9DQojIEdlbmVyYXIgYmFza2V0DQpiYXNrZXQgPC0gZGRwbHkoYmRfbGltcGlhLCBjKCJGLlRpY2tldCIpLCBmdW5jdGlvbihiZF9saW1waWEpcGFzdGUoYmRfbGltcGlhJE1hcmNhLCBjb2xsYXBzZSA9ICIsIikpDQpgYGANCg0KYGBge3J9DQojIEVsaW1pbmFyIG7Dum1lcm8gZGUgdGlja2V0DQpiYXNrZXQkRi5UaWNrZXQgPC0gTlVMTA0KYGBgDQoNCmBgYHtyfQ0KIyBSZW5vbWJyYXIgZWwgbm9tYnJlIGRlIGxhIGNvbHVtbmENCmNvbG5hbWVzKGJhc2tldCkgPC0gYygiTWFyY2EiKQ0KYGBgDQoNCg0KYGBge3J9DQojIEV4cG9ydGFyIGJhc2tldA0Kd3JpdGUuY3N2KGJhc2tldCwgImJhc2tldC5jc3YiLCBxdW90ZSA9IEZBTFNFLCByb3cubmFtZXMgPSBGQUxTRSkNCmBgYA0KDQpgYGB7cn0NCmxpYnJhcnkoYXJ1bGVzKQ0KbGlicmFyeShhcnVsZXNWaXopDQpgYGANCg0KDQpgYGB7cn0NCiMgSW1wb3J0YXIgdHJhbnNhY2Npb25lcw0KdHIgPC0gcmVhZC50cmFuc2FjdGlvbnMoICJDOlxcVXNlcnNcXEFzdXMgWmVuQm9va1xcRG9jdW1lbnRzXFxiYXNrZXQuY3N2IiwgZm9ybWF0ID0gImJhc2tldCIsIHNlcCA9ICIsIikNCmBgYA0KDQpgYGB7cn0NCiMgR2VuZXJhciByZWdsYXMgZGUgYXNvY2lhY2nDs24NCnJlZ2xhcy5hc29jaWFjaW9uIDwtIGFwcmlvcmkodHIscGFyYW1ldGVyID0gbGlzdChzdXBwPTAuMDAxLCBjb25mPTAuMiwgbWF4bGVuPTEwKSkNCnN1bW1hcnkocmVnbGFzLmFzb2NpYWNpb24pDQppbnNwZWN0KHJlZ2xhcy5hc29jaWFjaW9uKQ0KYGBgDQoNCmBgYHtyfQ0KIyBPcmRlbmFyIHJlZ2xhcyBkZSBhc29jaWFjacOzbg0KcmVnbGFzLmFzb2NpYWNpb24gPC0gc29ydChyZWdsYXMuYXNvY2lhY2lvbiwgYnkgPSAiY29uZmlkZW5jZSIsIGRlY3JlYXNpbmcgPSBUUlVFKQ0Kc3VtbWFyeShyZWdsYXMuYXNvY2lhY2lvbikNCmluc3BlY3QocmVnbGFzLmFzb2NpYWNpb24pDQpgYGANCg0KYGBge3J9DQojIFZpc3VhbGl6YXIgcmVnbGFzIGRlIGFzb2NpYWNpw7NuDQp0b3AxMHJlZ2xhcyA8LSBoZWFkKHJlZ2xhcy5hc29jaWFjaW9uLCBuPTEwLCBieT0gImNvbmZpZGVuY2UiKQ0KcGxvdCh0b3AxMHJlZ2xhcywgbWV0aG9kID0gImdyYXBoIiwgZW5naW5lID0gImh0bWx3aWRnZXQiKQ0KYGBgDQoNCiMgQ29uY2x1c2nDs24NCiMjIyBFc3RhIGFjdGl2aWRhZCBlcyBlc2VuY2lhbCBwYXJhIGVsIHJldG8sIHlhIHF1ZSB0cmFiYWplcmVtb3MgY29uIGJhc2VzIGRlIGRhdG9zIGV4dGVuc2FzIHF1ZSBwdWVkZW4gY29udGVuZXIgZXJyb3JlcywgdmFsb3JlcyBudWxvcyBvIGRhdG9zIHF1ZSBubyB1dGlsaXphcmVtb3MsIHBvciBsbyB0YW50byBlcyBpbXBvcnRhbnRlIGFwcmVuZGVyIGEgbGltcGlhciB5IG9yZ2FuaXphciBsb3MgZGF0b3MgcGFyYSBmYWNpbGl0YXIgZWwgcHJvY2VzbyBkZSBhbsOhbGlzaXMgeSB0ZW5lciBpbmZvcm1hY2nDs24gcmVsZXZhbnRlIHBhcmEgZWwgY2Fzby4gDQoNCg==