Se cambió el formato a Formato a fecha corta.
Se cambió el formato a Hora (Español México).
Se guardó como CSV UTF-8 (Delimitado por comas).
Se cambio las comas por espacios.
Se cambio los puntos por espacios.
bd <- read.csv("/Users/georginamartinez/Documents/Tec/Séptimo Semestre/Analítica para negocios, de los datos a decisiones/ventas_corregido.csv")
summary(bd)
## BillNo Itemname Quantity Date
## Length:522064 Length:522064 Min. :-9600.00 Length:522064
## Class :character Class :character 1st Qu.: 1.00 Class :character
## Mode :character Mode :character Median : 3.00 Mode :character
## Mean : 10.09
## 3rd Qu.: 10.00
## Max. :80995.00
##
## Hora Price CustomerID Country
## Length:522064 Min. :-11062.060 Min. :12346 Length:522064
## Class :character 1st Qu.: 1.250 1st Qu.:13950 Class :character
## Mode :character Median : 2.080 Median :15265 Mode :character
## Mean : 3.827 Mean :15317
## 3rd Qu.: 4.130 3rd Qu.:16837
## Max. : 13541.330 Max. :18287
## NA's :134041
# install.packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# count(bd, BillNo, sort = TRUE)
# count(bd, Itemname, sort = TRUE)
# count(bd, Quantity, sort = TRUE)
# count(bd, Date, sort = TRUE)
# count(bd, Hora, sort = TRUE)
# count(bd, Price, sort = TRUE)
# count(bd, CustomerID, sort = TRUE)
# count(bd, Country, sort = TRUE)
# install.packages("tidyverse")
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ stringr 1.4.1
## ✔ tidyr 1.2.0 ✔ forcats 0.5.2
## ✔ readr 2.1.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
# tibble(bd)
# str(bd)
# head(bd)
# head(bd, n=7)
# tail(bd)
# install.packages("janitor")
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
# tabyl(bd, CustomerID, Itemname)
# ¿Cuántos NA tengo en la base de datos?
sum(is.na(bd))
## [1] 134041
# ¿Cuántos NA tengo por variable?
sapply(bd, function(x) sum(is.na(x)))
## BillNo Itemname Quantity Date Hora Price CustomerID
## 0 0 0 0 0 0 134041
## Country
## 0
# Eliminar renglones
bd1 <- bd[bd$Price > 0.001, ]
# summary(bd1)
# Convertir de caracter a fecha
library(dplyr)
bd2 <- bd1
bd2$Date <- as.Date(bd2$Date, format= "%d/%m/%Y")
# tibble(bd2)
# Convertir de caracter a entero hora
bd3 <- bd2
bd3$Hora <- substr(bd3$Hora, start = 1, stop = 2)
bd3$Hora <- as.integer(bd3$Hora)
## Warning: NAs introduced by coercion
tibble(bd3)
## # A tibble: 519,547 × 8
## BillNo Itemname Quant…¹ Date Hora Price Custo…² Country
## <chr> <chr> <int> <date> <int> <dbl> <int> <chr>
## 1 536365 "WHITE HANGING HEART T… 6 2010-12-01 NA 2.55 17850 United…
## 2 536365 "WHITE METAL LANTERN" 6 2010-12-01 NA 3.39 17850 United…
## 3 536365 "CREAM CUPID HEARTS CO… 8 2010-12-01 NA 2.75 17850 United…
## 4 536365 "KNITTED UNION FLAG HO… 6 2010-12-01 NA 3.39 17850 United…
## 5 536365 "RED WOOLLY HOTTIE WHI… 6 2010-12-01 NA 3.39 17850 United…
## 6 536365 "SET 7 BABUSHKA NESTIN… 2 2010-12-01 NA 7.65 17850 United…
## 7 536365 "GLASS STAR FROSTED T-… 6 2010-12-01 NA 4.25 17850 United…
## 8 536366 "HAND WARMER UNION JAC… 6 2010-12-01 NA 1.85 17850 United…
## 9 536366 "HAND WARMER RED POLKA… 6 2010-12-01 NA 1.85 17850 United…
## 10 536367 "ASSORTED COLOUR BIRD … 32 2010-12-01 NA 1.69 13047 United…
## # … with 519,537 more rows, and abbreviated variable names ¹Quantity,
## # ²CustomerID
# Ordenar de menor a mayor los Tickets
bd4 <- bd3[order(bd3$BillNo),]
tail(bd4)
## BillNo Itemname Quantity Date Hora
## 522060 581587 PACK OF 20 SPACEBOY NAPKINS 12 2011-12-09 12
## 522061 581587 CHILDREN'S APRON DOLLY GIRL 6 2011-12-09 12
## 522062 581587 CHILDRENS CUTLERY DOLLY GIRL 4 2011-12-09 12
## 522063 581587 CHILDRENS CUTLERY CIRCUS PARADE 4 2011-12-09 12
## 522064 581587 BAKING SET 9 PIECE RETROSPOT 3 2011-12-09 12
## 288773 A563185 Adjust bad debt 1 2011-08-12 14
## Price CustomerID Country
## 522060 0.85 12680 France
## 522061 2.10 12680 France
## 522062 4.15 12680 France
## 522063 4.15 12680 France
## 522064 4.95 12680 France
## 288773 11062.06 NA United Kingdom
# Eliminar renglones
bd5 <- bd4
bd5 <- bd5[bd5$BillNo < 581588, ]
summary (bd1)
## BillNo Itemname Quantity Date
## Length:519547 Length:519547 Min. : 1.0 Length:519547
## Class :character Class :character 1st Qu.: 1.0 Class :character
## Mode :character Mode :character Median : 3.0 Mode :character
## Mean : 10.4
## 3rd Qu.: 10.0
## Max. :80995.0
##
## Hora Price CustomerID Country
## Length:519547 Min. : 0.040 Min. :12346 Length:519547
## Class :character 1st Qu.: 1.250 1st Qu.:13950 Class :character
## Mode :character Median : 2.080 Median :15265 Mode :character
## Mean : 3.888 Mean :15317
## 3rd Qu.: 4.130 3rd Qu.:16837
## Max. :13541.330 Max. :18287
## NA's :131566
tail(bd5)
## BillNo Itemname Quantity Date Hora Price
## 522059 581587 CHILDRENS CUTLERY SPACEBOY 4 2011-12-09 12 4.15
## 522060 581587 PACK OF 20 SPACEBOY NAPKINS 12 2011-12-09 12 0.85
## 522061 581587 CHILDREN'S APRON DOLLY GIRL 6 2011-12-09 12 2.10
## 522062 581587 CHILDRENS CUTLERY DOLLY GIRL 4 2011-12-09 12 4.15
## 522063 581587 CHILDRENS CUTLERY CIRCUS PARADE 4 2011-12-09 12 4.15
## 522064 581587 BAKING SET 9 PIECE RETROSPOT 3 2011-12-09 12 4.95
## CustomerID Country
## 522059 12680 France
## 522060 12680 France
## 522061 12680 France
## 522062 12680 France
## 522063 12680 France
## 522064 12680 France
# Convertir de caracter a entero
bd6 <- bd5
bd6$BillNo <- as.integer(bd6$BillNo)
tibble(bd6)
## # A tibble: 519,546 × 8
## BillNo Itemname Quant…¹ Date Hora Price Custo…² Country
## <int> <chr> <int> <date> <int> <dbl> <int> <chr>
## 1 536365 "WHITE HANGING HEART T… 6 2010-12-01 NA 2.55 17850 United…
## 2 536365 "WHITE METAL LANTERN" 6 2010-12-01 NA 3.39 17850 United…
## 3 536365 "CREAM CUPID HEARTS CO… 8 2010-12-01 NA 2.75 17850 United…
## 4 536365 "KNITTED UNION FLAG HO… 6 2010-12-01 NA 3.39 17850 United…
## 5 536365 "RED WOOLLY HOTTIE WHI… 6 2010-12-01 NA 3.39 17850 United…
## 6 536365 "SET 7 BABUSHKA NESTIN… 2 2010-12-01 NA 7.65 17850 United…
## 7 536365 "GLASS STAR FROSTED T-… 6 2010-12-01 NA 4.25 17850 United…
## 8 536366 "HAND WARMER UNION JAC… 6 2010-12-01 NA 1.85 17850 United…
## 9 536366 "HAND WARMER RED POLKA… 6 2010-12-01 NA 1.85 17850 United…
## 10 536367 "ASSORTED COLOUR BIRD … 32 2010-12-01 NA 1.69 13047 United…
## # … with 519,536 more rows, and abbreviated variable names ¹Quantity,
## # ²CustomerID
str(bd6)
## 'data.frame': 519546 obs. of 8 variables:
## $ BillNo : int 536365 536365 536365 536365 536365 536365 536365 536366 536366 536367 ...
## $ Itemname : chr "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ Quantity : int 6 6 8 6 6 2 6 6 6 32 ...
## $ Date : Date, format: "2010-12-01" "2010-12-01" ...
## $ Hora : int NA NA NA NA NA NA NA NA NA NA ...
## $ Price : num 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID: int 17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
## $ Country : chr "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
# Eliminar comas y puntos de los artículos
# bd7 <- bd6
# bd7$Itemname <- sub(",", " ", bd8$Itemname, fixed=TRUE)
# bd7$Itemname <- sub(",", " ", bd8$Itemname, fixed=TRUE)
# No se usó porque los artículos tienen comas entre texto, y esas no se eliminaron con esta función.
bd_limpia <- bd6
write.csv(bd_limpia, file="ventas_bd_limpia.csv", row.names = FALSE)
bd6$Total <- bd6$Quantity * bd6$Price
str(bd6)
## 'data.frame': 519546 obs. of 9 variables:
## $ BillNo : int 536365 536365 536365 536365 536365 536365 536365 536366 536366 536367 ...
## $ Itemname : chr "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ Quantity : int 6 6 8 6 6 2 6 6 6 32 ...
## $ Date : Date, format: "2010-12-01" "2010-12-01" ...
## $ Hora : int NA NA NA NA NA NA NA NA NA NA ...
## $ Price : num 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID: int 17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
## $ Country : chr "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
## $ Total : num 15.3 20.3 22 20.3 20.3 ...
summary(bd6)
## BillNo Itemname Quantity Date
## Min. :536365 Length:519546 Min. : 1.0 Min. :0201-10-10
## 1st Qu.:547895 Class :character 1st Qu.: 1.0 1st Qu.:2011-03-03
## Median :560689 Mode :character Median : 3.0 Median :2011-06-07
## Mean :559967 Mean : 10.4 Mean :1930-12-23
## 3rd Qu.:571909 3rd Qu.: 10.0 3rd Qu.:2011-09-05
## Max. :581587 Max. :80995.0 Max. :2011-12-09
## NA's :103991
## Hora Price CustomerID Country
## Min. : 0.00 Min. : 0.040 Min. :12346 Length:519546
## 1st Qu.: 1.00 1st Qu.: 1.250 1st Qu.:13950 Class :character
## Median :12.00 Median : 2.080 Median :15265 Mode :character
## Mean :10.28 Mean : 3.867 Mean :15317
## 3rd Qu.:15.00 3rd Qu.: 4.130 3rd Qu.:16837
## Max. :20.00 Max. :13541.330 Max. :18287
## NA's :32479 NA's :131565
## Total
## Min. : 0.06
## 1st Qu.: 3.75
## Median : 9.87
## Mean : 19.81
## 3rd Qu.: 17.40
## Max. :168469.60
##
# Obtener el Total por Ticket, agregando el cliente
bd7 <- bd6
bd7 <- aggregate(Total ~ CustomerID + BillNo, data = bd7, sum)
# Obtener el Ticket Promedio por cliente
bd8 <- bd7
bd8 <- aggregate(Total ~ CustomerID, data =bd7, mean)
# install.packages("plyr")
library(Matrix)
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
# install.packages("arules")
library(arules)
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
# install.packages("arulesViz")
library(arulesViz)
library(datasets)
# install.packages("plyr")
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following object is masked from 'package:purrr':
##
## compact
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
bd_limpia <- bd6
basket <- ddply(bd_limpia,c("BillNo"), function(bd_limpia)paste(bd_limpia$Itemname, collapse=","))
basket$BillNo <- NULL
colnames(basket) <- c("Artículo")
write.csv(basket,"basketV.csv", quote = FALSE, row.names = FALSE)
tr <- read.transactions("/Users/georginamartinez/basketV.csv", format = "basket" , sep= ",")
reglas.asociacion <- apriori(tr, parameter = list(supp= 0.001, conf=0.8, maxlen=10))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 19
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[8578 item(s), 19559 transaction(s)] done [0.23s].
## sorting and recoding items ... [2644 item(s)] done [0.01s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10 done [4.13s].
## writing ... [6156148 rule(s)] done [1.52s].
## creating S4 object ... done [3.88s].
reglas.asociacion <- sort(reglas.asociacion, by = "confidence", decreasing = TRUE)
top10reglas <- head(reglas.asociacion, n = 10, by = "confidence")
# inspect(top10reglas)
plot(top10reglas, method = "graph", engine = "htmlwidget")
El Market Basket Analysis nos sirve para conocer patrones sobre que productos se llevan en conjunto o si compran uno que otros compran y más a través de un conjunto de datos, lo utilizan muchas empresas porque les permite crear estrategias para aumentar sus ventas e incluso ofrecer una experiencia personalizada a los clientes.
Para esto tuvimos que realizar una limpieza de datos a la base de datos brindada y después conocer las reglas que nos diran que productos tienen relación en las tiendas de abarrotes.
La gráfica muestra las top 10 reglas que se encontraron dentro de la base de datos sobre las tiendas de Abarrotes y podemos interpretar que el producto principal que hace que compren en conjunto con otros es ‘Silver Mini Tape Measure’ comprando Charlotte bags o lunch bags, lo que nos quiere decir que compran herramientas y el lugar en donde guardarlas.