#file.choose()
bd <- read.csv("/Users/joseramonvazquezguzman/Documents/Tecnológico de Monterrey/Septimo semestre/Analítica de datos /M3/ventasM3.csv")
#Observaciones
#Separamos la fecha y hora de la base de datos original
#Reemplazamos todas las comas y puntos por un espacio
summary(bd)
## BillNo Itemname Quantity Date
## Length:522064 Length:522064 Min. :-9600.00 Length:522064
## Class :character Class :character 1st Qu.: 1.00 Class :character
## Mode :character Mode :character Median : 3.00 Mode :character
## Mean : 10.09
## 3rd Qu.: 10.00
## Max. :80995.00
##
## Hora Price CustomerID Country
## Length:522064 Min. :-11062.060 Min. :12346 Length:522064
## Class :character 1st Qu.: 1.250 1st Qu.:13950 Class :character
## Mode :character Median : 2.080 Median :15265 Mode :character
## Mean : 3.827 Mean :15317
## 3rd Qu.: 4.130 3rd Qu.:16837
## Max. : 13541.330 Max. :18287
## NA's :134041
No se eliminan columnas ya que todas son relevantes ## Eliminar renglones
bd1 <- bd
bd1 <- bd1[bd1$Price > 0,]
summary (bd1)
## BillNo Itemname Quantity Date
## Length:519551 Length:519551 Min. : 1.0 Length:519551
## Class :character Class :character 1st Qu.: 1.0 Class :character
## Mode :character Mode :character Median : 3.0 Mode :character
## Mean : 10.4
## 3rd Qu.: 10.0
## Max. :80995.0
##
## Hora Price CustomerID Country
## Length:519551 Min. : 0.001 Min. :12346 Length:519551
## Class :character 1st Qu.: 1.250 1st Qu.:13950 Class :character
## Mode :character Median : 2.080 Median :15265 Mode :character
## Mean : 3.888 Mean :15317
## 3rd Qu.: 4.130 3rd Qu.:16837
## Max. :13541.330 Max. :18287
## NA's :131566
bd2 <- bd1
bd2 <- bd2[bd2$Price > 0.3,]
summary(bd2)
## BillNo Itemname Quantity Date
## Length:511103 Length:511103 Min. : 1.00 Length:511103
## Class :character Class :character 1st Qu.: 1.00 Class :character
## Mode :character Mode :character Median : 3.00 Mode :character
## Mean : 9.85
## 3rd Qu.: 10.00
## Max. :80995.00
##
## Hora Price CustomerID Country
## Length:511103 Min. : 0.310 Min. :12346 Length:511103
## Class :character 1st Qu.: 1.250 1st Qu.:13949 Class :character
## Mode :character Median : 2.100 Median :15271 Mode :character
## Mean : 3.948 Mean :15317
## 3rd Qu.: 4.130 3rd Qu.:16837
## Max. :13541.330 Max. :18287
## NA's :131482
#No consideraremos los precios menores a 4 centavos debido a que eran cargos del banco y eran datos irrelevantes
sum(is.na(bd2))
## [1] 131482
sum(is.na(bd))
## [1] 134041
sapply(bd2, function(x) sum(is.na(x)))
## BillNo Itemname Quantity Date Hora Price CustomerID
## 0 0 0 0 0 0 131482
## Country
## 0
sapply(bd, function(x) sum(is.na(x)))
## BillNo Itemname Quantity Date Hora Price CustomerID
## 0 0 0 0 0 0 134041
## Country
## 0
bd3 <- bd2
boxplot(bd3$Price, horizontal = TRUE)
boxplot(bd3$Quantity, horizontal = TRUE)
#install.packages("lubridate")
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
bd3$Dia_de_la_Semana <- wday(bd3$Date)
summary(bd3)
## BillNo Itemname Quantity Date
## Length:511103 Length:511103 Min. : 1.00 Length:511103
## Class :character Class :character 1st Qu.: 1.00 Class :character
## Mode :character Mode :character Median : 3.00 Mode :character
## Mean : 9.85
## 3rd Qu.: 10.00
## Max. :80995.00
##
## Hora Price CustomerID Country
## Length:511103 Min. : 0.310 Min. :12346 Length:511103
## Class :character 1st Qu.: 1.250 1st Qu.:13949 Class :character
## Mode :character Median : 2.100 Median :15271 Mode :character
## Mean : 3.948 Mean :15317
## 3rd Qu.: 4.130 3rd Qu.:16837
## Max. :13541.330 Max. :18287
## NA's :131482
## Dia_de_la_Semana
## Min. :1.000
## 1st Qu.:2.000
## Median :4.000
## Mean :4.011
## 3rd Qu.:6.000
## Max. :7.000
##
bd_limpia <- bd3
write.csv(bd_limpia, file="Actividad_Ventas_Limpia.csv", row.names = FALSE)
#install.packages("plyr")
library(Matrix)
#install.packages("arules")
library(arules)
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
#install.packages("arulesViz")
library(arulesViz)
#install.packages("datasets")
library(datasets)
#Ordenar de menor a mayor los tickets
bd_limpia <- bd_limpia[order(bd_limpia$BillNo),]
head(bd_limpia)
## BillNo Itemname Quantity Date Hora Price
## 1 536365 WHITE HANGING HEART T-LIGHT HOLDER 6 01/12/2010 08:26:00 2.55
## 2 536365 WHITE METAL LANTERN 6 01/12/2010 08:26:00 3.39
## 3 536365 CREAM CUPID HEARTS COAT HANGER 8 01/12/2010 08:26:00 2.75
## 4 536365 KNITTED UNION FLAG HOT WATER BOTTLE 6 01/12/2010 08:26:00 3.39
## 5 536365 RED WOOLLY HOTTIE WHITE HEART. 6 01/12/2010 08:26:00 3.39
## 6 536365 SET 7 BABUSHKA NESTING BOXES 2 01/12/2010 08:26:00 7.65
## CustomerID Country Dia_de_la_Semana
## 1 17850 United Kingdom 5
## 2 17850 United Kingdom 5
## 3 17850 United Kingdom 5
## 4 17850 United Kingdom 5
## 5 17850 United Kingdom 5
## 6 17850 United Kingdom 5
tail(bd_limpia)
## BillNo Itemname Quantity Date Hora
## 522060 581587 PACK OF 20 SPACEBOY NAPKINS 12 09/12/2011 12:50:00
## 522061 581587 CHILDREN'S APRON DOLLY GIRL 6 09/12/2011 12:50:00
## 522062 581587 CHILDRENS CUTLERY DOLLY GIRL 4 09/12/2011 12:50:00
## 522063 581587 CHILDRENS CUTLERY CIRCUS PARADE 4 09/12/2011 12:50:00
## 522064 581587 BAKING SET 9 PIECE RETROSPOT 3 09/12/2011 12:50:00
## 288773 A563185 Adjust bad debt 1 12/08/2011 14:50:00
## Price CustomerID Country Dia_de_la_Semana
## 522060 0.85 12680 France 1
## 522061 2.10 12680 France 1
## 522062 4.15 12680 France 1
## 522063 4.15 12680 France 1
## 522064 4.95 12680 France 1
## 288773 11062.06 NA United Kingdom 2
#install.packages("plyr")
library(plyr)
basket <- ddply(bd_limpia,c("BillNo"), function(bd_limpia)paste(bd_limpia$Itemname, collapse = ","))
basket$BillNo <- NULL
colnames(basket) <- c("Itemname")
write.csv(basket,"basketventas.csv", quote = FALSE, row.names = FALSE)
#file.choose()
library(Matrix)
library(arules)
library(arulesViz)
tr <- read.transactions("/Users/joseramonvazquezguzman/Documents/Tecnológico de Monterrey/Septimo semestre/Analítica de datos /M3/basketrrr.csv", format = "basket", sep=",")
reglas.asociacion <- apriori(tr, parameter = list(supp=0.001, conf=0.8, maxlen=10))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 19
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[8578 item(s), 19559 transaction(s)] done [0.11s].
## sorting and recoding items ... [2644 item(s)] done [0.01s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10 done [3.00s].
## writing ... [6156148 rule(s)] done [1.08s].
## creating S4 object ... done [2.78s].
#summary(reglas.asociacion)
#inspect(reglas.asociacion)
top10reglas <- head(reglas.asociacion, n = 10, by = "confidence")
inspect(top10reglas)
## lhs rhs support confidence coverage lift count
## [1] {SILVER MINI TAPE MEASURE} => {JUMBO BAG PINK VINTAGE PAISLEY} 0.001124802 1 0.001124802 27.24095 22
## [2] {SILVER MINI TAPE MEASURE} => {STRAWBERRY CHARLOTTE BAG} 0.001124802 1 0.001124802 32.32893 22
## [3] {SILVER MINI TAPE MEASURE} => {LUNCH BAG CARS BLUE} 0.001124802 1 0.001124802 20.24741 22
## [4] {SILVER MINI TAPE MEASURE} => {WOODLAND CHARLOTTE BAG} 0.001124802 1 0.001124802 28.84808 22
## [5] {SILVER MINI TAPE MEASURE} => {RED RETROSPOT CHARLOTTE BAG} 0.001124802 1 0.001124802 22.98355 22
## [6] {PINK POLKADOT BOWL,
## SET/20 FRUIT SALAD PAPER NAPKINS} => {STRAWBERRY CHARLOTTE BAG} 0.001022547 1 0.001022547 32.32893 20
## [7] {PINK POLKADOT BOWL,
## SET/20 FRUIT SALAD PAPER NAPKINS} => {LUNCH BAG CARS BLUE} 0.001022547 1 0.001022547 20.24741 20
## [8] {PINK POLKADOT BOWL,
## SET/20 FRUIT SALAD PAPER NAPKINS} => {WOODLAND CHARLOTTE BAG} 0.001022547 1 0.001022547 28.84808 20
## [9] {PINK POLKADOT BOWL,
## SET/20 FRUIT SALAD PAPER NAPKINS} => {RED RETROSPOT CHARLOTTE BAG} 0.001022547 1 0.001022547 22.98355 20
## [10] {SET/20 FRUIT SALAD PAPER NAPKINS,
## STRAWBERRY CHARLOTTE BAG} => {LUNCH BAG CARS BLUE} 0.001073675 1 0.001073675 20.24741 21
plot(top10reglas, method ="graph", engine = "htmlwidget")
Como estrategia se podría elaborar el MBA para distintos productos y ver la correlación que se genera a partir de un producto, se puede observar en el análisis que hay una asociación entre la regla 2 y regla 6 donde el producto “Strawberry Charlotte” es el correlacionado, esto quiere decir que se tiene una venta mayor hacia este producto, esto para analizar el comportamiento de los productos dentro de la tienda puede ser muy interesante y puede ayudara también el departamento de marketing a interpretar y generar ideas en base a esta información.