#install.packages("dplyr")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#install.packages("tidyverse")
library(tidyverse)
## āā Attaching core tidyverse packages āāāāāāāāāāāāāāāāāāāāāāāā tidyverse 2.0.0 āā
## ā forcats 1.0.0 ā readr 2.1.4
## ā ggplot2 3.4.1 ā stringr 1.5.0
## ā lubridate 1.9.2 ā tibble 3.1.8
## ā purrr 1.0.1 ā tidyr 1.3.0
## āā Conflicts āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā tidyverse_conflicts() āā
## ā dplyr::filter() masks stats::filter()
## ā dplyr::lag() masks stats::lag()
## ā¹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
#install.packages("janitor")
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
#install.packages("lubridate")
library(lubridate)
file.choose()
bd <- read.csv("/Users/crisflorespalacios/Desktop/Manipulacion de datos/supermercado.csv")
summary(bd)
## BillNo Itemname Quantity Date
## Length:522064 Length:522064 Min. :-9600.00 Length:522064
## Class :character Class :character 1st Qu.: 1.00 Class :character
## Mode :character Mode :character Median : 3.00 Mode :character
## Mean : 10.09
## 3rd Qu.: 10.00
## Max. :80995.00
##
## Price CustomerID Country
## Min. :-11062.060 Min. :12346 Length:522064
## 1st Qu.: 1.250 1st Qu.:13950 Class :character
## Median : 2.080 Median :15265 Mode :character
## Mean : 3.827 Mean :15317
## 3rd Qu.: 4.130 3rd Qu.:16837
## Max. : 13541.330 Max. :18287
## NA's :134041
Entender las variables de caracter La fecha estĆ” en formato de caracter y hay que cambiarla Precios y cantidades negativos EL BILL NO. estĆ” como caracter Crear unacolumna de ventas cant*precio
#count(bd, BillNo, sort= TRUE)
#count(bd, Itemname, sort= TRUE)
#count(bd, Date, sort=TRUE)
#count(bd, Country, sort= TRUE)
tibble(bd)
## # A tibble: 522,064 Ć 7
## BillNo Itemname Quantā¦Ā¹ Date Price Custoā¦Ā² Country
## <chr> <chr> <int> <chr> <dbl> <int> <chr>
## 1 581483 "PAPER CRAFT , LITTLE BIRDIE" 80995 09/1⦠2.08 16446 Unitedā¦
## 2 541431 "MEDIUM CERAMIC TOP STORAGE JAR" 74215 18/0⦠1.04 12346 Unitedā¦
## 3 578841 "ASSTD DESIGN 3D PAPER STICKERS" 12540 25/1⦠0 13256 Unitedā¦
## 4 542504 "" 5568 28/0⦠0 NA Unitedā¦
## 5 573008 "WORLD WAR 2 GLIDERS ASSTD DESIGN⦠4800 27/1⦠0.21 12901 Unitedā¦
## 6 554868 "SMALL POPCORN HOLDER" 4300 27/0⦠0.72 13135 Unitedā¦
## 7 556231 "?" 4000 09/0⦠0 NA Unitedā¦
## 8 544612 "EMPIRE DESIGN ROSETTE" 3906 22/0⦠0.82 18087 Unitedā¦
## 9 560599 "ESSENTIAL BALM 3.5g TIN IN ENVEL⦠3186 19/0⦠0.06 14609 Unitedā¦
## 10 540815 "FAIRY CAKE FLANNEL ASSORTED COLO⦠3114 11/0⦠2.1 15749 Unitedā¦
## # ⦠with 522,054 more rows, and abbreviated variable names ¹āQuantity,
## # ²āCustomerID
bd1 <- bd
bd1 <- subset(bd1, select = -c(CustomerID))
bd2 <- bd1
bd2 <- bd2[bd2$Price >0,]
summary(bd2)
## BillNo Itemname Quantity Date
## Length:519551 Length:519551 Min. : 1.0 Length:519551
## Class :character Class :character 1st Qu.: 1.0 Class :character
## Mode :character Mode :character Median : 3.0 Mode :character
## Mean : 10.4
## 3rd Qu.: 10.0
## Max. :80995.0
## Price Country
## Min. : 0.001 Length:519551
## 1st Qu.: 1.250 Class :character
## Median : 2.080 Mode :character
## Mean : 3.888
## 3rd Qu.: 4.130
## Max. :13541.330
#bd2[duplicated(bd2),]
sum(duplicated(bd2))
## [1] 5282
bd3<- bd2
bd3 <- distinct(bd3)
bd4 <- bd3
bd4$Date <- as.Date(bd4$Date, format= "%d/%m/%Y")
summary(bd4)
## BillNo Itemname Quantity Date
## Length:514269 Length:514269 Min. : 1.00 Min. :2010-12-01
## Class :character Class :character 1st Qu.: 1.00 1st Qu.:2011-03-28
## Mode :character Mode :character Median : 3.00 Median :2011-07-19
## Mean : 10.47 Mean :2011-07-03
## 3rd Qu.: 10.00 3rd Qu.:2011-10-19
## Max. :80995.00 Max. :2011-12-09
## Price Country
## Min. : 0.001 Length:514269
## 1st Qu.: 1.250 Class :character
## Median : 2.080 Mode :character
## Mean : 3.903
## 3rd Qu.: 4.130
## Max. :13541.330
sum(is.na(bd4)) #en la base de datos 4 ya no hay
## [1] 0
sapply(bd4, function(x) sum (is.na(x)))
## BillNo Itemname Quantity Date Price Country
## 0 0 0 0 0 0
#sum(is.na(bd))
Buscar datos atĆpicos, fuera de lo normal
bd5 <- bd4
boxplot(bd5$Price, horizontal=TRUE) #hay 3 productos que cuestan mucho,
boxplot(bd5$Quantity, horizontal=TRUE) #se vendió en dos ocasiones una cantidad muy grande de POPCORNHOLDERS
Crear una columna de ventas = precio*cantidad
bd5$subtotal <- bd5$Price * bd5$Quantity
summary(bd5)
## BillNo Itemname Quantity Date
## Length:514269 Length:514269 Min. : 1.00 Min. :2010-12-01
## Class :character Class :character 1st Qu.: 1.00 1st Qu.:2011-03-28
## Mode :character Mode :character Median : 3.00 Median :2011-07-19
## Mean : 10.47 Mean :2011-07-03
## 3rd Qu.: 10.00 3rd Qu.:2011-10-19
## Max. :80995.00 Max. :2011-12-09
## Price Country subtotal
## Min. : 0.001 Length:514269 Min. : 0.00
## 1st Qu.: 1.250 Class :character 1st Qu.: 3.75
## Median : 2.080 Mode :character Median : 9.90
## Mean : 3.903 Mean : 19.98
## 3rd Qu.: 4.130 3rd Qu.: 17.40
## Max. :13541.330 Max. :168469.60
bd_limpia <- bd5
#write.csv(bd_limpia, file= "supermercado_bdlimpia.csv", row.names= FALSE)