instalar librerias

#install.packages("dplyr")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#install.packages("tidyverse")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## āœ” forcats   1.0.0     āœ” readr     2.1.4
## āœ” ggplot2   3.4.1     āœ” stringr   1.5.0
## āœ” lubridate 1.9.2     āœ” tibble    3.1.8
## āœ” purrr     1.0.1     āœ” tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## āœ– dplyr::filter() masks stats::filter()
## āœ– dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
#install.packages("janitor")
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
#install.packages("lubridate")
library(lubridate)

importar el archivo

file.choose()

bd <- read.csv("/Users/crisflorespalacios/Desktop/Manipulacion de datos/supermercado.csv")

Entender la base de datos

summary(bd)
##     BillNo            Itemname            Quantity            Date          
##  Length:522064      Length:522064      Min.   :-9600.00   Length:522064     
##  Class :character   Class :character   1st Qu.:    1.00   Class :character  
##  Mode  :character   Mode  :character   Median :    3.00   Mode  :character  
##                                        Mean   :   10.09                     
##                                        3rd Qu.:   10.00                     
##                                        Max.   :80995.00                     
##                                                                             
##      Price              CustomerID       Country         
##  Min.   :-11062.060   Min.   :12346    Length:522064     
##  1st Qu.:     1.250   1st Qu.:13950    Class :character  
##  Median :     2.080   Median :15265    Mode  :character  
##  Mean   :     3.827   Mean   :15317                      
##  3rd Qu.:     4.130   3rd Qu.:16837                      
##  Max.   : 13541.330   Max.   :18287                      
##                       NA's   :134041

Entender las variables de caracter La fecha estĆ” en formato de caracter y hay que cambiarla Precios y cantidades negativos EL BILL NO. estĆ” como caracter Crear unacolumna de ventas cant*precio

#count(bd, BillNo, sort= TRUE)
#count(bd, Itemname, sort= TRUE)
#count(bd, Date, sort=TRUE)
#count(bd, Country, sort= TRUE)
tibble(bd)
## # A tibble: 522,064 Ɨ 7
##    BillNo Itemname                           Quant…¹ Date  Price Custo…² Country
##    <chr>  <chr>                                <int> <chr> <dbl>   <int> <chr>  
##  1 581483 "PAPER CRAFT , LITTLE BIRDIE"        80995 09/1…  2.08   16446 United…
##  2 541431 "MEDIUM CERAMIC TOP STORAGE JAR"     74215 18/0…  1.04   12346 United…
##  3 578841 "ASSTD DESIGN 3D PAPER STICKERS"     12540 25/1…  0      13256 United…
##  4 542504 ""                                    5568 28/0…  0         NA United…
##  5 573008 "WORLD WAR 2 GLIDERS ASSTD DESIGN…    4800 27/1…  0.21   12901 United…
##  6 554868 "SMALL POPCORN HOLDER"                4300 27/0…  0.72   13135 United…
##  7 556231 "?"                                   4000 09/0…  0         NA United…
##  8 544612 "EMPIRE DESIGN ROSETTE"               3906 22/0…  0.82   18087 United…
##  9 560599 "ESSENTIAL BALM 3.5g TIN IN ENVEL…    3186 19/0…  0.06   14609 United…
## 10 540815 "FAIRY CAKE FLANNEL ASSORTED COLO…    3114 11/0…  2.1    15749 United…
## # … with 522,054 more rows, and abbreviated variable names ¹​Quantity,
## #   ²​CustomerID

Remover datos irrelevantes

bd1 <- bd
bd1 <- subset(bd1, select = -c(CustomerID))

bd2 <- bd1
bd2 <- bd2[bd2$Price >0,]
summary(bd2)
##     BillNo            Itemname            Quantity           Date          
##  Length:519551      Length:519551      Min.   :    1.0   Length:519551     
##  Class :character   Class :character   1st Qu.:    1.0   Class :character  
##  Mode  :character   Mode  :character   Median :    3.0   Mode  :character  
##                                        Mean   :   10.4                     
##                                        3rd Qu.:   10.0                     
##                                        Max.   :80995.0                     
##      Price             Country         
##  Min.   :    0.001   Length:519551     
##  1st Qu.:    1.250   Class :character  
##  Median :    2.080   Mode  :character  
##  Mean   :    3.888                     
##  3rd Qu.:    4.130                     
##  Max.   :13541.330

ĀæCuantos valores duplicados tenemos?

#bd2[duplicated(bd2),]
sum(duplicated(bd2))
## [1] 5282
bd3<- bd2
bd3 <- distinct(bd3)

Convertir de caracter a fecha

bd4 <- bd3
bd4$Date <- as.Date(bd4$Date, format= "%d/%m/%Y")
summary(bd4)
##     BillNo            Itemname            Quantity             Date           
##  Length:514269      Length:514269      Min.   :    1.00   Min.   :2010-12-01  
##  Class :character   Class :character   1st Qu.:    1.00   1st Qu.:2011-03-28  
##  Mode  :character   Mode  :character   Median :    3.00   Median :2011-07-19  
##                                        Mean   :   10.47   Mean   :2011-07-03  
##                                        3rd Qu.:   10.00   3rd Qu.:2011-10-19  
##                                        Max.   :80995.00   Max.   :2011-12-09  
##      Price             Country         
##  Min.   :    0.001   Length:514269     
##  1st Qu.:    1.250   Class :character  
##  Median :    2.080   Mode  :character  
##  Mean   :    3.903                     
##  3rd Qu.:    4.130                     
##  Max.   :13541.330

CuƔntos valores NA hay

sum(is.na(bd4)) #en la base de datos 4 ya no hay 
## [1] 0
sapply(bd4, function(x) sum (is.na(x)))
##   BillNo Itemname Quantity     Date    Price  Country 
##        0        0        0        0        0        0
#sum(is.na(bd))

Verificar datos con mƩtodos estadƭsticos

Buscar datos atĆ­picos, fuera de lo normal

bd5 <- bd4
boxplot(bd5$Price, horizontal=TRUE) #hay 3 productos que cuestan mucho, 

boxplot(bd5$Quantity, horizontal=TRUE) #se vendió en dos ocasiones una cantidad muy grande de POPCORNHOLDERS

Manipular la base de datos

Crear una columna de ventas = precio*cantidad

bd5$subtotal <- bd5$Price * bd5$Quantity
summary(bd5)
##     BillNo            Itemname            Quantity             Date           
##  Length:514269      Length:514269      Min.   :    1.00   Min.   :2010-12-01  
##  Class :character   Class :character   1st Qu.:    1.00   1st Qu.:2011-03-28  
##  Mode  :character   Mode  :character   Median :    3.00   Median :2011-07-19  
##                                        Mean   :   10.47   Mean   :2011-07-03  
##                                        3rd Qu.:   10.00   3rd Qu.:2011-10-19  
##                                        Max.   :80995.00   Max.   :2011-12-09  
##      Price             Country             subtotal        
##  Min.   :    0.001   Length:514269      Min.   :     0.00  
##  1st Qu.:    1.250   Class :character   1st Qu.:     3.75  
##  Median :    2.080   Mode  :character   Median :     9.90  
##  Mean   :    3.903                      Mean   :    19.98  
##  3rd Qu.:    4.130                      3rd Qu.:    17.40  
##  Max.   :13541.330                      Max.   :168469.60

Exportar base de datos limpia

bd_limpia <- bd5
#write.csv(bd_limpia, file= "supermercado_bdlimpia.csv", row.names= FALSE)
LS0tCnRpdGxlOiAic3VwZXJtZXJjYWRvIGxpbXBpYSIKYXV0aG9yOiAiQ3Jpc3RpbmEgRlAgYTAxNTY2ODgwIgpkYXRlOiAiMjAyMy0wMy0xNyIKb3V0cHV0OgogICAgaHRtbF9kb2N1bWVudDoKICAgICAgdG9jOiB0cnVlCiAgICAgIHRvY19mbG9hdDogdHJ1ZSAKICAgICAgY29kZV9kb3dubG9hZDogdHJ1ZSAKLS0tCiMjIyBpbnN0YWxhciBsaWJyZXJpYXMgCmBgYHtyfQojaW5zdGFsbC5wYWNrYWdlcygiZHBseXIiKQpsaWJyYXJ5KGRwbHlyKQojaW5zdGFsbC5wYWNrYWdlcygidGlkeXZlcnNlIikKbGlicmFyeSh0aWR5dmVyc2UpCiNpbnN0YWxsLnBhY2thZ2VzKCJqYW5pdG9yIikKbGlicmFyeShqYW5pdG9yKQojaW5zdGFsbC5wYWNrYWdlcygibHVicmlkYXRlIikKbGlicmFyeShsdWJyaWRhdGUpCmBgYAoKIyMjIGltcG9ydGFyIGVsIGFyY2hpdm8gCmZpbGUuY2hvb3NlKCkKYGBge3J9CmJkIDwtIHJlYWQuY3N2KCIvVXNlcnMvY3Jpc2Zsb3Jlc3BhbGFjaW9zL0Rlc2t0b3AvTWFuaXB1bGFjaW9uIGRlIGRhdG9zL3N1cGVybWVyY2Fkby5jc3YiKQpgYGAKCiMjIyBFbnRlbmRlciBsYSBiYXNlIGRlIGRhdG9zIApgYGB7cn0Kc3VtbWFyeShiZCkKYGBgCgpFbnRlbmRlciBsYXMgdmFyaWFibGVzIGRlIGNhcmFjdGVyIApMYSBmZWNoYSBlc3TDoSBlbiBmb3JtYXRvIGRlIGNhcmFjdGVyIHkgaGF5IHF1ZSBjYW1iaWFybGEKUHJlY2lvcyB5IGNhbnRpZGFkZXMgbmVnYXRpdm9zIApFTCBCSUxMIE5PLiBlc3TDoSBjb21vIGNhcmFjdGVyIApDcmVhciB1bmFjb2x1bW5hIGRlIHZlbnRhcyBjYW50KnByZWNpbyAKCmBgYHtyfQojY291bnQoYmQsIEJpbGxObywgc29ydD0gVFJVRSkKI2NvdW50KGJkLCBJdGVtbmFtZSwgc29ydD0gVFJVRSkKI2NvdW50KGJkLCBEYXRlLCBzb3J0PVRSVUUpCiNjb3VudChiZCwgQ291bnRyeSwgc29ydD0gVFJVRSkKdGliYmxlKGJkKQpgYGAKCiMjIyBSZW1vdmVyIGRhdG9zIGlycmVsZXZhbnRlcyAKYGBge3J9CmJkMSA8LSBiZApiZDEgPC0gc3Vic2V0KGJkMSwgc2VsZWN0ID0gLWMoQ3VzdG9tZXJJRCkpCgpiZDIgPC0gYmQxCmJkMiA8LSBiZDJbYmQyJFByaWNlID4wLF0Kc3VtbWFyeShiZDIpCmBgYAoKIyMjIMK/Q3VhbnRvcyB2YWxvcmVzIGR1cGxpY2Fkb3MgdGVuZW1vcz8KYGBge3J9CiNiZDJbZHVwbGljYXRlZChiZDIpLF0Kc3VtKGR1cGxpY2F0ZWQoYmQyKSkKCmJkMzwtIGJkMgpiZDMgPC0gZGlzdGluY3QoYmQzKQpgYGAKCiMjIyBDb252ZXJ0aXIgZGUgY2FyYWN0ZXIgYSBmZWNoYSAKYGBge3J9CmJkNCA8LSBiZDMKYmQ0JERhdGUgPC0gYXMuRGF0ZShiZDQkRGF0ZSwgZm9ybWF0PSAiJWQvJW0vJVkiKQpzdW1tYXJ5KGJkNCkKYGBgCgojIyMgQ3XDoW50b3MgdmFsb3JlcyBOQSBoYXkKYGBge3J9CnN1bShpcy5uYShiZDQpKSAjZW4gbGEgYmFzZSBkZSBkYXRvcyA0IHlhIG5vIGhheSAKc2FwcGx5KGJkNCwgZnVuY3Rpb24oeCkgc3VtIChpcy5uYSh4KSkpCiNzdW0oaXMubmEoYmQpKQpgYGAKCiMjIyBWZXJpZmljYXIgZGF0b3MgY29uIG3DqXRvZG9zIGVzdGFkw61zdGljb3MgCkJ1c2NhciBkYXRvcyBhdMOtcGljb3MsIGZ1ZXJhIGRlIGxvIG5vcm1hbApgYGB7cn0KYmQ1IDwtIGJkNApib3hwbG90KGJkNSRQcmljZSwgaG9yaXpvbnRhbD1UUlVFKSAjaGF5IDMgcHJvZHVjdG9zIHF1ZSBjdWVzdGFuIG11Y2hvLCAKYm94cGxvdChiZDUkUXVhbnRpdHksIGhvcml6b250YWw9VFJVRSkgI3NlIHZlbmRpw7MgZW4gZG9zIG9jYXNpb25lcyB1bmEgY2FudGlkYWQgbXV5IGdyYW5kZSBkZSBQT1BDT1JOSE9MREVSUwpgYGAKCiMjIyBNYW5pcHVsYXIgbGEgYmFzZSBkZSBkYXRvcyAKQ3JlYXIgdW5hIGNvbHVtbmEgZGUgdmVudGFzID0gcHJlY2lvKmNhbnRpZGFkCmBgYHtyfQpiZDUkc3VidG90YWwgPC0gYmQ1JFByaWNlICogYmQ1JFF1YW50aXR5CnN1bW1hcnkoYmQ1KQpgYGAKCiMjIyBFeHBvcnRhciBiYXNlIGRlIGRhdG9zIGxpbXBpYSAKYGBge3J9CmJkX2xpbXBpYSA8LSBiZDUKI3dyaXRlLmNzdihiZF9saW1waWEsIGZpbGU9ICJzdXBlcm1lcmNhZG9fYmRsaW1waWEuY3N2Iiwgcm93Lm5hbWVzPSBGQUxTRSkKYGBgCgoK