#file.choose()
#install.packages("readxl")
library(readxl)
ruta_excel <-"/Users/genarorodriguezalcantara/Desktop/Tec/Business Analytics/Evidencia/Abarrotes-Ventas.xlsx"
excel_sheets(ruta_excel)
## [1] "Total"
bd <- read_excel(ruta_excel)
bd$Subtotal <- bd$Precio * bd$Unidades
summary(bd)
## CveTienda DescGiro Codigo Barras PLU
## Length:200620 Length:200620 Min. :8.347e+05 Mode:logical
## Class :character Class :character 1st Qu.:7.501e+12 TRUE:1437
## Mode :character Mode :character Median :7.501e+12 NA's:199183
## Mean :5.950e+12
## 3rd Qu.:7.501e+12
## Max. :1.750e+13
## Fecha Hora
## Min. :2020-05-01 00:00:31.08 Min. :1899-12-31 00:00:00.00
## 1st Qu.:2020-06-06 13:24:49.08 1st Qu.:1899-12-31 13:12:42.75
## Median :2020-07-11 14:10:21.46 Median :1899-12-31 17:35:59.00
## Mean :2020-07-19 15:19:40.65 Mean :1899-12-31 16:43:52.05
## 3rd Qu.:2020-08-29 22:07:47.33 3rd Qu.:1899-12-31 20:47:06.00
## Max. :2020-11-11 23:53:47.73 Max. :1899-12-31 23:59:59.00
## Marca Fabricante Producto Precio
## Length:200620 Length:200620 Length:200620 Min. :-147.00
## Class :character Class :character Class :character 1st Qu.: 11.00
## Mode :character Mode :character Mode :character Median : 16.00
## Mean : 19.42
## 3rd Qu.: 25.00
## Max. :1000.00
## Ult.Costo Unidades F.Ticket NombreDepartamento
## Min. : 0.38 Min. : 0.200 Min. : 1 Length:200620
## 1st Qu.: 8.46 1st Qu.: 1.000 1st Qu.: 33967 Class :character
## Median : 12.31 Median : 1.000 Median :105996 Mode :character
## Mean : 15.31 Mean : 1.262 Mean :193994
## 3rd Qu.: 19.23 3rd Qu.: 1.000 3rd Qu.:383008
## Max. :769.23 Max. :96.000 Max. :450040
## NombreFamilia NombreCategoria Estado Mts 2
## Length:200620 Length:200620 Length:200620 Min. :47.0
## Class :character Class :character Class :character 1st Qu.:53.0
## Mode :character Mode :character Mode :character Median :60.0
## Mean :56.6
## 3rd Qu.:60.0
## Max. :62.0
## Tipo ubicación Giro Hora inicio
## Length:200620 Length:200620 Min. :1899-12-31 07:00:00.00
## Class :character Class :character 1st Qu.:1899-12-31 07:00:00.00
## Mode :character Mode :character Median :1899-12-31 08:00:00.00
## Mean :1899-12-31 07:35:49.71
## 3rd Qu.:1899-12-31 08:00:00.00
## Max. :1899-12-31 09:00:00.00
## Hora cierre Subtotal
## Min. :1899-12-31 21:00:00.00 Min. :-1232.00
## 1st Qu.:1899-12-31 22:00:00.00 1st Qu.: 12.00
## Median :1899-12-31 22:00:00.00 Median : 18.00
## Mean :1899-12-31 22:23:11.42 Mean : 24.24
## 3rd Qu.:1899-12-31 23:00:00.00 3rd Qu.: 27.00
## Max. :1899-12-31 23:00:00.00 Max. : 2496.00
digits <- nchar(bd$'Codigo Barras')
no10digits <- sum(digits != 10)
print(no10digits)
## [1] 200620
bd$'Codigo Barras' <- as.character(bd$'Codigo Barras')
bd$'Codigo Barras' <- formatC(bd$'Codigo Barras', width = 10, format = "d", flag = "0")
digitos <- nchar(bd$'Codigo Barras')
all(digitos == 10)
## [1] FALSE
###Percentage of records that have PLU
with_plu <- sum(!is.na(bd$PLU))
total <- nrow(bd)
percentage_with_plu <- (with_plu / total) * 100
abs_percentage_with_plu <- abs(percentage_with_plu)
cat("percentage_with_plu:", percentage_with_plu, "\n")
## percentage_with_plu: 0.7162795
cat("Percentage of records with a PLU: ", round(abs_percentage_with_plu, 2), "%")
## Percentage of records with a PLU: 0.72 %
no_plu <- sum(is.na(bd$PLU))
total <- nrow(bd)
percentage_no_plu <- no_plu / total * 100
print(paste("The percentage of records that don't have a PLU is: ", round(percentage_no_plu, 2), "%"))
## [1] "The percentage of records that don't have a PLU is: 99.28 %"
bd$PLU <- ifelse(is.na(bd$PLU), 0, bd$PLU)
summary(bd)
## CveTienda DescGiro Codigo Barras PLU
## Length:200620 Length:200620 Length:200620 Min. :0.000000
## Class :character Class :character Class :character 1st Qu.:0.000000
## Mode :character Mode :character Mode :character Median :0.000000
## Mean :0.007163
## 3rd Qu.:0.000000
## Max. :1.000000
## Fecha Hora
## Min. :2020-05-01 00:00:31.08 Min. :1899-12-31 00:00:00.00
## 1st Qu.:2020-06-06 13:24:49.08 1st Qu.:1899-12-31 13:12:42.75
## Median :2020-07-11 14:10:21.46 Median :1899-12-31 17:35:59.00
## Mean :2020-07-19 15:19:40.65 Mean :1899-12-31 16:43:52.05
## 3rd Qu.:2020-08-29 22:07:47.33 3rd Qu.:1899-12-31 20:47:06.00
## Max. :2020-11-11 23:53:47.73 Max. :1899-12-31 23:59:59.00
## Marca Fabricante Producto Precio
## Length:200620 Length:200620 Length:200620 Min. :-147.00
## Class :character Class :character Class :character 1st Qu.: 11.00
## Mode :character Mode :character Mode :character Median : 16.00
## Mean : 19.42
## 3rd Qu.: 25.00
## Max. :1000.00
## Ult.Costo Unidades F.Ticket NombreDepartamento
## Min. : 0.38 Min. : 0.200 Min. : 1 Length:200620
## 1st Qu.: 8.46 1st Qu.: 1.000 1st Qu.: 33967 Class :character
## Median : 12.31 Median : 1.000 Median :105996 Mode :character
## Mean : 15.31 Mean : 1.262 Mean :193994
## 3rd Qu.: 19.23 3rd Qu.: 1.000 3rd Qu.:383008
## Max. :769.23 Max. :96.000 Max. :450040
## NombreFamilia NombreCategoria Estado Mts 2
## Length:200620 Length:200620 Length:200620 Min. :47.0
## Class :character Class :character Class :character 1st Qu.:53.0
## Mode :character Mode :character Mode :character Median :60.0
## Mean :56.6
## 3rd Qu.:60.0
## Max. :62.0
## Tipo ubicación Giro Hora inicio
## Length:200620 Length:200620 Min. :1899-12-31 07:00:00.00
## Class :character Class :character 1st Qu.:1899-12-31 07:00:00.00
## Mode :character Mode :character Median :1899-12-31 08:00:00.00
## Mean :1899-12-31 07:35:49.71
## 3rd Qu.:1899-12-31 08:00:00.00
## Max. :1899-12-31 09:00:00.00
## Hora cierre Subtotal
## Min. :1899-12-31 21:00:00.00 Min. :-1232.00
## 1st Qu.:1899-12-31 22:00:00.00 1st Qu.: 12.00
## Median :1899-12-31 22:00:00.00 Median : 18.00
## Mean :1899-12-31 22:23:11.42 Mean : 24.24
## 3rd Qu.:1899-12-31 23:00:00.00 3rd Qu.: 27.00
## Max. :1899-12-31 23:00:00.00 Max. : 2496.00
num_na <- sum(is.na(bd$Precio))
print(paste("The number of missing values in the Precio column is:", num_na))
## [1] "The number of missing values in the Precio column is: 0"
bd1 <- bd
bd1$Precio <- abs(bd1$Precio)
summary(bd1$Precio)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.50 11.00 16.00 19.45 25.00 1000.00
min_FTicket <- min(bd$'F.Ticket', na.rm = TRUE)
print(paste("The minimum value in the F. Ticket column is:", min_FTicket))
## [1] "The minimum value in the F. Ticket column is: 1"
max_FTicket <- max(bd$'F.Ticket', na.rm = TRUE)
print(paste("The maximum value in the F. Ticket column is:", max_FTicket))
## [1] "The maximum value in the F. Ticket column is: 450040"
mean_FTicket <- mean(bd$'F.Ticket', na.rm = TRUE)
print(paste("The mean value in the F. Ticket column is:", mean_FTicket))
## [1] "The mean value in the F. Ticket column is: 193994.385140861"
product_most <- bd[which.max(bd$Unidades), "Producto"]
abs_units <- abs(bd[which.max(bd$Unidades), "Unidades"])
product_str <- paste("The product that sells the most in units is:", product_most, "with", abs_units, "units.")
cat(product_str)
## The product that sells the most in units is: TECATE LIGHT LATA 340ML with 96 units.
product_lowest <- bd[which.min(bd$Precio), "Producto"]
abs_units2 <- abs(bd[which.min(bd$Precio), "Precio"])
product_str2 <- paste("The product sold with the lowest unit price is:", product_lowest, "with", abs_units2, "unit price.")
cat(product_str2)
## The product sold with the lowest unit price is: KLEEN BEBE ABSOR-SEC JUMBO 40PZ with 147 unit price.
product_highest <- bd[which.max(bd$Precio), "Producto"]
abs_units3 <- abs(bd[which.max(bd$Precio), "Precio"])
product_str3 <- paste("The product sold with the highest unit price is:", product_highest, "with", abs_units3, "unit price.")
cat(product_str3)
## The product sold with the highest unit price is: Whisky Buchanan´s 1L with 1000 unit price.
sales_monthly <- bd %>%
mutate(Sales_Value = Precio * Unidades) %>%
group_by(CveTienda, format(Fecha, "%Y-%m")) %>%
summarize(Monthly_Sales_Value = sum(Sales_Value))
## `summarise()` has grouped output by 'CveTienda'. You can override using the
## `.groups` argument.
print(sales_monthly)
## # A tibble: 33 × 3
## # Groups: CveTienda [5]
## CveTienda `format(Fecha, "%Y-%m")` Monthly_Sales_Value
## <chr> <chr> <dbl>
## 1 MX001 2020-05 818611
## 2 MX001 2020-06 436442
## 3 MX001 2020-07 292452.
## 4 MX001 2020-08 301406
## 5 MX001 2020-09 206530
## 6 MX001 2020-10 164850.
## 7 MX001 2020-11 66046
## 8 MX002 2020-05 9360
## 9 MX002 2020-06 50062.
## 10 MX002 2020-07 29708.
## # … with 23 more rows
library(ggplot2)
library(dplyr)
library(scales)
sales_monthly <- bd %>%
mutate(Sales_Value = Precio * Unidades) %>%
group_by(CveTienda, format(Fecha, "%Y-%m")) %>%
summarize(Monthly_Sales_Value = sum(Sales_Value), Fecha = unique(Fecha), .groups = "drop")
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(sales_monthly, aes(x = format(Fecha, "%Y-%m"), y = Monthly_Sales_Value/10000, color = CveTienda, group = CveTienda)) +
geom_line() +
scale_x_discrete(name = "Month", labels = function(x) substr(x, 6, 7)) +
scale_y_continuous(name = "Monthly Sales Value (in K)", labels = label_number(suffix = "K", accuracy = 1)) +
labs(title = "Monthly Sales by Store", color = "Store Code") +
theme_minimal()
library(scales)
ggplot(sales_monthly, aes(x = format(Fecha, "%Y-%m"), y = Monthly_Sales_Value/1000)) +
geom_col() +
scale_x_discrete(name = "Month", labels = function(x) substr(x, 6, 7)) +
scale_y_continuous(name = "Monthly Sales Value (in 1000s)", labels = comma_format()) +
labs(title = "Monthly Sales by Month") +
theme_minimal()
#ggplot(sales_monthly, aes(x = format(Fecha, "%Y-%m"), y = Monthly_Sales_Value)) +
#geom_col() +
#scale_x_discrete(name = "Month", labels = function(x) substr(x, 6, 7)) +
#scale_y_continuous(name = "Monthly Sales Value") +
#labs(title = "Monthly Sales by Month") +
#theme_minimal()