#file.choose()

#install.packages("readxl")
library(readxl)

ruta_excel <-"/Users/genarorodriguezalcantara/Desktop/Tec/Business Analytics/Evidencia/Abarrotes-Ventas.xlsx"
excel_sheets(ruta_excel)
## [1] "Total"

1. Import Excel

bd <- read_excel(ruta_excel)

2a. Add Total

bd$Subtotal <- bd$Precio * bd$Unidades
summary(bd)
##   CveTienda           DescGiro         Codigo Barras         PLU         
##  Length:200620      Length:200620      Min.   :8.347e+05   Mode:logical  
##  Class :character   Class :character   1st Qu.:7.501e+12   TRUE:1437     
##  Mode  :character   Mode  :character   Median :7.501e+12   NA's:199183   
##                                        Mean   :5.950e+12                 
##                                        3rd Qu.:7.501e+12                 
##                                        Max.   :1.750e+13                 
##      Fecha                             Hora                       
##  Min.   :2020-05-01 00:00:31.08   Min.   :1899-12-31 00:00:00.00  
##  1st Qu.:2020-06-06 13:24:49.08   1st Qu.:1899-12-31 13:12:42.75  
##  Median :2020-07-11 14:10:21.46   Median :1899-12-31 17:35:59.00  
##  Mean   :2020-07-19 15:19:40.65   Mean   :1899-12-31 16:43:52.05  
##  3rd Qu.:2020-08-29 22:07:47.33   3rd Qu.:1899-12-31 20:47:06.00  
##  Max.   :2020-11-11 23:53:47.73   Max.   :1899-12-31 23:59:59.00  
##     Marca            Fabricante          Producto             Precio       
##  Length:200620      Length:200620      Length:200620      Min.   :-147.00  
##  Class :character   Class :character   Class :character   1st Qu.:  11.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :  16.00  
##                                                           Mean   :  19.42  
##                                                           3rd Qu.:  25.00  
##                                                           Max.   :1000.00  
##    Ult.Costo         Unidades         F.Ticket      NombreDepartamento
##  Min.   :  0.38   Min.   : 0.200   Min.   :     1   Length:200620     
##  1st Qu.:  8.46   1st Qu.: 1.000   1st Qu.: 33967   Class :character  
##  Median : 12.31   Median : 1.000   Median :105996   Mode  :character  
##  Mean   : 15.31   Mean   : 1.262   Mean   :193994                     
##  3rd Qu.: 19.23   3rd Qu.: 1.000   3rd Qu.:383008                     
##  Max.   :769.23   Max.   :96.000   Max.   :450040                     
##  NombreFamilia      NombreCategoria       Estado              Mts 2     
##  Length:200620      Length:200620      Length:200620      Min.   :47.0  
##  Class :character   Class :character   Class :character   1st Qu.:53.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :60.0  
##                                                           Mean   :56.6  
##                                                           3rd Qu.:60.0  
##                                                           Max.   :62.0  
##  Tipo ubicación         Giro            Hora inicio                    
##  Length:200620      Length:200620      Min.   :1899-12-31 07:00:00.00  
##  Class :character   Class :character   1st Qu.:1899-12-31 07:00:00.00  
##  Mode  :character   Mode  :character   Median :1899-12-31 08:00:00.00  
##                                        Mean   :1899-12-31 07:35:49.71  
##                                        3rd Qu.:1899-12-31 08:00:00.00  
##                                        Max.   :1899-12-31 09:00:00.00  
##   Hora cierre                        Subtotal       
##  Min.   :1899-12-31 21:00:00.00   Min.   :-1232.00  
##  1st Qu.:1899-12-31 22:00:00.00   1st Qu.:   12.00  
##  Median :1899-12-31 22:00:00.00   Median :   18.00  
##  Mean   :1899-12-31 22:23:11.42   Mean   :   24.24  
##  3rd Qu.:1899-12-31 23:00:00.00   3rd Qu.:   27.00  
##  Max.   :1899-12-31 23:00:00.00   Max.   : 2496.00

3. Correction of barcode digits

3a. Number of data that doesn’t have 10 digits

digits <- nchar(bd$'Codigo Barras')
no10digits <- sum(digits != 10)
print(no10digits)
## [1] 200620

3b. Correction of those that don’t have 10 digits

Convert data

bd$'Codigo Barras' <- as.character(bd$'Codigo Barras')

Add zeros at the beginning so that each one has 10 digits

bd$'Codigo Barras' <- formatC(bd$'Codigo Barras', width = 10, format = "d", flag = "0")

Verification

digitos <- nchar(bd$'Codigo Barras')
all(digitos == 10)
## [1] FALSE

4. PLU

4a.

###Percentage of records that have PLU

with_plu <- sum(!is.na(bd$PLU))
total <- nrow(bd)

percentage_with_plu <- (with_plu / total) * 100

abs_percentage_with_plu <- abs(percentage_with_plu)

cat("percentage_with_plu:", percentage_with_plu, "\n")
## percentage_with_plu: 0.7162795
cat("Percentage of records with a PLU: ", round(abs_percentage_with_plu, 2), "%")
## Percentage of records with a PLU:  0.72 %

Percentage of records that don’t have PLU

no_plu <- sum(is.na(bd$PLU))
total <- nrow(bd)

percentage_no_plu <- no_plu / total * 100

print(paste("The percentage of records that don't have a PLU is: ", round(percentage_no_plu, 2), "%"))
## [1] "The percentage of records that don't have a PLU is:  99.28 %"

4b. Correction of the variable

bd$PLU <- ifelse(is.na(bd$PLU), 0, bd$PLU)
summary(bd)
##   CveTienda           DescGiro         Codigo Barras           PLU          
##  Length:200620      Length:200620      Length:200620      Min.   :0.000000  
##  Class :character   Class :character   Class :character   1st Qu.:0.000000  
##  Mode  :character   Mode  :character   Mode  :character   Median :0.000000  
##                                                           Mean   :0.007163  
##                                                           3rd Qu.:0.000000  
##                                                           Max.   :1.000000  
##      Fecha                             Hora                       
##  Min.   :2020-05-01 00:00:31.08   Min.   :1899-12-31 00:00:00.00  
##  1st Qu.:2020-06-06 13:24:49.08   1st Qu.:1899-12-31 13:12:42.75  
##  Median :2020-07-11 14:10:21.46   Median :1899-12-31 17:35:59.00  
##  Mean   :2020-07-19 15:19:40.65   Mean   :1899-12-31 16:43:52.05  
##  3rd Qu.:2020-08-29 22:07:47.33   3rd Qu.:1899-12-31 20:47:06.00  
##  Max.   :2020-11-11 23:53:47.73   Max.   :1899-12-31 23:59:59.00  
##     Marca            Fabricante          Producto             Precio       
##  Length:200620      Length:200620      Length:200620      Min.   :-147.00  
##  Class :character   Class :character   Class :character   1st Qu.:  11.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :  16.00  
##                                                           Mean   :  19.42  
##                                                           3rd Qu.:  25.00  
##                                                           Max.   :1000.00  
##    Ult.Costo         Unidades         F.Ticket      NombreDepartamento
##  Min.   :  0.38   Min.   : 0.200   Min.   :     1   Length:200620     
##  1st Qu.:  8.46   1st Qu.: 1.000   1st Qu.: 33967   Class :character  
##  Median : 12.31   Median : 1.000   Median :105996   Mode  :character  
##  Mean   : 15.31   Mean   : 1.262   Mean   :193994                     
##  3rd Qu.: 19.23   3rd Qu.: 1.000   3rd Qu.:383008                     
##  Max.   :769.23   Max.   :96.000   Max.   :450040                     
##  NombreFamilia      NombreCategoria       Estado              Mts 2     
##  Length:200620      Length:200620      Length:200620      Min.   :47.0  
##  Class :character   Class :character   Class :character   1st Qu.:53.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :60.0  
##                                                           Mean   :56.6  
##                                                           3rd Qu.:60.0  
##                                                           Max.   :62.0  
##  Tipo ubicación         Giro            Hora inicio                    
##  Length:200620      Length:200620      Min.   :1899-12-31 07:00:00.00  
##  Class :character   Class :character   1st Qu.:1899-12-31 07:00:00.00  
##  Mode  :character   Mode  :character   Median :1899-12-31 08:00:00.00  
##                                        Mean   :1899-12-31 07:35:49.71  
##                                        3rd Qu.:1899-12-31 08:00:00.00  
##                                        Max.   :1899-12-31 09:00:00.00  
##   Hora cierre                        Subtotal       
##  Min.   :1899-12-31 21:00:00.00   Min.   :-1232.00  
##  1st Qu.:1899-12-31 22:00:00.00   1st Qu.:   12.00  
##  Median :1899-12-31 22:00:00.00   Median :   18.00  
##  Mean   :1899-12-31 22:23:11.42   Mean   :   24.24  
##  3rd Qu.:1899-12-31 23:00:00.00   3rd Qu.:   27.00  
##  Max.   :1899-12-31 23:00:00.00   Max.   : 2496.00

5. Correction of prices

5a. Number of NA

num_na <- sum(is.na(bd$Precio))
print(paste("The number of missing values in the Precio column is:", num_na))
## [1] "The number of missing values in the Precio column is: 0"

5b. Convertion of prices to an absolute number

bd1 <- bd
bd1$Precio <- abs(bd1$Precio)
summary(bd1$Precio)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.50   11.00   16.00   19.45   25.00 1000.00

6. FTicket

6a. Minimum number

min_FTicket <- min(bd$'F.Ticket', na.rm = TRUE)
print(paste("The minimum value in the F. Ticket column is:", min_FTicket))
## [1] "The minimum value in the F. Ticket column is: 1"

6b. Maximum number

max_FTicket <- max(bd$'F.Ticket', na.rm = TRUE)
print(paste("The maximum value in the F. Ticket column is:", max_FTicket))
## [1] "The maximum value in the F. Ticket column is: 450040"

6c. Mean

mean_FTicket <- mean(bd$'F.Ticket', na.rm = TRUE)
print(paste("The mean value in the F. Ticket column is:", mean_FTicket))
## [1] "The mean value in the F. Ticket column is: 193994.385140861"

7. For each of the stores find:

7a. The product that sells the most in units.

product_most <- bd[which.max(bd$Unidades), "Producto"]
abs_units <- abs(bd[which.max(bd$Unidades), "Unidades"])

product_str <- paste("The product that sells the most in units is:", product_most, "with", abs_units, "units.")
cat(product_str)
## The product that sells the most in units is: TECATE LIGHT LATA 340ML with 96 units.

7b. The product sold with the lowest unit price.

product_lowest <- bd[which.min(bd$Precio), "Producto"]
abs_units2 <- abs(bd[which.min(bd$Precio), "Precio"])

product_str2 <- paste("The product sold with the lowest unit price is:", product_lowest, "with", abs_units2, "unit price.")
cat(product_str2)
## The product sold with the lowest unit price is: KLEEN BEBE ABSOR-SEC  JUMBO 40PZ with 147 unit price.

7c. The product sold with the highest unit price.

product_highest <- bd[which.max(bd$Precio), "Producto"]
abs_units3 <- abs(bd[which.max(bd$Precio), "Precio"])

product_str3 <- paste("The product sold with the highest unit price is:", product_highest, "with", abs_units3, "unit price.")
cat(product_str3)
## The product sold with the highest unit price is: Whisky Buchanan´s 1L with 1000 unit price.

8. Make a table showing the monthly sales value in $ for each store.

sales_monthly <- bd %>%
  mutate(Sales_Value = Precio * Unidades) %>%
  group_by(CveTienda, format(Fecha, "%Y-%m")) %>%
  summarize(Monthly_Sales_Value = sum(Sales_Value))
## `summarise()` has grouped output by 'CveTienda'. You can override using the
## `.groups` argument.
print(sales_monthly)
## # A tibble: 33 × 3
## # Groups:   CveTienda [5]
##    CveTienda `format(Fecha, "%Y-%m")` Monthly_Sales_Value
##    <chr>     <chr>                                  <dbl>
##  1 MX001     2020-05                              818611 
##  2 MX001     2020-06                              436442 
##  3 MX001     2020-07                              292452.
##  4 MX001     2020-08                              301406 
##  5 MX001     2020-09                              206530 
##  6 MX001     2020-10                              164850.
##  7 MX001     2020-11                               66046 
##  8 MX002     2020-05                                9360 
##  9 MX002     2020-06                               50062.
## 10 MX002     2020-07                               29708.
## # … with 23 more rows

9. Make an annex to your report that includes graphs and other tools that make it easier to visualize your findings.

9a. Graph 1

library(ggplot2)
library(dplyr)
library(scales)

sales_monthly <- bd %>%
  mutate(Sales_Value = Precio * Unidades) %>%
  group_by(CveTienda, format(Fecha, "%Y-%m")) %>%
  summarize(Monthly_Sales_Value = sum(Sales_Value), Fecha = unique(Fecha), .groups = "drop")
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(sales_monthly, aes(x = format(Fecha, "%Y-%m"), y = Monthly_Sales_Value/10000, color = CveTienda, group = CveTienda)) +
  geom_line() +
  scale_x_discrete(name = "Month", labels = function(x) substr(x, 6, 7)) +
  scale_y_continuous(name = "Monthly Sales Value (in K)", labels = label_number(suffix = "K", accuracy = 1)) +
  labs(title = "Monthly Sales by Store", color = "Store Code") +
  theme_minimal()

9b. Graph 2

library(scales)

ggplot(sales_monthly, aes(x = format(Fecha, "%Y-%m"), y = Monthly_Sales_Value/1000)) +
  geom_col() +
  scale_x_discrete(name = "Month", labels = function(x) substr(x, 6, 7)) +
  scale_y_continuous(name = "Monthly Sales Value (in 1000s)", labels = comma_format()) +
  labs(title = "Monthly Sales by Month") +
  theme_minimal()

#ggplot(sales_monthly, aes(x = format(Fecha, "%Y-%m"), y = Monthly_Sales_Value)) +
  #geom_col() +
  #scale_x_discrete(name = "Month", labels = function(x) substr(x, 6, 7)) +
  #scale_y_continuous(name = "Monthly Sales Value") +
  #labs(title = "Monthly Sales by Month") +
  #theme_minimal()