library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(openxlsx)
## Warning: package 'openxlsx' was built under R version 4.2.3
library(readxl)
library(tidyr)
library(ggplot2)
#1. Export the database to R
df <- read_excel("Abarrotes-Ventas-1.xlsx")
#2. In the database, only the units sold and the unit price appear for each transaction, but the amount corresponding to each transaction does not appear, that is, the multiplication of the price by the units sold.

#A) After the "Unidades" column, add another one called "Monto Total" that indicates the result of that operation (Price X Units).

df <- df %>%
  mutate('Monto_Total' = Precio * Unidades) %>%
  relocate('Monto_Total', .after = 'Unidades')
#3. The Barcode variable must be standardized to 13 digits:

#a) How many records do not meet that parameter?
n <- sum(nchar(df$`Codigo Barras`) != 13)
cat("There are", format(n, scientific = FALSE), "barcodes with less or more than 13 digits.")
## There are 42486 barcodes with less or more than 13 digits.
#b) What do you suggest to correct these erroneous data?
print("The erroneous data corresponds to 21% of the total data, therefore deleting all of these observations would mean losing valuable information. And since barcodes are only a lable, meaning that even though they are numbers no mathematical calculations can be done with them, we optted with adding 00 to the beggining of the numbers to uniform them into 13 digits.")
## [1] "The erroneous data corresponds to 21% of the total data, therefore deleting all of these observations would mean losing valuable information. And since barcodes are only a lable, meaning that even though they are numbers no mathematical calculations can be done with them, we optted with adding 00 to the beggining of the numbers to uniform them into 13 digits."
df$`Codigo Barras` <- as.character(df$`Codigo Barras`)
df$`Codigo Barras` <- sprintf("%013s", df$`Codigo Barras`)
"Price look-up codes, commonly called PLU codes, PLU numbers, PLUs, produce codes,  or produce labels, are a system of numbers that uniquely identify bulk produce sold in grocery stores and supermarkets. The codes have been in use since 1990, and over 1400 have been assigned.The codes are administered by the International Federation for Produce Standards (IFPS), a global coalition of fruit and vegetable associations that was formed in 2001 to introduce PLU numbers globally"
## [1] "Price look-up codes, commonly called PLU codes, PLU numbers, PLUs, produce codes,  or produce labels, are a system of numbers that uniquely identify bulk produce sold in grocery stores and supermarkets. The codes have been in use since 1990, and over 1400 have been assigned.The codes are administered by the International Federation for Produce Standards (IFPS), a global coalition of fruit and vegetable associations that was formed in 2001 to introduce PLU numbers globally"
"a) What percentage of records have a PLU and how many do not?"
## [1] "a) What percentage of records have a PLU and how many do not?"
PluDF <- sum(is.na(df$PLU))
print(PluDF)
## [1] 199183
(PluDF/200620)*100
## [1] 99.28372
PLUSNUM <- sum(!is.na(df$PLU))
print(PLUSNUM)
## [1] 1437
(PLUSNUM/200620)*100
## [1] 0.7162795
print("Based on the results of some quick calculations we can see that the amount of items that don't have PLUs is 99.29% of the dataframe. The one that do have a PLU are few and in total are 1437 or only 0.71% of the whole dataframe")  
## [1] "Based on the results of some quick calculations we can see that the amount of items that don't have PLUs is 99.29% of the dataframe. The one that do have a PLU are few and in total are 1437 or only 0.71% of the whole dataframe"
"b) What do you suggest doing about this variable?"
## [1] "b) What do you suggest doing about this variable?"
print("Since 99% of the products don't have a PLU we can assume that it is not as important as other variables in the database. Also since we have the bar code we can easly look up products based on the bar codes. However if the PLU's are important for our analysis then I think creating a specific code for products to group by would be requiered")
## [1] "Since 99% of the products don't have a PLU we can assume that it is not as important as other variables in the database. Also since we have the bar code we can easly look up products based on the bar codes. However if the PLU's are important for our analysis then I think creating a specific code for products to group by would be requiered"
"In the database some prices appear with negative values, which is obviously a mistake"
## [1] "In the database some prices appear with negative values, which is obviously a mistake"
"a) How many records have a negative price?"
## [1] "a) How many records have a negative price?"
NegativeNum <- sum(df$Precio <0)
print(NegativeNum)
## [1] 147
"b) Correct those negative values indicating the criteria used to do so."
## [1] "b) Correct those negative values indicating the criteria used to do so."
PrecioPositivo <- abs(NegativeNum)
print(PrecioPositivo)
## [1] 147
DfPositivo <- abs(df$Precio)
print("To correct the negative values in the collum of Precio, I used the abs function or the Absolute function which converts them to positive. I also did the abs function on the whole dataframe collum of Precio which is the same but just so it can be saved on the environmet")
## [1] "To correct the negative values in the collum of Precio, I used the abs function or the Absolute function which converts them to positive. I also did the abs function on the whole dataframe collum of Precio which is the same but just so it can be saved on the environmet"
#6. The "FTicket" variable indicates the folio number of each transaction. As you can see, the "FTicket" of each store starts with the number 1 and contains the products purchased in each transaction. For each store find

#a) The minimum number of items purchased perticket.
minimos <- df %>%
  group_by(CveTienda, F.Ticket) %>%
  summarize(num_articulos = sum(Unidades)) %>%
  group_by(CveTienda) %>%
  summarize(min_articulos_ticket = min(num_articulos))
## `summarise()` has grouped output by 'CveTienda'. You can override using the
## `.groups` argument.
#b) The maximum number of items purchased perticket.
maximos <- df %>%
  group_by(CveTienda, F.Ticket) %>%
  summarize(num_articulos = sum(Unidades)) %>%
  group_by(CveTienda) %>%
  summarize(max_articulos_ticket = max(num_articulos))
## `summarise()` has grouped output by 'CveTienda'. You can override using the
## `.groups` argument.
#c) The average number of items purchased perticket.
medios <- df %>%
  group_by(CveTienda, F.Ticket) %>%
  summarize(num_articulos = sum(Unidades)) %>%
  group_by(CveTienda) %>%
  summarize(media_articulos_ticket = mean(num_articulos))
## `summarise()` has grouped output by 'CveTienda'. You can override using the
## `.groups` argument.
# Unir los resultados en un solo objeto
resultado <- minimos %>%
  left_join(maximos, by = "CveTienda") %>%
  left_join(medios, by = "CveTienda")
resultado
## # A tibble: 5 × 4
##   CveTienda min_articulos_ticket max_articulos_ticket media_articulos_ticket
##   <chr>                    <dbl>                <dbl>                  <dbl>
## 1 MX001                        1                  104                   2.15
## 2 MX002                        1                    9                   1.45
## 3 MX003                        1                   24                   2.14
## 4 MX004                        1                   96                   1.94
## 5 MX005                        1                   60                   2.28
#7. For each of the stores find:
#a) The product that sells the most in units.
mas_vendido <- df %>% 
  group_by(CveTienda) %>% 
  summarize(ProductoMasVendido = Producto[which.max(Unidades)])
#b) The product sold with the lowest unit price.
precio_bajo <- df %>% 
  group_by(CveTienda) %>% 
  summarize(ProductoPrecioBajo = Producto[which.min(Precio)])
#c) The product sold with the highest unit price.
precio_alto <- df %>% 
  group_by(CveTienda) %>% 
  summarize(ProductoPrecioAlto = Producto[which.max(Precio)])
# Ver resultados
resultado2 <- left_join(left_join(mas_vendido, precio_bajo, by = "CveTienda"), precio_alto, by = "CveTienda")
resultado2
## # A tibble: 5 × 4
##   CveTienda ProductoMasVendido          ProductoPrecioBajo               Produ…¹
##   <chr>     <chr>                       <chr>                            <chr>  
## 1 MX001     Cerveza Tecate Light 340Ml  KLEEN BEBE ABSOR-SEC  JUMBO 40PZ Whisky…
## 2 MX002     Caribe Cooler Durazno 300ml Trident Yerbabuena 4´s 5.2gr     NIDO K…
## 3 MX003     Coca Cola Regular 355Ml     Canels Sabor Menta 5g            Tocino…
## 4 MX004     TECATE LIGHT LATA 340ML     bubalo                           TEQUIL…
## 5 MX005     Cerveza Tecate Light 340Ml  Clorets Cajita 2Pz               Six Te…
## # … with abbreviated variable name ¹​ProductoPrecioAlto
ggplot(df, aes(x=(CveTienda), group=("Monto_Total"), fill=("Monto_Total"))) + geom_bar() 

##Make a table showing the monthly sales value in $ for each store
sales <- df
  sales$date <- as.Date("Fecha", format="%m/%d/%Y")
  sales_by_store_month <- aggregate(Monto_Total ~ format(Fecha, "%Y-%m") + CveTienda, data = sales, sum)
sales_table <- pivot_wider(sales_by_store_month, names_from = CveTienda, values_from = Monto_Total)
sales_table
## # A tibble: 7 × 6
##   `format(Fecha, "%Y-%m")`   MX001  MX002  MX003   MX004   MX005
##   <chr>                      <dbl>  <dbl>  <dbl>   <dbl>   <dbl>
## 1 2020-05                  818611   9360   2758  372940.     NA 
## 2 2020-06                  436442  50062.  4630  318352. 174768 
## 3 2020-07                  292452. 29708. 12029. 338807  136018.
## 4 2020-08                  301406  20906  19275. 328022  149851 
## 5 2020-09                  206530  21505  20140. 291469.  55854.
## 6 2020-10                  164850.  8268. 15560  139950.  37182.
## 7 2020-11                   66046   2820   5324.     NA   10814.
df$Fecha <- format(as.Date(df$Fecha, format="%Y-%m-%d"), "%m")

ggplot(df, aes(x=Fecha, group=(CveTienda), fill=(CveTienda))) + geom_bar() + xlab("Mes") + ylab("Ventas")