library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(openxlsx)
## Warning: package 'openxlsx' was built under R version 4.2.3
library(readxl)
library(tidyr)
library(ggplot2)
#1. Export the database to R
df <- read_excel("Abarrotes-Ventas-1.xlsx")
#2. In the database, only the units sold and the unit price appear for each transaction, but the amount corresponding to each transaction does not appear, that is, the multiplication of the price by the units sold.
#A) After the "Unidades" column, add another one called "Monto Total" that indicates the result of that operation (Price X Units).
df <- df %>%
mutate('Monto_Total' = Precio * Unidades) %>%
relocate('Monto_Total', .after = 'Unidades')
#3. The Barcode variable must be standardized to 13 digits:
#a) How many records do not meet that parameter?
n <- sum(nchar(df$`Codigo Barras`) != 13)
cat("There are", format(n, scientific = FALSE), "barcodes with less or more than 13 digits.")
## There are 42486 barcodes with less or more than 13 digits.
#b) What do you suggest to correct these erroneous data?
print("The erroneous data corresponds to 21% of the total data, therefore deleting all of these observations would mean losing valuable information. And since barcodes are only a lable, meaning that even though they are numbers no mathematical calculations can be done with them, we optted with adding 00 to the beggining of the numbers to uniform them into 13 digits.")
## [1] "The erroneous data corresponds to 21% of the total data, therefore deleting all of these observations would mean losing valuable information. And since barcodes are only a lable, meaning that even though they are numbers no mathematical calculations can be done with them, we optted with adding 00 to the beggining of the numbers to uniform them into 13 digits."
df$`Codigo Barras` <- as.character(df$`Codigo Barras`)
df$`Codigo Barras` <- sprintf("%013s", df$`Codigo Barras`)
"Price look-up codes, commonly called PLU codes, PLU numbers, PLUs, produce codes, or produce labels, are a system of numbers that uniquely identify bulk produce sold in grocery stores and supermarkets. The codes have been in use since 1990, and over 1400 have been assigned.The codes are administered by the International Federation for Produce Standards (IFPS), a global coalition of fruit and vegetable associations that was formed in 2001 to introduce PLU numbers globally"
## [1] "Price look-up codes, commonly called PLU codes, PLU numbers, PLUs, produce codes, or produce labels, are a system of numbers that uniquely identify bulk produce sold in grocery stores and supermarkets. The codes have been in use since 1990, and over 1400 have been assigned.The codes are administered by the International Federation for Produce Standards (IFPS), a global coalition of fruit and vegetable associations that was formed in 2001 to introduce PLU numbers globally"
"a) What percentage of records have a PLU and how many do not?"
## [1] "a) What percentage of records have a PLU and how many do not?"
PluDF <- sum(is.na(df$PLU))
print(PluDF)
## [1] 199183
(PluDF/200620)*100
## [1] 99.28372
PLUSNUM <- sum(!is.na(df$PLU))
print(PLUSNUM)
## [1] 1437
(PLUSNUM/200620)*100
## [1] 0.7162795
print("Based on the results of some quick calculations we can see that the amount of items that don't have PLUs is 99.29% of the dataframe. The one that do have a PLU are few and in total are 1437 or only 0.71% of the whole dataframe")
## [1] "Based on the results of some quick calculations we can see that the amount of items that don't have PLUs is 99.29% of the dataframe. The one that do have a PLU are few and in total are 1437 or only 0.71% of the whole dataframe"
"b) What do you suggest doing about this variable?"
## [1] "b) What do you suggest doing about this variable?"
print("Since 99% of the products don't have a PLU we can assume that it is not as important as other variables in the database. Also since we have the bar code we can easly look up products based on the bar codes. However if the PLU's are important for our analysis then I think creating a specific code for products to group by would be requiered")
## [1] "Since 99% of the products don't have a PLU we can assume that it is not as important as other variables in the database. Also since we have the bar code we can easly look up products based on the bar codes. However if the PLU's are important for our analysis then I think creating a specific code for products to group by would be requiered"
"In the database some prices appear with negative values, which is obviously a mistake"
## [1] "In the database some prices appear with negative values, which is obviously a mistake"
"a) How many records have a negative price?"
## [1] "a) How many records have a negative price?"
NegativeNum <- sum(df$Precio <0)
print(NegativeNum)
## [1] 147
"b) Correct those negative values indicating the criteria used to do so."
## [1] "b) Correct those negative values indicating the criteria used to do so."
PrecioPositivo <- abs(NegativeNum)
print(PrecioPositivo)
## [1] 147
DfPositivo <- abs(df$Precio)
print("To correct the negative values in the collum of Precio, I used the abs function or the Absolute function which converts them to positive. I also did the abs function on the whole dataframe collum of Precio which is the same but just so it can be saved on the environmet")
## [1] "To correct the negative values in the collum of Precio, I used the abs function or the Absolute function which converts them to positive. I also did the abs function on the whole dataframe collum of Precio which is the same but just so it can be saved on the environmet"
#6. The "FTicket" variable indicates the folio number of each transaction. As you can see, the "FTicket" of each store starts with the number 1 and contains the products purchased in each transaction. For each store find
#a) The minimum number of items purchased perticket.
minimos <- df %>%
group_by(CveTienda, F.Ticket) %>%
summarize(num_articulos = sum(Unidades)) %>%
group_by(CveTienda) %>%
summarize(min_articulos_ticket = min(num_articulos))
## `summarise()` has grouped output by 'CveTienda'. You can override using the
## `.groups` argument.
#b) The maximum number of items purchased perticket.
maximos <- df %>%
group_by(CveTienda, F.Ticket) %>%
summarize(num_articulos = sum(Unidades)) %>%
group_by(CveTienda) %>%
summarize(max_articulos_ticket = max(num_articulos))
## `summarise()` has grouped output by 'CveTienda'. You can override using the
## `.groups` argument.
#c) The average number of items purchased perticket.
medios <- df %>%
group_by(CveTienda, F.Ticket) %>%
summarize(num_articulos = sum(Unidades)) %>%
group_by(CveTienda) %>%
summarize(media_articulos_ticket = mean(num_articulos))
## `summarise()` has grouped output by 'CveTienda'. You can override using the
## `.groups` argument.
# Unir los resultados en un solo objeto
resultado <- minimos %>%
left_join(maximos, by = "CveTienda") %>%
left_join(medios, by = "CveTienda")
resultado
## # A tibble: 5 × 4
## CveTienda min_articulos_ticket max_articulos_ticket media_articulos_ticket
## <chr> <dbl> <dbl> <dbl>
## 1 MX001 1 104 2.15
## 2 MX002 1 9 1.45
## 3 MX003 1 24 2.14
## 4 MX004 1 96 1.94
## 5 MX005 1 60 2.28
#7. For each of the stores find:
#a) The product that sells the most in units.
mas_vendido <- df %>%
group_by(CveTienda) %>%
summarize(ProductoMasVendido = Producto[which.max(Unidades)])
#b) The product sold with the lowest unit price.
precio_bajo <- df %>%
group_by(CveTienda) %>%
summarize(ProductoPrecioBajo = Producto[which.min(Precio)])
#c) The product sold with the highest unit price.
precio_alto <- df %>%
group_by(CveTienda) %>%
summarize(ProductoPrecioAlto = Producto[which.max(Precio)])
# Ver resultados
resultado2 <- left_join(left_join(mas_vendido, precio_bajo, by = "CveTienda"), precio_alto, by = "CveTienda")
resultado2
## # A tibble: 5 × 4
## CveTienda ProductoMasVendido ProductoPrecioBajo Produ…¹
## <chr> <chr> <chr> <chr>
## 1 MX001 Cerveza Tecate Light 340Ml KLEEN BEBE ABSOR-SEC JUMBO 40PZ Whisky…
## 2 MX002 Caribe Cooler Durazno 300ml Trident Yerbabuena 4´s 5.2gr NIDO K…
## 3 MX003 Coca Cola Regular 355Ml Canels Sabor Menta 5g Tocino…
## 4 MX004 TECATE LIGHT LATA 340ML bubalo TEQUIL…
## 5 MX005 Cerveza Tecate Light 340Ml Clorets Cajita 2Pz Six Te…
## # … with abbreviated variable name ¹ProductoPrecioAlto
ggplot(df, aes(x=(CveTienda), group=("Monto_Total"), fill=("Monto_Total"))) + geom_bar()

##Make a table showing the monthly sales value in $ for each store
sales <- df
sales$date <- as.Date("Fecha", format="%m/%d/%Y")
sales_by_store_month <- aggregate(Monto_Total ~ format(Fecha, "%Y-%m") + CveTienda, data = sales, sum)
sales_table <- pivot_wider(sales_by_store_month, names_from = CveTienda, values_from = Monto_Total)
sales_table
## # A tibble: 7 × 6
## `format(Fecha, "%Y-%m")` MX001 MX002 MX003 MX004 MX005
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2020-05 818611 9360 2758 372940. NA
## 2 2020-06 436442 50062. 4630 318352. 174768
## 3 2020-07 292452. 29708. 12029. 338807 136018.
## 4 2020-08 301406 20906 19275. 328022 149851
## 5 2020-09 206530 21505 20140. 291469. 55854.
## 6 2020-10 164850. 8268. 15560 139950. 37182.
## 7 2020-11 66046 2820 5324. NA 10814.
df$Fecha <- format(as.Date(df$Fecha, format="%Y-%m-%d"), "%m")
ggplot(df, aes(x=Fecha, group=(CveTienda), fill=(CveTienda))) + geom_bar() + xlab("Mes") + ylab("Ventas")
