Inicializamos importando la biblioteca tidyverse

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(skimr)

Cargamos los datos utilizados en la sesión anterior

clean <- read_csv("results/1_wrangling_consolidated.csv")
## Rows: 108643 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (6): order_id, product_id, seller_id, product_category_name, customer_...
## dbl  (10): order_item_id, price, freight_value, product_name_lenght, product...
## dttm  (6): shipping_limit_date, order_purchase_timestamp, order_approved_at,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Veamos que existen las siguientes variables:

glimpse(clean)
## Rows: 108,643
## Columns: 22
## $ order_id                      <chr> "00010242fe8c5a6d1ba2dd792cb16214", "000…
## $ order_item_id                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ product_id                    <chr> "4244733e06e7ecb4970a6e2683c13e61", "e5f…
## $ seller_id                     <chr> "48436dade18ac8b2bce089ec2a041202", "dd7…
## $ shipping_limit_date           <dttm> 2017-09-19 09:45:35, 2017-05-03 11:05:1…
## $ price                         <dbl> 58.90, 239.90, 199.00, 12.99, 199.90, 21…
## $ freight_value                 <dbl> 13.29, 19.93, 17.87, 12.79, 18.14, 12.69…
## $ product_category_name         <chr> "cool_stuff", "pet_shop", "moveis_decora…
## $ product_name_lenght           <dbl> 58, 56, 59, 42, 59, 36, 52, 39, 59, 52, …
## $ product_description_lenght    <dbl> 598, 239, 695, 480, 409, 558, 815, 1310,…
## $ product_photos_qty            <dbl> 4, 2, 2, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 2…
## $ product_weight_g              <dbl> 650, 30000, 3050, 200, 3750, 450, 200, 1…
## $ product_length_cm             <dbl> 28, 50, 33, 16, 35, 24, 27, 35, 30, 29, …
## $ product_height_cm             <dbl> 9, 30, 13, 10, 40, 8, 5, 75, 12, 3, 25, …
## $ product_width_cm              <dbl> 14, 40, 33, 15, 30, 15, 20, 45, 16, 21, …
## $ customer_id                   <chr> "3ce436f183e68e07877b285a838db11a", "f6d…
## $ order_status                  <chr> "delivered", "delivered", "delivered", "…
## $ order_purchase_timestamp      <dttm> 2017-09-13 08:59:02, 2017-04-26 10:53:0…
## $ order_approved_at             <dttm> 2017-09-13 09:45:35, 2017-04-26 11:05:1…
## $ order_delivered_carrier_date  <dttm> 2017-09-19 18:34:16, 2017-05-04 14:35:0…
## $ order_delivered_customer_date <dttm> 2017-09-20 23:43:48, 2017-05-12 16:04:2…
## $ order_estimated_delivery_date <dttm> 2017-09-29, 2017-05-15, 2018-02-05, 201…

Análisis univariado

Identificadores

ids <- clean %>%
  select(order_id, order_item_id, product_id, seller_id) 

Tras seleccionar estos hagamos un análisis de la frecuencia en cada una de estas columnas,

ids %>%
  group_by(order_id) %>%
  count() %>%
  filter(n > 1)
## # A tibble: 9,492 × 2
## # Groups:   order_id [9,492]
##    order_id                             n
##    <chr>                            <int>
##  1 0008288aa423d2a3f00fcb17cd7d8719     2
##  2 00143d0f86d6fbd9f9b38ab440ac16f5     3
##  3 001ab0a7578dd66cd4b0a71f5b6e1e41     3
##  4 001d8f0e34a38c37f7dba2a37d4eba8b     2
##  5 002c9def9c9b951b1bec6d50753c9891     2
##  6 002f98c0f7efd42638ed6100ca699b42     2
##  7 003324c70b19a16798817b2b3640e721     2
##  8 00337fe25a3780b3424d9ad7c5a4b35e     2
##  9 003822434f91204da0a51fe4cf2aba18     2
## 10 003f201cdd39cdd59b6447cff2195456     2
## # ℹ 9,482 more rows

Este análisis nos hace ver que los identificadores de hecho son únicos ya que si existiera alguna repetición, veríamos alguna fila después de aplicar el pipeline de dplyr. Replicando este mismo análisis para los otros identificadores

Contemos el total de order_item_id distintos

unique(ids$order_item_id)
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21

Ahora contaremos la frecuencia de cada uno de estos números

oi_id <- ids %>%
  group_by(order_item_id) %>%
  count() %>%
  # filter(n > 1) %>%
  arrange(-n)
head(oi_id)
## # A tibble: 6 × 2
## # Groups:   order_item_id [6]
##   order_item_id     n
##           <dbl> <int>
## 1             1 95102
## 2             2  9505
## 3             3  2214
## 4             4   927
## 5             5   436
## 6             6   247

Graficando,

ggplot(oi_id, aes(order_item_id, n))+
  scale_y_log10()+
  geom_point()

Utilizamos un eje logarítmico para hacer notar la profunda diferencia de frecuencias entre los diferentes identificadores de orden.

Product id

length(unique(ids$product_id))
## [1] 31625

De aquí obtenemos a

p_id <- ids %>%
  group_by(product_id) %>%
  count() %>%
  filter(n > 1) %>% 
  arrange(-n)
head(p_id)
## # A tibble: 6 × 2
## # Groups:   product_id [6]
##   product_id                           n
##   <chr>                            <int>
## 1 aca2eb7d00ea1a7b8ebd4e68314663af   520
## 2 422879e10f46682990de24d770e7f83d   484
## 3 99a4788cb24856965c36a24e339b6058   477
## 4 389d119b48cf3043d311335e499d9c6b   390
## 5 368c6c730842d78016ad823897a372db   388
## 6 53759a2ecddad2bb87a079a1f1519f73   373

Graficando

ggplot(p_id, aes(n))+
  geom_histogram(binwidth = 5)

s_id <- ids %>%
  group_by(seller_id) %>%
  count() %>%
  filter(n > 1) %>%
  arrange(-n)
head(s_id)
## # A tibble: 6 × 2
## # Groups:   seller_id [6]
##   seller_id                            n
##   <chr>                            <int>
## 1 6560211a19b47992c3666cc44a7e94c0  1996
## 2 4a3ca9315b744ce9f8e9374361493884  1949
## 3 1f50f920176fa81dab994f9023523100  1926
## 4 cc419e0650a3c5ba77189a1882b7556a  1719
## 5 da8622b14eb17ae2831f4ac5b9dab84a  1548
## 6 955fee9216a65b617aa5c0531780ce60  1472
ggplot(s_id, aes(n))+
  geom_histogram(binwidth = 5)

skim(clean)
Data summary
Name clean
Number of rows 108643
Number of columns 22
_______________________
Column type frequency:
character 6
numeric 10
POSIXct 6
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
order_id 0 1 32 32 0 95129 0
product_id 0 1 32 32 0 31625 0
seller_id 0 1 32 32 0 2914 0
product_category_name 0 1 3 46 0 73 0
customer_id 0 1 32 32 0 95129 0
order_status 0 1 8 9 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
order_item_id 0 1 1.20 0.71 1.00 1.00 1.00 1.00 21.00 ▇▁▁▁▁
price 0 1 120.09 182.04 0.85 39.90 74.90 134.90 6735.00 ▇▁▁▁▁
freight_value 0 1 19.98 15.73 0.00 13.08 16.29 21.16 409.68 ▇▁▁▁▁
product_name_lenght 0 1 48.81 10.01 5.00 42.00 52.00 57.00 76.00 ▁▁▃▇▁
product_description_lenght 0 1 787.32 651.35 4.00 348.00 603.00 987.00 3992.00 ▇▃▁▁▁
product_photos_qty 0 1 2.21 1.72 1.00 1.00 1.00 3.00 20.00 ▇▁▁▁▁
product_weight_g 0 1 2095.56 3744.02 0.00 300.00 700.00 1800.00 40425.00 ▇▁▁▁▁
product_length_cm 0 1 30.20 16.16 7.00 18.00 25.00 38.00 105.00 ▇▅▂▁▁
product_height_cm 0 1 16.59 13.43 2.00 8.00 13.00 20.00 105.00 ▇▂▁▁▁
product_width_cm 0 1 23.03 11.70 6.00 15.00 20.00 30.00 118.00 ▇▃▁▁▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
shipping_limit_date 0 1 2016-09-19 23:11:33 2020-04-09 22:35:08 2018-01-29 13:51:02 90128
order_purchase_timestamp 0 1 2016-09-15 12:16:38 2018-08-29 15:00:37 2018-01-22 11:52:53 94625
order_approved_at 0 1 2016-09-15 12:16:38 2018-08-29 15:10:26 2018-01-22 19:38:22 87125
order_delivered_carrier_date 0 1 2016-10-08 10:34:01 2018-09-11 19:48:28 2018-01-25 15:13:21 78921
order_delivered_customer_date 0 1 2016-10-11 13:46:32 2018-10-17 13:22:46 2018-02-05 14:13:22 94341
order_estimated_delivery_date 0 1 2016-10-04 00:00:00 2018-10-25 00:00:00 2018-02-19 00:00:00 445
clean %>%
  count(product_category_name, sort = TRUE) %>%
  head(20)
## # A tibble: 20 × 2
##    product_category_name           n
##    <chr>                       <int>
##  1 cama_mesa_banho             10952
##  2 beleza_saude                 9467
##  3 esporte_lazer                8429
##  4 moveis_decoracao             8156
##  5 informatica_acessorios       7643
##  6 utilidades_domesticas        6795
##  7 relogios_presentes           5857
##  8 telefonia                    4428
##  9 ferramentas_jardim           4267
## 10 automotivo                   4139
## 11 brinquedos                   4030
## 12 cool_stuff                   3715
## 13 perfumaria                   3342
## 14 bebes                        2981
## 15 eletronicos                  2729
## 16 papelaria                    2466
## 17 fashion_bolsas_e_acessorios  1986
## 18 pet_shop                     1924
## 19 moveis_escritorio            1668
## 20 consoles_games               1089
clean  |> 
  count(product_category_name, sort = TRUE) |> 
  slice_head(n = 20) |> 
  mutate(
    product_category_name = reorder(product_category_name, n)
  ) |> 
  ggplot(aes(x = product_category_name, y = n)) +
    geom_col() +
    coord_flip() +
    labs(
    title = "20 categorías de producto más frecuentes",
    x = "Categoría de producto",
    y = "Frecuencia"
    ) +
    theme_minimal()