Inicializamos importando la biblioteca tidyverse
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(skimr)
Cargamos los datos utilizados en la sesión anterior
clean <- read_csv("results/1_wrangling_consolidated.csv")
## Rows: 108643 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): order_id, product_id, seller_id, product_category_name, customer_...
## dbl (10): order_item_id, price, freight_value, product_name_lenght, product...
## dttm (6): shipping_limit_date, order_purchase_timestamp, order_approved_at,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Veamos que existen las siguientes variables:
glimpse(clean)
## Rows: 108,643
## Columns: 22
## $ order_id <chr> "00010242fe8c5a6d1ba2dd792cb16214", "000…
## $ order_item_id <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ product_id <chr> "4244733e06e7ecb4970a6e2683c13e61", "e5f…
## $ seller_id <chr> "48436dade18ac8b2bce089ec2a041202", "dd7…
## $ shipping_limit_date <dttm> 2017-09-19 09:45:35, 2017-05-03 11:05:1…
## $ price <dbl> 58.90, 239.90, 199.00, 12.99, 199.90, 21…
## $ freight_value <dbl> 13.29, 19.93, 17.87, 12.79, 18.14, 12.69…
## $ product_category_name <chr> "cool_stuff", "pet_shop", "moveis_decora…
## $ product_name_lenght <dbl> 58, 56, 59, 42, 59, 36, 52, 39, 59, 52, …
## $ product_description_lenght <dbl> 598, 239, 695, 480, 409, 558, 815, 1310,…
## $ product_photos_qty <dbl> 4, 2, 2, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 2…
## $ product_weight_g <dbl> 650, 30000, 3050, 200, 3750, 450, 200, 1…
## $ product_length_cm <dbl> 28, 50, 33, 16, 35, 24, 27, 35, 30, 29, …
## $ product_height_cm <dbl> 9, 30, 13, 10, 40, 8, 5, 75, 12, 3, 25, …
## $ product_width_cm <dbl> 14, 40, 33, 15, 30, 15, 20, 45, 16, 21, …
## $ customer_id <chr> "3ce436f183e68e07877b285a838db11a", "f6d…
## $ order_status <chr> "delivered", "delivered", "delivered", "…
## $ order_purchase_timestamp <dttm> 2017-09-13 08:59:02, 2017-04-26 10:53:0…
## $ order_approved_at <dttm> 2017-09-13 09:45:35, 2017-04-26 11:05:1…
## $ order_delivered_carrier_date <dttm> 2017-09-19 18:34:16, 2017-05-04 14:35:0…
## $ order_delivered_customer_date <dttm> 2017-09-20 23:43:48, 2017-05-12 16:04:2…
## $ order_estimated_delivery_date <dttm> 2017-09-29, 2017-05-15, 2018-02-05, 201…
ids <- clean %>%
select(order_id, order_item_id, product_id, seller_id)
Tras seleccionar estos hagamos un análisis de la frecuencia en cada una de estas columnas,
ids %>%
group_by(order_id) %>%
count() %>%
filter(n > 1)
## # A tibble: 9,492 × 2
## # Groups: order_id [9,492]
## order_id n
## <chr> <int>
## 1 0008288aa423d2a3f00fcb17cd7d8719 2
## 2 00143d0f86d6fbd9f9b38ab440ac16f5 3
## 3 001ab0a7578dd66cd4b0a71f5b6e1e41 3
## 4 001d8f0e34a38c37f7dba2a37d4eba8b 2
## 5 002c9def9c9b951b1bec6d50753c9891 2
## 6 002f98c0f7efd42638ed6100ca699b42 2
## 7 003324c70b19a16798817b2b3640e721 2
## 8 00337fe25a3780b3424d9ad7c5a4b35e 2
## 9 003822434f91204da0a51fe4cf2aba18 2
## 10 003f201cdd39cdd59b6447cff2195456 2
## # ℹ 9,482 more rows
Este análisis nos hace ver que los identificadores de hecho son únicos ya que si existiera alguna repetición, veríamos alguna fila después de aplicar el pipeline de dplyr. Replicando este mismo análisis para los otros identificadores
Contemos el total de order_item_id distintos
unique(ids$order_item_id)
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
Ahora contaremos la frecuencia de cada uno de estos números
oi_id <- ids %>%
group_by(order_item_id) %>%
count() %>%
# filter(n > 1) %>%
arrange(-n)
head(oi_id)
## # A tibble: 6 × 2
## # Groups: order_item_id [6]
## order_item_id n
## <dbl> <int>
## 1 1 95102
## 2 2 9505
## 3 3 2214
## 4 4 927
## 5 5 436
## 6 6 247
Graficando,
ggplot(oi_id, aes(order_item_id, n))+
scale_y_log10()+
geom_point()
Utilizamos un eje logarítmico para hacer notar la profunda diferencia de
frecuencias entre los diferentes identificadores de orden.
length(unique(ids$product_id))
## [1] 31625
De aquí obtenemos a
p_id <- ids %>%
group_by(product_id) %>%
count() %>%
filter(n > 1) %>%
arrange(-n)
head(p_id)
## # A tibble: 6 × 2
## # Groups: product_id [6]
## product_id n
## <chr> <int>
## 1 aca2eb7d00ea1a7b8ebd4e68314663af 520
## 2 422879e10f46682990de24d770e7f83d 484
## 3 99a4788cb24856965c36a24e339b6058 477
## 4 389d119b48cf3043d311335e499d9c6b 390
## 5 368c6c730842d78016ad823897a372db 388
## 6 53759a2ecddad2bb87a079a1f1519f73 373
Graficando
ggplot(p_id, aes(n))+
geom_histogram(binwidth = 5)
s_id <- ids %>%
group_by(seller_id) %>%
count() %>%
filter(n > 1) %>%
arrange(-n)
head(s_id)
## # A tibble: 6 × 2
## # Groups: seller_id [6]
## seller_id n
## <chr> <int>
## 1 6560211a19b47992c3666cc44a7e94c0 1996
## 2 4a3ca9315b744ce9f8e9374361493884 1949
## 3 1f50f920176fa81dab994f9023523100 1926
## 4 cc419e0650a3c5ba77189a1882b7556a 1719
## 5 da8622b14eb17ae2831f4ac5b9dab84a 1548
## 6 955fee9216a65b617aa5c0531780ce60 1472
ggplot(s_id, aes(n))+
geom_histogram(binwidth = 5)
skim(clean)
| Name | clean |
| Number of rows | 108643 |
| Number of columns | 22 |
| _______________________ | |
| Column type frequency: | |
| character | 6 |
| numeric | 10 |
| POSIXct | 6 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| order_id | 0 | 1 | 32 | 32 | 0 | 95129 | 0 |
| product_id | 0 | 1 | 32 | 32 | 0 | 31625 | 0 |
| seller_id | 0 | 1 | 32 | 32 | 0 | 2914 | 0 |
| product_category_name | 0 | 1 | 3 | 46 | 0 | 73 | 0 |
| customer_id | 0 | 1 | 32 | 32 | 0 | 95129 | 0 |
| order_status | 0 | 1 | 8 | 9 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| order_item_id | 0 | 1 | 1.20 | 0.71 | 1.00 | 1.00 | 1.00 | 1.00 | 21.00 | ▇▁▁▁▁ |
| price | 0 | 1 | 120.09 | 182.04 | 0.85 | 39.90 | 74.90 | 134.90 | 6735.00 | ▇▁▁▁▁ |
| freight_value | 0 | 1 | 19.98 | 15.73 | 0.00 | 13.08 | 16.29 | 21.16 | 409.68 | ▇▁▁▁▁ |
| product_name_lenght | 0 | 1 | 48.81 | 10.01 | 5.00 | 42.00 | 52.00 | 57.00 | 76.00 | ▁▁▃▇▁ |
| product_description_lenght | 0 | 1 | 787.32 | 651.35 | 4.00 | 348.00 | 603.00 | 987.00 | 3992.00 | ▇▃▁▁▁ |
| product_photos_qty | 0 | 1 | 2.21 | 1.72 | 1.00 | 1.00 | 1.00 | 3.00 | 20.00 | ▇▁▁▁▁ |
| product_weight_g | 0 | 1 | 2095.56 | 3744.02 | 0.00 | 300.00 | 700.00 | 1800.00 | 40425.00 | ▇▁▁▁▁ |
| product_length_cm | 0 | 1 | 30.20 | 16.16 | 7.00 | 18.00 | 25.00 | 38.00 | 105.00 | ▇▅▂▁▁ |
| product_height_cm | 0 | 1 | 16.59 | 13.43 | 2.00 | 8.00 | 13.00 | 20.00 | 105.00 | ▇▂▁▁▁ |
| product_width_cm | 0 | 1 | 23.03 | 11.70 | 6.00 | 15.00 | 20.00 | 30.00 | 118.00 | ▇▃▁▁▁ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| shipping_limit_date | 0 | 1 | 2016-09-19 23:11:33 | 2020-04-09 22:35:08 | 2018-01-29 13:51:02 | 90128 |
| order_purchase_timestamp | 0 | 1 | 2016-09-15 12:16:38 | 2018-08-29 15:00:37 | 2018-01-22 11:52:53 | 94625 |
| order_approved_at | 0 | 1 | 2016-09-15 12:16:38 | 2018-08-29 15:10:26 | 2018-01-22 19:38:22 | 87125 |
| order_delivered_carrier_date | 0 | 1 | 2016-10-08 10:34:01 | 2018-09-11 19:48:28 | 2018-01-25 15:13:21 | 78921 |
| order_delivered_customer_date | 0 | 1 | 2016-10-11 13:46:32 | 2018-10-17 13:22:46 | 2018-02-05 14:13:22 | 94341 |
| order_estimated_delivery_date | 0 | 1 | 2016-10-04 00:00:00 | 2018-10-25 00:00:00 | 2018-02-19 00:00:00 | 445 |
clean %>%
count(product_category_name, sort = TRUE) %>%
head(20)
## # A tibble: 20 × 2
## product_category_name n
## <chr> <int>
## 1 cama_mesa_banho 10952
## 2 beleza_saude 9467
## 3 esporte_lazer 8429
## 4 moveis_decoracao 8156
## 5 informatica_acessorios 7643
## 6 utilidades_domesticas 6795
## 7 relogios_presentes 5857
## 8 telefonia 4428
## 9 ferramentas_jardim 4267
## 10 automotivo 4139
## 11 brinquedos 4030
## 12 cool_stuff 3715
## 13 perfumaria 3342
## 14 bebes 2981
## 15 eletronicos 2729
## 16 papelaria 2466
## 17 fashion_bolsas_e_acessorios 1986
## 18 pet_shop 1924
## 19 moveis_escritorio 1668
## 20 consoles_games 1089
clean |>
count(product_category_name, sort = TRUE) |>
slice_head(n = 20) |>
mutate(
product_category_name = reorder(product_category_name, n)
) |>
ggplot(aes(x = product_category_name, y = n)) +
geom_col() +
coord_flip() +
labs(
title = "20 categorías de producto más frecuentes",
x = "Categoría de producto",
y = "Frecuencia"
) +
theme_minimal()