:: Libraries

if(!require("tidyverse")){install.packages("tidyverse")} 
library(lubridate)
if(!require("janitor")){install.packages("janitor")}

:: Read

#unzipped
df_sellers <- read_csv('data/olist_sellers_dataset.csv')

#zipped
df_customers <- readr::read_csv('data/olist_customers_dataset.csv.zip')
df_geolocation = read_csv('data/olist_geolocation_dataset.csv.zip')
df_order_items <- read_csv('data/olist_order_items_dataset.csv.zip')
df_order_payments = read_csv('data/olist_order_payments_dataset.csv.zip')
df_order_reviews = read_csv('data/olist_order_reviews_dataset.csv.zip')
df_orders <- read_csv('data/olist_orders_dataset.csv.zip')
df_products <- read_csv('data/olist_products_dataset.csv.zip')

:: Join

df_olist <- df_orders %>% 
    left_join(df_order_items, by = "order_id") %>% 
    full_join(df_order_payments, by = "order_id") %>% 
    full_join(df_order_reviews, by = "order_id") %>% 
    full_join(df_products, by = "product_id") %>% 
    full_join(df_customers, by = "customer_id") %>% 
    full_join(df_sellers, by = "seller_id")

df_olist %>% 
  select(order_purchase_timestamp) %>%
  mutate( order_purchase_timestamp_2 = 
            lubridate::floor_date( order_purchase_timestamp, unit = "day")) %>% 
  mutate()

## # A tibble: 119,151 x 2
##    order_purchase_timestamp order_purchase_timestamp_2
##    <dttm>                   <dttm>                    
##  1 2017-10-02 10:56:33      2017-10-02 00:00:00       
##  2 2017-10-02 10:56:33      2017-10-02 00:00:00       
##  3 2017-10-02 10:56:33      2017-10-02 00:00:00       
##  4 2018-07-24 20:41:37      2018-07-24 00:00:00       
##  5 2018-08-08 08:38:49      2018-08-08 00:00:00       
##  6 2017-11-18 19:28:06      2017-11-18 00:00:00       
##  7 2018-02-13 21:18:39      2018-02-13 00:00:00       
##  8 2017-07-09 21:57:05      2017-07-09 00:00:00       
##  9 2017-04-11 12:22:08      2017-04-11 00:00:00       
## 10 2017-05-16 13:10:30      2017-05-16 00:00:00       
## # ... with 119,141 more rows

  glimpse

## function (x, width = NULL, ...) 
## {
##     UseMethod("glimpse")
## }
## <bytecode: 0x0000000014ea7790>
## <environment: namespace:tibble>

:: View

df_olist %>% glimpse

## Rows: 119,151
## Columns: 39
## $ order_id                      <chr> "e481f51cbdc54678b7cc49136f2d6af7", "...
## $ customer_id                   <chr> "9ef432eb6251297304e76186b10a928d", "...
## $ order_status                  <chr> "delivered", "delivered", "delivered"...
## $ order_purchase_timestamp      <dttm> 2017-10-02 10:56:33, 2017-10-02 10:5...
## $ order_approved_at             <dttm> 2017-10-02 11:07:15, 2017-10-02 11:0...
## $ order_delivered_carrier_date  <dttm> 2017-10-04 19:55:00, 2017-10-04 19:5...
## $ order_delivered_customer_date <dttm> 2017-10-10 21:25:13, 2017-10-10 21:2...
## $ order_estimated_delivery_date <dttm> 2017-10-18, 2017-10-18, 2017-10-18, ...
## $ order_item_id                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ product_id                    <chr> "87285b34884572647811a353c7ac498a", "...
## $ seller_id                     <chr> "3504c0cb71d7fa48d967e0e4c94d59d9", "...
## $ shipping_limit_date           <dttm> 2017-10-06 11:07:15, 2017-10-06 11:0...
## $ price                         <dbl> 29.99, 29.99, 29.99, 118.70, 159.90, ...
## $ freight_value                 <dbl> 8.72, 8.72, 8.72, 22.76, 19.22, 27.20...
## $ payment_sequential            <dbl> 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1...
## $ payment_type                  <chr> "credit_card", "voucher", "voucher", ...
## $ payment_installments          <dbl> 1, 1, 1, 1, 3, 1, 1, 6, 1, 3, 1, 1, 1...
## $ payment_value                 <dbl> 18.12, 2.00, 18.59, 141.46, 179.12, 7...
## $ review_id                     <chr> "a54f0611adc9ed256b57ede6b6eb5114", "...
## $ review_score                  <dbl> 4, 4, 4, 4, 5, 5, 5, 4, 2, 5, 1, 5, 5...
## $ review_comment_title          <chr> NA, NA, NA, "Muito boa a loja", NA, N...
## $ review_comment_message        <chr> "Não testei o produto ainda, mas ele ...
## $ review_creation_date          <dttm> 2017-10-11, 2017-10-11, 2017-10-11, ...
## $ review_answer_timestamp       <dttm> 2017-10-12 03:43:48, 2017-10-12 03:4...
## $ product_category_name         <chr> "utilidades_domesticas", "utilidades_...
## $ product_name_lenght           <dbl> 40, 40, 40, 29, 46, 59, 38, 49, NA, 5...
## $ product_description_lenght    <dbl> 268, 268, 268, 178, 232, 468, 316, 60...
## $ product_photos_qty            <dbl> 4, 4, 4, 1, 1, 3, 4, 1, NA, 1, 2, 1, ...
## $ product_weight_g              <dbl> 500, 500, 500, 400, 420, 450, 250, 71...
## $ product_length_cm             <dbl> 19, 19, 19, 19, 24, 30, 51, 65, 35, 1...
## $ product_height_cm             <dbl> 8, 8, 8, 13, 19, 10, 15, 10, 35, 16, ...
## $ product_width_cm              <dbl> 13, 13, 13, 19, 21, 20, 15, 65, 15, 1...
## $ customer_unique_id            <chr> "7c396fd4830fd04220f754e42b4e5bff", "...
## $ customer_zip_code_prefix      <chr> "03149", "03149", "03149", "47813", "...
## $ customer_city                 <chr> "sao paulo", "sao paulo", "sao paulo"...
## $ customer_state                <chr> "SP", "SP", "SP", "BA", "GO", "RN", "...
## $ seller_zip_code_prefix        <chr> "09350", "09350", "09350", "31570", "...
## $ seller_city                   <chr> "maua", "maua", "maua", "belo horizon...
## $ seller_state                  <chr> "SP", "SP", "SP", "SP", "SP", "MG", "...

:: Explore

Seguem algumas explorações feitas

ids

A base possui 119151 linhas e 39 colunas, com 0.8345797 de clientes únicos, e um total de 32952 diferentes produtos vendidos. Considerando que houveram 96096 customer_unique_id temos que a imensa maioria dos pedidos foram realizadas por clientes que compraram uma única vez. Esta informação pode ser utilizada tanto para uma investigação na base de dados, considerando a possibilidade que os customer_unique_id não estejam sendo gerados adequadamente, quanto como uma oportunidade de negócio para a fidelização de tais clientes.

df_olist %>% 
    summarise(
        n_linhas = n(),
        n_distinct(order_id),
        n_distinct(customer_id),
        n_distinct(review_id),
        n_distinct(customer_unique_id),   
        n_distinct(product_id),   
        ) %>% 
    pivot_longer(cols = everything())

## # A tibble: 6 x 2
##   name                            value
##   <chr>                           <int>
## 1 n_linhas                       119151
## 2 n_distinct(order_id)            99441
## 3 n_distinct(customer_id)         99441
## 4 n_distinct(review_id)           99173
## 5 n_distinct(customer_unique_id)  96096
## 6 n_distinct(product_id)          32952

order_estimated_delivery_date

df_olist %>%
  select(order_estimated_delivery_date, order_delivered_customer_date) %>% 
    mutate(estimated_vs_delivered_date = 
             (order_delivered_customer_date - order_estimated_delivery_date) %/% ddays(1)) %>% 
    pull(estimated_vs_delivered_date) %>% summary()

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -146.00  -16.00  -12.00  -11.13   -6.00  188.00    3421

df_olist %>%
    mutate( estimated_vs_delivered_date = 
              (order_delivered_customer_date - order_estimated_delivery_date) %/% ddays(1)) %>%
    ggplot(aes(product_weight_g, estimated_vs_delivered_date)) +
    geom_point(alpha = 0.2) +
    coord_cartesian(xlim= c(NA, 30000))

## Warning: Removed 3441 rows containing missing values (geom_point).

order_purchase_timestamp

A base possui dados entre 2016-09-04 e 2018-10-17

df_olist %>%
    mutate(order_purchase_timestamp_safra = 
               lubridate::floor_date((order_purchase_timestamp), "month")) %>% 
    group_by(order_purchase_timestamp_safra) %>% 
    summarise(n = n_distinct(order_id))

## # A tibble: 25 x 2
##    order_purchase_timestamp_safra     n
##  * <dttm>                         <int>
##  1 2016-09-01 00:00:00                4
##  2 2016-10-01 00:00:00              324
##  3 2016-12-01 00:00:00                1
##  4 2017-01-01 00:00:00              800
##  5 2017-02-01 00:00:00             1780
##  6 2017-03-01 00:00:00             2682
##  7 2017-04-01 00:00:00             2404
##  8 2017-05-01 00:00:00             3700
##  9 2017-06-01 00:00:00             3245
## 10 2017-07-01 00:00:00             4026
## # ... with 15 more rows

Tendo uma ascendencia na quantidade de pedidos

df_olist %>%
    mutate(order_purchase_timestamp_safra = 
               lubridate::floor_date((order_purchase_timestamp), "month")) %>% 
    group_by(order_purchase_timestamp_safra) %>% 
    summarise(n = n_distinct(order_id)) %>% 
    ggplot(aes(order_purchase_timestamp_safra, n)) +
    geom_line()

Tal crescimento foi concentrado nos pedidos com avaliação positiva

df_olist %>%
    mutate(order_purchase_timestamp_safra = 
               lubridate::floor_date((order_purchase_timestamp), "month")) %>% 
    mutate(review_score = as_factor(review_score)) %>% 
    group_by(order_purchase_timestamp_safra, review_score) %>% 
    summarise(n = n()) %>% 
    mutate(order_purchase_timestamp_safra = as.Date(order_purchase_timestamp_safra)) %>%  
    ggplot(aes(order_purchase_timestamp_safra, n)) +
    geom_area(aes(fill = review_score), alpha = 0.5) +
    scale_fill_brewer(palette = "RdYlGn") +
    scale_x_date(date_breaks = "2 months", labels = scales::date_format("%b/%y"),
                 guide = guide_axis(n.dodge = 2))

## `summarise()` has grouped output by 'order_purchase_timestamp_safra'. You can override using the `.groups` argument.

order_purchase_timestamp

df_olist %>% 
    mutate(prazo_efetivo = interval(
        order_purchase_timestamp, 
        order_delivered_customer_date) %/% 
            ddays(1)) %>% 
    filter(order_status == 'delivered') %>% 
    ggplot(aes(x = log(price), y = log(prazo_efetivo), z = review_score)) +
        stat_summary_2d(fun = "mean")

## Warning: Removed 27 rows containing non-finite values (stat_summary2d).

order_vs_delivered_date

df_olist %>%
    mutate(review_score = as_factor(review_score)) %>% 
    mutate(order_vs_delivered_date = 
             interval(order_purchase_timestamp, order_delivered_customer_date) %/% ddays(1)) %>% 
    ggplot(aes("", order_vs_delivered_date, color = review_score, fill = review_score)) +
      geom_boxplot(alpha = 0.6) +
      scale_color_brewer(palette = "RdYlGn") +
      scale_fill_brewer(palette = "RdYlGn") +
      facet_grid( . ~ review_score , labeller = label_both) +
      theme(legend.position = 'none') +
      labs(x="")

## Warning: Removed 3421 rows containing non-finite values (stat_boxplot).

order_status

A grande maioria dos pedidos foram entregues

df_olist %>% 
    janitor::tabyl(order_status) %>% 
    arrange(-n) %>% 
    mutate(percent = scales::percent(percent))

##  order_status      n  percent
##     delivered 115731 97.1297%
##       shipped   1256  1.0541%
##      canceled    750  0.6295%
##   unavailable    652  0.5472%
##      invoiced    378  0.3172%
##    processing    376  0.3156%
##       created      5  0.0042%
##      approved      3  0.0025%

product_category_name

top 80%

df_olist %>% 
  janitor::tabyl(product_category_name) %>% 
  arrange(-n) %>% 
  mutate(percent_acum = cumsum(percent)) %>% 
  filter(percent_acum < 0.8) %>% 
  mutate(across(contains("percent"), ~ scales::percent(.x)))

##   product_category_name     n percent valid_percent percent_acum
##         cama_mesa_banho 11990 10.063%        10.28%        10.1%
##            beleza_saude 10033  8.420%         8.60%        18.5%
##           esporte_lazer  9005  7.558%         7.72%        26.0%
##        moveis_decoracao  8833  7.413%         7.57%        33.5%
##  informatica_acessorios  8151  6.841%         6.99%        40.3%
##   utilidades_domesticas  7380  6.194%         6.33%        46.5%
##      relogios_presentes  6213  5.214%         5.33%        51.7%
##               telefonia  4726  3.966%         4.05%        55.7%
##      ferramentas_jardim  4590  3.852%         3.94%        59.5%
##              automotivo  4400  3.693%         3.77%        63.2%
##              brinquedos  4281  3.593%         3.67%        66.8%
##              cool_stuff  3999  3.356%         3.43%        70.2%
##              perfumaria  3589  3.012%         3.08%        73.2%
##                   bebes  3204  2.689%         2.75%        75.9%
##             eletronicos  2847  2.389%         2.44%        78.3%

categorias vs.

df_olist %>%
  mutate(Persona = ifelse(review_score > 3, "bom", "ruim")) %>%
  select(product_category_name, Persona) %>% 
  #filtrando os produtos presentes em pelo menos 500 compras
    group_by(product_category_name) %>% 
    filter(n() > 500) %>% 
    ungroup() %>%
  #filtrando as linhas com NA
    filter(!is.na(product_category_name)) %>% 
  #exemplo de um filtro por nome da categoria
    #filter(stringr::str_detect(product_category_name, "construcao|casa")) %>% 
  #ordenando as categorias no gráfico e depois invertendo a ordem
    mutate(product_category_name = forcats::fct_infreq(product_category_name)) %>%
    mutate(product_category_name = forcats::fct_rev(product_category_name)) %>% 
  #gráfico
  ggplot(aes(x = product_category_name, fill = Persona)) +
    geom_bar(data = . %>% filter(Persona == "bom")) +
    geom_bar(data = . %>% filter(Persona == "ruim"), aes( y = ..count..*(-1) ) ) +
    scale_y_continuous(limits = c(-10000, 10000), breaks = seq(-10000, 10000, 2000), labels = abs) +
    coord_flip()

payment_type

concentração de tipos de pagamento

df_olist %>% 
    janitor::tabyl(payment_type) %>% 
    arrange(-n) %>% 
    mutate(percent = scales::percent(percent)) %>% 
    mutate(valid_percent = scales::percent(valid_percent))

##  payment_type     n percent valid_percent
##   credit_card 87784   73.7%         73.7%
##        boleto 23190   19.5%         19.5%
##       voucher  6465    5.4%          5.4%
##    debit_card  1706    1.4%          1.4%
##   not_defined     3    0.0%          0.0%
##          <NA>     3    0.0%          <NA>

df_olist %>% 
    mutate(payment_type = forcats::fct_infreq(payment_type),
           payment_type = forcats::fct_explicit_na(payment_type, na_level = "NA"),
           payment_type = forcats::fct_recode(payment_type, 
                              missing = "not_defined", missing = "NA")) %>% 
    ggplot(aes(payment_type, color = payment_type, fill = payment_type)) +
        geom_bar(alpha = 0.4) +
        theme(legend.position = 'none')

review_score

df_olist %>% 
    mutate(review_score = as_factor(review_score)) %>% 
    ggplot(aes(review_score, fill = review_score)) +
    geom_bar(alpha = 0.6) + 
    scale_fill_brewer(palette = "RdYlGn") +
    theme(legend.position = 'none')

review_comment_message

Identificação de pedidos que tiveram comentários

df_olist %>% 
    mutate(review_comment_message_str = str_length(review_comment_message)) %>% 
    mutate(review_comment_message_index = 
               ifelse(is.na(review_comment_message), "nao_comentou", "comentou")) %>% 
    tabyl(review_comment_message_index) %>% 
  mutate(percent= scales::percent(percent))

##  review_comment_message_index     n percent
##                      comentou 51242     43%
##                  nao_comentou 67909     57%

entre os pedidos com comentário a % de avaliações negativas é superior

df_olist %>% 
    mutate(review_comment_message_str = str_length(review_comment_message)) %>% 
    mutate(review_comment_message_index = 
               ifelse(is.na(review_comment_message), "nao_comentou", "comentou")) %>% 
    mutate(review_score = as_factor(review_score)) %>% 
    ggplot(aes(review_comment_message_index, fill = review_score)) +
        geom_bar(position = "fill", alpha = 0.6) + 
        scale_fill_brewer(palette = "RdYlGn") +
        coord_flip() +
        theme(legend.position = 'bottom') +
        guides(fill = guide_legend(reverse = TRUE))

price

A média dos preços é de 120.6491086, versus uma mediana de 74.9, indicando a presença de outliers

summary(df_olist$price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.85   39.90   74.90  120.65  134.90 6735.00     833

Adicionalmente temos que 50% dos pedidos estão concentrados nos valores em 50 e 150 reais

df_olist%>% 
  ggplot(aes(price)) +
  geom_boxplot(color = "darkblue", fill = "darkblue", alpha = 0.4) +
  coord_cartesian(xlim = c(NA, 500)) +
  scale_x_continuous(breaks = seq(0,500, 50))

## Warning: Removed 833 rows containing non-finite values (stat_boxplot).

avaliando por tipos de pagamento, os clientes que pagam com cartão de crédito possuem uma ticket superior

df_olist %>% 
  ggplot(aes(payment_type, price)) +
  geom_boxplot(alpha = 0.4) +
  coord_cartesian(ylim = c(NA, 300))

## Warning: Removed 833 rows containing non-finite values (stat_boxplot).

  scale_y_continuous(breaks = seq(0,500, 50))

## <ScaleContinuousPosition>
##  Range:  
##  Limits:    0 --    1

customer_state vs. seller_state

SP concentra o número de vendedores e clientes, avaliando segundo a proporção SP concentra ~30% das transaçõe

df_olist %>%
  group_by(customer_state, seller_state) %>% 
  count() %>%
  ungroup() %>% 
  arrange(-n) %>% 
  mutate(p = n/sum(n)) %>% 
  mutate(acumulado = cumsum(n)) %>% 
  mutate(acumulado_p = acumulado/sum(n)) %>% 
  ggplot(aes(customer_state, seller_state)) + 
    geom_tile(aes(fill = p)) + 
    theme_minimal()

considerando a distribuição de clientes para cada um dos estados vendedores, SP, RJ e MG possuem o maior destaque

df_olist %>%
  group_by(customer_state, seller_state) %>% 
  count() %>%
  arrange(-n) %>% 
  group_by(seller_state) %>% 
  mutate(p = n/sum(n)) %>% 
  ggplot(aes(customer_state, seller_state)) + 
    geom_tile(aes(fill = p)) + 
    scale_fill_distiller(palette = "YlGnBu", direction = 1) +
    theme_minimal()

voltando ao resultado geral, e excluindo SP, temos que além de MG e RJ, a região Sul possui destaque

df_olist %>%
  group_by(customer_state, seller_state) %>% 
  count() %>%
  ungroup() %>% 
  arrange(-n) %>% 
  mutate(p = n/sum(n)) %>% 
  mutate(acumulado = cumsum(n)) %>% 
  mutate(acumulado_p = acumulado/sum(n)) %>% 
  filter(customer_state != "SP", seller_state != "SP") %>% 
  ggplot(aes(customer_state, seller_state)) + 
    geom_tile(aes(fill = p)) + 
    scale_fill_distiller(palette = "YlGnBu", direction = 1) +
    theme_minimal()

dataset olist - EDA

:: Libraries

:: Read

:: Join

:: View

:: Explore

ids

order_estimated_delivery_date

order_purchase_timestamp

order_purchase_timestamp

order_vs_delivered_date

order_status

product_category_name

top 80%

categorias vs.

payment_type

review_score

review_comment_message

price

customer_state vs. seller_state

.