if(!require("tidyverse")){install.packages("tidyverse")}
library(lubridate)
if(!require("janitor")){install.packages("janitor")}
#unzipped
df_sellers <- read_csv('data/olist_sellers_dataset.csv')
#zipped
df_customers <- readr::read_csv('data/olist_customers_dataset.csv.zip')
df_geolocation = read_csv('data/olist_geolocation_dataset.csv.zip')
df_order_items <- read_csv('data/olist_order_items_dataset.csv.zip')
df_order_payments = read_csv('data/olist_order_payments_dataset.csv.zip')
df_order_reviews = read_csv('data/olist_order_reviews_dataset.csv.zip')
df_orders <- read_csv('data/olist_orders_dataset.csv.zip')
df_products <- read_csv('data/olist_products_dataset.csv.zip')
df_olist <- df_orders %>%
left_join(df_order_items, by = "order_id") %>%
full_join(df_order_payments, by = "order_id") %>%
full_join(df_order_reviews, by = "order_id") %>%
full_join(df_products, by = "product_id") %>%
full_join(df_customers, by = "customer_id") %>%
full_join(df_sellers, by = "seller_id")
df_olist %>%
select(order_purchase_timestamp) %>%
mutate( order_purchase_timestamp_2 =
lubridate::floor_date( order_purchase_timestamp, unit = "day")) %>%
mutate()
## # A tibble: 119,151 x 2
## order_purchase_timestamp order_purchase_timestamp_2
## <dttm> <dttm>
## 1 2017-10-02 10:56:33 2017-10-02 00:00:00
## 2 2017-10-02 10:56:33 2017-10-02 00:00:00
## 3 2017-10-02 10:56:33 2017-10-02 00:00:00
## 4 2018-07-24 20:41:37 2018-07-24 00:00:00
## 5 2018-08-08 08:38:49 2018-08-08 00:00:00
## 6 2017-11-18 19:28:06 2017-11-18 00:00:00
## 7 2018-02-13 21:18:39 2018-02-13 00:00:00
## 8 2017-07-09 21:57:05 2017-07-09 00:00:00
## 9 2017-04-11 12:22:08 2017-04-11 00:00:00
## 10 2017-05-16 13:10:30 2017-05-16 00:00:00
## # ... with 119,141 more rows
glimpse
## function (x, width = NULL, ...)
## {
## UseMethod("glimpse")
## }
## <bytecode: 0x0000000014ea7790>
## <environment: namespace:tibble>
df_olist %>% glimpse
## Rows: 119,151
## Columns: 39
## $ order_id <chr> "e481f51cbdc54678b7cc49136f2d6af7", "...
## $ customer_id <chr> "9ef432eb6251297304e76186b10a928d", "...
## $ order_status <chr> "delivered", "delivered", "delivered"...
## $ order_purchase_timestamp <dttm> 2017-10-02 10:56:33, 2017-10-02 10:5...
## $ order_approved_at <dttm> 2017-10-02 11:07:15, 2017-10-02 11:0...
## $ order_delivered_carrier_date <dttm> 2017-10-04 19:55:00, 2017-10-04 19:5...
## $ order_delivered_customer_date <dttm> 2017-10-10 21:25:13, 2017-10-10 21:2...
## $ order_estimated_delivery_date <dttm> 2017-10-18, 2017-10-18, 2017-10-18, ...
## $ order_item_id <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ product_id <chr> "87285b34884572647811a353c7ac498a", "...
## $ seller_id <chr> "3504c0cb71d7fa48d967e0e4c94d59d9", "...
## $ shipping_limit_date <dttm> 2017-10-06 11:07:15, 2017-10-06 11:0...
## $ price <dbl> 29.99, 29.99, 29.99, 118.70, 159.90, ...
## $ freight_value <dbl> 8.72, 8.72, 8.72, 22.76, 19.22, 27.20...
## $ payment_sequential <dbl> 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1...
## $ payment_type <chr> "credit_card", "voucher", "voucher", ...
## $ payment_installments <dbl> 1, 1, 1, 1, 3, 1, 1, 6, 1, 3, 1, 1, 1...
## $ payment_value <dbl> 18.12, 2.00, 18.59, 141.46, 179.12, 7...
## $ review_id <chr> "a54f0611adc9ed256b57ede6b6eb5114", "...
## $ review_score <dbl> 4, 4, 4, 4, 5, 5, 5, 4, 2, 5, 1, 5, 5...
## $ review_comment_title <chr> NA, NA, NA, "Muito boa a loja", NA, N...
## $ review_comment_message <chr> "Não testei o produto ainda, mas ele ...
## $ review_creation_date <dttm> 2017-10-11, 2017-10-11, 2017-10-11, ...
## $ review_answer_timestamp <dttm> 2017-10-12 03:43:48, 2017-10-12 03:4...
## $ product_category_name <chr> "utilidades_domesticas", "utilidades_...
## $ product_name_lenght <dbl> 40, 40, 40, 29, 46, 59, 38, 49, NA, 5...
## $ product_description_lenght <dbl> 268, 268, 268, 178, 232, 468, 316, 60...
## $ product_photos_qty <dbl> 4, 4, 4, 1, 1, 3, 4, 1, NA, 1, 2, 1, ...
## $ product_weight_g <dbl> 500, 500, 500, 400, 420, 450, 250, 71...
## $ product_length_cm <dbl> 19, 19, 19, 19, 24, 30, 51, 65, 35, 1...
## $ product_height_cm <dbl> 8, 8, 8, 13, 19, 10, 15, 10, 35, 16, ...
## $ product_width_cm <dbl> 13, 13, 13, 19, 21, 20, 15, 65, 15, 1...
## $ customer_unique_id <chr> "7c396fd4830fd04220f754e42b4e5bff", "...
## $ customer_zip_code_prefix <chr> "03149", "03149", "03149", "47813", "...
## $ customer_city <chr> "sao paulo", "sao paulo", "sao paulo"...
## $ customer_state <chr> "SP", "SP", "SP", "BA", "GO", "RN", "...
## $ seller_zip_code_prefix <chr> "09350", "09350", "09350", "31570", "...
## $ seller_city <chr> "maua", "maua", "maua", "belo horizon...
## $ seller_state <chr> "SP", "SP", "SP", "SP", "SP", "MG", "...
Seguem algumas explorações feitas
A base possui 119151 linhas e 39 colunas, com 0.8345797 de clientes únicos, e um total de 32952 diferentes produtos vendidos. Considerando que houveram 96096 customer_unique_id
temos que a imensa maioria dos pedidos foram realizadas por clientes que compraram uma única vez. Esta informação pode ser utilizada tanto para uma investigação na base de dados, considerando a possibilidade que os customer_unique_id
não estejam sendo gerados adequadamente, quanto como uma oportunidade de negócio para a fidelização de tais clientes.
df_olist %>%
summarise(
n_linhas = n(),
n_distinct(order_id),
n_distinct(customer_id),
n_distinct(review_id),
n_distinct(customer_unique_id),
n_distinct(product_id),
) %>%
pivot_longer(cols = everything())
## # A tibble: 6 x 2
## name value
## <chr> <int>
## 1 n_linhas 119151
## 2 n_distinct(order_id) 99441
## 3 n_distinct(customer_id) 99441
## 4 n_distinct(review_id) 99173
## 5 n_distinct(customer_unique_id) 96096
## 6 n_distinct(product_id) 32952
df_olist %>%
select(order_estimated_delivery_date, order_delivered_customer_date) %>%
mutate(estimated_vs_delivered_date =
(order_delivered_customer_date - order_estimated_delivery_date) %/% ddays(1)) %>%
pull(estimated_vs_delivered_date) %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -146.00 -16.00 -12.00 -11.13 -6.00 188.00 3421
df_olist %>%
mutate( estimated_vs_delivered_date =
(order_delivered_customer_date - order_estimated_delivery_date) %/% ddays(1)) %>%
ggplot(aes(product_weight_g, estimated_vs_delivered_date)) +
geom_point(alpha = 0.2) +
coord_cartesian(xlim= c(NA, 30000))
## Warning: Removed 3441 rows containing missing values (geom_point).
A base possui dados entre 2016-09-04 e 2018-10-17
df_olist %>%
mutate(order_purchase_timestamp_safra =
lubridate::floor_date((order_purchase_timestamp), "month")) %>%
group_by(order_purchase_timestamp_safra) %>%
summarise(n = n_distinct(order_id))
## # A tibble: 25 x 2
## order_purchase_timestamp_safra n
## * <dttm> <int>
## 1 2016-09-01 00:00:00 4
## 2 2016-10-01 00:00:00 324
## 3 2016-12-01 00:00:00 1
## 4 2017-01-01 00:00:00 800
## 5 2017-02-01 00:00:00 1780
## 6 2017-03-01 00:00:00 2682
## 7 2017-04-01 00:00:00 2404
## 8 2017-05-01 00:00:00 3700
## 9 2017-06-01 00:00:00 3245
## 10 2017-07-01 00:00:00 4026
## # ... with 15 more rows
Tendo uma ascendencia na quantidade de pedidos
df_olist %>%
mutate(order_purchase_timestamp_safra =
lubridate::floor_date((order_purchase_timestamp), "month")) %>%
group_by(order_purchase_timestamp_safra) %>%
summarise(n = n_distinct(order_id)) %>%
ggplot(aes(order_purchase_timestamp_safra, n)) +
geom_line()
Tal crescimento foi concentrado nos pedidos com avaliação positiva
df_olist %>%
mutate(order_purchase_timestamp_safra =
lubridate::floor_date((order_purchase_timestamp), "month")) %>%
mutate(review_score = as_factor(review_score)) %>%
group_by(order_purchase_timestamp_safra, review_score) %>%
summarise(n = n()) %>%
mutate(order_purchase_timestamp_safra = as.Date(order_purchase_timestamp_safra)) %>%
ggplot(aes(order_purchase_timestamp_safra, n)) +
geom_area(aes(fill = review_score), alpha = 0.5) +
scale_fill_brewer(palette = "RdYlGn") +
scale_x_date(date_breaks = "2 months", labels = scales::date_format("%b/%y"),
guide = guide_axis(n.dodge = 2))
## `summarise()` has grouped output by 'order_purchase_timestamp_safra'. You can override using the `.groups` argument.
df_olist %>%
mutate(prazo_efetivo = interval(
order_purchase_timestamp,
order_delivered_customer_date) %/%
ddays(1)) %>%
filter(order_status == 'delivered') %>%
ggplot(aes(x = log(price), y = log(prazo_efetivo), z = review_score)) +
stat_summary_2d(fun = "mean")
## Warning: Removed 27 rows containing non-finite values (stat_summary2d).
df_olist %>%
mutate(review_score = as_factor(review_score)) %>%
mutate(order_vs_delivered_date =
interval(order_purchase_timestamp, order_delivered_customer_date) %/% ddays(1)) %>%
ggplot(aes("", order_vs_delivered_date, color = review_score, fill = review_score)) +
geom_boxplot(alpha = 0.6) +
scale_color_brewer(palette = "RdYlGn") +
scale_fill_brewer(palette = "RdYlGn") +
facet_grid( . ~ review_score , labeller = label_both) +
theme(legend.position = 'none') +
labs(x="")
## Warning: Removed 3421 rows containing non-finite values (stat_boxplot).
A grande maioria dos pedidos foram entregues
df_olist %>%
janitor::tabyl(order_status) %>%
arrange(-n) %>%
mutate(percent = scales::percent(percent))
## order_status n percent
## delivered 115731 97.1297%
## shipped 1256 1.0541%
## canceled 750 0.6295%
## unavailable 652 0.5472%
## invoiced 378 0.3172%
## processing 376 0.3156%
## created 5 0.0042%
## approved 3 0.0025%
df_olist %>%
janitor::tabyl(product_category_name) %>%
arrange(-n) %>%
mutate(percent_acum = cumsum(percent)) %>%
filter(percent_acum < 0.8) %>%
mutate(across(contains("percent"), ~ scales::percent(.x)))
## product_category_name n percent valid_percent percent_acum
## cama_mesa_banho 11990 10.063% 10.28% 10.1%
## beleza_saude 10033 8.420% 8.60% 18.5%
## esporte_lazer 9005 7.558% 7.72% 26.0%
## moveis_decoracao 8833 7.413% 7.57% 33.5%
## informatica_acessorios 8151 6.841% 6.99% 40.3%
## utilidades_domesticas 7380 6.194% 6.33% 46.5%
## relogios_presentes 6213 5.214% 5.33% 51.7%
## telefonia 4726 3.966% 4.05% 55.7%
## ferramentas_jardim 4590 3.852% 3.94% 59.5%
## automotivo 4400 3.693% 3.77% 63.2%
## brinquedos 4281 3.593% 3.67% 66.8%
## cool_stuff 3999 3.356% 3.43% 70.2%
## perfumaria 3589 3.012% 3.08% 73.2%
## bebes 3204 2.689% 2.75% 75.9%
## eletronicos 2847 2.389% 2.44% 78.3%
df_olist %>%
mutate(Persona = ifelse(review_score > 3, "bom", "ruim")) %>%
select(product_category_name, Persona) %>%
#filtrando os produtos presentes em pelo menos 500 compras
group_by(product_category_name) %>%
filter(n() > 500) %>%
ungroup() %>%
#filtrando as linhas com NA
filter(!is.na(product_category_name)) %>%
#exemplo de um filtro por nome da categoria
#filter(stringr::str_detect(product_category_name, "construcao|casa")) %>%
#ordenando as categorias no gráfico e depois invertendo a ordem
mutate(product_category_name = forcats::fct_infreq(product_category_name)) %>%
mutate(product_category_name = forcats::fct_rev(product_category_name)) %>%
#gráfico
ggplot(aes(x = product_category_name, fill = Persona)) +
geom_bar(data = . %>% filter(Persona == "bom")) +
geom_bar(data = . %>% filter(Persona == "ruim"), aes( y = ..count..*(-1) ) ) +
scale_y_continuous(limits = c(-10000, 10000), breaks = seq(-10000, 10000, 2000), labels = abs) +
coord_flip()
concentração de tipos de pagamento
df_olist %>%
janitor::tabyl(payment_type) %>%
arrange(-n) %>%
mutate(percent = scales::percent(percent)) %>%
mutate(valid_percent = scales::percent(valid_percent))
## payment_type n percent valid_percent
## credit_card 87784 73.7% 73.7%
## boleto 23190 19.5% 19.5%
## voucher 6465 5.4% 5.4%
## debit_card 1706 1.4% 1.4%
## not_defined 3 0.0% 0.0%
## <NA> 3 0.0% <NA>
df_olist %>%
mutate(payment_type = forcats::fct_infreq(payment_type),
payment_type = forcats::fct_explicit_na(payment_type, na_level = "NA"),
payment_type = forcats::fct_recode(payment_type,
missing = "not_defined", missing = "NA")) %>%
ggplot(aes(payment_type, color = payment_type, fill = payment_type)) +
geom_bar(alpha = 0.4) +
theme(legend.position = 'none')
df_olist %>%
mutate(review_score = as_factor(review_score)) %>%
ggplot(aes(review_score, fill = review_score)) +
geom_bar(alpha = 0.6) +
scale_fill_brewer(palette = "RdYlGn") +
theme(legend.position = 'none')
Identificação de pedidos que tiveram comentários
df_olist %>%
mutate(review_comment_message_str = str_length(review_comment_message)) %>%
mutate(review_comment_message_index =
ifelse(is.na(review_comment_message), "nao_comentou", "comentou")) %>%
tabyl(review_comment_message_index) %>%
mutate(percent= scales::percent(percent))
## review_comment_message_index n percent
## comentou 51242 43%
## nao_comentou 67909 57%
entre os pedidos com comentário a % de avaliações negativas é superior
df_olist %>%
mutate(review_comment_message_str = str_length(review_comment_message)) %>%
mutate(review_comment_message_index =
ifelse(is.na(review_comment_message), "nao_comentou", "comentou")) %>%
mutate(review_score = as_factor(review_score)) %>%
ggplot(aes(review_comment_message_index, fill = review_score)) +
geom_bar(position = "fill", alpha = 0.6) +
scale_fill_brewer(palette = "RdYlGn") +
coord_flip() +
theme(legend.position = 'bottom') +
guides(fill = guide_legend(reverse = TRUE))
A média dos preços é de 120.6491086, versus uma mediana de 74.9, indicando a presença de outliers
summary(df_olist$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.85 39.90 74.90 120.65 134.90 6735.00 833
Adicionalmente temos que 50% dos pedidos estão concentrados nos valores em 50 e 150 reais
df_olist%>%
ggplot(aes(price)) +
geom_boxplot(color = "darkblue", fill = "darkblue", alpha = 0.4) +
coord_cartesian(xlim = c(NA, 500)) +
scale_x_continuous(breaks = seq(0,500, 50))
## Warning: Removed 833 rows containing non-finite values (stat_boxplot).
avaliando por tipos de pagamento, os clientes que pagam com cartão de crédito possuem uma ticket superior
df_olist %>%
ggplot(aes(payment_type, price)) +
geom_boxplot(alpha = 0.4) +
coord_cartesian(ylim = c(NA, 300))
## Warning: Removed 833 rows containing non-finite values (stat_boxplot).
scale_y_continuous(breaks = seq(0,500, 50))
## <ScaleContinuousPosition>
## Range:
## Limits: 0 -- 1
SP concentra o número de vendedores e clientes, avaliando segundo a proporção SP concentra ~30% das transaçõe
df_olist %>%
group_by(customer_state, seller_state) %>%
count() %>%
ungroup() %>%
arrange(-n) %>%
mutate(p = n/sum(n)) %>%
mutate(acumulado = cumsum(n)) %>%
mutate(acumulado_p = acumulado/sum(n)) %>%
ggplot(aes(customer_state, seller_state)) +
geom_tile(aes(fill = p)) +
theme_minimal()
considerando a distribuição de clientes para cada um dos estados vendedores, SP, RJ e MG possuem o maior destaque
df_olist %>%
group_by(customer_state, seller_state) %>%
count() %>%
arrange(-n) %>%
group_by(seller_state) %>%
mutate(p = n/sum(n)) %>%
ggplot(aes(customer_state, seller_state)) +
geom_tile(aes(fill = p)) +
scale_fill_distiller(palette = "YlGnBu", direction = 1) +
theme_minimal()
voltando ao resultado geral, e excluindo SP, temos que além de MG e RJ, a região Sul possui destaque
df_olist %>%
group_by(customer_state, seller_state) %>%
count() %>%
ungroup() %>%
arrange(-n) %>%
mutate(p = n/sum(n)) %>%
mutate(acumulado = cumsum(n)) %>%
mutate(acumulado_p = acumulado/sum(n)) %>%
filter(customer_state != "SP", seller_state != "SP") %>%
ggplot(aes(customer_state, seller_state)) +
geom_tile(aes(fill = p)) +
scale_fill_distiller(palette = "YlGnBu", direction = 1) +
theme_minimal()