Preparation: Libraries and Datasets

Here are some techniques that I use to standardize datasets for convenient purposes. Likewise, packages are used up to help me to intepret datasets


#Set working directory
setwd("C:/Users/DellPC/Desktop/brazilian-ecommerce")

#Load libraries

suppressMessages(library(tidyverse))
suppressMessages(library(dplyr))
suppressMessages(library(ggplot2))
suppressMessages(library(scales))
suppressMessages(library(plotly))
suppressMessages(library(gridExtra))
suppressMessages(library(crosstalk))
suppressMessages(library(kableExtra))
suppressMessages(library(DT))
suppressMessages(library(leaflet))
suppressMessages(library(lubridate))
suppressMessages(library(RColorBrewer))
suppressMessages(library(zoo))
suppressMessages(library(patchwork))

theme_set(theme_minimal())
#Load datasets

customers <- read.csv("olist_customers_dataset.csv", header = TRUE)
geolocation <-
               read.csv("olist_geolocation_dataset.csv", header = TRUE)
items <- read.csv("olist_order_items_dataset.csv", header = TRUE)
payments <-
               read.csv("olist_order_payments_dataset.csv", header = TRUE)
reviews <-
               read.csv("olist_order_reviews_dataset.csv", header = TRUE)
orders <- read.csv("olist_orders_dataset.csv", header = TRUE)
products <- read.csv("olist_products_dataset.csv", header = TRUE)
sellers <- read.csv("olist_sellers_dataset.csv", header = TRUE)
product_category_name <-
               read.csv("product_category_name_translation.csv", header = TRUE)

# Standardize the datasets

customers <-
               customers %>% mutate(customer_city = str_to_title(customer_city))
geolocation <-
               geolocation %>% mutate(geolocation_city = str_to_title(geolocation_city))
sellers <-
               sellers %>% transmute(
                              seller_city = str_to_title(seller_city),
                              seller_id = seller_id,
                              seller_zip_code_prefix = seller_zip_code_prefix,
                              seller_state = seller_state
               )
items <-
               items %>% mutate(shipping_limit_date = as.POSIXct(shipping_limit_date, format =
                                                                                "Y-%m-%d %H:%m:%s"))
orders <-
               orders %>% mutate(
                              order_purchase_timestamp = as.POSIXct(
                                             order_purchase_timestamp,
                                             format = "Y-%m-%d %H:%m:%s",
                                             order_approved_at = as.POSIXct(order_approved_at, format = "Y-%m-%d %H:%m:%s"),
                                             order_delivered_carrier_date = as.POSIXct(
                                                            order_delivered_carrier_date,
                                                            format = "Y-%m-%d %H:%m:%s",
                                                            order_delivered_customer_date = as.POSIXct(order_estimated_delivery_date, format =
                                                                                                                      "Y-%m-%d %H:%m:%s"),
                                                            order_estimated_delivery_date = as.POSIXct(order_estimated_delivery_date, format =
                                                                                                                      "Y-%m-%d %H:%m:%s")
                                             )
                              )
               )
payments <-
               payments %>% mutate(
                              payment_type = fct_recode(
                                             payment_type,
                                             "Credit card" = "credit_card",
                                             "Boleto" = "boleto",
                                             "Debit card" = "debit_card",
                                             "Not defined" = "not_defined",
                                             "Voucher" = "voucher"
                              )
               )
product_category_name <-
               product_category_name %>% transmute(
                              product_category_name = ï..product_category_name,
                              product_category_name_english = product_category_name_english
               ) %>% mutate(product_category_name_english = str_to_title(product_category_name_english))
products <-
               products %>% transmute(
                              product_category_name = str_to_title(product_category_name),
                              weight = product_weight_g,
                              quantity = product_photos_qty,
                              weight = product_weight_g,
                              height = product_height_cm,
                              width = product_width_cm
               )
reviews <-
               reviews %>% transmute(
                              score = review_score,
                              comment = review_comment_message,
                              creation_date = review_creation_date,
                              answer = review_answer_timestamp
               ) %>% mutate(creation_date = as.POSIXct(creation_date, format = "%Y-%m-%d %H:%M:%s"))

It’s time to have a first look at Olist datasets

Customer

I don’t know why they just store in prefix-zip-code in datasets. Because This is not enough elements to determine each geolocation for each customers. To visualize geolocation, I just calculate the mean of longitude and latitude to have a general view.

           - customer_id
           - customer_unique_id
           - customer_zip_code_prefix
           - customer_city
           - customer_state
           
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state
06b8999e2fba1a1fbc88172c00ba8bc7 861eff4711a542e4b93843c6dd7febb0 14409 Franca SP
18955e83d337fd6b2def6b18a428ac77 290c77bc529b7ac935b93aa66c333dc3 9790 Sao Bernardo Do Campo SP
4e7b3e00288586ebd08712fdd0374a03 060e732b5b29e8181a18229c7b0b2b5e 1151 Sao Paulo SP
b2b6027bc5c5109e529d4dc6358b12c3 259dac757896d24d7702b9acbbff3f3c 8775 Mogi Das Cruzes SP
4f2d8ab171c80ec8364f7c12e35b23ad 345ecd01c38d18a9036ed96c73b8d066 13056 Campinas SP
879864dab9bc3047522c92c82e1212b8 4c93744516667ad3b8f1fb645a3116a4 89254 Jaragua Do Sul SC
fd826e7cf63160e536e0908c76c3f441 addec96d2e059c80c30fe6871d30d177 4534 Sao Paulo SP
5e274e7a0c3809e14aba7ad5aae0d407 57b2a98a409812fe9618067b6b8ebe4f 35182 Timoteo MG
5adf08e34b2e993982a47070956c5c65 1175e95fb47ddff9de6b2b06188f7e0d 81560 Curitiba PR
4b7139f34592b3a31687243a302fa75b 9afe194fb833f79e300e37e580171f22 30575 Belo Horizonte MG

Geolocation

There are some typo in processing text, but it’s more popular in less frequent geolocation_city

        - geolocation_zip_code_prefix
           - geolocation_lat
           - geolocation_lng
           - geolocation_city
           - geolocation_state
           
           
geolocation_zip_code_prefix geolocation_lat geolocation_lng geolocation_city geolocation_state
1037 -23.54562 -46.63929 Sao Paulo SP
1046 -23.54608 -46.64482 Sao Paulo SP
1046 -23.54613 -46.64295 Sao Paulo SP
1041 -23.54439 -46.63950 Sao Paulo SP
1035 -23.54158 -46.64161 Sao Paulo SP
1012 -23.54776 -46.63536 Sã£O Paulo SP
1047 -23.54627 -46.64123 Sao Paulo SP
1013 -23.54692 -46.63426 Sao Paulo SP
1029 -23.54377 -46.63428 Sao Paulo SP
1011 -23.54764 -46.63603 Sao Paulo SP

Order

There are some typo in processing text, but it’s more popular in less frequent geolocation_city

        - order_id
           - customer_id
           - order_status
           - order_purchase_timestamp
           - order_approved_at
           - order_delivered_carrier_date
           - order_delivered_customer_date
           - order_estimated_delivery_date
           
           
order_id customer_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date
e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d delivered NA 2017-10-02 11:07:15 2017-10-04 19:55:00 2017-10-10 21:25:13 2017-10-18 00:00:00
53cdb2fc8bc7dce0b6741e2150273451 b0830fb4747a6c6d20dea0b8c802d7ef delivered NA 2018-07-26 03:24:27 2018-07-26 14:31:00 2018-08-07 15:27:45 2018-08-13 00:00:00
47770eb9100c2d0c44946d9cf07ec65d 41ce2a54c0b03bf3443c3d931a367089 delivered NA 2018-08-08 08:55:23 2018-08-08 13:50:00 2018-08-17 18:06:29 2018-09-04 00:00:00
949d5b44dbf5de918fe9c16f97b45f8a f88197465ea7920adcdbec7375364d82 delivered NA 2017-11-18 19:45:59 2017-11-22 13:39:59 2017-12-02 00:28:42 2017-12-15 00:00:00
ad21c59c0840e6cb83a9ceb5573f8159 8ab97904e6daea8866dbdbc4fb7aad2c delivered NA 2018-02-13 22:20:29 2018-02-14 19:46:34 2018-02-16 18:17:02 2018-02-26 00:00:00
a4591c265e18cb1dcee52889e2d8acc3 503740e9ca751ccdda7ba28e9ab8f608 delivered NA 2017-07-09 22:10:13 2017-07-11 14:58:04 2017-07-26 10:57:55 2017-08-01 00:00:00
136cce7faa42fdb2cefd53fdc79a6098 ed0271e0b7da060a393796590e7b737a invoiced NA 2017-04-13 13:25:17 2017-05-09 00:00:00
6514b8ad8028c9f2cc2374ded245783f 9bdf08b4b3b52b5526ff42d37d47f222 delivered NA 2017-05-16 13:22:11 2017-05-22 10:07:46 2017-05-26 12:55:51 2017-06-07 00:00:00
76c6e866289321a7c93b82b54852dc33 f54a9f0e6b351c431402b8461ea51999 delivered NA 2017-01-25 02:50:47 2017-01-26 14:16:31 2017-02-02 14:08:10 2017-03-06 00:00:00
e69bfb5eb88e0ed6a785585b27e16dbf 31ad1d1b63eb9962463f764d4e6e0c9d delivered NA 2017-07-29 12:05:32 2017-08-10 19:45:24 2017-08-16 17:14:30 2017-08-23 00:00:00

Item

I don’t understand Is order_item_id like as order quantity?

        - order_id
           - order_item_id
           - product_id
           - seller_id
           - shipping_limit_date
           - price
           - freight_value
           
           
order_id order_item_id product_id seller_id shipping_limit_date price freight_value
00010242fe8c5a6d1ba2dd792cb16214 1 4244733e06e7ecb4970a6e2683c13e61 48436dade18ac8b2bce089ec2a041202 NA 58.90 13.29
00018f77f2f0320c557190d7a144bdd3 1 e5f2d52b802189ee658865ca93d83a8f dd7ddc04e1b6c2c614352b383efe2d36 NA 239.90 19.93
000229ec398224ef6ca0657da4fc703e 1 c777355d18b72b67abbeef9df44fd0fd 5b51032eddd242adc84c38acab88f23d NA 199.00 17.87
00024acbcdf0a6daa1e931b038114c75 1 7634da152a4610f1595efa32f14722fc 9d7a1d34a5052409006425275ba1c2b4 NA 12.99 12.79
00042b26cf59d7ce69dfabb4e55b4fd9 1 ac6c3623068f30de03045865e4e10089 df560393f3a51e74553ab94004ba5c87 NA 199.90 18.14
00048cc3ae777c65dbb7d2a0634bc1ea 1 ef92defde845ab8450f9d70c526ef70f 6426d21aca402a131fc0a5d0960a3c90 NA 21.90 12.69
00054e8431b9d7675808bcb819fb4a32 1 8d4f2bb7e93e6710a28f34fa83ee7d28 7040e82f899a04d1b434b795a43b4617 NA 19.90 11.85
000576fe39319847cbb9d288c5617fa6 1 557d850972a7d6f792fd18ae1400d9b6 5996cddab893a4652a15592fb58ab8db NA 810.00 70.75
0005a1a1728c9d785b8e2b08b904576c 1 310ae3c140ff94b03219ad0adc3c778f a416b6a846a11724393025641d4edd5e NA 145.95 11.65
0005f50442cb953dcd1d21e1fb923495 1 4535b0e1091c278dfd193e5a1d63b39f ba143b05f0110f0dc71ad71b4466ce92 NA 53.99 11.40

Payment

I don’t understand payment_sequential like as payment quantity?

        - order_id
           - payment_sequential
           - payment_type
           - payment_installments
           - payment_value
           
           
order_id payment_sequential payment_type payment_installments payment_value
b81ef226f3fe1789b1e8b2acac839d17 1 Credit card 8 99.33
a9810da82917af2d9aefd1278f1dcfa0 1 Credit card 1 24.39
25e8ea4e93396b6fa0d3dd708e76c1bd 1 Credit card 1 65.71
ba78997921bbcdc1373bb41e913ab953 1 Credit card 8 107.78
42fdf880ba16b47b59251dd489d4441a 1 Credit card 2 128.45
298fcdf1f73eb413e4d26d01b25bc1cd 1 Credit card 2 96.12
771ee386b001f06208a7419e4fc1bbd7 1 Credit card 1 81.16
3d7239c394a212faae122962df514ac7 1 Credit card 3 51.84
1f78449c87a54faf9e96e88ba1491fa9 1 Credit card 6 341.09
0573b5e23cbd798006520e1d5b4c6714 1 Boleto 1 51.95

Review

I don’t know Portugese, so the review comment is quite difficult for me to understand. But my first think approach is use sentiment

        - review_id
           - order_id
           - review_score
           - review_comment_title
           - review_comment_message
           - review_creation_date
           - review_answer_timestamp
           
           
score comment creation_date answer
4 1970-01-01 07:00:00 2018-01-18 21:46:59
5 1970-01-01 07:00:00 2018-03-11 03:05:13
5 1970-01-01 07:00:00 2018-02-18 14:36:24
5 Recebi bem antes do prazo estipulado. 1970-01-01 07:00:00 2017-04-21 22:02:06
5 Parabéns lojas lannister adorei comprar pela Internet seguro e prático Parabéns a todos feliz Páscoa 1970-01-01 07:00:00 2018-03-02 10:26:53
1 1970-01-01 07:00:00 2018-04-16 00:39:37
5 1970-01-01 07:00:00 2017-07-18 19:30:34
5 1970-01-01 07:00:00 2018-08-14 21:36:06
5 1970-01-01 07:00:00 2017-05-18 12:05:37
4 aparelho eficiente. no site a marca do aparelho esta impresso como 3desinfector e ao chegar esta com outro nome…atualizar com a marca correta uma vez que é o mesmo aparelho 1970-01-01 07:00:00 2018-05-23 16:45:47

Seller

        - seller_city
           - seller_id
           - seller_zip_code_prefix
           - seller_state
           
           
seller_city seller_id seller_zip_code_prefix seller_state
Campinas 3442f8959a84dea7ee197c632cb2df15 13023 SP
Mogi Guacu d1b65fc7debc3361ea86b5f14c68d2e2 13844 SP
Rio De Janeiro ce3ad9de960102d0677a81f5d0bb7b2d 20031 RJ
Sao Paulo c0f3eea2e14555b6faeea3dd58c1b1c3 4195 SP
Braganca Paulista 51a04a8a6bdcb23deccc82b0b80742cf 12914 SP
Rio De Janeiro c240c4061717ac1806ae6ee72be3533b 20920 RJ
Brejao e49c26c3edfa46d227d5121a6b6e4d37 55325 PE
Penapolis 1b938a7ec6ac5061a66a3766e0e75f90 16304 SP
Sao Paulo 768a86e36ad6aae3d03ee3c6433d61df 1529 SP
Curitiba ccc4bbb5f32a6ab2b7066a4130f114e3 80310 PR

Product

        - product_category_name
           - weight
           - quantity
           - height
           - widths
           
product_category_name weight quantity height width
Perfumaria 225 1 10 14
Artes 1000 1 18 20
Esporte_lazer 154 1 9 15
Bebes 371 1 4 26
Utilidades_domesticas 625 4 17 13
Instrumentos_musicais 200 1 5 11
Cool_stuff 18350 4 24 44
Moveis_decoracao 900 2 8 40
Eletrodomesticos 400 1 13 17
Brinquedos 600 1 10 12

Product_Category_Name

        - product_category_name
           - product_category_name_english
           
product_category_name product_category_name_english
beleza_saude Health_beauty
informatica_acessorios Computers_accessories
automotivo Auto
cama_mesa_banho Bed_bath_table
moveis_decoracao Furniture_decor
esporte_lazer Sports_leisure
perfumaria Perfumery
utilidades_domesticas Housewares
telefonia Telephony
relogios_presentes Watches_gifts

Customers and Seller dive deeply

Customers Perspective

#Statistic in column for city and state
gg <-
               customers %>% group_by(customer_state) %>% count() %>% arrange(desc(n)) %>%
               ggplot(aes(
                              x = fct_reorder(customer_state, n),
                              y = n,
                              label = n,
                              fill = customer_state
               )) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 1)) +
               coord_flip() + labs(title = "Number of customers in each state") + xlab("Brazilian State") +
               ylab("Number of customers ") + scale_y_continuous(labels = comma)
ggplotly(gg)
gg <-
               customers %>% transmute(a = fct_lump(customer_city, 30)) %>% group_by(a) %>%
               count() %>% ggplot(aes(
                              x = fct_reorder(a, n),
                              y = n,
                              label = n,
                              fill = a
               )) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 1), legend.title = element_blank()) +
               coord_flip() + labs(title = "Number of customers in each most city") + xlab("Brazilian city") +
               ylab("Number of customers ") + scale_y_continuous(labels = comma)
ggplotly(gg)
#Draw the geographical map for city and state

"Cities"
## [1] "Cities"
##Prepare data set
j_cus <-
               customers %>% select_at(vars(customer_city, customer_zip_code_prefix, customer_state)) %>% group_by(customer_city) %>%
               count() %>% filter(n > 100)

"Cities that have equal and more than 100 customers"
## [1] "Cities that have equal and more than 100 customers"
j_geo <-
               geolocation %>% select_at(
                              vars(
                                             geolocation_zip_code_prefix,
                                             geolocation_lat,
                                             geolocation_lng,
                                             geolocation_state,
                                             geolocation_city
                              )
               ) %>% unique.data.frame() %>% group_by(geolocation_city) %>% summarise(
                              geolocation_lng = mean(geolocation_lng),
                              geolocatio_lat = mean(geolocation_lat)
               )

"This dataset have some duplicated rows, the long and lat of different cities are the same"
## [1] "This dataset have some duplicated rows, the long and lat of different cities are the same"
j_cus_geo <-
               inner_join(j_cus, j_geo, by = c("customer_city" = "geolocation_city")) %>%
               mutate(popup = paste(customer_city, "-", n))

##Draw the map (customer_city)

               j_cus_geo %>% leaflet() %>% addTiles() %>% addProviderTiles(provider = providers$Esri.WorldTopoMap) %>%
               addProviderTiles(provider = providers$HERE.mapLabels) %>% addProviderTiles(providers$Stamen.TonerLines) %>%
               addCircleMarkers(
                              lng =  ~ geolocation_lng,
                              lat =  ~ geolocatio_lat,
                              popup = ~ popup,
                              radius = ~ sqrt(n) / 5,
                              label =  ~ customer_city,
                              color = "black",
                              fill = "white"
               ) %>% addPopups(
                              lng = ~ geolocation_lng,
                              lat =  ~ geolocatio_lat,
                              popup = ~ popup,
                              options = popupOptions(closeOnClick = TRUE)
               )
"States"
## [1] "States"
##Prepare data set
j_cus <-
               customers %>% select_at(vars(customer_city, customer_zip_code_prefix, customer_state)) %>% group_by(customer_state) %>%
               count()

"There are 27 states in Brazil"
## [1] "There are 27 states in Brazil"
j_geo <-
               geolocation %>% select_at(
                              vars(
                                             geolocation_zip_code_prefix,
                                             geolocation_lat,
                                             geolocation_lng,
                                             geolocation_state,
                                             geolocation_city
                              )
               ) %>% unique.data.frame() %>% group_by(geolocation_state) %>% summarise(
                              geolocation_lng = mean(geolocation_lng),
                              geolocatio_lat = mean(geolocation_lat)
               )

"This dataset have some duplicated rows, the long and lat of different cities are the same"
## [1] "This dataset have some duplicated rows, the long and lat of different cities are the same"
j_cus_geo <-
               inner_join(j_cus, j_geo, by = c("customer_state" = "geolocation_state")) %>%
               mutate(popup = paste(customer_state, "-", n))

##Draw the map (state_map)

               j_cus_geo %>% leaflet() %>% addTiles() %>% addTiles() %>% addProviderTiles(provider = providers$Esri.WorldTopoMap) %>%
               addProviderTiles(provider = providers$HERE.mapLabels) %>% addProviderTiles(providers$Stamen.TonerLines) %>%
               addCircleMarkers(
                              lng =  ~ geolocation_lng,
                              lat =  ~ geolocatio_lat,
                              popup = ~ popup,
                              radius = ~ sqrt(n) / 5,
                              label =  ~ customer_state,
                              color = "black",
                              fill = "white"
               ) %>% addPopups(
                              lng = ~ geolocation_lng,
                              lat =  ~ geolocatio_lat,
                              popup = ~ popup,
                              options = popupOptions(closeOnClick = FALSE)
               )
#Payments in each city and state
j_p_o <- left_join(payments, orders, by = "order_id")
j_p_o <-
               j_p_o %>% select(order_id, payment_type, payment_value, order_status)

j_o_c <- left_join(orders, customers, by = c("customer_id"))
j_o_c <- j_o_c %>% select(order_id, customer_zip_code_prefix)

j_o_g <-
               left_join(
                              customers,
                              geolocation,
                              by = c("customer_zip_code_prefix" = "geolocation_zip_code_prefix")
               )
j_o_g <-
               j_o_g %>% select(
                              customer_zip_code_prefix,
                              geolocation_city,
                              geolocation_state,
                              geolocation_lat,
                              geolocation_lng
               ) %>% group_by(customer_zip_code_prefix,
                              geolocation_city,
                              geolocation_state) %>% summarise(
                                             geolocation_lng = mean(geolocation_lng),
                                             geolocation_lat = mean(geolocation_lat)
                              )

j1 <-
               left_join(j_p_o, j_o_c, by = "order_id") %>% group_by(customer_zip_code_prefix, payment_type) %>%
               summarise(payment_value = mean(payment_value))
j2 <-
               left_join(j1, j_o_g, by = "customer_zip_code_prefix") %>% group_by(payment_type, geolocation_state) %>%
               summarise(
                              geolocation_lng = mean(geolocation_lng),
                              geolocation_lat = mean(geolocation_lat),
                              payment_value = mean(payment_value)
               ) %>% drop_na()

values <-
               c(brewer.pal(9, "Set1"),
                 brewer.pal(9, "RdPu"),
                 brewer.pal(9, "YlOrBr"))

               j2 %>% leaflet() %>% addTiles() %>% addProviderTiles(provider = providers$Esri.WorldTopoMap) %>%
               addProviderTiles(provider = providers$HERE.mapLabels) %>% addProviderTiles(providers$Stamen.TonerLines) %>% addCircleMarkers(
                              lng =  ~ geolocation_lng,
                              lat =  ~ geolocation_lat,
                              label =  ~ paste(payment_type, "-", geolocation_state),
                              radius = ~ sqrt(payment_value),
                              group = ~ geolocation_state,
                              color = ~ values
               ) %>% addLayersControl(overlayGroups = c(j2$geolocation_state, j2$payment_type))
"Duplicated recoreds, Because of the first time in cleaning mistakes, I can't visualize it within city. Likewise, State scope is a potential choice"
## [1] "Duplicated recoreds, Because of the first time in cleaning mistakes, I can't visualize it within city. Likewise, State scope is a potential choice"
#Number of customers in each year (The trend of the number)

##Is customer_id unique in orders dataset?
table(duplicated(orders$customer_id))
## 
## FALSE 
## 99441
"Yes"
## [1] "Yes"
time_line_dt <-
               orders %>% select(customer_id, order_approved_at) %>% mutate(order_approved_at =
                                                                                           as.yearmon(as.Date(order_approved_at)),
                                                                            year = year(as.Date(order_approved_at))) %>% group_by(order_approved_at, year) %>%
               count()%>%drop_na()%>%ungroup()

gg<-time_line_dt %>% ggplot(aes(x = factor(order_approved_at), y = n)) + geom_line(aes(group =
                                                                                       year, color = factor(year)), lwd = 2) + theme(axis.text.x = element_text(angle = 90, vjust =
                                                                                                                                                                    1,color = "red",size = 12),legend.title = element_blank()) + scale_color_manual(values = c("red", "green", "yellow")) + geom_point(fill ="darkorchid", size = 3,shape=21)+xlab("Date")+ylab("Number of Userid")+labs(title="The increasing of number userid in each year")+scale_y_continuous(labels = comma)



ggplotly(gg)
gg+coord_polar()


Sellers Perspective

#Statistic in column for city and state
gg <-
               sellers %>% group_by(seller_state) %>% count() %>% arrange(desc(n)) %>% ggplot(aes(
                              x = fct_reorder(seller_state, n),
                              y = n,
                              label = n,
                              fill = seller_state
               )) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 1)) +
               coord_flip() + labs(title = "Number of sellers in each state") + xlab("Brazilian State") +
               ylab("Number of sellers ") + scale_y_continuous(labels = comma)
ggplotly(gg)
gg <-
               sellers %>% mutate(a = fct_lump(seller_city,30)) %>% group_by(a) %>% count() %>%
               ggplot(aes(
                              x = fct_reorder(a, n),
                              y = n,
                              label = n,
                              fill = a
               )) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 1),
                                       legend.title = element_blank()) + coord_flip() + labs(title = "Number of sellers in each most city") +
               xlab("Brazilian city") + ylab("Number of sellers ") + scale_y_continuous(labels = comma)
ggplotly(gg)
#Draw the geographical map for city and state

"Cities"
## [1] "Cities"
##Prepare data set
j_sel<-sellers%>%select_at(vars(seller_city,seller_zip_code_prefix,seller_state))%>% group_by(seller_city)%>%count()

"Cities that have equal and more than 100 sellers"
## [1] "Cities that have equal and more than 100 sellers"
j_geo<- geolocation %>%select_at(vars(geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_state,geolocation_city))%>%unique.data.frame()%>%group_by(geolocation_city)%>%summarise(geolocation_lng=mean(geolocation_lng),geolocatio_lat=mean(geolocation_lat))

"This dataset have some duplicated rows, the long and lat of different cities are the same"
## [1] "This dataset have some duplicated rows, the long and lat of different cities are the same"
j_sel_geo<- inner_join(j_sel,j_geo,by=c("seller_city"="geolocation_city"))%>%mutate(popup=paste(seller_city,"-",n))

##Draw the map (sellers map_city map)

j_sel_geo%>%leaflet()%>%addTiles()%>% addTiles() %>% addProviderTiles(provider = providers$Esri.WorldTopoMap) %>%
               addProviderTiles(provider = providers$HERE.mapLabels) %>% addProviderTiles(providers$Stamen.TonerLines) %>%addCircleMarkers(lng=~geolocation_lng,lat=~geolocatio_lat,popup = ~popup,radius = ~sqrt(n)/5,label=~seller_city,color ="black",fill="white")%>%addPopups(lng = ~geolocation_lng,lat=~geolocatio_lat,popup = ~popup,options = popupOptions(closeOnClick = TRUE))
"States"
## [1] "States"
##Prepare data set
j_sel<-sellers%>%select_at(vars(seller_city,seller_zip_code_prefix,seller_state))%>% group_by(seller_state)%>%count()

"There are 27 states in Brazil"
## [1] "There are 27 states in Brazil"
j_geo<- geolocation %>%select_at(vars(geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_state,geolocation_city))%>%unique.data.frame()%>%group_by(geolocation_state)%>%summarise(geolocation_lng=mean(geolocation_lng),geolocatio_lat=mean(geolocation_lat))

"This dataset have some duplicated rows, the long and lat of different cities are the same"
## [1] "This dataset have some duplicated rows, the long and lat of different cities are the same"
j_sel_geo<- inner_join(j_sel,j_geo,by=c("seller_state"="geolocation_state"))%>%mutate(popup=paste(seller_state,"-",n))

##Draw the map (sellers map_ state map)

j_sel_geo%>%leaflet()%>%addTiles()%>% addTiles() %>% addProviderTiles(provider = providers$Esri.WorldTopoMap) %>%
               addProviderTiles(provider = providers$HERE.mapLabels) %>% addProviderTiles(providers$Stamen.TonerLines) %>%addCircleMarkers(lng=~geolocation_lng,lat=~geolocatio_lat,popup = ~popup,radius = ~sqrt(n)/5,label=~seller_state,color ="black",fill="white")%>%addPopups(lng = ~geolocation_lng,lat=~geolocatio_lat,popup = ~popup,options = popupOptions(closeOnClick = FALSE))
#Number of sellers in each year (The trend of the number)

it<-items%>%mutate(value=price*order_item_id)%>%group_by(order_id,seller_id)%>%summarise(value=sum(value),freight_value=sum(freight_value))%>%ungroup()

j_it_or<- left_join(it,orders,by="order_id")%>%select(seller_id,order_approved_at,freight_value,value)%>%transmute(order_approved_at=as.yearmon(as.Date(order_approved_at,format="%Y-%m-%d")),seller_id,freight_value,value,year=year(order_approved_at))



time_line_dt <-j_it_or %>% group_by(order_approved_at, year) %>% count()%>%drop_na()%>%ungroup()

gg<-time_line_dt %>% ggplot(aes(x = factor(order_approved_at), y = n)) + geom_line(aes(group =
                                                                                       year, color = factor(year)), lwd = 2) + theme(axis.text.x = element_text(angle = 90, vjust =
                                                                                                                                                                    1,color = "red",size = 12),legend.title = element_blank()) + scale_color_manual(values = c("red", "green", "yellow")) + geom_point(fill ="darkorchid", size = 3,shape=21)+xlab("Date")+ylab("Number of Userid")+labs(title="The increasing of number sellers in each year")+scale_y_continuous(labels = comma)

ggplotly(gg)
gg+coord_polar()