Here are some techniques that I use to standardize datasets for convenient purposes. Likewise, packages are used up to help me to intepret datasets
str_to_title() function to convert string into right formatgridExtra, but it is very convenient for manipulating#Set working directory
setwd("C:/Users/DellPC/Desktop/brazilian-ecommerce")
#Load libraries
suppressMessages(library(tidyverse))
suppressMessages(library(dplyr))
suppressMessages(library(ggplot2))
suppressMessages(library(scales))
suppressMessages(library(plotly))
suppressMessages(library(gridExtra))
suppressMessages(library(crosstalk))
suppressMessages(library(kableExtra))
suppressMessages(library(DT))
suppressMessages(library(leaflet))
suppressMessages(library(lubridate))
suppressMessages(library(RColorBrewer))
suppressMessages(library(zoo))
suppressMessages(library(patchwork))
theme_set(theme_minimal())
#Load datasets
customers <- read.csv("olist_customers_dataset.csv", header = TRUE)
geolocation <-
read.csv("olist_geolocation_dataset.csv", header = TRUE)
items <- read.csv("olist_order_items_dataset.csv", header = TRUE)
payments <-
read.csv("olist_order_payments_dataset.csv", header = TRUE)
reviews <-
read.csv("olist_order_reviews_dataset.csv", header = TRUE)
orders <- read.csv("olist_orders_dataset.csv", header = TRUE)
products <- read.csv("olist_products_dataset.csv", header = TRUE)
sellers <- read.csv("olist_sellers_dataset.csv", header = TRUE)
product_category_name <-
read.csv("product_category_name_translation.csv", header = TRUE)
# Standardize the datasets
customers <-
customers %>% mutate(customer_city = str_to_title(customer_city))
geolocation <-
geolocation %>% mutate(geolocation_city = str_to_title(geolocation_city))
sellers <-
sellers %>% transmute(
seller_city = str_to_title(seller_city),
seller_id = seller_id,
seller_zip_code_prefix = seller_zip_code_prefix,
seller_state = seller_state
)
items <-
items %>% mutate(shipping_limit_date = as.POSIXct(shipping_limit_date, format =
"Y-%m-%d %H:%m:%s"))
orders <-
orders %>% mutate(
order_purchase_timestamp = as.POSIXct(
order_purchase_timestamp,
format = "Y-%m-%d %H:%m:%s",
order_approved_at = as.POSIXct(order_approved_at, format = "Y-%m-%d %H:%m:%s"),
order_delivered_carrier_date = as.POSIXct(
order_delivered_carrier_date,
format = "Y-%m-%d %H:%m:%s",
order_delivered_customer_date = as.POSIXct(order_estimated_delivery_date, format =
"Y-%m-%d %H:%m:%s"),
order_estimated_delivery_date = as.POSIXct(order_estimated_delivery_date, format =
"Y-%m-%d %H:%m:%s")
)
)
)
payments <-
payments %>% mutate(
payment_type = fct_recode(
payment_type,
"Credit card" = "credit_card",
"Boleto" = "boleto",
"Debit card" = "debit_card",
"Not defined" = "not_defined",
"Voucher" = "voucher"
)
)
product_category_name <-
product_category_name %>% transmute(
product_category_name = ï..product_category_name,
product_category_name_english = product_category_name_english
) %>% mutate(product_category_name_english = str_to_title(product_category_name_english))
products <-
products %>% transmute(
product_category_name = str_to_title(product_category_name),
weight = product_weight_g,
quantity = product_photos_qty,
weight = product_weight_g,
height = product_height_cm,
width = product_width_cm
)
reviews <-
reviews %>% transmute(
score = review_score,
comment = review_comment_message,
creation_date = review_creation_date,
answer = review_answer_timestamp
) %>% mutate(creation_date = as.POSIXct(creation_date, format = "%Y-%m-%d %H:%M:%s"))
I don’t know why they just store in prefix-zip-code in datasets. Because This is not enough elements to determine each geolocation for each customers. To visualize geolocation, I just calculate the mean of longitude and latitude to have a general view.
- customer_id
- customer_unique_id
- customer_zip_code_prefix
- customer_city
- customer_state
| customer_id | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state |
|---|---|---|---|---|
| 06b8999e2fba1a1fbc88172c00ba8bc7 | 861eff4711a542e4b93843c6dd7febb0 | 14409 | Franca | SP |
| 18955e83d337fd6b2def6b18a428ac77 | 290c77bc529b7ac935b93aa66c333dc3 | 9790 | Sao Bernardo Do Campo | SP |
| 4e7b3e00288586ebd08712fdd0374a03 | 060e732b5b29e8181a18229c7b0b2b5e | 1151 | Sao Paulo | SP |
| b2b6027bc5c5109e529d4dc6358b12c3 | 259dac757896d24d7702b9acbbff3f3c | 8775 | Mogi Das Cruzes | SP |
| 4f2d8ab171c80ec8364f7c12e35b23ad | 345ecd01c38d18a9036ed96c73b8d066 | 13056 | Campinas | SP |
| 879864dab9bc3047522c92c82e1212b8 | 4c93744516667ad3b8f1fb645a3116a4 | 89254 | Jaragua Do Sul | SC |
| fd826e7cf63160e536e0908c76c3f441 | addec96d2e059c80c30fe6871d30d177 | 4534 | Sao Paulo | SP |
| 5e274e7a0c3809e14aba7ad5aae0d407 | 57b2a98a409812fe9618067b6b8ebe4f | 35182 | Timoteo | MG |
| 5adf08e34b2e993982a47070956c5c65 | 1175e95fb47ddff9de6b2b06188f7e0d | 81560 | Curitiba | PR |
| 4b7139f34592b3a31687243a302fa75b | 9afe194fb833f79e300e37e580171f22 | 30575 | Belo Horizonte | MG |
There are some typo in processing text, but it’s more popular in less frequent geolocation_city
- geolocation_zip_code_prefix
- geolocation_lat
- geolocation_lng
- geolocation_city
- geolocation_state
| geolocation_zip_code_prefix | geolocation_lat | geolocation_lng | geolocation_city | geolocation_state |
|---|---|---|---|---|
| 1037 | -23.54562 | -46.63929 | Sao Paulo | SP |
| 1046 | -23.54608 | -46.64482 | Sao Paulo | SP |
| 1046 | -23.54613 | -46.64295 | Sao Paulo | SP |
| 1041 | -23.54439 | -46.63950 | Sao Paulo | SP |
| 1035 | -23.54158 | -46.64161 | Sao Paulo | SP |
| 1012 | -23.54776 | -46.63536 | Sã£O Paulo | SP |
| 1047 | -23.54627 | -46.64123 | Sao Paulo | SP |
| 1013 | -23.54692 | -46.63426 | Sao Paulo | SP |
| 1029 | -23.54377 | -46.63428 | Sao Paulo | SP |
| 1011 | -23.54764 | -46.63603 | Sao Paulo | SP |
There are some typo in processing text, but it’s more popular in less frequent geolocation_city
- order_id
- customer_id
- order_status
- order_purchase_timestamp
- order_approved_at
- order_delivered_carrier_date
- order_delivered_customer_date
- order_estimated_delivery_date
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date |
|---|---|---|---|---|---|---|---|
| e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | NA | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 00:00:00 |
| 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | NA | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 00:00:00 |
| 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | NA | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 00:00:00 |
| 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | NA | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 00:00:00 |
| ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | NA | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 00:00:00 |
| a4591c265e18cb1dcee52889e2d8acc3 | 503740e9ca751ccdda7ba28e9ab8f608 | delivered | NA | 2017-07-09 22:10:13 | 2017-07-11 14:58:04 | 2017-07-26 10:57:55 | 2017-08-01 00:00:00 |
| 136cce7faa42fdb2cefd53fdc79a6098 | ed0271e0b7da060a393796590e7b737a | invoiced | NA | 2017-04-13 13:25:17 | 2017-05-09 00:00:00 | ||
| 6514b8ad8028c9f2cc2374ded245783f | 9bdf08b4b3b52b5526ff42d37d47f222 | delivered | NA | 2017-05-16 13:22:11 | 2017-05-22 10:07:46 | 2017-05-26 12:55:51 | 2017-06-07 00:00:00 |
| 76c6e866289321a7c93b82b54852dc33 | f54a9f0e6b351c431402b8461ea51999 | delivered | NA | 2017-01-25 02:50:47 | 2017-01-26 14:16:31 | 2017-02-02 14:08:10 | 2017-03-06 00:00:00 |
| e69bfb5eb88e0ed6a785585b27e16dbf | 31ad1d1b63eb9962463f764d4e6e0c9d | delivered | NA | 2017-07-29 12:05:32 | 2017-08-10 19:45:24 | 2017-08-16 17:14:30 | 2017-08-23 00:00:00 |
I don’t understand Is order_item_id like as order quantity?
- order_id
- order_item_id
- product_id
- seller_id
- shipping_limit_date
- price
- freight_value
| order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value |
|---|---|---|---|---|---|---|
| 00010242fe8c5a6d1ba2dd792cb16214 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | NA | 58.90 | 13.29 |
| 00018f77f2f0320c557190d7a144bdd3 | 1 | e5f2d52b802189ee658865ca93d83a8f | dd7ddc04e1b6c2c614352b383efe2d36 | NA | 239.90 | 19.93 |
| 000229ec398224ef6ca0657da4fc703e | 1 | c777355d18b72b67abbeef9df44fd0fd | 5b51032eddd242adc84c38acab88f23d | NA | 199.00 | 17.87 |
| 00024acbcdf0a6daa1e931b038114c75 | 1 | 7634da152a4610f1595efa32f14722fc | 9d7a1d34a5052409006425275ba1c2b4 | NA | 12.99 | 12.79 |
| 00042b26cf59d7ce69dfabb4e55b4fd9 | 1 | ac6c3623068f30de03045865e4e10089 | df560393f3a51e74553ab94004ba5c87 | NA | 199.90 | 18.14 |
| 00048cc3ae777c65dbb7d2a0634bc1ea | 1 | ef92defde845ab8450f9d70c526ef70f | 6426d21aca402a131fc0a5d0960a3c90 | NA | 21.90 | 12.69 |
| 00054e8431b9d7675808bcb819fb4a32 | 1 | 8d4f2bb7e93e6710a28f34fa83ee7d28 | 7040e82f899a04d1b434b795a43b4617 | NA | 19.90 | 11.85 |
| 000576fe39319847cbb9d288c5617fa6 | 1 | 557d850972a7d6f792fd18ae1400d9b6 | 5996cddab893a4652a15592fb58ab8db | NA | 810.00 | 70.75 |
| 0005a1a1728c9d785b8e2b08b904576c | 1 | 310ae3c140ff94b03219ad0adc3c778f | a416b6a846a11724393025641d4edd5e | NA | 145.95 | 11.65 |
| 0005f50442cb953dcd1d21e1fb923495 | 1 | 4535b0e1091c278dfd193e5a1d63b39f | ba143b05f0110f0dc71ad71b4466ce92 | NA | 53.99 | 11.40 |
I don’t understand payment_sequential like as payment quantity?
- order_id
- payment_sequential
- payment_type
- payment_installments
- payment_value
| order_id | payment_sequential | payment_type | payment_installments | payment_value |
|---|---|---|---|---|
| b81ef226f3fe1789b1e8b2acac839d17 | 1 | Credit card | 8 | 99.33 |
| a9810da82917af2d9aefd1278f1dcfa0 | 1 | Credit card | 1 | 24.39 |
| 25e8ea4e93396b6fa0d3dd708e76c1bd | 1 | Credit card | 1 | 65.71 |
| ba78997921bbcdc1373bb41e913ab953 | 1 | Credit card | 8 | 107.78 |
| 42fdf880ba16b47b59251dd489d4441a | 1 | Credit card | 2 | 128.45 |
| 298fcdf1f73eb413e4d26d01b25bc1cd | 1 | Credit card | 2 | 96.12 |
| 771ee386b001f06208a7419e4fc1bbd7 | 1 | Credit card | 1 | 81.16 |
| 3d7239c394a212faae122962df514ac7 | 1 | Credit card | 3 | 51.84 |
| 1f78449c87a54faf9e96e88ba1491fa9 | 1 | Credit card | 6 | 341.09 |
| 0573b5e23cbd798006520e1d5b4c6714 | 1 | Boleto | 1 | 51.95 |
I don’t know Portugese, so the review comment is quite difficult for me to understand. But my first think approach is use sentiment
- review_id
- order_id
- review_score
- review_comment_title
- review_comment_message
- review_creation_date
- review_answer_timestamp
| score | comment | creation_date | answer |
|---|---|---|---|
| 4 | 1970-01-01 07:00:00 | 2018-01-18 21:46:59 | |
| 5 | 1970-01-01 07:00:00 | 2018-03-11 03:05:13 | |
| 5 | 1970-01-01 07:00:00 | 2018-02-18 14:36:24 | |
| 5 | Recebi bem antes do prazo estipulado. | 1970-01-01 07:00:00 | 2017-04-21 22:02:06 |
| 5 | Parabéns lojas lannister adorei comprar pela Internet seguro e prático Parabéns a todos feliz Páscoa | 1970-01-01 07:00:00 | 2018-03-02 10:26:53 |
| 1 | 1970-01-01 07:00:00 | 2018-04-16 00:39:37 | |
| 5 | 1970-01-01 07:00:00 | 2017-07-18 19:30:34 | |
| 5 | 1970-01-01 07:00:00 | 2018-08-14 21:36:06 | |
| 5 | 1970-01-01 07:00:00 | 2017-05-18 12:05:37 | |
| 4 | aparelho eficiente. no site a marca do aparelho esta impresso como 3desinfector e ao chegar esta com outro nome…atualizar com a marca correta uma vez que é o mesmo aparelho | 1970-01-01 07:00:00 | 2018-05-23 16:45:47 |
- seller_city
- seller_id
- seller_zip_code_prefix
- seller_state
| seller_city | seller_id | seller_zip_code_prefix | seller_state |
|---|---|---|---|
| Campinas | 3442f8959a84dea7ee197c632cb2df15 | 13023 | SP |
| Mogi Guacu | d1b65fc7debc3361ea86b5f14c68d2e2 | 13844 | SP |
| Rio De Janeiro | ce3ad9de960102d0677a81f5d0bb7b2d | 20031 | RJ |
| Sao Paulo | c0f3eea2e14555b6faeea3dd58c1b1c3 | 4195 | SP |
| Braganca Paulista | 51a04a8a6bdcb23deccc82b0b80742cf | 12914 | SP |
| Rio De Janeiro | c240c4061717ac1806ae6ee72be3533b | 20920 | RJ |
| Brejao | e49c26c3edfa46d227d5121a6b6e4d37 | 55325 | PE |
| Penapolis | 1b938a7ec6ac5061a66a3766e0e75f90 | 16304 | SP |
| Sao Paulo | 768a86e36ad6aae3d03ee3c6433d61df | 1529 | SP |
| Curitiba | ccc4bbb5f32a6ab2b7066a4130f114e3 | 80310 | PR |
- product_category_name
- weight
- quantity
- height
- widths
| product_category_name | weight | quantity | height | width |
|---|---|---|---|---|
| Perfumaria | 225 | 1 | 10 | 14 |
| Artes | 1000 | 1 | 18 | 20 |
| Esporte_lazer | 154 | 1 | 9 | 15 |
| Bebes | 371 | 1 | 4 | 26 |
| Utilidades_domesticas | 625 | 4 | 17 | 13 |
| Instrumentos_musicais | 200 | 1 | 5 | 11 |
| Cool_stuff | 18350 | 4 | 24 | 44 |
| Moveis_decoracao | 900 | 2 | 8 | 40 |
| Eletrodomesticos | 400 | 1 | 13 | 17 |
| Brinquedos | 600 | 1 | 10 | 12 |
- product_category_name
- product_category_name_english
| product_category_name | product_category_name_english |
|---|---|
| beleza_saude | Health_beauty |
| informatica_acessorios | Computers_accessories |
| automotivo | Auto |
| cama_mesa_banho | Bed_bath_table |
| moveis_decoracao | Furniture_decor |
| esporte_lazer | Sports_leisure |
| perfumaria | Perfumery |
| utilidades_domesticas | Housewares |
| telefonia | Telephony |
| relogios_presentes | Watches_gifts |
#Statistic in column for city and state
gg <-
customers %>% group_by(customer_state) %>% count() %>% arrange(desc(n)) %>%
ggplot(aes(
x = fct_reorder(customer_state, n),
y = n,
label = n,
fill = customer_state
)) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 1)) +
coord_flip() + labs(title = "Number of customers in each state") + xlab("Brazilian State") +
ylab("Number of customers ") + scale_y_continuous(labels = comma)
ggplotly(gg)
gg <-
customers %>% transmute(a = fct_lump(customer_city, 30)) %>% group_by(a) %>%
count() %>% ggplot(aes(
x = fct_reorder(a, n),
y = n,
label = n,
fill = a
)) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 1), legend.title = element_blank()) +
coord_flip() + labs(title = "Number of customers in each most city") + xlab("Brazilian city") +
ylab("Number of customers ") + scale_y_continuous(labels = comma)
ggplotly(gg)
#Draw the geographical map for city and state
"Cities"
## [1] "Cities"
##Prepare data set
j_cus <-
customers %>% select_at(vars(customer_city, customer_zip_code_prefix, customer_state)) %>% group_by(customer_city) %>%
count() %>% filter(n > 100)
"Cities that have equal and more than 100 customers"
## [1] "Cities that have equal and more than 100 customers"
j_geo <-
geolocation %>% select_at(
vars(
geolocation_zip_code_prefix,
geolocation_lat,
geolocation_lng,
geolocation_state,
geolocation_city
)
) %>% unique.data.frame() %>% group_by(geolocation_city) %>% summarise(
geolocation_lng = mean(geolocation_lng),
geolocatio_lat = mean(geolocation_lat)
)
"This dataset have some duplicated rows, the long and lat of different cities are the same"
## [1] "This dataset have some duplicated rows, the long and lat of different cities are the same"
j_cus_geo <-
inner_join(j_cus, j_geo, by = c("customer_city" = "geolocation_city")) %>%
mutate(popup = paste(customer_city, "-", n))
##Draw the map (customer_city)
j_cus_geo %>% leaflet() %>% addTiles() %>% addProviderTiles(provider = providers$Esri.WorldTopoMap) %>%
addProviderTiles(provider = providers$HERE.mapLabels) %>% addProviderTiles(providers$Stamen.TonerLines) %>%
addCircleMarkers(
lng = ~ geolocation_lng,
lat = ~ geolocatio_lat,
popup = ~ popup,
radius = ~ sqrt(n) / 5,
label = ~ customer_city,
color = "black",
fill = "white"
) %>% addPopups(
lng = ~ geolocation_lng,
lat = ~ geolocatio_lat,
popup = ~ popup,
options = popupOptions(closeOnClick = TRUE)
)
"States"
## [1] "States"
##Prepare data set
j_cus <-
customers %>% select_at(vars(customer_city, customer_zip_code_prefix, customer_state)) %>% group_by(customer_state) %>%
count()
"There are 27 states in Brazil"
## [1] "There are 27 states in Brazil"
j_geo <-
geolocation %>% select_at(
vars(
geolocation_zip_code_prefix,
geolocation_lat,
geolocation_lng,
geolocation_state,
geolocation_city
)
) %>% unique.data.frame() %>% group_by(geolocation_state) %>% summarise(
geolocation_lng = mean(geolocation_lng),
geolocatio_lat = mean(geolocation_lat)
)
"This dataset have some duplicated rows, the long and lat of different cities are the same"
## [1] "This dataset have some duplicated rows, the long and lat of different cities are the same"
j_cus_geo <-
inner_join(j_cus, j_geo, by = c("customer_state" = "geolocation_state")) %>%
mutate(popup = paste(customer_state, "-", n))
##Draw the map (state_map)
j_cus_geo %>% leaflet() %>% addTiles() %>% addTiles() %>% addProviderTiles(provider = providers$Esri.WorldTopoMap) %>%
addProviderTiles(provider = providers$HERE.mapLabels) %>% addProviderTiles(providers$Stamen.TonerLines) %>%
addCircleMarkers(
lng = ~ geolocation_lng,
lat = ~ geolocatio_lat,
popup = ~ popup,
radius = ~ sqrt(n) / 5,
label = ~ customer_state,
color = "black",
fill = "white"
) %>% addPopups(
lng = ~ geolocation_lng,
lat = ~ geolocatio_lat,
popup = ~ popup,
options = popupOptions(closeOnClick = FALSE)
)
#Payments in each city and state
j_p_o <- left_join(payments, orders, by = "order_id")
j_p_o <-
j_p_o %>% select(order_id, payment_type, payment_value, order_status)
j_o_c <- left_join(orders, customers, by = c("customer_id"))
j_o_c <- j_o_c %>% select(order_id, customer_zip_code_prefix)
j_o_g <-
left_join(
customers,
geolocation,
by = c("customer_zip_code_prefix" = "geolocation_zip_code_prefix")
)
j_o_g <-
j_o_g %>% select(
customer_zip_code_prefix,
geolocation_city,
geolocation_state,
geolocation_lat,
geolocation_lng
) %>% group_by(customer_zip_code_prefix,
geolocation_city,
geolocation_state) %>% summarise(
geolocation_lng = mean(geolocation_lng),
geolocation_lat = mean(geolocation_lat)
)
j1 <-
left_join(j_p_o, j_o_c, by = "order_id") %>% group_by(customer_zip_code_prefix, payment_type) %>%
summarise(payment_value = mean(payment_value))
j2 <-
left_join(j1, j_o_g, by = "customer_zip_code_prefix") %>% group_by(payment_type, geolocation_state) %>%
summarise(
geolocation_lng = mean(geolocation_lng),
geolocation_lat = mean(geolocation_lat),
payment_value = mean(payment_value)
) %>% drop_na()
values <-
c(brewer.pal(9, "Set1"),
brewer.pal(9, "RdPu"),
brewer.pal(9, "YlOrBr"))
j2 %>% leaflet() %>% addTiles() %>% addProviderTiles(provider = providers$Esri.WorldTopoMap) %>%
addProviderTiles(provider = providers$HERE.mapLabels) %>% addProviderTiles(providers$Stamen.TonerLines) %>% addCircleMarkers(
lng = ~ geolocation_lng,
lat = ~ geolocation_lat,
label = ~ paste(payment_type, "-", geolocation_state),
radius = ~ sqrt(payment_value),
group = ~ geolocation_state,
color = ~ values
) %>% addLayersControl(overlayGroups = c(j2$geolocation_state, j2$payment_type))
"Duplicated recoreds, Because of the first time in cleaning mistakes, I can't visualize it within city. Likewise, State scope is a potential choice"
## [1] "Duplicated recoreds, Because of the first time in cleaning mistakes, I can't visualize it within city. Likewise, State scope is a potential choice"
#Number of customers in each year (The trend of the number)
##Is customer_id unique in orders dataset?
table(duplicated(orders$customer_id))
##
## FALSE
## 99441
"Yes"
## [1] "Yes"
time_line_dt <-
orders %>% select(customer_id, order_approved_at) %>% mutate(order_approved_at =
as.yearmon(as.Date(order_approved_at)),
year = year(as.Date(order_approved_at))) %>% group_by(order_approved_at, year) %>%
count()%>%drop_na()%>%ungroup()
gg<-time_line_dt %>% ggplot(aes(x = factor(order_approved_at), y = n)) + geom_line(aes(group =
year, color = factor(year)), lwd = 2) + theme(axis.text.x = element_text(angle = 90, vjust =
1,color = "red",size = 12),legend.title = element_blank()) + scale_color_manual(values = c("red", "green", "yellow")) + geom_point(fill ="darkorchid", size = 3,shape=21)+xlab("Date")+ylab("Number of Userid")+labs(title="The increasing of number userid in each year")+scale_y_continuous(labels = comma)
ggplotly(gg)
gg+coord_polar()
#Statistic in column for city and state
gg <-
sellers %>% group_by(seller_state) %>% count() %>% arrange(desc(n)) %>% ggplot(aes(
x = fct_reorder(seller_state, n),
y = n,
label = n,
fill = seller_state
)) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 1)) +
coord_flip() + labs(title = "Number of sellers in each state") + xlab("Brazilian State") +
ylab("Number of sellers ") + scale_y_continuous(labels = comma)
ggplotly(gg)
gg <-
sellers %>% mutate(a = fct_lump(seller_city,30)) %>% group_by(a) %>% count() %>%
ggplot(aes(
x = fct_reorder(a, n),
y = n,
label = n,
fill = a
)) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 1),
legend.title = element_blank()) + coord_flip() + labs(title = "Number of sellers in each most city") +
xlab("Brazilian city") + ylab("Number of sellers ") + scale_y_continuous(labels = comma)
ggplotly(gg)
#Draw the geographical map for city and state
"Cities"
## [1] "Cities"
##Prepare data set
j_sel<-sellers%>%select_at(vars(seller_city,seller_zip_code_prefix,seller_state))%>% group_by(seller_city)%>%count()
"Cities that have equal and more than 100 sellers"
## [1] "Cities that have equal and more than 100 sellers"
j_geo<- geolocation %>%select_at(vars(geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_state,geolocation_city))%>%unique.data.frame()%>%group_by(geolocation_city)%>%summarise(geolocation_lng=mean(geolocation_lng),geolocatio_lat=mean(geolocation_lat))
"This dataset have some duplicated rows, the long and lat of different cities are the same"
## [1] "This dataset have some duplicated rows, the long and lat of different cities are the same"
j_sel_geo<- inner_join(j_sel,j_geo,by=c("seller_city"="geolocation_city"))%>%mutate(popup=paste(seller_city,"-",n))
##Draw the map (sellers map_city map)
j_sel_geo%>%leaflet()%>%addTiles()%>% addTiles() %>% addProviderTiles(provider = providers$Esri.WorldTopoMap) %>%
addProviderTiles(provider = providers$HERE.mapLabels) %>% addProviderTiles(providers$Stamen.TonerLines) %>%addCircleMarkers(lng=~geolocation_lng,lat=~geolocatio_lat,popup = ~popup,radius = ~sqrt(n)/5,label=~seller_city,color ="black",fill="white")%>%addPopups(lng = ~geolocation_lng,lat=~geolocatio_lat,popup = ~popup,options = popupOptions(closeOnClick = TRUE))
"States"
## [1] "States"
##Prepare data set
j_sel<-sellers%>%select_at(vars(seller_city,seller_zip_code_prefix,seller_state))%>% group_by(seller_state)%>%count()
"There are 27 states in Brazil"
## [1] "There are 27 states in Brazil"
j_geo<- geolocation %>%select_at(vars(geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_state,geolocation_city))%>%unique.data.frame()%>%group_by(geolocation_state)%>%summarise(geolocation_lng=mean(geolocation_lng),geolocatio_lat=mean(geolocation_lat))
"This dataset have some duplicated rows, the long and lat of different cities are the same"
## [1] "This dataset have some duplicated rows, the long and lat of different cities are the same"
j_sel_geo<- inner_join(j_sel,j_geo,by=c("seller_state"="geolocation_state"))%>%mutate(popup=paste(seller_state,"-",n))
##Draw the map (sellers map_ state map)
j_sel_geo%>%leaflet()%>%addTiles()%>% addTiles() %>% addProviderTiles(provider = providers$Esri.WorldTopoMap) %>%
addProviderTiles(provider = providers$HERE.mapLabels) %>% addProviderTiles(providers$Stamen.TonerLines) %>%addCircleMarkers(lng=~geolocation_lng,lat=~geolocatio_lat,popup = ~popup,radius = ~sqrt(n)/5,label=~seller_state,color ="black",fill="white")%>%addPopups(lng = ~geolocation_lng,lat=~geolocatio_lat,popup = ~popup,options = popupOptions(closeOnClick = FALSE))
#Number of sellers in each year (The trend of the number)
it<-items%>%mutate(value=price*order_item_id)%>%group_by(order_id,seller_id)%>%summarise(value=sum(value),freight_value=sum(freight_value))%>%ungroup()
j_it_or<- left_join(it,orders,by="order_id")%>%select(seller_id,order_approved_at,freight_value,value)%>%transmute(order_approved_at=as.yearmon(as.Date(order_approved_at,format="%Y-%m-%d")),seller_id,freight_value,value,year=year(order_approved_at))
time_line_dt <-j_it_or %>% group_by(order_approved_at, year) %>% count()%>%drop_na()%>%ungroup()
gg<-time_line_dt %>% ggplot(aes(x = factor(order_approved_at), y = n)) + geom_line(aes(group =
year, color = factor(year)), lwd = 2) + theme(axis.text.x = element_text(angle = 90, vjust =
1,color = "red",size = 12),legend.title = element_blank()) + scale_color_manual(values = c("red", "green", "yellow")) + geom_point(fill ="darkorchid", size = 3,shape=21)+xlab("Date")+ylab("Number of Userid")+labs(title="The increasing of number sellers in each year")+scale_y_continuous(labels = comma)
ggplotly(gg)
gg+coord_polar()