Preparation: Libraries and Datasets
#Set working directory
setwd("C:/Users/DellPC/Desktop/brazilian-ecommerce")
#Load libraries
suppressMessages(library(tidyverse))
suppressMessages(library(dplyr))
suppressMessages(library(ggplot2))
suppressMessages(library(scales))
suppressMessages(library(plotly))
suppressMessages(library(gridExtra))
suppressMessages(library(crosstalk))
suppressMessages(library(kableExtra))
suppressMessages(library(DT))
suppressMessages(library(leaflet))
suppressMessages(library(lubridate))
suppressMessages(library(RColorBrewer))
suppressMessages(library(zoo))
theme_set(theme_minimal())
#Load datasets
customers <- read.csv("olist_customers_dataset.csv", header = TRUE)
geolocation <-
read.csv("olist_geolocation_dataset.csv", header = TRUE)
items <- read.csv("olist_order_items_dataset.csv", header = TRUE)
payments <-
read.csv("olist_order_payments_dataset.csv", header = TRUE)
reviews <-
read.csv("olist_order_reviews_dataset.csv", header = TRUE)
orders <- read.csv("olist_orders_dataset.csv", header = TRUE)
products <- read.csv("olist_products_dataset.csv", header = TRUE)
sellers <- read.csv("olist_sellers_dataset.csv", header = TRUE)
product_category_name <-
read.csv("product_category_name_translation.csv", header = TRUE)
# Standardize the datasets
customers <-
customers %>% mutate(customer_city = str_to_title(customer_city))
geolocation <-
geolocation %>% mutate(geolocation_city = str_to_title(geolocation_city))
sellers <-
sellers %>% transmute(
seller_city = str_to_title(seller_city),
seller_id = seller_id,
seller_zip_code_prefix = seller_zip_code_prefix,
seller_state = seller_state
)
items <-
items %>% mutate(shipping_limit_date = as.POSIXct(shipping_limit_date, format =
"Y-%m-%d %H:%m:%s"))
orders <-
orders %>% mutate(
order_purchase_timestamp = as.POSIXct(
order_purchase_timestamp,
format = "Y-%m-%d %H:%m:%s",
order_approved_at = as.POSIXct(order_approved_at, format = "Y-%m-%d %H:%m:%s"),
order_delivered_carrier_date = as.POSIXct(
order_delivered_carrier_date,
format = "Y-%m-%d %H:%m:%s",
order_delivered_customer_date = as.POSIXct(order_estimated_delivery_date, format =
"Y-%m-%d %H:%m:%s"),
order_estimated_delivery_date = as.POSIXct(order_estimated_delivery_date, format =
"Y-%m-%d %H:%m:%s")
)
)
)
payments <-
payments %>% mutate(
payment_type = fct_recode(
payment_type,
"Credit card" = "credit_card",
"Boleto" = "boleto",
"Debit card" = "debit_card",
"Not defined" = "not_defined",
"Voucher" = "voucher"
)
)
product_category_name <-
product_category_name %>% transmute(
product_category_name = ï..product_category_name,
product_category_name_english = product_category_name_english
) %>% mutate(product_category_name_english = str_to_title(product_category_name_english))
products <-
products %>% transmute(
product_category_name = str_to_title(product_category_name),
weight = product_weight_g,
quantity = product_photos_qty,
weight = product_weight_g,
height = product_height_cm,
width = product_width_cm
)
reviews <-
reviews %>% transmute(
score = review_score,
comment = review_comment_message,
creation_date = review_creation_date,
answer = review_answer_timestamp
) %>% mutate(creation_date = as.POSIXct(creation_date, format = "%Y-%m-%d %H:%M:%s"))
Customers Perspective
#Statistic in column for city and state
gg <-
customers %>% group_by(customer_state) %>% count() %>% arrange(desc(n)) %>%
ggplot(aes(
x = fct_reorder(customer_state, n),
y = n,
label = n,
fill = customer_state
)) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 1)) +
coord_flip() + labs(title = "Number of customers in each state") + xlab("Brazilian State") +
ylab("Number of customers ") + scale_y_continuous(labels = comma)
ggplotly(gg)
gg <-
customers %>% transmute(a = fct_lump(customer_city, 30)) %>% group_by(a) %>%
count() %>% ggplot(aes(
x = fct_reorder(a, n),
y = n,
label = n,
fill = a
)) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 1), legend.title = element_blank()) +
coord_flip() + labs(title = "Number of customers in each most city") + xlab("Brazilian city") +
ylab("Number of customers ") + scale_y_continuous(labels = comma)
ggplotly(gg)
#Draw the geographical map for city and state
"Cities"
## [1] "Cities"
##Prepare data set
j_cus <-
customers %>% select_at(vars(customer_city, customer_zip_code_prefix, customer_state)) %>% group_by(customer_city) %>%
count() %>% filter(n > 100)
"Cities that have equal and more than 100 customers"
## [1] "Cities that have equal and more than 100 customers"
j_geo <-
geolocation %>% select_at(
vars(
geolocation_zip_code_prefix,
geolocation_lat,
geolocation_lng,
geolocation_state,
geolocation_city
)
) %>% unique.data.frame() %>% group_by(geolocation_city) %>% summarise(
geolocation_lng = mean(geolocation_lng),
geolocatio_lat = mean(geolocation_lat)
)
"This dataset have some duplicated rows, the long and lat of different cities are the same"
## [1] "This dataset have some duplicated rows, the long and lat of different cities are the same"
j_cus_geo <-
inner_join(j_cus, j_geo, by = c("customer_city" = "geolocation_city")) %>%
mutate(popup = paste(customer_city, "-", n))
##Draw the map (customer_city)
j_cus_geo %>% leaflet() %>% addTiles() %>% addProviderTiles(provider = providers$Esri.WorldTopoMap) %>%
addProviderTiles(provider = providers$HERE.mapLabels) %>% addProviderTiles(providers$Stamen.TonerLines) %>%
addCircleMarkers(
lng = ~ geolocation_lng,
lat = ~ geolocatio_lat,
popup = ~ popup,
radius = ~ sqrt(n) / 5,
label = ~ customer_city,
color = "black",
fill = "white"
) %>% addPopups(
lng = ~ geolocation_lng,
lat = ~ geolocatio_lat,
popup = ~ popup,
options = popupOptions(closeOnClick = TRUE)
)
"States"
## [1] "States"
##Prepare data set
j_cus <-
customers %>% select_at(vars(customer_city, customer_zip_code_prefix, customer_state)) %>% group_by(customer_state) %>%
count()
"There are 27 states in Brazil"
## [1] "There are 27 states in Brazil"
j_geo <-
geolocation %>% select_at(
vars(
geolocation_zip_code_prefix,
geolocation_lat,
geolocation_lng,
geolocation_state,
geolocation_city
)
) %>% unique.data.frame() %>% group_by(geolocation_state) %>% summarise(
geolocation_lng = mean(geolocation_lng),
geolocatio_lat = mean(geolocation_lat)
)
"This dataset have some duplicated rows, the long and lat of different cities are the same"
## [1] "This dataset have some duplicated rows, the long and lat of different cities are the same"
j_cus_geo <-
inner_join(j_cus, j_geo, by = c("customer_state" = "geolocation_state")) %>%
mutate(popup = paste(customer_state, "-", n))
##Draw the map (state_map)
j_cus_geo %>% leaflet() %>% addTiles() %>% addProviderTiles(provider = providers$OpenMapSurfer.Hillshade) %>%
addProviderTiles(provider = providers$HERE.mapLabels) %>% addProviderTiles(providers$Stamen.TonerLines) %>%
addCircleMarkers(
lng = ~ geolocation_lng,
lat = ~ geolocatio_lat,
popup = ~ popup,
radius = ~ sqrt(n) / 5,
label = ~ customer_state,
color = "black",
fill = "white"
) %>% addPopups(
lng = ~ geolocation_lng,
lat = ~ geolocatio_lat,
popup = ~ popup,
options = popupOptions(closeOnClick = FALSE)
)
#Payments in each city and state
j_p_o <- left_join(payments, orders, by = "order_id")
j_p_o <-
j_p_o %>% select(order_id, payment_type, payment_value, order_status)
j_o_c <- left_join(orders, customers, by = c("customer_id"))
j_o_c <- j_o_c %>% select(order_id, customer_zip_code_prefix)
j_o_g <-
left_join(
customers,
geolocation,
by = c("customer_zip_code_prefix" = "geolocation_zip_code_prefix")
)
j_o_g <-
j_o_g %>% select(
customer_zip_code_prefix,
geolocation_city,
geolocation_state,
geolocation_lat,
geolocation_lng
) %>% group_by(customer_zip_code_prefix,
geolocation_city,
geolocation_state) %>% summarise(
geolocation_lng = mean(geolocation_lng),
geolocation_lat = mean(geolocation_lat)
)
j1 <-
left_join(j_p_o, j_o_c, by = "order_id") %>% group_by(customer_zip_code_prefix, payment_type) %>%
summarise(payment_value = mean(payment_value))
j2 <-
left_join(j1, j_o_g, by = "customer_zip_code_prefix") %>% group_by(payment_type, geolocation_state) %>%
summarise(
geolocation_lng = mean(geolocation_lng),
geolocation_lat = mean(geolocation_lat),
payment_value = mean(payment_value)
) %>% drop_na()
values <-
c(brewer.pal(9, "Set1"),
brewer.pal(9, "RdPu"),
brewer.pal(9, "YlOrBr"))
j2 %>% leaflet() %>% addTiles() %>% addCircleMarkers(
lng = ~ geolocation_lng,
lat = ~ geolocation_lat,
label = ~ paste(payment_type, "-", geolocation_state),
radius = ~ sqrt(payment_value),
group = ~ geolocation_state,
color = ~ values
) %>% addLayersControl(overlayGroups = c(j2$geolocation_state, j2$payment_type))
"Duplicated recoreds, Because of the first time in cleaning mistakes, I can't visualize it within city. Likewise, State scope is a potential choice"
## [1] "Duplicated recoreds, Because of the first time in cleaning mistakes, I can't visualize it within city. Likewise, State scope is a potential choice"
#Number of customers in each year (The trend of the number)
##Is customer_id unique in orders dataset?
table(duplicated(orders$customer_id))
##
## FALSE
## 99441
"Yes"
## [1] "Yes"
time_line_dt <-
orders %>% select(customer_id, order_approved_at) %>% mutate(order_approved_at =
as.yearmon(as.Date(order_approved_at)),
year = year(as.Date(order_approved_at))) %>% group_by(order_approved_at, year) %>%
count()%>%drop_na()%>%ungroup()
gg<-time_line_dt %>% ggplot(aes(x = factor(order_approved_at), y = n)) + geom_line(aes(group =
year, color = factor(year)), lwd = 2) + theme(axis.text.x = element_text(angle = 90, vjust =
1,color = "red",size = 12),legend.title = element_blank()) + scale_color_manual(values = c("red", "green", "yellow")) + geom_point(fill ="darkorchid", size = 3,shape=21)+xlab("Date")+ylab("Number of Userid")+labs(title="The increasing of number userid in each year")+scale_y_continuous(labels = comma)
ggplotly(gg)
gg+coord_polar()

Sellers Perspective
#Statistic in column for city and state
gg <-
sellers %>% group_by(seller_state) %>% count() %>% arrange(desc(n)) %>% ggplot(aes(
x = fct_reorder(seller_state, n),
y = n,
label = n,
fill = seller_state
)) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 1)) +
coord_flip() + labs(title = "Number of sellers in each state") + xlab("Brazilian State") +
ylab("Number of sellers ") + scale_y_continuous(labels = comma)
ggplotly(gg)
gg <-
sellers %>% mutate(a = fct_lump(seller_city,30)) %>% group_by(a) %>% count() %>%
ggplot(aes(
x = fct_reorder(a, n),
y = n,
label = n,
fill = a
)) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 1),
legend.title = element_blank()) + coord_flip() + labs(title = "Number of sellers in each most city") +
xlab("Brazilian city") + ylab("Number of sellers ") + scale_y_continuous(labels = comma)
ggplotly(gg)
#Draw the geographical map for city and state
"Cities"
## [1] "Cities"
##Prepare data set
j_sel<-sellers%>%select_at(vars(seller_city,seller_zip_code_prefix,seller_state))%>% group_by(seller_city)%>%count()
"Cities that have equal and more than 100 sellers"
## [1] "Cities that have equal and more than 100 sellers"
j_geo<- geolocation %>%select_at(vars(geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_state,geolocation_city))%>%unique.data.frame()%>%group_by(geolocation_city)%>%summarise(geolocation_lng=mean(geolocation_lng),geolocatio_lat=mean(geolocation_lat))
"This dataset have some duplicated rows, the long and lat of different cities are the same"
## [1] "This dataset have some duplicated rows, the long and lat of different cities are the same"
j_sel_geo<- inner_join(j_sel,j_geo,by=c("seller_city"="geolocation_city"))%>%mutate(popup=paste(seller_city,"-",n))
##Draw the map (sellers map_city map)
j_sel_geo%>%leaflet()%>%addTiles()%>%addProviderTiles(provider = providers$Esri.WorldTopoMap)%>%addProviderTiles(provider = providers$HERE.mapLabels)%>%addProviderTiles(providers$Stamen.TonerLines)%>%addCircleMarkers(lng=~geolocation_lng,lat=~geolocatio_lat,popup = ~popup,radius = ~sqrt(n)/5,label=~seller_city,color ="black",fill="white")%>%addPopups(lng = ~geolocation_lng,lat=~geolocatio_lat,popup = ~popup,options = popupOptions(closeOnClick = TRUE))
"States"
## [1] "States"
##Prepare data set
j_sel<-sellers%>%select_at(vars(seller_city,seller_zip_code_prefix,seller_state))%>% group_by(seller_state)%>%count()
"There are 27 states in Brazil"
## [1] "There are 27 states in Brazil"
j_geo<- geolocation %>%select_at(vars(geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_state,geolocation_city))%>%unique.data.frame()%>%group_by(geolocation_state)%>%summarise(geolocation_lng=mean(geolocation_lng),geolocatio_lat=mean(geolocation_lat))
"This dataset have some duplicated rows, the long and lat of different cities are the same"
## [1] "This dataset have some duplicated rows, the long and lat of different cities are the same"
j_sel_geo<- inner_join(j_sel,j_geo,by=c("seller_state"="geolocation_state"))%>%mutate(popup=paste(seller_state,"-",n))
##Draw the map (sellers map_ state map)
j_sel_geo%>%leaflet()%>%addTiles()%>%addProviderTiles(provider = providers$OpenMapSurfer.Hillshade)%>%addProviderTiles(provider = providers$HERE.mapLabels)%>%addProviderTiles(providers$Stamen.TonerLines)%>%addCircleMarkers(lng=~geolocation_lng,lat=~geolocatio_lat,popup = ~popup,radius = ~sqrt(n)/5,label=~seller_state,color ="black",fill="white")%>%addPopups(lng = ~geolocation_lng,lat=~geolocatio_lat,popup = ~popup,options = popupOptions(closeOnClick = FALSE))
#Number of sellers in each year (The trend of the number)
it<-items%>%mutate(value=price*order_item_id)%>%group_by(order_id,seller_id)%>%summarise(value=sum(value),freight_value=sum(freight_value))%>%ungroup()
j_it_or<- left_join(it,orders,by="order_id")%>%select(seller_id,order_approved_at,freight_value,value)%>%transmute(order_approved_at=as.yearmon(as.Date(order_approved_at,format="%Y-%m-%d")),seller_id,freight_value,value,year=year(order_approved_at))
time_line_dt <-j_it_or %>% group_by(order_approved_at, year) %>% count()%>%drop_na()%>%ungroup()
gg<-time_line_dt %>% ggplot(aes(x = factor(order_approved_at), y = n)) + geom_line(aes(group =
year, color = factor(year)), lwd = 2) + theme(axis.text.x = element_text(angle = 90, vjust =
1,color = "red",size = 12),legend.title = element_blank()) + scale_color_manual(values = c("red", "green", "yellow")) + geom_point(fill ="darkorchid", size = 3,shape=21)+xlab("Date")+ylab("Number of Userid")+labs(title="The increasing of number sellers in each year")+scale_y_continuous(labels = comma)
ggplotly(gg)
gg+coord_polar()
