Preparation: Libraries and Datasets

#Set working directory
setwd("C:/Users/DellPC/Desktop/brazilian-ecommerce")

#Load libraries

suppressMessages(library(tidyverse))
suppressMessages(library(dplyr))
suppressMessages(library(ggplot2))
suppressMessages(library(scales))
suppressMessages(library(plotly))
suppressMessages(library(gridExtra))
suppressMessages(library(crosstalk))
suppressMessages(library(kableExtra))
suppressMessages(library(DT))
suppressMessages(library(leaflet))
suppressMessages(library(lubridate))
suppressMessages(library(RColorBrewer))
suppressMessages(library(zoo))

theme_set(theme_minimal())
#Load datasets

customers <- read.csv("olist_customers_dataset.csv", header = TRUE)
geolocation <-
               read.csv("olist_geolocation_dataset.csv", header = TRUE)
items <- read.csv("olist_order_items_dataset.csv", header = TRUE)
payments <-
               read.csv("olist_order_payments_dataset.csv", header = TRUE)
reviews <-
               read.csv("olist_order_reviews_dataset.csv", header = TRUE)
orders <- read.csv("olist_orders_dataset.csv", header = TRUE)
products <- read.csv("olist_products_dataset.csv", header = TRUE)
sellers <- read.csv("olist_sellers_dataset.csv", header = TRUE)
product_category_name <-
               read.csv("product_category_name_translation.csv", header = TRUE)

# Standardize the datasets

customers <-
               customers %>% mutate(customer_city = str_to_title(customer_city))
geolocation <-
               geolocation %>% mutate(geolocation_city = str_to_title(geolocation_city))
sellers <-
               sellers %>% transmute(
                              seller_city = str_to_title(seller_city),
                              seller_id = seller_id,
                              seller_zip_code_prefix = seller_zip_code_prefix,
                              seller_state = seller_state
               )
items <-
               items %>% mutate(shipping_limit_date = as.POSIXct(shipping_limit_date, format =
                                                                                "Y-%m-%d %H:%m:%s"))
orders <-
               orders %>% mutate(
                              order_purchase_timestamp = as.POSIXct(
                                             order_purchase_timestamp,
                                             format = "Y-%m-%d %H:%m:%s",
                                             order_approved_at = as.POSIXct(order_approved_at, format = "Y-%m-%d %H:%m:%s"),
                                             order_delivered_carrier_date = as.POSIXct(
                                                            order_delivered_carrier_date,
                                                            format = "Y-%m-%d %H:%m:%s",
                                                            order_delivered_customer_date = as.POSIXct(order_estimated_delivery_date, format =
                                                                                                                      "Y-%m-%d %H:%m:%s"),
                                                            order_estimated_delivery_date = as.POSIXct(order_estimated_delivery_date, format =
                                                                                                                      "Y-%m-%d %H:%m:%s")
                                             )
                              )
               )
payments <-
               payments %>% mutate(
                              payment_type = fct_recode(
                                             payment_type,
                                             "Credit card" = "credit_card",
                                             "Boleto" = "boleto",
                                             "Debit card" = "debit_card",
                                             "Not defined" = "not_defined",
                                             "Voucher" = "voucher"
                              )
               )
product_category_name <-
               product_category_name %>% transmute(
                              product_category_name = ï..product_category_name,
                              product_category_name_english = product_category_name_english
               ) %>% mutate(product_category_name_english = str_to_title(product_category_name_english))
products <-
               products %>% transmute(
                              product_category_name = str_to_title(product_category_name),
                              weight = product_weight_g,
                              quantity = product_photos_qty,
                              weight = product_weight_g,
                              height = product_height_cm,
                              width = product_width_cm
               )
reviews <-
               reviews %>% transmute(
                              score = review_score,
                              comment = review_comment_message,
                              creation_date = review_creation_date,
                              answer = review_answer_timestamp
               ) %>% mutate(creation_date = as.POSIXct(creation_date, format = "%Y-%m-%d %H:%M:%s"))

Customers Perspective

#Statistic in column for city and state
gg <-
               customers %>% group_by(customer_state) %>% count() %>% arrange(desc(n)) %>%
               ggplot(aes(
                              x = fct_reorder(customer_state, n),
                              y = n,
                              label = n,
                              fill = customer_state
               )) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 1)) +
               coord_flip() + labs(title = "Number of customers in each state") + xlab("Brazilian State") +
               ylab("Number of customers ") + scale_y_continuous(labels = comma)
ggplotly(gg)
gg <-
               customers %>% transmute(a = fct_lump(customer_city, 30)) %>% group_by(a) %>%
               count() %>% ggplot(aes(
                              x = fct_reorder(a, n),
                              y = n,
                              label = n,
                              fill = a
               )) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 1), legend.title = element_blank()) +
               coord_flip() + labs(title = "Number of customers in each most city") + xlab("Brazilian city") +
               ylab("Number of customers ") + scale_y_continuous(labels = comma)
ggplotly(gg)
#Draw the geographical map for city and state

"Cities"
## [1] "Cities"
##Prepare data set
j_cus <-
               customers %>% select_at(vars(customer_city, customer_zip_code_prefix, customer_state)) %>% group_by(customer_city) %>%
               count() %>% filter(n > 100)

"Cities that have equal and more than 100 customers"
## [1] "Cities that have equal and more than 100 customers"
j_geo <-
               geolocation %>% select_at(
                              vars(
                                             geolocation_zip_code_prefix,
                                             geolocation_lat,
                                             geolocation_lng,
                                             geolocation_state,
                                             geolocation_city
                              )
               ) %>% unique.data.frame() %>% group_by(geolocation_city) %>% summarise(
                              geolocation_lng = mean(geolocation_lng),
                              geolocatio_lat = mean(geolocation_lat)
               )

"This dataset have some duplicated rows, the long and lat of different cities are the same"
## [1] "This dataset have some duplicated rows, the long and lat of different cities are the same"
j_cus_geo <-
               inner_join(j_cus, j_geo, by = c("customer_city" = "geolocation_city")) %>%
               mutate(popup = paste(customer_city, "-", n))

##Draw the map (customer_city)

               j_cus_geo %>% leaflet() %>% addTiles() %>% addProviderTiles(provider = providers$Esri.WorldTopoMap) %>%
               addProviderTiles(provider = providers$HERE.mapLabels) %>% addProviderTiles(providers$Stamen.TonerLines) %>%
               addCircleMarkers(
                              lng =  ~ geolocation_lng,
                              lat =  ~ geolocatio_lat,
                              popup = ~ popup,
                              radius = ~ sqrt(n) / 5,
                              label =  ~ customer_city,
                              color = "black",
                              fill = "white"
               ) %>% addPopups(
                              lng = ~ geolocation_lng,
                              lat =  ~ geolocatio_lat,
                              popup = ~ popup,
                              options = popupOptions(closeOnClick = TRUE)
               )
"States"
## [1] "States"
##Prepare data set
j_cus <-
               customers %>% select_at(vars(customer_city, customer_zip_code_prefix, customer_state)) %>% group_by(customer_state) %>%
               count()

"There are 27 states in Brazil"
## [1] "There are 27 states in Brazil"
j_geo <-
               geolocation %>% select_at(
                              vars(
                                             geolocation_zip_code_prefix,
                                             geolocation_lat,
                                             geolocation_lng,
                                             geolocation_state,
                                             geolocation_city
                              )
               ) %>% unique.data.frame() %>% group_by(geolocation_state) %>% summarise(
                              geolocation_lng = mean(geolocation_lng),
                              geolocatio_lat = mean(geolocation_lat)
               )

"This dataset have some duplicated rows, the long and lat of different cities are the same"
## [1] "This dataset have some duplicated rows, the long and lat of different cities are the same"
j_cus_geo <-
               inner_join(j_cus, j_geo, by = c("customer_state" = "geolocation_state")) %>%
               mutate(popup = paste(customer_state, "-", n))

##Draw the map (state_map)

               j_cus_geo %>% leaflet() %>% addTiles() %>% addProviderTiles(provider = providers$OpenMapSurfer.Hillshade) %>%
               addProviderTiles(provider = providers$HERE.mapLabels) %>% addProviderTiles(providers$Stamen.TonerLines) %>%
               addCircleMarkers(
                              lng =  ~ geolocation_lng,
                              lat =  ~ geolocatio_lat,
                              popup = ~ popup,
                              radius = ~ sqrt(n) / 5,
                              label =  ~ customer_state,
                              color = "black",
                              fill = "white"
               ) %>% addPopups(
                              lng = ~ geolocation_lng,
                              lat =  ~ geolocatio_lat,
                              popup = ~ popup,
                              options = popupOptions(closeOnClick = FALSE)
               )
#Payments in each city and state
j_p_o <- left_join(payments, orders, by = "order_id")
j_p_o <-
               j_p_o %>% select(order_id, payment_type, payment_value, order_status)

j_o_c <- left_join(orders, customers, by = c("customer_id"))
j_o_c <- j_o_c %>% select(order_id, customer_zip_code_prefix)

j_o_g <-
               left_join(
                              customers,
                              geolocation,
                              by = c("customer_zip_code_prefix" = "geolocation_zip_code_prefix")
               )
j_o_g <-
               j_o_g %>% select(
                              customer_zip_code_prefix,
                              geolocation_city,
                              geolocation_state,
                              geolocation_lat,
                              geolocation_lng
               ) %>% group_by(customer_zip_code_prefix,
                              geolocation_city,
                              geolocation_state) %>% summarise(
                                             geolocation_lng = mean(geolocation_lng),
                                             geolocation_lat = mean(geolocation_lat)
                              )

j1 <-
               left_join(j_p_o, j_o_c, by = "order_id") %>% group_by(customer_zip_code_prefix, payment_type) %>%
               summarise(payment_value = mean(payment_value))
j2 <-
               left_join(j1, j_o_g, by = "customer_zip_code_prefix") %>% group_by(payment_type, geolocation_state) %>%
               summarise(
                              geolocation_lng = mean(geolocation_lng),
                              geolocation_lat = mean(geolocation_lat),
                              payment_value = mean(payment_value)
               ) %>% drop_na()

values <-
               c(brewer.pal(9, "Set1"),
                 brewer.pal(9, "RdPu"),
                 brewer.pal(9, "YlOrBr"))

               j2 %>% leaflet() %>% addTiles() %>% addCircleMarkers(
                              lng =  ~ geolocation_lng,
                              lat =  ~ geolocation_lat,
                              label =  ~ paste(payment_type, "-", geolocation_state),
                              radius = ~ sqrt(payment_value),
                              group = ~ geolocation_state,
                              color = ~ values
               ) %>% addLayersControl(overlayGroups = c(j2$geolocation_state, j2$payment_type))
"Duplicated recoreds, Because of the first time in cleaning mistakes, I can't visualize it within city. Likewise, State scope is a potential choice"
## [1] "Duplicated recoreds, Because of the first time in cleaning mistakes, I can't visualize it within city. Likewise, State scope is a potential choice"
#Number of customers in each year (The trend of the number)

##Is customer_id unique in orders dataset?
table(duplicated(orders$customer_id))
## 
## FALSE 
## 99441
"Yes"
## [1] "Yes"
time_line_dt <-
               orders %>% select(customer_id, order_approved_at) %>% mutate(order_approved_at =
                                                                                           as.yearmon(as.Date(order_approved_at)),
                                                                            year = year(as.Date(order_approved_at))) %>% group_by(order_approved_at, year) %>%
               count()%>%drop_na()%>%ungroup()

gg<-time_line_dt %>% ggplot(aes(x = factor(order_approved_at), y = n)) + geom_line(aes(group =
                                                                                       year, color = factor(year)), lwd = 2) + theme(axis.text.x = element_text(angle = 90, vjust =
                                                                                                                                                                    1,color = "red",size = 12),legend.title = element_blank()) + scale_color_manual(values = c("red", "green", "yellow")) + geom_point(fill ="darkorchid", size = 3,shape=21)+xlab("Date")+ylab("Number of Userid")+labs(title="The increasing of number userid in each year")+scale_y_continuous(labels = comma)



ggplotly(gg)
gg+coord_polar()

Sellers Perspective

#Statistic in column for city and state
gg <-
               sellers %>% group_by(seller_state) %>% count() %>% arrange(desc(n)) %>% ggplot(aes(
                              x = fct_reorder(seller_state, n),
                              y = n,
                              label = n,
                              fill = seller_state
               )) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 1)) +
               coord_flip() + labs(title = "Number of sellers in each state") + xlab("Brazilian State") +
               ylab("Number of sellers ") + scale_y_continuous(labels = comma)
ggplotly(gg)
gg <-
               sellers %>% mutate(a = fct_lump(seller_city,30)) %>% group_by(a) %>% count() %>%
               ggplot(aes(
                              x = fct_reorder(a, n),
                              y = n,
                              label = n,
                              fill = a
               )) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 1),
                                       legend.title = element_blank()) + coord_flip() + labs(title = "Number of sellers in each most city") +
               xlab("Brazilian city") + ylab("Number of sellers ") + scale_y_continuous(labels = comma)
ggplotly(gg)
#Draw the geographical map for city and state

"Cities"
## [1] "Cities"
##Prepare data set
j_sel<-sellers%>%select_at(vars(seller_city,seller_zip_code_prefix,seller_state))%>% group_by(seller_city)%>%count()

"Cities that have equal and more than 100 sellers"
## [1] "Cities that have equal and more than 100 sellers"
j_geo<- geolocation %>%select_at(vars(geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_state,geolocation_city))%>%unique.data.frame()%>%group_by(geolocation_city)%>%summarise(geolocation_lng=mean(geolocation_lng),geolocatio_lat=mean(geolocation_lat))

"This dataset have some duplicated rows, the long and lat of different cities are the same"
## [1] "This dataset have some duplicated rows, the long and lat of different cities are the same"
j_sel_geo<- inner_join(j_sel,j_geo,by=c("seller_city"="geolocation_city"))%>%mutate(popup=paste(seller_city,"-",n))

##Draw the map (sellers map_city map)

j_sel_geo%>%leaflet()%>%addTiles()%>%addProviderTiles(provider = providers$Esri.WorldTopoMap)%>%addProviderTiles(provider = providers$HERE.mapLabels)%>%addProviderTiles(providers$Stamen.TonerLines)%>%addCircleMarkers(lng=~geolocation_lng,lat=~geolocatio_lat,popup = ~popup,radius = ~sqrt(n)/5,label=~seller_city,color ="black",fill="white")%>%addPopups(lng = ~geolocation_lng,lat=~geolocatio_lat,popup = ~popup,options = popupOptions(closeOnClick = TRUE))
"States"
## [1] "States"
##Prepare data set
j_sel<-sellers%>%select_at(vars(seller_city,seller_zip_code_prefix,seller_state))%>% group_by(seller_state)%>%count()

"There are 27 states in Brazil"
## [1] "There are 27 states in Brazil"
j_geo<- geolocation %>%select_at(vars(geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_state,geolocation_city))%>%unique.data.frame()%>%group_by(geolocation_state)%>%summarise(geolocation_lng=mean(geolocation_lng),geolocatio_lat=mean(geolocation_lat))

"This dataset have some duplicated rows, the long and lat of different cities are the same"
## [1] "This dataset have some duplicated rows, the long and lat of different cities are the same"
j_sel_geo<- inner_join(j_sel,j_geo,by=c("seller_state"="geolocation_state"))%>%mutate(popup=paste(seller_state,"-",n))

##Draw the map (sellers map_ state map)

j_sel_geo%>%leaflet()%>%addTiles()%>%addProviderTiles(provider = providers$OpenMapSurfer.Hillshade)%>%addProviderTiles(provider = providers$HERE.mapLabels)%>%addProviderTiles(providers$Stamen.TonerLines)%>%addCircleMarkers(lng=~geolocation_lng,lat=~geolocatio_lat,popup = ~popup,radius = ~sqrt(n)/5,label=~seller_state,color ="black",fill="white")%>%addPopups(lng = ~geolocation_lng,lat=~geolocatio_lat,popup = ~popup,options = popupOptions(closeOnClick = FALSE))
#Number of sellers in each year (The trend of the number)

it<-items%>%mutate(value=price*order_item_id)%>%group_by(order_id,seller_id)%>%summarise(value=sum(value),freight_value=sum(freight_value))%>%ungroup()

j_it_or<- left_join(it,orders,by="order_id")%>%select(seller_id,order_approved_at,freight_value,value)%>%transmute(order_approved_at=as.yearmon(as.Date(order_approved_at,format="%Y-%m-%d")),seller_id,freight_value,value,year=year(order_approved_at))



time_line_dt <-j_it_or %>% group_by(order_approved_at, year) %>% count()%>%drop_na()%>%ungroup()

gg<-time_line_dt %>% ggplot(aes(x = factor(order_approved_at), y = n)) + geom_line(aes(group =
                                                                                       year, color = factor(year)), lwd = 2) + theme(axis.text.x = element_text(angle = 90, vjust =
                                                                                                                                                                    1,color = "red",size = 12),legend.title = element_blank()) + scale_color_manual(values = c("red", "green", "yellow")) + geom_point(fill ="darkorchid", size = 3,shape=21)+xlab("Date")+ylab("Number of Userid")+labs(title="The increasing of number sellers in each year")+scale_y_continuous(labels = comma)

ggplotly(gg)
gg+coord_polar()