E-commerce is a trending business sector, of electronically buying and selling products over the internet. This fastgrowing industry has also adapted features like online payments, internet marketing, door-step delivery, returns and exchanges, etc. Since it is a widespread industry that adds to human convenience and luxuries, there is a lot of scope for improvements and business growth, which brings profits to both consumers and industry.
This is a Brazilian ecommerce public dataset of orders made at Olist Store. The dataset has information of 100k orders from 2016 to 2018 made at multiple marketplaces in Brazil. The Dataset contains multiple tables providing information on actual ecommerce purchases along with various other dimensions of a particular purchase like Customer Information, Seller Information, Product Metadata as well as Customer reviews on their purchase experience. The dataset also has geolocation file that relates Brazilian zip codes to latitude/longitude coordinates.
a <- items_data %>% left_join(orders_data) %>% mutate(mny = format(strptime(order_purchase_timestamp, "%Y-%m-%d %H:%M:%S"),'%Y-%m')) %>% group_by(mny) %>% summarise(total = sum(price)) %>% arrange(mny)
brazil_holidays <- brazil_holidays_data %>% mutate(mny = strftime(Date, format = "%Y-%m")) %>% group_by(mny) %>% mutate(holidays_by_week = paste0(Holiday, collapse = ",")) %>% select(mny, holidays_by_week)
brazil_holidays <- brazil_holidays[!duplicated(brazil_holidays$mny),]
p <- plot_ly(a, x = ~mny, y = ~total, type = 'scatter', mode = 'lines', color = "red")
p <- p %>%
add_trace(
type = 'bar',
x = brazil_holidays$mny,
y = 1000000,
text = brazil_holidays$holidays_by_week,
hoverinfo = 'text',
marker = list(color='yellow'),
showlegend = F,
width = 0.3
) %>% layout(xaxis = list(autotick = F, dtick = 1)) %>%
layout(xaxis = list(title = 'Time (Year - Month)',
autotick = F, dtick = 1),
yaxis = list(title = 'Total Purchase (in $)'))
p
Clearly, the sale increases significantly when there is an event during a particular month.
order_weekday <- orders_data %>% mutate(purchase_weekday = wday(order_purchase_timestamp), purchase_hour = format(strptime(order_purchase_timestamp, "%Y-%m-%d %H:%M:%S"),'%H')) %>% group_by(purchase_weekday, purchase_hour) %>% summarise(Count = n())
p <- plot_ly(data = order_weekday,
x = ~purchase_hour,
y = ~purchase_weekday,
z = ~Count,
type = "heatmap",
width = 1050,
height = 500,
colors = colorRamp(c("white","yellow", "red"))) %>%
layout(xaxis = list(title = 'Hour'),
yaxis = list(title = 'Day', tickvals = c(1, 2, 3, 4, 5, 6, 7), ticktext = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))) %>%
add_annotations(x = order_weekday$purchase_hour, y = order_weekday$purchase_weekday, text = order_weekday$Count, xref = 'x', yref = 'y', showarrow = FALSE, font=list(color='black'))
p
The heatmap indiciates that the majority of the transactions occur over the weekdays during office hours. We can thus increases our marketing strategies during this period.
# customer's geo location data
customers_geo_data <- customers_data %>%
left_join(geo_data,by=c("customer_state"="geolocation_state",
"customer_city"="geolocation_city",
"customer_zip_code_prefix" ="geolocation_zip_code_prefix")) %>%
rename("customer_geolocation_lat" = "geolocation_lat",
"customer_geolocation_lng" = "geolocation_lng" ) %>%
drop_na(customer_geolocation_lat,customer_geolocation_lng)
# seller's geo location data
sellers_geo_data <- sellers_data %>%
left_join(geo_data,by=c("seller_state"="geolocation_state",
"seller_city"="geolocation_city",
"seller_zip_code_prefix" ="geolocation_zip_code_prefix")) %>%
rename("seller_geolocation_lat" = "geolocation_lat",
"seller_geolocation_lng" = "geolocation_lng" ) %>%
drop_na(seller_geolocation_lat,seller_geolocation_lng)
# data manipulation data frame for sellers and orders delivered by these sellers
seller_orders_data <- items_data %>% left_join(sellers_geo_data, by="seller_ID") %>%
left_join(orders_data, by="order_ID") %>%
left_join(customers_geo_data, by="customer_ID") %>%
drop_na(order_approved_at,order_purchase_timestamp,order_delivered_carrier_date,
order_delivered_customer_date,seller_geolocation_lat,seller_geolocation_lng,
customer_geolocation_lat,customer_geolocation_lng) %>%
filter(order_status == "delivered") %>%
mutate(approved_in_days = round(difftime(strptime(order_approved_at,
format = "%Y-%m-%d %H:%M:%S"),
strptime(order_purchase_timestamp,
format="%Y-%m-%d %H:%M:%S"),
units="days")),
delivered_in_days = round(difftime(strptime(order_delivered_customer_date,
format = "%Y-%m-%d %H:%M:%S"),
strptime(order_purchase_timestamp,
format="%Y-%m-%d %H:%M:%S"),
units="days")),
estimated_in_days = round(difftime(strptime(order_estimated_delivery_date,
format = "%Y-%m-%d"),
strptime(order_purchase_timestamp,
format="%Y-%m-%d %H:%M:%S"),
units="days")),
del_to_carrier_in = round(difftime(strptime(order_delivered_carrier_date,
format = "%Y-%m-%d %H:%M:%S"),
strptime(order_approved_at,
format="%Y-%m-%d %H:%M:%S"),
units="days")),
delay = round(difftime(strptime(order_delivered_customer_date,
format = "%Y-%m-%d %H:%M:%S"),
strptime(order_estimated_delivery_date,
format="%Y-%m-%d"),
units="days"))) %>%
separate(order_purchase_timestamp , into=c("purchase_year",
"purchase_month", "purchase_date"),
sep="-") %>%
separate(purchase_date , into=c("purchase_date", "purchase_time"), sep=" ") %>%
select(seller_ID, seller_city, seller_zip_code_prefix, seller_state,
seller_geolocation_lat, seller_geolocation_lng,
customer_ID, customer_city, customer_state,customer_zip_code_prefix,
customer_geolocation_lat,customer_geolocation_lng,
order_ID, product_ID, approved_in_days,delivered_in_days,
estimated_in_days, del_to_carrier_in, delay,
purchase_year,purchase_month, purchase_date) %>%
mutate(delay = as.factor(if_else(delay <= 0 , 0, 1)))
# Top 10 sellers
top_sellers <- seller_orders_data %>% group_by(seller_ID) %>%
summarise(total_orders=n()) %>% top_n(10,total_orders)
# Top 10 Product Categories
products_data_english <- left_join(products_data,translations_data, by = c("product_category_name"))
products_data_eng_items <- left_join(items_data,products_data_english, by = "product_ID")
Top10_Product <- products_data_eng_items %>%
filter(!is.na(product_category_name_english)) %>%
group_by(product_category_name_english) %>%
tally(sort = TRUE) %>%
top_n(10)
# Yearly Analysis
yearly_product_sold <- seller_orders_data %>%
left_join(products_data, by="product_ID") %>%
left_join(translations_data,by="product_category_name") %>%
group_by(seller_ID, purchase_year, purchase_month) %>%
filter(seller_ID %in% top_sellers$seller_ID)%>%
mutate(seller_name = paste0("Seller_", seller_ID),
product_category_name_english = ifelse(as.character(product_category_name_english) %in% Top10_Product$product_category_name_english, as.character(product_category_name_english), "Others"))
# Plot
top_seller_yearly_categories <- ggplot(yearly_product_sold,
aes(x=purchase_year, fill=product_category_name_english,
name="Product Categories")) +
geom_bar() + facet_grid(. ~ seller_name) +
labs(x = "Year", y= "Product Count")+
guides(fill=guide_legend("Product Categories"))
ggplotly(top_seller_yearly_categories, height = 800, width = 1200)
# Delivery distribution of top 10 sellers
delivery_distribution <- top_sellers %>%
left_join(seller_orders_data,by="seller_ID") %>% group_by(seller_ID) %>%
select(seller_ID,order_ID,approved_in_days,
delivered_in_days,estimated_in_days, del_to_carrier_in)%>%
mutate(seller_name = paste0("Seller_", seller_ID))
# Average Delivery Estimation by top 10 Sellers
mean_delivery_estimate <- delivery_distribution %>% group_by(seller_ID) %>%
summarize(avg_estimated_days = mean(estimated_in_days)) %>%
mutate(seller_name = paste0("Seller_", seller_ID))
# Average Delivery to Carrier
avg_carrier_del_days <- delivery_distribution %>% group_by(seller_ID) %>%
summarize(mean_carrier_del = mean(del_to_carrier_in)) %>%
mutate(seller_name = paste0("Seller_", seller_ID))
# Box plot for Delivery Distribution and Estimated Delivery
delivery_dist_plot <- plot_ly(data = mean_delivery_estimate, x = ~seller_name, y = ~avg_estimated_days,
type='scatter', mode="lines", name ="Estimated Delivery", height = 600, width = 900) %>%
add_trace(data= avg_carrier_del_days, x= ~seller_name, y= ~mean_carrier_del,
type='scatter',mode="lines", name ="Carrier Delivery") %>%
add_boxplot(data = delivery_distribution, x = ~seller_name, y = ~delivered_in_days,
color = ~seller_name, type = "box" , name= ~seller_name) %>%
layout(xaxis=list(title = "Top 10 Sellers", type = "category",
categoryorder = "array",
categoryarray = mean_delivery_estimate$seller_name,
size=8),
yaxis=list(title = "Number of Days",
range = c(0,50),
size=8))
delivery_dist_plot
orders_data <- mutate(orders_data, estimatedLead = interval(ymd_hms(orders_data$order_purchase_timestamp), ymd_hms(orders_data$order_estimated_delivery_date)) %/% days(x = 1))
orders_data <- mutate(orders_data, actualLead = interval(ymd_hms(orders_data$order_purchase_timestamp), ymd_hms(orders_data$order_delivered_customer_date)) %/% days(x = 1))
orders_data <- mutate(orders_data, approvalTime = interval(ymd_hms(orders_data$order_purchase_timestamp), ymd_hms(orders_data$order_approved_at)) %/% minutes(x = 1))
orders_data <- mutate(orders_data, carrierTime = interval(ymd_hms(orders_data$order_purchase_timestamp), ymd_hms(orders_data$order_delivered_carrier_date)) %/% days(x = 1))
orders_data <- mutate(orders_data, carrierToCustomerTime = interval(ymd_hms(orders_data$order_delivered_carrier_date), ymd_hms(orders_data$order_delivered_customer_date)) %/% days(x = 1))
orders_data <- mutate(orders_data, purchaseQuarter = quarter(orders_data$order_purchase_timestamp, with_year = TRUE))
orders_data <- mutate(orders_data, purchaseMonth = month(orders_data$order_purchase_timestamp, label = TRUE, abbr = FALSE))
orders_data <- mutate(orders_data, purchaseYear = year(orders_data$order_purchase_timestamp))
lead_time <- orders_data %>% filter(orders_data$order_status == "delivered") %>% group_by_at(vars(purchaseQuarter)) %>%
summarise_at(c("estimatedLead","actualLead","approvalTime", "carrierTime", "carrierToCustomerTime"),
mean, na.rm = TRUE) %>% arrange_at(vars(purchaseQuarter))
lead_time <- mutate(lead_time, Quart.Y = paste0(purchaseQuarter, sep ="Q"))
lead_time$Quart.Y <- as.factor(lead_time$Quart.Y)
lead_time %>% ggplot(aes(x = purchaseQuarter, y =estimatedLead))+
geom_line(color = "red", alpha = 0.81, linetype = "longdash")+
geom_segment(aes(xend = 2018.4, yend = estimatedLead), linetype = 2, colour = 'grey51')+
geom_text(aes(x = 2018.3, label = Quart.Y, hjust = 1))+
geom_point(size=3, color="red", fill=alpha("orange", 0.63), alpha=0.54, shape=21, stroke=1)+
# facet_wrap(~order_status)+
coord_polar(start = 0)+
transition_reveal(purchaseQuarter)+
theme(axis.text.x = element_blank(),
axis.ticks = element_blank(),
panel.background = element_blank(),
panel.grid.major.x = element_line(color = "coral", linetype = 3),
panel.grid.major.y = element_line(color = "coral", linetype = 3),
strip.background = element_blank())+
labs(title = "Change in average Estimated Lead Time by Quarters",
x = "Purchasing Quarter/Year",
y= " Estimated average lead time [in Days]")
freq_count <- as.data.frame(table(ordertable$customer_unique_id))
odtbl <- ordertable %>% select_at(vars(year, month, customer_unique_id)) %>%
distinct_at(vars(year,month, customer_unique_id)) %>%
arrange_at(vars(year, month)) %>%
count_(vars(year, month)) %>%
ggplot() + geom_line(mapping = aes(x = month, y = n, group = 1, color="red"),show.legend = FALSE) +facet_wrap(~year)+geom_point(mapping = aes(x = month, y = n, group = 1, color="red",show.legend=FALSE),show.legend = FALSE)+labs(x="Month", y="Count of new customers added each month", title = "New customers purchased every consecutive month")
ggplotly(odtbl, height = 800, width = 1000)
Products_details1 <- left_join(products_data,translations_data, by = c("product_category_name" = "product_category_name"))
Products_orders_details1 <- left_join(items_data,Products_details1, by = "product_ID")
Products_orders_details1 <- Products_orders_details1 %>% mutate(volume = product_length_cm * product_height_cm * product_width_cm)
Products_orders_items1 <- left_join(Products_orders_details1,orders_data, by = "order_ID")
Products_orders_items1 <- left_join(Products_orders_items1, customers_data, by = "customer_ID")
Products_orders_items1 <- left_join(Products_orders_items1, sellers_data, by = "seller_ID")
Products_orders_items1 <- left_join(Products_orders_items1, reviews_data, by = "order_ID")
p5 <- Products_orders_items1 %>% mutate(product_category_name_english = ifelse(as.character(product_category_name_english) %in% Top10_Product$product_category_name_english, as.character(product_category_name_english), "Others"))%>% group_by(product_category_name_english,review_score)
p6 <- p5 %>% summarise(fv = mean(freight_value), pv = mean(price))
plot_ly(p6, x = ~review_score, y = ~pv, z = ~fv, color = ~product_category_name_english , colors = c('green', 'red'), height = 600, width=900) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Mean Review Score'),
yaxis = list(title = 'Mean Price'),
zaxis = list(title = 'Mean Freight Value')))%>%
layout(xaxis = list(title = 'State'), yaxis = list(title = 'Delivery days'))