Olist is a brazilian company founded in 2014 that operates a generalist e-marketplace that connects businesses with individual consumers. Unlike Amazon, Olist simply functions as an interface and does not hold any inventory nor sells product of its own.
The dataset we analyze was found on Kaggle. It contains anonymized record of 100k orders made on the platform. Each observation on the main csv represents one order and, using adjunct csv provided, we can match those order to additional information such as the type of product, the location of the seller, the buyer, the review given, and much more.
organization
Once datasets have been aggregated, each row will represent one order and be described with the following variables:
We have also created a bunch of auxiliary variables
The main dataframe we work with contains observation with information of each of the variables listed above. Although the dataset contains orders passed on a time span as long as two years, we have chosen to analyze only orders made in the last twelve months (Sep2017 to Aug2018) because prior to that sales are too few and volatile.
In this report we will first go through a succint exploratory analysis of interactions between variables of the dataset and from that exploration we will extract business insights or follow up questions for managers. Thereafter we will endeavour to build a predictive model for customer order satisfaction and propose practical guidelines for manager to use this model effectively.
1. Sales
LTM %>%
ggplot(aes(x=shipping_limit_date))+
geom_histogram(binwidth=1,fill="white",color="navy")+
geom_hline(yintercept = nrow(LTM)/365,color="darkred")+
theme_minimal()+
labs(x="Shipping Date",y="Count")
ggmap(Brazil_terrain,)+
geom_point(data=LTM,aes(x= customer_long, y=customer_lat),alpha=0.3,size=0.1,
color="navy")
## Warning: Removed 17 rows containing missing values (geom_point).
image
ggmap(Brazil_terrain,)+ #XX
geom_density_2d(data=LTM,aes(x= customer_long, y=customer_lat),color="darkred")
## Warning: Removed 17 rows containing non-finite values (stat_density2d).
ggmap(Sao,)+ #Density x customer 35/36
geom_density_2d(data=LTM%>%group_by(customer_id)%>%
summarise(sum=sum(total_value),lon=mean(customer_long),
lat=mean(customer_lat))%>%filter(sum>500/0.3),aes(
x=lon,y=lat))+
geom_density2d(data=LTM%>%group_by(customer_id)%>%
summarise(sum=sum(total_value),lon=mean(customer_long),
lat=mean(customer_lat))%>%filter(sum<100/0.3),aes(
x=lon,y=lat),color="red",size=0.1,alpha=0.8)
## Warning: Removed 417 rows containing non-finite values (stat_density2d).
## Warning: Removed 46659 rows containing non-finite values (stat_density2d).
ggmap(Brazil_terrain,)+ #32
geom_density_2d(data=LTM%>%
filter(price>500/0.3),aes(y=customer_lat,x=customer_long,),
size=0.5,color="blue")+
geom_density_2d(data=LTM%>%
filter(price<100/0.3),aes(y=customer_lat,x=customer_long),
size=0.5,color="red")+
theme(legend.position = "none",axis.title.x = element_blank(),
axis.title.y = element_blank())
## Warning: Removed 16 rows containing non-finite values (stat_density2d).
2. Prices
LTM%>%
group_by(Month)%>%
summarise(AveragePrice=mean(total_price))%>%
ggplot(aes(x=Month,y=AveragePrice*0.3))+
geom_col(fill = "Navy",color="Navy",alpha=0.5)+
theme_minimal()+
labs(x="Shipping Month",y="Average Dollar Value")
LTM%>%group_by(customer_id)%>%
summarise(n=n()) %>% .$n %>% summary()
| Min. | 1st Qu. | Median | Mean | 3rd Qu. | Max. |
|---|---|---|---|---|---|
| 1 | 1 | 1 | 1.189167 | 1 | 34 |
LTM%>%
filter(total_value<1000&total_value>0)%>%
ggplot(aes(x=total_value))+
geom_density(color="navy",fill="Navy",alpha=0.5)+
theme_minimal()+
labs(x="Value of Order",y="Density")
LTM%>%
mutate(BinValue = cut(total_value,c(0,50,100,200,400,800,1600,Inf)))%>%
group_by(BinValue)%>%
summarise(VolumeSales = sum(total_value), Orders = n())%>%
mutate(per_volume=VolumeSales/sum(VolumeSales),per_Orders=Orders/sum(Orders))
| BinValue | VolumeSales | Orders | per_volume | per_Orders |
|---|---|---|---|---|
| (0,50] | 556182.6 | 15099 | 0.0394860 | 0.1724122 |
| (50,100] | 1991768.5 | 27141 | 0.1414050 | 0.3099172 |
| (100,200] | 3917057.0 | 27570 | 0.2780903 | 0.3148159 |
| (200,400] | 3319098.7 | 12280 | 0.2356384 | 0.1402227 |
| (400,800] | 2076930.0 | 3806 | 0.1474510 | 0.0434599 |
| (800,1.6e+03] | 1445121.6 | 1350 | 0.1025960 | 0.0154154 |
| (1.6e+03,Inf] | 779399.7 | 329 | 0.0553332 | 0.0037568 |
3. Payments
LTM %>% #(6)
group_by(payment_type)%>%
summarise(vol=sum(total_value))%>%
mutate(per=vol/sum(vol))%>%
ggplot(aes(x="",group=reorder(payment_type,-vol),y=per,fill=payment_type))+
geom_bar(color="black",stat="identity",alpha=0.8)+
theme_minimal()+
scale_fill_brewer(palette="Paired",name="Payment Type")+
labs(x="Payment Type",y="% of Sales")
LTM %>%
ggplot(aes(x=log(total_value),color=payment_type))+
geom_density(size=1)+
theme_minimal()+
scale_color_brewer(palette="Spectral",name="Payment Type")
LTM%>% group_by(payment_type)%>%
summarise(n=n(),avg_spend=mean(total_value))
| payment_type | n | avg_spend |
|---|---|---|
| boleto | 16890 | 150.3465 |
| credit_card | 65423 | 165.4633 |
| debit_card | 1419 | 143.0357 |
| voucher | 3843 | 134.8241 |
t.test(LTM%>%filter(payment_type=="boleto")%>%.$total_value,
LTM%>%filter(payment_type=="credit_card")%>%.$total_value)
##
## Welch Two Sample t-test
##
## data: LTM %>% filter(payment_type == "boleto") %>% .$total_value and LTM %>% filter(payment_type == "credit_card") %>% .$total_value
## t = -7.6851, df = 26753, p-value = 1.581e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -18.97228 -11.26133
## sample estimates:
## mean of x mean of y
## 150.3465 165.4633
4. Installments
LTM %>% filter(payment_installments<=10,payment_installments>0)%>%
ggplot(aes(x=as.factor(payment_installments)))+
geom_bar(fill="Navy",color="Navy",alpha=0.5)+
theme_minimal()+
labs(x="Payment Installments",y="count")
LTM %>%
ggplot(aes(x=log(total_value),
group=cut(payment_installments,c(-Inf,2,8,+Inf)),
color=cut(payment_installments,c(-Inf,2,8,+Inf))))+
geom_density()+
theme_minimal()+
scale_color_discrete(name="Groups of Installments")
LTM %>% group_by(cut(payment_installments,c(-Inf,2,8,+Inf)))%>%
summarise(avg_value = mean(total_value),n=n())
| cut(payment_installments, c(-Inf, 2, 8, +Inf)) | avg_value | n |
|---|---|---|
| (-Inf,2] | 126.4004 | 54464 |
| (2,8] | 187.5936 | 27649 |
| (8, Inf] | 368.8228 | 5462 |
LTM %>% group_by(cut(payment_installments,c(-Inf,1,Inf)))%>%
summarise(sm = sum(total_value),n=n()) %>%
mutate(prop_n = n/sum(n),prop_val=sm/sum(sm))
| cut(payment_installments, c(-Inf, 1, Inf)) | sm | n | prop_n | prop_val |
|---|---|---|---|---|
| (-Inf,1] | 5536983 | 43997 | 0.5023922 | 0.3930965 |
| (1, Inf] | 8548575 | 43578 | 0.4976078 | 0.6069035 |
5. Categories
LTM%>% #(12)
group_by(product_category_name_english)%>%
summarise(vol=sum(total_value))%>%
mutate(rank=rank(-vol))%>%
arrange(rank)%>%
mutate(cum=cumsum(vol))%>%
mutate(cumper=cum/max(cum))%>%
ggplot(aes(x=as.factor(rank),y=cumper,fill=rank))+
geom_col(alpha=0.8,color="white",size=0.001)+
theme_minimal()+
geom_vline(xintercept = 3,color="darkred")+
geom_vline(xintercept = 7,color="darkred")+
geom_vline(xintercept = 14,color="darkred")+
labs(x="Item Rank",y="Cumulative % of Sales")+
theme(legend.position = "none",
axis.text.x = element_text(size=4))+
scale_fill_distiller(palette ="Spectral")
LTM %>%
group_by(product_category_name_english)%>%
summarise(vol=sum(total_value))%>%
mutate(ShareVol=vol/sum(vol))%>%
filter(vol>90000/0.3)%>%
ggplot(aes(x=reorder(product_category_name_english,vol),y=ShareVol))+
geom_col(aes(fill=vol),alpha=0.9,color="black")+
theme_minimal()+
labs(x="",y="% Total Sales")+
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position="none")+
coord_flip()+
scale_y_continuous(labels=percent)
LTM %>%
group_by(Month,product_category_name_english)%>%
summarise(vol=sum(total_value))%>%
group_by(Month)%>%
filter(vol>sort(vol,T)[6])%>%
group_by(Month)%>%
mutate(rank=rank(-vol))%>%
ggplot(aes(x=Month,y=rank,fill=product_category_name_english))+
geom_tile(colour="black")+
theme_minimal()+
labs(x="Month",y="Rank(1=best)")+
theme(legend.title=element_blank(),
legend.text=element_text(size=15))
5. Freight
LTM %>%
group_by(product_category_name_english)%>%
summarise(volume=sum(total_value),avg=mean(freight_value),wh=mean(product_weight_g))%>%
filter(avg>(7/0.3))%>%
ggplot(aes(x=reorder(product_category_name_english,avg),y=avg*0.3))+
geom_col(aes(fill=volume<90000/0.3),color="black",alpha=0.8)+
theme_minimal()+
labs(x="",y="Average $ Delivery Price")+
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position="none")+
coord_flip()+
scale_fill_brewer(palette="Set1")
LTM %>%
group_by(product_category_name_english)%>%
summarise(ratio=sum(total_freight)/sum(total_value),vol=sum(total_value))%>%
arrange(desc(ratio))%>%
filter(ratio>0.18)%>%
ggplot(aes(x=reorder(product_category_name_english,ratio),y=ratio))+
geom_col(aes(fill=vol<90000/0.3),color="black")+
coord_flip()+
theme_minimal()+
labs(x="",y="DeliveryFee/OverallValue")+
scale_y_continuous(labels = percent)+
scale_fill_brewer(palette="Set1", name="Top 15 Product",labels=c("TRUE","FALSE"))
6. Sellers
LTM%>% #17
group_by(seller_id)%>%
summarise(vol = sum(total_value))%>%
mutate(per=vol/sum(vol))%>%
arrange(desc(vol))%>%
mutate(cum=cumsum(per),r=rank(-per,ties.method = "random"))%>%
filter(r<2000)%>%
ggplot(aes(y=cum,x=r))+
geom_col(fill="navy",color="Navy",alpha=0.1)+
geom_vline(xintercept = 27,color="darkred")+
geom_vline(xintercept = 125,color="darkred")+
geom_vline(xintercept = 420,color="darkred")+
theme_minimal()+
labs(x="Rank",y="Cumulative percentage of Sales")
ggmap(Brazil_terrain,)+ #18
geom_point(data=LTM%>%
group_by(seller_id)%>%
summarise(vol = sum(total_value),lat=mean(seller_lat),lon=mean(seller_long))%>%
mutate(per=vol/sum(vol,na.rm=TRUE))%>%
arrange(desc(vol))%>%
mutate(cum=cumsum(per),r=rank(-per,ties.method = "random"))%>%
filter(r<125),aes(x=lon,y=lat,size=vol,color=r<27),
alpha=0.5)+
theme(legend.position = "none",axis.title.x = element_blank(),
axis.title.y = element_blank())
7. Delivery Process
LTM %>% # 19
ggplot(aes(x=clicktodoor))+
geom_histogram(bins=60,fill="navy",color="Black",alpha=0.8)+
xlim(0,60)+
theme_minimal()+
labs(y="Click-to-Reception Days")
## Warning: Removed 255 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
half of the orders arrives in 10 days to destination after the customer first clicked to purchase. This is much higher Amazon which orders on average arrive in only 3.28 days. This is surely a kpi which Olist should strive to improve, but this is nonetheless not a main point of preoccupation given the relative youth of the company. 3 years ago, Amazon package took on average 6 days to come to destination.
The interesting distribution below, of orders as a function of distance traveled, is not smooth at all. The first two peaks can be explained in the following manner: Rio and Sao Paulo are the two main centers of economic activity and are separated by roughly 400 km, the distance between the peaks. The massive amount of bilateral trade between those cities, who trade either to their agglomeration or the agglomeration of the other cities, cause this pattern of booms and busts. The other peaks should also be the result of a similar phenomenon at play.
LTM %>% #20
filter(distance<3000)%>%
ggplot(aes(x=distance))+
geom_density(fill="darkred",alpha=0.3)+
geom_histogram(bins=100,fill="Navy",color="grey50",aes(y=..density..))+
theme_minimal()
On median, a package travels 427 Km, which, if divided by the median time taken by a package to reach its destination, gives 42km a day, which does not seem like a lot. Another key performance indicator that managers should focus on improving.
LTM%>% #21
group_by(payment_type)%>%
summarise(mn=mean(approval_time),mnn=mean(review_score))%>%
ggplot(aes(x=payment_type,y=mn))+
geom_col(fill="Navy",alpha=0.8,color="black")+
theme_minimal()+
labs(x="Payment Type",y="Payment Approval Time in Days")
LTM %>% #24
filter(transit_logistic<90,distance<4000)%>%
ggplot(aes(x=distance,y=transit_logistic))+
geom_smooth(aes(x=distance,y=transit_logistic))+
theme_minimal()+
labs(y="Carrier Transit")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
LTM %>% #23
group_by(Sendoff_time)%>%
summarise(cn=n(),avg=mean(review_score))%>%
filter(Sendoff_time>=0& Sendoff_time<35)%>%
ggplot(aes(x=Sendoff_time,y=avg))+
geom_col(fill="Navy",color="black",alpha=0.7)+
geom_smooth()+
theme_minimal()+
labs(x="Sendoff Time",y="Review Score")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
LTM %>% #25
filter(transit_logistic<90,distance<4000,price>10/0.3) %>%
group_by(product_category_name_english)%>%
mutate(bigpay=ratio_delivery>=1*median(ratio_delivery))%>%
group_by(product_category_name_english,bigpay)%>%
summarise(avg=mean(transit_logistic))%>%
ggplot(aes(x=reorder(product_category_name_english,-avg),
y=avg,
fill=bigpay))+
theme_minimal()+
geom_col(position="identity",alpha=0.5)+
theme(axis.text.x = element_blank())+
labs(x="Categories",y="Average Transit Days")+
scale_fill_brewer(palette = "Set1")
LTM$lag <- LTM$expectedtransit-LTM$transit
LTM%>% #26
ggplot()+
geom_smooth(aes(x=distance,y=expectedtransit))+
geom_smooth(aes(x=distance,y=transit),color="red")+
theme_minimal()+labs(y="Red=actual/Blue=Expected")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 7 rows containing non-finite values (stat_smooth).
LTM %>% #27
filter(transit<60)%>%
ggplot(position="identity")+
geom_density(aes(x=transit),fill="Navy",alpha=0.5)+
geom_density(aes(x=expectedtransit),fill="red",alpha=0.5)+
theme_minimal()+
ggtitle("Blue = Actual , Red = Predicted")+
labs(x="Transit Days",y="Density")
t.test(LTM$transit,LTM$expectedtransit)
##
## Welch Two Sample t-test
##
## data: LTM$transit and LTM$expectedtransit
## t = -263.74, df = 174335, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -11.49431 -11.32473
## sample estimates:
## mean of x mean of y
## 11.93916 23.34867
LTM %>% #28
filter(lag>-30&lag<50)%>%
ggplot(aes(x=as.integer(lag)))+
geom_histogram(aes(y=..density..),bins=80,fill="navy",color="grey50")+
geom_density()+
theme_minimal()+
geom_vline(color="darkred",xintercept = 0,size=1)
ggmap(Brazil_terrain,)+ #29
geom_density_2d(data=LTM%>%
filter(lag<0),aes(y=customer_lat,x=customer_long,size=-(as.integer(lag))),
alpha=0.8,color="red")+
geom_density_2d(data=LTM%>%
filter(lag>10),aes(y=customer_lat,x=customer_long,size=-(as.integer(lag))),
alpha=0.8,)+
theme(legend.position = "none",axis.title.x = element_blank(),
axis.title.y = element_blank())
## Warning: Removed 2 rows containing non-finite values (stat_density2d).
## Warning: Removed 7 rows containing non-finite values (stat_density2d).
8. Reviews
LTM %>% ggplot(aes(x=review_score ,y=stat(count)/sum(stat(count))))+
geom_histogram(color="black",fill="navy",alpha=0.8)+
theme_minimal()+
labs(x="Review Score",y="Count")+
scale_y_continuous(labels=percent)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
LTM %>% group_by(product_category_name_english)%>%
summarise(mn=mean(review_score),vol=sum(total_value),cn=n())%>%filter(cn>500)%>%
arrange(-mn)%>%
ggplot(aes(x=reorder(product_category_name_english,mn),
y=mn-3,
fill=vol>90000/0.3))+
geom_col(color="black",alpha=0.8)+coord_flip()+
scale_fill_brewer(palette = "Set1",name="Top 15 Sales")+
scale_y_continuous(labels = function(x){x+3})+
labs(y="Rating",x="")+
theme_minimal()
LTM %>% filter(lag>-50&lag<50)%>%ggplot(aes(x=lag))+
geom_density(aes(color=as.factor(review_score)),size=0.5)+
scale_color_discrete(name="Review Score")+
theme_minimal()+
labs(x="Day Lags",y="")+
scale_color_brewer(palette="Spectral")
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.
LTM$Timetoreview <- LTM$review_answer_timestamp-LTM$order_delivered_customer_date
LTM %>% filter(Timetoreview>-5&Timetoreview<10)%>%
ggplot(aes(x=Timetoreview,
fill=as.factor(review_score),
y=..density..))+
geom_histogram(position="dodge")+
theme_minimal()+
scale_fill_brewer(palette="Spectral")+
theme(legend.position="none")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.